import pandas as pd
from sklearn.preprocessing import LabelEncoder
from Data_Preprocessing import DataPreprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate
import os
api_key = "**********"
NASA_Data = DataPreprocessing(api_key)
csv_file_path = os.path.join('..', 'data')
solar_data = pd.read_csv(f'{csv_file_path}\\null_removed.csv')
NASA_Data.check_missing_values(solar_data)
NASA_Data.summarize_columns(solar_data)
solar_data['classType_Agg'] = solar_data['classType'].str.extract(r'([A-Za-z])')
solar_data['classType_Scale'] = solar_data['classType'].str.extract(r'(\d+\.\d+)')
solar_data['beginTime'] = pd.to_datetime(solar_data['beginTime'])
solar_data['endTime'] = pd.to_datetime(solar_data['endTime'])
solar_data['flare_duration'] = (solar_data['endTime'] - solar_data['beginTime']).dt.total_seconds()
solar_data = solar_data.sort_values(by='beginTime')
solar_data['time_since_last_flare'] = solar_data['beginTime'].diff().dt.total_seconds().fillna(0)
solar_data['flare_hour'] = solar_data['beginTime'].dt.hour
solar_data['flare_month'] = solar_data['beginTime'].dt.month
label_encoder = LabelEncoder()
solar_data['classType_encoded'] = label_encoder.fit_transform(solar_data['classType'])
solar_data['classTypeAgg_encoded'] = label_encoder.fit_transform(solar_data['classType_Agg'])
solar_data['Type_encoded'] = label_encoder.fit_transform(solar_data['type'])
solar_data['sourceLocation_x_encoded'] = label_encoder.fit_transform(solar_data['sourceLocation_x'])
print(tabulate(solar_data, headers='keys', tablefmt='pretty'))
plt.figure(figsize=(8, 6))
sns.countplot(x='classType', data=solar_data)
plt.title('Distribution of Solar Flare Classes Granular')
plt.savefig('images/solar_flare_class_distribution_grn.png')
plt.show()
plt.figure(figsize=(8, 6))
sns.countplot(x='classType_Agg', data=solar_data)
plt.title('Distribution of Solar Flare Classes Aggregated')
plt.savefig('images/solar_flare_class_distribution_agg.png')
plt.show()
plt.figure(figsize=(8, 6))
sns.countplot(x='type', data=solar_data)
plt.title('Distribution of CME Types')
plt.savefig('images/cme_type_distribution.png')
plt.show()
numeric_vars = [
'latitude',
'longitude',
'halfAngle',
'speed',
'classType_encoded',
'Type_encoded',
'classTypeAgg_encoded',
'activeRegionNum_x',
'sourceLocation_x_encoded'
]
correlation_df = solar_data[numeric_vars]
target_corr = correlation_df.corr()['classType_encoded'].sort_values(ascending=False)
print(target_corr)
sns.pairplot(correlation_df, hue='classType_encoded', diag_kind='kde')
plt.savefig('images/pairplot.png')
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='classType', y='speed', data=solar_data)
plt.title('CME Speed Distribution by Solar Flare Class')
plt.savefig('images/cme_speed_distribution_by_class.png')
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='classType_Agg', y='speed', data=solar_data)
plt.title('CME Speed Distribution by Solar Flare Class')
plt.xlabel('Solar Flare Class')
plt.ylabel('Speed (km/s)')
plt.savefig('images/cme_speed_distribution_by_flare_class_agg.png')
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(x='type', y='speed', data=solar_data)
plt.title('CME Speed Distribution by CME Class')
plt.xlabel('Solar Flare Class')
plt.ylabel('Speed (km/s)')
plt.savefig('images/solar_flare_class_distribution_grn.png')
plt.show()
plt.figure(figsize=(15, 8))
sns.boxplot(x='classType', y='speed', data=solar_data)
plt.title('CME Speed Distribution by Solar Flare Class')
plt.xlabel('Solar Flare Class')
plt.ylabel('CME Speed (km/s)')
plt.xticks(rotation=90)
plt.savefig('images/cme_speed_distribution_by_flare_class.png')
plt.show()
Language:Python