import pandas as pd
from sklearn.preprocessing import LabelEncoder
from Data_Preprocessing import DataPreprocessing
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate

import os

api_key = "**********"
NASA_Data = DataPreprocessing(api_key)

csv_file_path = os.path.join('..', 'data')
solar_data = pd.read_csv(f'{csv_file_path}\\null_removed.csv')


####################################Checking Missing Values######################################
NASA_Data.check_missing_values(solar_data)
NASA_Data.summarize_columns(solar_data)

##########################################END####################################################

########################Creating new features###################################################
#Extracting flare information
solar_data['classType_Agg'] = solar_data['classType'].str.extract(r'([A-Za-z])')
solar_data['classType_Scale'] = solar_data['classType'].str.extract(r'(\d+\.\d+)')

# Converting timestamps to datetime
solar_data['beginTime'] = pd.to_datetime(solar_data['beginTime'])
solar_data['endTime'] = pd.to_datetime(solar_data['endTime'])

#Calculating duration
solar_data['flare_duration'] = (solar_data['endTime'] - solar_data['beginTime']).dt.total_seconds()

# Time since last flare calculation
solar_data = solar_data.sort_values(by='beginTime')
solar_data['time_since_last_flare'] = solar_data['beginTime'].diff().dt.total_seconds().fillna(0)

# Extracting hour and month
solar_data['flare_hour'] = solar_data['beginTime'].dt.hour
solar_data['flare_month'] = solar_data['beginTime'].dt.month


# Encode classType as a categorical feature
label_encoder = LabelEncoder()

solar_data['classType_encoded'] = label_encoder.fit_transform(solar_data['classType'])
solar_data['classTypeAgg_encoded'] = label_encoder.fit_transform(solar_data['classType_Agg'])
solar_data['Type_encoded'] = label_encoder.fit_transform(solar_data['type'])
solar_data['sourceLocation_x_encoded'] = label_encoder.fit_transform(solar_data['sourceLocation_x'])

#############################################End of feature creation##################################

#######################################Printing the table############################################
print(tabulate(solar_data, headers='keys', tablefmt='pretty'))

########################################End of table print###########################################

#########################################Visualizations##############################################

# Solar flare class distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='classType', data=solar_data)
plt.title('Distribution of Solar Flare Classes Granular')
plt.savefig('images/solar_flare_class_distribution_grn.png')
plt.show()

plt.figure(figsize=(8, 6))
sns.countplot(x='classType_Agg', data=solar_data)
plt.title('Distribution of Solar Flare Classes Aggregated')
plt.savefig('images/solar_flare_class_distribution_agg.png')
plt.show()

# CME type distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='type', data=solar_data)
plt.title('Distribution of CME Types')
plt.savefig('images/cme_type_distribution.png')
plt.show()


# Select relevant numeric variables
numeric_vars = [
    'latitude',
    'longitude',
    'halfAngle',
    'speed',
    'classType_encoded',
    'Type_encoded',
    'classTypeAgg_encoded',
    'activeRegionNum_x',
    'sourceLocation_x_encoded'
]

# Create a new DataFrame with only the selected numeric variables
correlation_df = solar_data[numeric_vars]

# Target Correlation
target_corr = correlation_df.corr()['classType_encoded'].sort_values(ascending=False)
print(target_corr)

# Pairplot
sns.pairplot(correlation_df, hue='classType_encoded', diag_kind='kde')
plt.savefig('images/pairplot.png')
plt.show()

# CME Speed Distribution by Solar Flare Class
plt.figure(figsize=(10, 6))
sns.boxplot(x='classType', y='speed', data=solar_data)
plt.title('CME Speed Distribution by Solar Flare Class')
plt.savefig('images/cme_speed_distribution_by_class.png')
plt.show()

# CME Speed Distribution by Solar Flare Class Agg
plt.figure(figsize=(10, 6))
sns.boxplot(x='classType_Agg', y='speed', data=solar_data)
plt.title('CME Speed Distribution by Solar Flare Class')
plt.xlabel('Solar Flare Class')
plt.ylabel('Speed (km/s)')
plt.savefig('images/cme_speed_distribution_by_flare_class_agg.png')
plt.show()

#CME Speed Distribution by CME Class
plt.figure(figsize=(10, 6))
sns.boxplot(x='type', y='speed', data=solar_data)
plt.title('CME Speed Distribution by CME Class')
plt.xlabel('Solar Flare Class')
plt.ylabel('Speed (km/s)')
plt.savefig('images/solar_flare_class_distribution_grn.png')
plt.show()

#CME Distribution by Flare Class
plt.figure(figsize=(15, 8))
sns.boxplot(x='classType', y='speed', data=solar_data)
plt.title('CME Speed Distribution by Solar Flare Class')
plt.xlabel('Solar Flare Class')
plt.ylabel('CME Speed (km/s)')
plt.xticks(rotation=90)  # Rotate x-axis labels for readability
plt.savefig('images/cme_speed_distribution_by_flare_class.png')
plt.show()Language:Python