import pandas as pd
import os
import sqlite3
script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.abspath(os.path.join(script_dir, '..', 'data'))
sharp_csv = os.path.join(data_dir,"SHARPevents.csv")
sharp_events_df = pd.read_csv(sharp_csv)
solar_flare_df = pd.read_csv( os.path.join(data_dir,"solar_flare_data.csv"))
sharp_events_df['event_date'] = pd.to_datetime(sharp_events_df['event_date']).dt.date
solar_flare_df['beginDate'] = pd.to_datetime(solar_flare_df['beginTime']).dt.date
solar_flare_df = solar_flare_df[(solar_flare_df['activeRegionNum']!='') & (solar_flare_df['activeRegionNum'].notna())]
sharp_events_df['noaa_active_region'] = sharp_events_df['noaa_active_region'].astype(int)
solar_flare_df['activeRegionNum'] = solar_flare_df['activeRegionNum'].astype(int)
merged_df = pd.merge(sharp_events_df, solar_flare_df, left_on=['event_date', 'noaa_active_region'], right_on=['beginDate', 'activeRegionNum'], how='inner')
conn = sqlite3.connect(os.path.join(data_dir,"HMI_SHARP_SWPC_FINAL_NEW.db"))
unmatched_solar_flare_df = solar_flare_df[~solar_flare_df['flrID'].isin(merged_df['flrID'])]
additional_matches = []
for date, AR in unmatched_solar_flare_df[['beginDate', 'activeRegionNum']].drop_duplicates().values:
temp_unmatched_df = unmatched_solar_flare_df[(unmatched_solar_flare_df['beginDate'] == date) & (unmatched_solar_flare_df['activeRegionNum'] == AR)]
query_hmi = f'SELECT * FROM [VW_HMIDAILY] WHERE T_REC = "{date}" AND NOAA_AR = "{AR}"'
hmi_df = pd.read_sql(query_hmi, conn)
hmi_df['T_REC_date'] = pd.to_datetime(hmi_df['T_REC']).dt.date
date_matches_df = pd.merge(temp_unmatched_df, hmi_df, left_on=['beginDate', 'activeRegionNum'], right_on=['T_REC_date', 'NOAA_AR'], how='inner')
additional_matches.append(date_matches_df)
if additional_matches:
additional_matches_df = pd.concat(additional_matches, ignore_index=True)
else:
additional_matches_df = pd.DataFrame()
final_merged_df = pd.concat([merged_df, additional_matches_df], ignore_index=True)
columns = ['flrID', 'instruments', 'beginTime', 'peakTime', 'endTime', 'classType', 'sourceLocation',
'activeRegionNum', 'note', 'submissionTime', 'versionId', 'link', 'linkedEvents', 'T_REC', 'HARPNUM',
'TOTUSJH', 'TOTPOT', 'TOTUSJZ', 'ABSNJZH', 'SAVNCPP', 'USFLUX', 'AREA_ACR', 'MEANPOT', 'R_VALUE',
'SHRGT45', 'MEANSHR', 'MEANGAM', 'MEANGBT', 'MEANGBZ', 'MEANGBH', 'MEANJZH', 'MEANJZD', 'MEANALP',
'NOAA_AR', 'NOAA_NUM', 'NOAA_ARS', 'QUALITY']
final_dataset = final_merged_df[columns]
final_csv = os.path.join(data_dir, 'final_merged_data.csv')
final_dataset.to_csv(final_csv, index=False)
conn.close()
Language:Python