import pandas as pd
import os
import sqlite3 

script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.abspath(os.path.join(script_dir, '..', 'data'))

sharp_csv = os.path.join(data_dir,"SHARPevents.csv")
sharp_events_df = pd.read_csv(sharp_csv)
solar_flare_df = pd.read_csv( os.path.join(data_dir,"solar_flare_data.csv"))

# Extract date from start_time and beginTime for joining
sharp_events_df['event_date'] = pd.to_datetime(sharp_events_df['event_date']).dt.date
solar_flare_df['beginDate'] = pd.to_datetime(solar_flare_df['beginTime']).dt.date

# Filter to remove where activeRegionNum is NA/blank
solar_flare_df = solar_flare_df[(solar_flare_df['activeRegionNum']!='') & (solar_flare_df['activeRegionNum'].notna())] 

# Convert noaa_active_region to integer in both dataframes
sharp_events_df['noaa_active_region'] = sharp_events_df['noaa_active_region'].astype(int)
solar_flare_df['activeRegionNum'] = solar_flare_df['activeRegionNum'].astype(int)

# Join on event_date and beginDate, and noaa_active_region and activeRegionNum
merged_df = pd.merge(sharp_events_df, solar_flare_df, left_on=['event_date', 'noaa_active_region'], right_on=['beginDate', 'activeRegionNum'], how='inner')

conn = sqlite3.connect(os.path.join(data_dir,"HMI_SHARP_SWPC_FINAL_NEW.db"))


# Identify unmatched entries
unmatched_solar_flare_df = solar_flare_df[~solar_flare_df['flrID'].isin(merged_df['flrID'])]

# Step 3: Handle unmatched entries day-by-day
additional_matches = []

for date, AR in unmatched_solar_flare_df[['beginDate', 'activeRegionNum']].drop_duplicates().values:
    temp_unmatched_df = unmatched_solar_flare_df[(unmatched_solar_flare_df['beginDate'] == date) & (unmatched_solar_flare_df['activeRegionNum'] == AR)]
    query_hmi = f'SELECT * FROM [VW_HMIDAILY] WHERE T_REC = "{date}" AND NOAA_AR = "{AR}"'
    hmi_df = pd.read_sql(query_hmi, conn)
    hmi_df['T_REC_date'] = pd.to_datetime(hmi_df['T_REC']).dt.date

    # Join unmatched solar flare entries for the specific date and active region with [VW_HMIDAILY] data
    date_matches_df = pd.merge(temp_unmatched_df, hmi_df, left_on=['beginDate', 'activeRegionNum'], right_on=['T_REC_date', 'NOAA_AR'], how='inner')
    additional_matches.append(date_matches_df)

# Combine all additional matches
if additional_matches:
    additional_matches_df = pd.concat(additional_matches, ignore_index=True)
else:
    additional_matches_df = pd.DataFrame()

# Combine the initial merged dataframe with the additional matches
final_merged_df = pd.concat([merged_df, additional_matches_df], ignore_index=True)

# Select required columns for the final dataset
columns = ['flrID', 'instruments', 'beginTime', 'peakTime', 'endTime', 'classType', 'sourceLocation', 
           'activeRegionNum', 'note', 'submissionTime', 'versionId', 'link', 'linkedEvents', 'T_REC', 'HARPNUM', 
           'TOTUSJH', 'TOTPOT', 'TOTUSJZ', 'ABSNJZH', 'SAVNCPP', 'USFLUX', 'AREA_ACR', 'MEANPOT', 'R_VALUE', 
           'SHRGT45', 'MEANSHR', 'MEANGAM', 'MEANGBT', 'MEANGBZ', 'MEANGBH', 'MEANJZH', 'MEANJZD', 'MEANALP', 
           'NOAA_AR', 'NOAA_NUM', 'NOAA_ARS', 'QUALITY']

final_dataset = final_merged_df[columns]

# Save the final dataset to a new CSV file
final_csv = os.path.join(data_dir, 'final_merged_data.csv')
final_dataset.to_csv(final_csv, index=False)

# Close the database connection
conn.close()Language:Python