import http.client
import json
import pandas as pd
import os
import ast
class DataPreprocessing:
"""
Class for preprocessing data
"""
def __init__(self, api_key):
self.api_key = api_key
self.base_url = "api.nasa.gov"
def get_data(self, data_string, start_date, end_date):
"""
Reusing Jordan api code
:param data_string:
:param start_date:
:param end_date:
:return:
"""
conn = http.client.HTTPSConnection(self.base_url)
conn.request("GET", f"/DONKI/{data_string}?startDate={start_date}&endDate={end_date}&api_key={self.api_key}")
response = conn.getresponse()
data = response.read().decode("utf-8")
conn.close()
parsed_data = json.loads(data)
df = pd.json_normalize(parsed_data)
return df
def save_to_csv(self,df,file_name,folder):
"""
Function to save csv
:param df:
:param file_name:
:param folder:
:return: file save to a location defined by user
"""
directory = os.path.join("..", folder)
os.makedirs(directory, exist_ok=True)
file_path = os.path.join(directory, f"{file_name}.csv")
df.to_csv(file_path, index=False)
print(f"Data saved to {file_path}")
def join_data(self, df_list, join_field, join_type='inner'):
"""
function that handles joining multiple dataframes together
:param df_list:
:param join_type:
:return: joined dataframe
"""
join_df = df_list[0]
for df in df_list[1:]:
join_df = pd.merge(join_df, df, how=join_type, on=join_field)
return join_df
def check_missing_values(self, df):
"""
Summarizes missing values and provides a report of missing counts per column.
:param df: The DataFrame to check for missing values.
:return: Series with missing value counts for each column.
"""
missing_values = df.isnull().sum()
print(f"Missing values per column:\n{missing_values}")
return missing_values
def summarize_columns(self, df):
"""
Provides summary statistics for each column, including null counts, distinct values, and data types.
:param df: The DataFrame to summarize.
:return: DataFrame with column summaries.
"""
summary = pd.DataFrame({
'Data Type': df.dtypes,
'Null Count': df.isnull().sum(),
'Distinct Values': df.nunique()
})
print(f"Column Summary:\n{summary}")
return summary
def remove_null_rows(self, df, column_name):
"""
Removes rows with null values in the specified column.
:param df: The DataFrame to process.
:param column_name: The column to check for null values.
:return: DataFrame with rows removed where the specified column has null values.
"""
cleaned_df = df.dropna(subset=[column_name])
print(f"Rows with null values in column '{column_name}' have been removed.")
return cleaned_df
def explode_columns(self, df, columns):
'''
using this function to explode the dataframe columns. For this dataset, we have seen several
columns that has a list key values pairs that could be used as a column.
:param df:
:param columns:
:return: exploded dataframe
'''
df_exploded = df.copy()
normalized_dfs = []
for column in columns:
if column in df_exploded.columns:
df_exploded = df_exploded.explode(column).reset_index(drop=True)
normalized_df = pd.json_normalize(df_exploded[column])
normalized_dfs.append(normalized_df)
else:
print(f"Warning: Column '{column}' not found in DataFrame.")
df_exploded = df_exploded.drop(columns=columns, errors='ignore')
if normalized_dfs:
result = pd.concat([df_exploded] + normalized_dfs, axis=1)
else:
result = df_exploded
return result
def explode_columns1(self, df, columns):
df_exploded = df.copy()
normalized_dfs = []
for column in columns:
if column in df_exploded.columns:
if df_exploded[column].apply(lambda x: isinstance(x, (list, dict))).any():
df_exploded = df_exploded.explode(column).reset_index(drop=True)
normalized_df = pd.json_normalize(df_exploded[column])
print(f"Normalized Data for {column}:\n", normalized_df.head())
normalized_dfs.append(normalized_df)
else:
print(f"Column '{column}' does not contain list or dict data.")
else:
print(f"Warning: Column '{column}' not found in DataFrame.")
df_exploded = df_exploded.drop(columns=columns, errors='ignore')
if normalized_dfs:
result = pd.concat([df_exploded] + normalized_dfs, axis=1)
else:
print("No normalized data frames to concatenate.")
result = df_exploded
return result
Language:Python