This guide demonstrates how to use the data_cleaning
module to prepare and clean water quality datasets before applying modeling or visualization.
from ai_aquatica.data_cleaning import (
remove_duplicates,
handle_missing_values,
normalize_data,
standardize_data
)
import pandas as pd
import numpy as np
# Example dataset with missing values and duplicates
data = pd.DataFrame({
'pH': [7.0, 6.8, np.nan, 7.2, 7.0],
'NO3': [1.5, 1.7, 1.6, np.nan, 1.5]
})
# Add duplicate row
data.loc[5] = data.loc[0]
data_no_duplicates = remove_duplicates(data)
data_filled_mean = handle_missing_values(data, strategy='mean')
data_filled_median = handle_missing_values(data, strategy='median')
data_filled_interpolated = handle_missing_values(data, strategy='interpolate')
data_normalized = normalize_data(data_filled_mean)
data_standardized = standardize_data(data_filled_mean)