diff --git a/app/etl/transform.py b/app/etl/transform.py index ed7b936..f44915a 100644 --- a/app/etl/transform.py +++ b/app/etl/transform.py @@ -1,62 +1,57 @@ import pandas as pd # TODO (Find & Fix) -def transform(df: pd.DataFrame) -> pd.DataFrame: - """ - Transform data by cleaning and standardizing it. - - Args: - df: Input DataFrame - - Returns: - Transformed DataFrame - """ - if df.empty: - # TODO (Find & Fix): Should raise a ValueError if DataFrame is empty - pass - - # Create a copy to avoid modifying original - df_transformed = df.copy() - - print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print - - # Handle duplicates - initial_rows = len(df_transformed) - # Removing duplicates - df_transformed=df_transformed.drop_duplicates() - - duplicates_removed = initial_rows - len(df_transformed) +def _remove_duplicates(df: pd.DataFrame) -> pd.DataFrame: + """Handles duplicate removal.""" + initial_rows = len(df) + df = df.drop_duplicates() + duplicates_removed = initial_rows - len(df) if duplicates_removed > 0: - # Number of duplicates removed print(f"✅ Removed {duplicates_removed} duplicate rows.") - - - # Handle null values in numeric columns - numeric_columns = df_transformed.select_dtypes(include=['number']).columns + return df + +def _handle_nulls(df: pd.DataFrame) -> pd.DataFrame: + """Handles null values (currently a placeholder).""" + numeric_columns = df.select_dtypes(include=['number']).columns for col in numeric_columns: # TODO (Find & Fix): Nulls in numeric columns are not handled pass - - # Handle null values in text columns - text_columns = df_transformed.select_dtypes(include=['object']).columns + text_columns = df.select_dtypes(include=['object']).columns for col in text_columns: # TODO (Find & Fix): Nulls in text columns are not handled pass - - # Standardize date columns (look for common date column names) - date_columns = [col for col in df_transformed.columns - if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])] - + return df + +def _standardize_dates(df: pd.DataFrame) -> pd.DataFrame: + """Handles date standardization.""" + date_columns = [col for col in df.columns + if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])] for col in date_columns: try: - df_transformed[col] = pd.to_datetime(df_transformed[col], errors='coerce', infer_datetime_format=True) + df[col] = pd.to_datetime(df[col], errors='coerce', format='mixed') # Standardize all dates to 'YYYY-MM-DD HH:MM:SS' - df_transformed[col] = df_transformed[col].dt.strftime('%Y-%m-%d %H:%M:%S') + df[col] = df[col].dt.strftime('%Y-%m-%d %H:%M:%S') - print(f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})") + print(f"✅ Standardized date column '{col}' (e.g., {df[col].iloc[0]})") except Exception as e: print(f"⚠️ Could not standardize column '{col}': {e}") + return df - +def _validate_types(df: pd.DataFrame) -> pd.DataFrame: # TODO (Find & Fix): Text columns are not cleaned (strip, lowercase) - return df_transformed + pass + return df +def transform(df: pd.DataFrame) -> pd.DataFrame: + if df.empty: + # TODO (Find &Fix): Should raise a ValueError if DataFrame is empty + pass + + df_transformed = df.copy() + + print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print + df_transformed = _remove_duplicates(df_transformed) + df_transformed = _handle_nulls(df_transformed) + df_transformed = _standardize_dates(df_transformed) + df_transformed = _validate_types(df_transformed) + + return df_transformed \ No newline at end of file diff --git a/tests/__pycache__/__init__.cpython-310.pyc b/tests/__pycache__/__init__.cpython-310.pyc index cf1af23..143c3ad 100644 Binary files a/tests/__pycache__/__init__.cpython-310.pyc and b/tests/__pycache__/__init__.cpython-310.pyc differ diff --git a/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc b/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc index dbb29cd..c7fd053 100644 Binary files a/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc and b/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc differ diff --git a/tests/test_transform.py b/tests/test_transform.py new file mode 100644 index 0000000..89dd6b7 --- /dev/null +++ b/tests/test_transform.py @@ -0,0 +1,52 @@ +import pytest +import numpy as np +import pandas as pd +import numpy as np +from pandas.testing import assert_frame_equal +from app.etl.transform import ( + _remove_duplicates, + _handle_nulls, + _standardize_dates, + _validate_types +) + +def test_remove_duplicates(): + data = {'id': [1, 1, 2], 'name': ['a', 'a', 'b']} + input_df = pd.DataFrame(data) + expected_data = {'id': [1, 2], 'name': ['a', 'b']} + expected_df = pd.DataFrame(expected_data).reset_index(drop=True) + result_df = _remove_duplicates(input_df).reset_index(drop=True) + assert len(result_df) == 2 + assert_frame_equal(result_df, expected_df) + +def test_handle_nulls_does_nothing(): + data = {'numeric': [1, np.nan], 'text': ['a', np.nan]} + input_df = pd.DataFrame(data) + expected_df = input_df.copy() + result_df = _handle_nulls(input_df) + assert_frame_equal(result_df, expected_df) + +def test_standardize_dates(): + data = {'report_date': ['2023-01-01 12:00:00', '02/25/2022', 'bad-date']} + df = pd.DataFrame(data) + + result_df = _standardize_dates(df) + + expected = [ + '2023-01-01 12:00:00', + '2022-02-25 00:00:00', + np.nan + ] + assert result_df['report_date'].tolist() == expected + +def test_validate_types_does_nothing(): + """ + Tests that _validate_types currently does nothing + (as per the 'pass' in the logic). + """ + data = {'name': [' Alice ', ' Bob ']} + input_df = pd.DataFrame(data) + expected_df = input_df.copy() + + result_df = _validate_types(input_df) + assert_frame_equal(result_df, expected_df) \ No newline at end of file