Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 38 additions & 43 deletions app/etl/transform.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,57 @@
import pandas as pd
# TODO (Find & Fix)

def transform(df: pd.DataFrame) -> pd.DataFrame:
"""
Transform data by cleaning and standardizing it.

Args:
df: Input DataFrame

Returns:
Transformed DataFrame
"""
if df.empty:
# TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
pass

# Create a copy to avoid modifying original
df_transformed = df.copy()

print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print

# Handle duplicates
initial_rows = len(df_transformed)
# Removing duplicates
df_transformed=df_transformed.drop_duplicates()

duplicates_removed = initial_rows - len(df_transformed)
def _remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
"""Handles duplicate removal."""
initial_rows = len(df)
df = df.drop_duplicates()
duplicates_removed = initial_rows - len(df)
if duplicates_removed > 0:
# Number of duplicates removed
print(f"✅ Removed {duplicates_removed} duplicate rows.")


# Handle null values in numeric columns
numeric_columns = df_transformed.select_dtypes(include=['number']).columns
return df

def _handle_nulls(df: pd.DataFrame) -> pd.DataFrame:
"""Handles null values (currently a placeholder)."""
numeric_columns = df.select_dtypes(include=['number']).columns
for col in numeric_columns:
# TODO (Find & Fix): Nulls in numeric columns are not handled
pass

# Handle null values in text columns
text_columns = df_transformed.select_dtypes(include=['object']).columns
text_columns = df.select_dtypes(include=['object']).columns
for col in text_columns:
# TODO (Find & Fix): Nulls in text columns are not handled
pass

# Standardize date columns (look for common date column names)
date_columns = [col for col in df_transformed.columns
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]

return df

def _standardize_dates(df: pd.DataFrame) -> pd.DataFrame:
"""Handles date standardization."""
date_columns = [col for col in df.columns
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
for col in date_columns:
try:
df_transformed[col] = pd.to_datetime(df_transformed[col], errors='coerce', infer_datetime_format=True)
df[col] = pd.to_datetime(df[col], errors='coerce', format='mixed')
# Standardize all dates to 'YYYY-MM-DD HH:MM:SS'
df_transformed[col] = df_transformed[col].dt.strftime('%Y-%m-%d %H:%M:%S')
df[col] = df[col].dt.strftime('%Y-%m-%d %H:%M:%S')

print(f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})")
print(f"✅ Standardized date column '{col}' (e.g., {df[col].iloc[0]})")
except Exception as e:
print(f"⚠️ Could not standardize column '{col}': {e}")
return df

def _validate_types(df: pd.DataFrame) -> pd.DataFrame:
# TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
return df_transformed
pass
return df
def transform(df: pd.DataFrame) -> pd.DataFrame:
if df.empty:
# TODO (Find &Fix): Should raise a ValueError if DataFrame is empty
pass

df_transformed = df.copy()

print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print
df_transformed = _remove_duplicates(df_transformed)
df_transformed = _handle_nulls(df_transformed)
df_transformed = _standardize_dates(df_transformed)
df_transformed = _validate_types(df_transformed)

return df_transformed
Binary file modified tests/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file modified tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc
Binary file not shown.
52 changes: 52 additions & 0 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import pytest
import numpy as np
import pandas as pd
import numpy as np
from pandas.testing import assert_frame_equal
from app.etl.transform import (
_remove_duplicates,
_handle_nulls,
_standardize_dates,
_validate_types
)

def test_remove_duplicates():
data = {'id': [1, 1, 2], 'name': ['a', 'a', 'b']}
input_df = pd.DataFrame(data)
expected_data = {'id': [1, 2], 'name': ['a', 'b']}
expected_df = pd.DataFrame(expected_data).reset_index(drop=True)
result_df = _remove_duplicates(input_df).reset_index(drop=True)
assert len(result_df) == 2
assert_frame_equal(result_df, expected_df)

def test_handle_nulls_does_nothing():
data = {'numeric': [1, np.nan], 'text': ['a', np.nan]}
input_df = pd.DataFrame(data)
expected_df = input_df.copy()
result_df = _handle_nulls(input_df)
assert_frame_equal(result_df, expected_df)

def test_standardize_dates():
data = {'report_date': ['2023-01-01 12:00:00', '02/25/2022', 'bad-date']}
df = pd.DataFrame(data)

result_df = _standardize_dates(df)

expected = [
'2023-01-01 12:00:00',
'2022-02-25 00:00:00',
np.nan
]
assert result_df['report_date'].tolist() == expected

def test_validate_types_does_nothing():
"""
Tests that _validate_types currently does nothing
(as per the 'pass' in the logic).
"""
data = {'name': [' Alice ', ' Bob ']}
input_df = pd.DataFrame(data)
expected_df = input_df.copy()

result_df = _validate_types(input_df)
assert_frame_equal(result_df, expected_df)
Loading