Skip to content

Commit ccb4d02

Browse files
authored
Merge branch 'main' into feat/logging-new
2 parents 15a0ed2 + aa97624 commit ccb4d02

File tree

4 files changed

+91
-50
lines changed

4 files changed

+91
-50
lines changed

app/etl/transform.py

Lines changed: 39 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -2,68 +2,57 @@
22
import logging as lg
33
# TODO (Find & Fix)
44

5-
logger = lg.getLogger(__name__)
6-
logger.setLevel(lg.DEBUG)
7-
8-
9-
def transform(df: pd.DataFrame) -> pd.DataFrame:
10-
"""
11-
Transform data by cleaning and standardizing it.
12-
13-
Args:
14-
df: Input DataFrame
15-
16-
Returns:
17-
Transformed DataFrame
18-
"""
19-
if df.empty:
20-
# TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
21-
pass
22-
23-
# Create a copy to avoid modifying original
24-
df_transformed = df.copy()
25-
26-
# TODO (Find & Fix): Use logging instead of print
27-
logger.info(f"🔄 Starting transformation of {len(df_transformed)} rows")
28-
29-
# Handle duplicates
30-
initial_rows = len(df_transformed)
31-
# Removing duplicates
32-
df_transformed = df_transformed.drop_duplicates()
33-
34-
duplicates_removed = initial_rows - len(df_transformed)
5+
def _remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
6+
"""Handles duplicate removal."""
7+
initial_rows = len(df)
8+
df = df.drop_duplicates()
9+
duplicates_removed = initial_rows - len(df)
3510
if duplicates_removed > 0:
36-
# Number of duplicates removed
37-
logger.info(f"✅ Removed {duplicates_removed} duplicate rows.")
11+
print(f"✅ Removed {duplicates_removed} duplicate rows.")
12+
return df
3813

39-
# Handle null values in numeric columns
40-
numeric_columns = df_transformed.select_dtypes(include=['number']).columns
14+
def _handle_nulls(df: pd.DataFrame) -> pd.DataFrame:
15+
"""Handles null values (currently a placeholder)."""
16+
numeric_columns = df.select_dtypes(include=['number']).columns
4117
for col in numeric_columns:
4218
# TODO (Find & Fix): Nulls in numeric columns are not handled
4319
pass
44-
45-
# Handle null values in text columns
46-
text_columns = df_transformed.select_dtypes(include=['object']).columns
20+
text_columns = df.select_dtypes(include=['object']).columns
4721
for col in text_columns:
4822
# TODO (Find & Fix): Nulls in text columns are not handled
4923
pass
24+
return df
5025

51-
# Standardize date columns (look for common date column names)
52-
date_columns = [col for col in df_transformed.columns
53-
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
54-
26+
def _standardize_dates(df: pd.DataFrame) -> pd.DataFrame:
27+
"""Handles date standardization."""
28+
date_columns = [col for col in df.columns
29+
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
5530
for col in date_columns:
5631
try:
57-
df_transformed[col] = pd.to_datetime(
58-
df_transformed[col], errors='coerce', infer_datetime_format=True)
32+
df[col] = pd.to_datetime(df[col], errors='coerce', format='mixed')
5933
# Standardize all dates to 'YYYY-MM-DD HH:MM:SS'
60-
df_transformed[col] = df_transformed[col].dt.strftime(
61-
'%Y-%m-%d %H:%M:%S')
62-
63-
logger.info(
64-
f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})")
34+
df[col] = df[col].dt.strftime('%Y-%m-%d %H:%M:%S')
35+
36+
print(f"✅ Standardized date column '{col}' (e.g., {df[col].iloc[0]})")
6537
except Exception as e:
66-
logger.error(f"⚠️ Could not standardize column '{col}': {e}")
38+
print(f"⚠️ Could not standardize column '{col}': {e}")
39+
return df
6740

41+
def _validate_types(df: pd.DataFrame) -> pd.DataFrame:
6842
# TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
69-
return df_transformed
43+
pass
44+
return df
45+
def transform(df: pd.DataFrame) -> pd.DataFrame:
46+
if df.empty:
47+
# TODO (Find &Fix): Should raise a ValueError if DataFrame is empty
48+
pass
49+
50+
df_transformed = df.copy()
51+
52+
print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print
53+
df_transformed = _remove_duplicates(df_transformed)
54+
df_transformed = _handle_nulls(df_transformed)
55+
df_transformed = _standardize_dates(df_transformed)
56+
df_transformed = _validate_types(df_transformed)
57+
58+
return df_transformed
0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.

tests/test_transform.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import pytest
2+
import numpy as np
3+
import pandas as pd
4+
import numpy as np
5+
from pandas.testing import assert_frame_equal
6+
from app.etl.transform import (
7+
_remove_duplicates,
8+
_handle_nulls,
9+
_standardize_dates,
10+
_validate_types
11+
)
12+
13+
def test_remove_duplicates():
14+
data = {'id': [1, 1, 2], 'name': ['a', 'a', 'b']}
15+
input_df = pd.DataFrame(data)
16+
expected_data = {'id': [1, 2], 'name': ['a', 'b']}
17+
expected_df = pd.DataFrame(expected_data).reset_index(drop=True)
18+
result_df = _remove_duplicates(input_df).reset_index(drop=True)
19+
assert len(result_df) == 2
20+
assert_frame_equal(result_df, expected_df)
21+
22+
def test_handle_nulls_does_nothing():
23+
data = {'numeric': [1, np.nan], 'text': ['a', np.nan]}
24+
input_df = pd.DataFrame(data)
25+
expected_df = input_df.copy()
26+
result_df = _handle_nulls(input_df)
27+
assert_frame_equal(result_df, expected_df)
28+
29+
def test_standardize_dates():
30+
data = {'report_date': ['2023-01-01 12:00:00', '02/25/2022', 'bad-date']}
31+
df = pd.DataFrame(data)
32+
33+
result_df = _standardize_dates(df)
34+
35+
expected = [
36+
'2023-01-01 12:00:00',
37+
'2022-02-25 00:00:00',
38+
np.nan
39+
]
40+
assert result_df['report_date'].tolist() == expected
41+
42+
def test_validate_types_does_nothing():
43+
"""
44+
Tests that _validate_types currently does nothing
45+
(as per the 'pass' in the logic).
46+
"""
47+
data = {'name': [' Alice ', ' Bob ']}
48+
input_df = pd.DataFrame(data)
49+
expected_df = input_df.copy()
50+
51+
result_df = _validate_types(input_df)
52+
assert_frame_equal(result_df, expected_df)

0 commit comments

Comments
 (0)