Skip to content

Commit aa97624

Browse files
Merge pull request #40 from Satvik-Singh192/feat/transform-split
feat: task done
2 parents f69fbea + 467284e commit aa97624

File tree

4 files changed

+90
-43
lines changed

4 files changed

+90
-43
lines changed

app/etl/transform.py

Lines changed: 38 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,57 @@
11
import pandas as pd
22
# TODO (Find & Fix)
33

4-
def transform(df: pd.DataFrame) -> pd.DataFrame:
5-
"""
6-
Transform data by cleaning and standardizing it.
7-
8-
Args:
9-
df: Input DataFrame
10-
11-
Returns:
12-
Transformed DataFrame
13-
"""
14-
if df.empty:
15-
# TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
16-
pass
17-
18-
# Create a copy to avoid modifying original
19-
df_transformed = df.copy()
20-
21-
print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print
22-
23-
# Handle duplicates
24-
initial_rows = len(df_transformed)
25-
# Removing duplicates
26-
df_transformed=df_transformed.drop_duplicates()
27-
28-
duplicates_removed = initial_rows - len(df_transformed)
4+
def _remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
5+
"""Handles duplicate removal."""
6+
initial_rows = len(df)
7+
df = df.drop_duplicates()
8+
duplicates_removed = initial_rows - len(df)
299
if duplicates_removed > 0:
30-
# Number of duplicates removed
3110
print(f"✅ Removed {duplicates_removed} duplicate rows.")
32-
33-
34-
# Handle null values in numeric columns
35-
numeric_columns = df_transformed.select_dtypes(include=['number']).columns
11+
return df
12+
13+
def _handle_nulls(df: pd.DataFrame) -> pd.DataFrame:
14+
"""Handles null values (currently a placeholder)."""
15+
numeric_columns = df.select_dtypes(include=['number']).columns
3616
for col in numeric_columns:
3717
# TODO (Find & Fix): Nulls in numeric columns are not handled
3818
pass
39-
40-
# Handle null values in text columns
41-
text_columns = df_transformed.select_dtypes(include=['object']).columns
19+
text_columns = df.select_dtypes(include=['object']).columns
4220
for col in text_columns:
4321
# TODO (Find & Fix): Nulls in text columns are not handled
4422
pass
45-
46-
# Standardize date columns (look for common date column names)
47-
date_columns = [col for col in df_transformed.columns
48-
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
49-
23+
return df
24+
25+
def _standardize_dates(df: pd.DataFrame) -> pd.DataFrame:
26+
"""Handles date standardization."""
27+
date_columns = [col for col in df.columns
28+
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
5029
for col in date_columns:
5130
try:
52-
df_transformed[col] = pd.to_datetime(df_transformed[col], errors='coerce', infer_datetime_format=True)
31+
df[col] = pd.to_datetime(df[col], errors='coerce', format='mixed')
5332
# Standardize all dates to 'YYYY-MM-DD HH:MM:SS'
54-
df_transformed[col] = df_transformed[col].dt.strftime('%Y-%m-%d %H:%M:%S')
33+
df[col] = df[col].dt.strftime('%Y-%m-%d %H:%M:%S')
5534

56-
print(f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})")
35+
print(f"✅ Standardized date column '{col}' (e.g., {df[col].iloc[0]})")
5736
except Exception as e:
5837
print(f"⚠️ Could not standardize column '{col}': {e}")
38+
return df
5939

60-
40+
def _validate_types(df: pd.DataFrame) -> pd.DataFrame:
6141
# TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
62-
return df_transformed
42+
pass
43+
return df
44+
def transform(df: pd.DataFrame) -> pd.DataFrame:
45+
if df.empty:
46+
# TODO (Find &Fix): Should raise a ValueError if DataFrame is empty
47+
pass
48+
49+
df_transformed = df.copy()
50+
51+
print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print
52+
df_transformed = _remove_duplicates(df_transformed)
53+
df_transformed = _handle_nulls(df_transformed)
54+
df_transformed = _standardize_dates(df_transformed)
55+
df_transformed = _validate_types(df_transformed)
56+
57+
return df_transformed
0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.

tests/test_transform.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import pytest
2+
import numpy as np
3+
import pandas as pd
4+
import numpy as np
5+
from pandas.testing import assert_frame_equal
6+
from app.etl.transform import (
7+
_remove_duplicates,
8+
_handle_nulls,
9+
_standardize_dates,
10+
_validate_types
11+
)
12+
13+
def test_remove_duplicates():
14+
data = {'id': [1, 1, 2], 'name': ['a', 'a', 'b']}
15+
input_df = pd.DataFrame(data)
16+
expected_data = {'id': [1, 2], 'name': ['a', 'b']}
17+
expected_df = pd.DataFrame(expected_data).reset_index(drop=True)
18+
result_df = _remove_duplicates(input_df).reset_index(drop=True)
19+
assert len(result_df) == 2
20+
assert_frame_equal(result_df, expected_df)
21+
22+
def test_handle_nulls_does_nothing():
23+
data = {'numeric': [1, np.nan], 'text': ['a', np.nan]}
24+
input_df = pd.DataFrame(data)
25+
expected_df = input_df.copy()
26+
result_df = _handle_nulls(input_df)
27+
assert_frame_equal(result_df, expected_df)
28+
29+
def test_standardize_dates():
30+
data = {'report_date': ['2023-01-01 12:00:00', '02/25/2022', 'bad-date']}
31+
df = pd.DataFrame(data)
32+
33+
result_df = _standardize_dates(df)
34+
35+
expected = [
36+
'2023-01-01 12:00:00',
37+
'2022-02-25 00:00:00',
38+
np.nan
39+
]
40+
assert result_df['report_date'].tolist() == expected
41+
42+
def test_validate_types_does_nothing():
43+
"""
44+
Tests that _validate_types currently does nothing
45+
(as per the 'pass' in the logic).
46+
"""
47+
data = {'name': [' Alice ', ' Bob ']}
48+
input_df = pd.DataFrame(data)
49+
expected_df = input_df.copy()
50+
51+
result_df = _validate_types(input_df)
52+
assert_frame_equal(result_df, expected_df)

0 commit comments

Comments
 (0)