From 467284e6bba3a73d842cccee05a10cac748dccae Mon Sep 17 00:00:00 2001 From: Satvik-Singh192 Date: Fri, 7 Nov 2025 17:48:23 +0530 Subject: [PATCH] feat: task done --- app/etl/transform.py | 81 ++++++++---------- tests/__pycache__/__init__.cpython-310.pyc | Bin 131 -> 131 bytes .../test_extract.cpython-310-pytest-8.4.2.pyc | Bin 2380 -> 2380 bytes tests/test_transform.py | 52 +++++++++++ 4 files changed, 90 insertions(+), 43 deletions(-) create mode 100644 tests/test_transform.py diff --git a/app/etl/transform.py b/app/etl/transform.py index ed7b936..f44915a 100644 --- a/app/etl/transform.py +++ b/app/etl/transform.py @@ -1,62 +1,57 @@ import pandas as pd # TODO (Find & Fix) -def transform(df: pd.DataFrame) -> pd.DataFrame: - """ - Transform data by cleaning and standardizing it. - - Args: - df: Input DataFrame - - Returns: - Transformed DataFrame - """ - if df.empty: - # TODO (Find & Fix): Should raise a ValueError if DataFrame is empty - pass - - # Create a copy to avoid modifying original - df_transformed = df.copy() - - print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print - - # Handle duplicates - initial_rows = len(df_transformed) - # Removing duplicates - df_transformed=df_transformed.drop_duplicates() - - duplicates_removed = initial_rows - len(df_transformed) +def _remove_duplicates(df: pd.DataFrame) -> pd.DataFrame: + """Handles duplicate removal.""" + initial_rows = len(df) + df = df.drop_duplicates() + duplicates_removed = initial_rows - len(df) if duplicates_removed > 0: - # Number of duplicates removed print(f"✅ Removed {duplicates_removed} duplicate rows.") - - - # Handle null values in numeric columns - numeric_columns = df_transformed.select_dtypes(include=['number']).columns + return df + +def _handle_nulls(df: pd.DataFrame) -> pd.DataFrame: + """Handles null values (currently a placeholder).""" + numeric_columns = df.select_dtypes(include=['number']).columns for col in numeric_columns: # TODO (Find & Fix): Nulls in numeric columns are not handled pass - - # Handle null values in text columns - text_columns = df_transformed.select_dtypes(include=['object']).columns + text_columns = df.select_dtypes(include=['object']).columns for col in text_columns: # TODO (Find & Fix): Nulls in text columns are not handled pass - - # Standardize date columns (look for common date column names) - date_columns = [col for col in df_transformed.columns - if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])] - + return df + +def _standardize_dates(df: pd.DataFrame) -> pd.DataFrame: + """Handles date standardization.""" + date_columns = [col for col in df.columns + if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])] for col in date_columns: try: - df_transformed[col] = pd.to_datetime(df_transformed[col], errors='coerce', infer_datetime_format=True) + df[col] = pd.to_datetime(df[col], errors='coerce', format='mixed') # Standardize all dates to 'YYYY-MM-DD HH:MM:SS' - df_transformed[col] = df_transformed[col].dt.strftime('%Y-%m-%d %H:%M:%S') + df[col] = df[col].dt.strftime('%Y-%m-%d %H:%M:%S') - print(f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})") + print(f"✅ Standardized date column '{col}' (e.g., {df[col].iloc[0]})") except Exception as e: print(f"⚠️ Could not standardize column '{col}': {e}") + return df - +def _validate_types(df: pd.DataFrame) -> pd.DataFrame: # TODO (Find & Fix): Text columns are not cleaned (strip, lowercase) - return df_transformed + pass + return df +def transform(df: pd.DataFrame) -> pd.DataFrame: + if df.empty: + # TODO (Find &Fix): Should raise a ValueError if DataFrame is empty + pass + + df_transformed = df.copy() + + print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print + df_transformed = _remove_duplicates(df_transformed) + df_transformed = _handle_nulls(df_transformed) + df_transformed = _standardize_dates(df_transformed) + df_transformed = _validate_types(df_transformed) + + return df_transformed \ No newline at end of file diff --git a/tests/__pycache__/__init__.cpython-310.pyc b/tests/__pycache__/__init__.cpython-310.pyc index cf1af2325ec051ad0e688cbd9c42bde1e75f05c9..143c3adec9a81d21c8ba0bcc56f1fd5e3af80e54 100644 GIT binary patch delta 20 ZcmZo>Y-Z%i=jG*M0D=`k?3ojJ$^j=&1SS9g delta 20 ZcmZo>Y-Z%i=jG*M0D_M6-x(8m$^k2_1r7iJ diff --git a/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc b/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc index dbb29cd183a345d66d794cfb9ae74243f60d753e..c7fd053063c68104846618e8b175be46f10bd700 100644 GIT binary patch delta 21 bcmX>jbVi6LpO=@50SHzEv1e}Nao_|1IerAp delta 21 bcmX>jbVi6LpO=@50SI2C|IXOR