22import logging as lg
33# TODO (Find & Fix)
44
5- logger = lg .getLogger (__name__ )
6- logger .setLevel (lg .DEBUG )
7-
8-
9- def transform (df : pd .DataFrame ) -> pd .DataFrame :
10- """
11- Transform data by cleaning and standardizing it.
12-
13- Args:
14- df: Input DataFrame
15-
16- Returns:
17- Transformed DataFrame
18- """
19- if df .empty :
20- # TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
21- pass
22-
23- # Create a copy to avoid modifying original
24- df_transformed = df .copy ()
25-
26- # TODO (Find & Fix): Use logging instead of print
27- logger .info (f"🔄 Starting transformation of { len (df_transformed )} rows" )
28-
29- # Handle duplicates
30- initial_rows = len (df_transformed )
31- # Removing duplicates
32- df_transformed = df_transformed .drop_duplicates ()
33-
34- duplicates_removed = initial_rows - len (df_transformed )
5+ def _remove_duplicates (df : pd .DataFrame ) -> pd .DataFrame :
6+ """Handles duplicate removal."""
7+ initial_rows = len (df )
8+ df = df .drop_duplicates ()
9+ duplicates_removed = initial_rows - len (df )
3510 if duplicates_removed > 0 :
36- # Number of duplicates removed
37- logger . info ( f"✅ Removed { duplicates_removed } duplicate rows." )
11+ print ( f"✅ Removed { duplicates_removed } duplicate rows." )
12+ return df
3813
39- # Handle null values in numeric columns
40- numeric_columns = df_transformed .select_dtypes (include = ['number' ]).columns
14+ def _handle_nulls (df : pd .DataFrame ) -> pd .DataFrame :
15+ """Handles null values (currently a placeholder)."""
16+ numeric_columns = df .select_dtypes (include = ['number' ]).columns
4117 for col in numeric_columns :
4218 # TODO (Find & Fix): Nulls in numeric columns are not handled
4319 pass
44-
45- # Handle null values in text columns
46- text_columns = df_transformed .select_dtypes (include = ['object' ]).columns
20+ text_columns = df .select_dtypes (include = ['object' ]).columns
4721 for col in text_columns :
4822 # TODO (Find & Fix): Nulls in text columns are not handled
4923 pass
24+ return df
5025
51- # Standardize date columns (look for common date column names)
52- date_columns = [ col for col in df_transformed . columns
53- if any ( keyword in col . lower () for keyword in [ 'date' , 'time' , 'created' , 'updated' ])]
54-
26+ def _standardize_dates ( df : pd . DataFrame ) -> pd . DataFrame :
27+ """Handles date standardization."""
28+ date_columns = [ col for col in df . columns
29+ if any ( keyword in col . lower () for keyword in [ 'date' , 'time' , 'created' , 'updated' ])]
5530 for col in date_columns :
5631 try :
57- df_transformed [col ] = pd .to_datetime (
58- df_transformed [col ], errors = 'coerce' , infer_datetime_format = True )
32+ df [col ] = pd .to_datetime (df [col ], errors = 'coerce' , format = 'mixed' )
5933 # Standardize all dates to 'YYYY-MM-DD HH:MM:SS'
60- df_transformed [col ] = df_transformed [col ].dt .strftime (
61- '%Y-%m-%d %H:%M:%S' )
62-
63- logger .info (
64- f"✅ Standardized date column '{ col } ' (e.g., { df_transformed [col ].iloc [0 ]} )" )
34+ df [col ] = df [col ].dt .strftime ('%Y-%m-%d %H:%M:%S' )
35+
36+ print (f"✅ Standardized date column '{ col } ' (e.g., { df [col ].iloc [0 ]} )" )
6537 except Exception as e :
66- logger .error (f"⚠️ Could not standardize column '{ col } ': { e } " )
38+ print (f"⚠️ Could not standardize column '{ col } ': { e } " )
39+ return df
6740
41+ def _validate_types (df : pd .DataFrame ) -> pd .DataFrame :
6842 # TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
69- return df_transformed
43+ pass
44+ return df
45+ def transform (df : pd .DataFrame ) -> pd .DataFrame :
46+ if df .empty :
47+ # TODO (Find &Fix): Should raise a ValueError if DataFrame is empty
48+ pass
49+
50+ df_transformed = df .copy ()
51+
52+ print (f"🔄 Starting transformation of { len (df_transformed )} rows" ) # TODO (Find & Fix): Use logging instead of print
53+ df_transformed = _remove_duplicates (df_transformed )
54+ df_transformed = _handle_nulls (df_transformed )
55+ df_transformed = _standardize_dates (df_transformed )
56+ df_transformed = _validate_types (df_transformed )
57+
58+ return df_transformed
0 commit comments