11import pandas as pd
22# TODO (Find & Fix)
33
4- def transform (df : pd .DataFrame ) -> pd .DataFrame :
5- """
6- Transform data by cleaning and standardizing it.
7-
8- Args:
9- df: Input DataFrame
10-
11- Returns:
12- Transformed DataFrame
13- """
14- if df .empty :
15- # TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
16- pass
17-
18- # Create a copy to avoid modifying original
19- df_transformed = df .copy ()
20-
21- print (f"🔄 Starting transformation of { len (df_transformed )} rows" ) # TODO (Find & Fix): Use logging instead of print
22-
23- # Handle duplicates
24- initial_rows = len (df_transformed )
25- # Removing duplicates
26- df_transformed = df_transformed .drop_duplicates ()
27-
28- duplicates_removed = initial_rows - len (df_transformed )
4+ def _remove_duplicates (df : pd .DataFrame ) -> pd .DataFrame :
5+ """Handles duplicate removal."""
6+ initial_rows = len (df )
7+ df = df .drop_duplicates ()
8+ duplicates_removed = initial_rows - len (df )
299 if duplicates_removed > 0 :
30- # Number of duplicates removed
3110 print (f"✅ Removed { duplicates_removed } duplicate rows." )
32-
33-
34- # Handle null values in numeric columns
35- numeric_columns = df_transformed .select_dtypes (include = ['number' ]).columns
11+ return df
12+
13+ def _handle_nulls (df : pd .DataFrame ) -> pd .DataFrame :
14+ """Handles null values (currently a placeholder)."""
15+ numeric_columns = df .select_dtypes (include = ['number' ]).columns
3616 for col in numeric_columns :
3717 # TODO (Find & Fix): Nulls in numeric columns are not handled
3818 pass
39-
40- # Handle null values in text columns
41- text_columns = df_transformed .select_dtypes (include = ['object' ]).columns
19+ text_columns = df .select_dtypes (include = ['object' ]).columns
4220 for col in text_columns :
4321 # TODO (Find & Fix): Nulls in text columns are not handled
4422 pass
45-
46- # Standardize date columns (look for common date column names)
47- date_columns = [col for col in df_transformed .columns
48- if any (keyword in col .lower () for keyword in ['date' , 'time' , 'created' , 'updated' ])]
49-
23+ return df
24+
25+ def _standardize_dates (df : pd .DataFrame ) -> pd .DataFrame :
26+ """Handles date standardization."""
27+ date_columns = [col for col in df .columns
28+ if any (keyword in col .lower () for keyword in ['date' , 'time' , 'created' , 'updated' ])]
5029 for col in date_columns :
5130 try :
52- df_transformed [col ] = pd .to_datetime (df_transformed [col ], errors = 'coerce' , infer_datetime_format = True )
31+ df [col ] = pd .to_datetime (df [col ], errors = 'coerce' , format = 'mixed' )
5332 # Standardize all dates to 'YYYY-MM-DD HH:MM:SS'
54- df_transformed [col ] = df_transformed [col ].dt .strftime ('%Y-%m-%d %H:%M:%S' )
33+ df [col ] = df [col ].dt .strftime ('%Y-%m-%d %H:%M:%S' )
5534
56- print (f"✅ Standardized date column '{ col } ' (e.g., { df_transformed [col ].iloc [0 ]} )" )
35+ print (f"✅ Standardized date column '{ col } ' (e.g., { df [col ].iloc [0 ]} )" )
5736 except Exception as e :
5837 print (f"⚠️ Could not standardize column '{ col } ': { e } " )
38+ return df
5939
60-
40+ def _validate_types ( df : pd . DataFrame ) -> pd . DataFrame :
6141 # TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
62- return df_transformed
42+ pass
43+ return df
44+ def transform (df : pd .DataFrame ) -> pd .DataFrame :
45+ if df .empty :
46+ # TODO (Find &Fix): Should raise a ValueError if DataFrame is empty
47+ pass
48+
49+ df_transformed = df .copy ()
50+
51+ print (f"🔄 Starting transformation of { len (df_transformed )} rows" ) # TODO (Find & Fix): Use logging instead of print
52+ df_transformed = _remove_duplicates (df_transformed )
53+ df_transformed = _handle_nulls (df_transformed )
54+ df_transformed = _standardize_dates (df_transformed )
55+ df_transformed = _validate_types (df_transformed )
56+
57+ return df_transformed
0 commit comments