|
1 | 1 | import pandas as pd |
| 2 | +import logging as lg |
2 | 3 | # TODO (Find & Fix) |
3 | 4 |
|
| 5 | +logger = lg.getLogger(__name__) |
| 6 | +logger.setLevel(lg.DEBUG) |
| 7 | + |
| 8 | + |
4 | 9 | def transform(df: pd.DataFrame) -> pd.DataFrame: |
5 | 10 | """ |
6 | 11 | Transform data by cleaning and standardizing it. |
7 | | - |
| 12 | +
|
8 | 13 | Args: |
9 | 14 | df: Input DataFrame |
10 | | - |
| 15 | +
|
11 | 16 | Returns: |
12 | 17 | Transformed DataFrame |
13 | 18 | """ |
14 | 19 | if df.empty: |
15 | 20 | # TODO (Find & Fix): Should raise a ValueError if DataFrame is empty |
16 | 21 | pass |
17 | | - |
18 | | - # Create a copy to avoid modifying original |
| 22 | + |
| 23 | + # Create a copy to avoid modifying original |
19 | 24 | df_transformed = df.copy() |
20 | | - |
21 | | - print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print |
22 | | - |
| 25 | + |
| 26 | + # TODO (Find & Fix): Use logging instead of print |
| 27 | + logger.info(f"🔄 Starting transformation of {len(df_transformed)} rows") |
| 28 | + |
23 | 29 | # Handle duplicates |
24 | 30 | initial_rows = len(df_transformed) |
25 | | - # Removing duplicates |
26 | | - df_transformed=df_transformed.drop_duplicates() |
| 31 | + # Removing duplicates |
| 32 | + df_transformed = df_transformed.drop_duplicates() |
27 | 33 |
|
28 | 34 | duplicates_removed = initial_rows - len(df_transformed) |
29 | 35 | if duplicates_removed > 0: |
30 | 36 | # Number of duplicates removed |
31 | | - print(f"✅ Removed {duplicates_removed} duplicate rows.") |
32 | | - |
33 | | - |
| 37 | + logger.info(f"✅ Removed {duplicates_removed} duplicate rows.") |
| 38 | + |
34 | 39 | # Handle null values in numeric columns |
35 | 40 | numeric_columns = df_transformed.select_dtypes(include=['number']).columns |
36 | 41 | for col in numeric_columns: |
37 | 42 | # TODO (Find & Fix): Nulls in numeric columns are not handled |
38 | 43 | pass |
39 | | - |
| 44 | + |
40 | 45 | # Handle null values in text columns |
41 | 46 | text_columns = df_transformed.select_dtypes(include=['object']).columns |
42 | 47 | for col in text_columns: |
43 | 48 | # TODO (Find & Fix): Nulls in text columns are not handled |
44 | 49 | pass |
45 | | - |
| 50 | + |
46 | 51 | # Standardize date columns (look for common date column names) |
47 | | - date_columns = [col for col in df_transformed.columns |
48 | | - if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])] |
49 | | - |
| 52 | + date_columns = [col for col in df_transformed.columns |
| 53 | + if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])] |
| 54 | + |
50 | 55 | for col in date_columns: |
51 | 56 | try: |
52 | | - df_transformed[col] = pd.to_datetime(df_transformed[col], errors='coerce', infer_datetime_format=True) |
| 57 | + df_transformed[col] = pd.to_datetime( |
| 58 | + df_transformed[col], errors='coerce', infer_datetime_format=True) |
53 | 59 | # Standardize all dates to 'YYYY-MM-DD HH:MM:SS' |
54 | | - df_transformed[col] = df_transformed[col].dt.strftime('%Y-%m-%d %H:%M:%S') |
55 | | - |
56 | | - print(f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})") |
| 60 | + df_transformed[col] = df_transformed[col].dt.strftime( |
| 61 | + '%Y-%m-%d %H:%M:%S') |
| 62 | + |
| 63 | + logger.info( |
| 64 | + f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})") |
57 | 65 | except Exception as e: |
58 | | - print(f"⚠️ Could not standardize column '{col}': {e}") |
| 66 | + logger.error(f"⚠️ Could not standardize column '{col}': {e}") |
59 | 67 |
|
60 | | - |
61 | 68 | # TODO (Find & Fix): Text columns are not cleaned (strip, lowercase) |
62 | 69 | return df_transformed |
0 commit comments