feat: implemented logging

pushpam345 · pushpam345 · commit 15a0ed2aa92b · 2025-11-07T18:25:45.000+05:30
diff --git a/app/etl/extract.py b/app/etl/extract.py
@@ -1,68 +1,79 @@
 import pandas as pd
 import os
+import logging as lg
+
 # TODO (Find & Fix)
 
-def extract(path: str = "xyz.csv") -> pd.DataFrame :
+
+logger = lg.getLogger(__name__)
+logger.setLevel(lg.DEBUG)
+
+
+def extract(path: str = "xyz.csv") -> pd.DataFrame:
     """
     Extracts data from CSV, Excel, or JSON file.
-    
+
     Args:
         path: Path to the data file (supports .csv, .xlsx, .json)
-        
+
     Returns:
         pd.DataFrame: DataFrame containing the extracted data
-        
+
     Raises:
         FileNotFoundError: If the file doesn't exist
         ValueError: If the file is empty or invalid
     """
     # Validate file path
     if not os.path.exists(path):
         raise FileNotFoundError(f"❌ File not found: {path}")
-    
+
     # Get file extension
     ext = os.path.splitext(path)[-1].lower()
-    
+
     # Check if file format is supported
     if ext not in ['.csv', '.xlsx', '.xls', '.json']:
         raise ValueError(f"Unsupported file format: {ext}")
-    
+
     try:
         if ext == '.csv':
             # Try different encodings for CSV files
             encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
             df = None
-            
+
             for encoding in encodings:
                 try:
                     df = pd.read_csv(path, encoding=encoding)
-                    print(f"Successfully read CSV with encoding: {encoding}")
+                    logger.info(f"Successfully read CSV with encoding: {encoding}")
                     break
                 except UnicodeDecodeError:
-                    print(f"Failed to read with encoding '{encoding}'")
+                    logger.error(f"Failed to read with encoding '{encoding}'")
                     continue
                 except Exception as e:
-                    print(f"Error reading with encoding '{encoding}': {e}")
+                    logger.error(
+                        f"Error reading with encoding '{encoding}': {e}")
                     continue
-            
+
             if df is None:
-                raise ValueError(f"Could not read CSV with tried encodings: {encodings}")
-                
+                raise ValueError(
+                    f"Could not read CSV with tried encodings: {encodings}")
+
         elif ext in ['.xls', '.xlsx']:
             df = pd.read_excel(path)
-            print(f"Successfully read Excel file: {path}")
-            
+            logger.info(f"Successfully read Excel file: {path}")
+
         elif ext == '.json':
             df = pd.read_json(path)
-            print(f"Successfully read JSON file: {path}")
-        
+            logger.info(f"Successfully read JSON file: {path}")
+
         # Validate data
         if df.empty:
             raise ValueError("File contains no data")
-        
-        print(f"✅ Extracted {len(df)} rows and {len(df.columns)} columns")  # TODO: Use logging instead of print
+
+        # TODO: Use logging instead of print
+        logger.info(
+            f"✅ Extracted {len(df)} rows and {len(df.columns)} columns")
         return df
-        
+
     except pd.errors.EmptyDataError:
         raise ValueError("❌ File contains no data")
     except pd.errors.ParserError as e:
diff --git a/app/etl/load.py b/app/etl/load.py
@@ -1,7 +1,11 @@
 import pandas as pd
 import sqlite3
 import os
+import logging  as lg 
 # TODO (Find & Fix)
+logger=lg.getLogger(__name__)
+logger.setLevel(lg.DEBUG)
+
 
 def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "processed_data"):
     """
@@ -13,10 +17,10 @@ def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "proc
         table_name: Name of the table to create/update
     """
     if df.empty:
-        print("⚠️ Warning: Empty DataFrame received, nothing to load")  # TODO (Find & Fix)
+        logger.warning("⚠️ Warning: Empty DataFrame received, nothing to load")  # TODO (Find & Fix)
         return
     
-    print(f"🔄 Loading {len(df)} rows into database '{db_path}'")  # TODO (Find & Fix)
+    logger.info(f"🔄 Loading {len(df)} rows into database '{db_path}'")  # TODO (Find & Fix)
     
     # Ensure directory exists
     db_dir = os.path.dirname(db_path)
diff --git a/app/etl/transform.py b/app/etl/transform.py
@@ -1,62 +1,69 @@
 import pandas as pd
+import logging as lg
 # TODO (Find & Fix)
 
+logger = lg.getLogger(__name__)
+logger.setLevel(lg.DEBUG)
+
+
 def transform(df: pd.DataFrame) -> pd.DataFrame:
     """
     Transform data by cleaning and standardizing it.
-    
+
     Args:
         df: Input DataFrame
-        
+
     Returns:
         Transformed DataFrame
     """
     if df.empty:
         # TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
         pass
-    
-    # Create a copy to avoid modifying original
+
+        # Create a copy to avoid modifying original
     df_transformed = df.copy()
-    
-    print(f"🔄 Starting transformation of {len(df_transformed)} rows")  # TODO (Find & Fix): Use logging instead of print
-    
+
+    # TODO (Find & Fix): Use logging instead of print
+    logger.info(f"🔄 Starting transformation of {len(df_transformed)} rows")
+
     # Handle duplicates
     initial_rows = len(df_transformed)
-    # Removing duplicates 
-    df_transformed=df_transformed.drop_duplicates()
+    # Removing duplicates
+    df_transformed = df_transformed.drop_duplicates()
 
     duplicates_removed = initial_rows - len(df_transformed)
     if duplicates_removed > 0:
         # Number of duplicates removed
-        print(f"✅ Removed {duplicates_removed} duplicate rows.")
-    
-    
+        logger.info(f"✅ Removed {duplicates_removed} duplicate rows.")
+
     # Handle null values in numeric columns
     numeric_columns = df_transformed.select_dtypes(include=['number']).columns
     for col in numeric_columns:
         # TODO (Find & Fix): Nulls in numeric columns are not handled
         pass
-    
+
     # Handle null values in text columns
     text_columns = df_transformed.select_dtypes(include=['object']).columns
     for col in text_columns:
         # TODO (Find & Fix): Nulls in text columns are not handled
         pass
-    
+
     # Standardize date columns (look for common date column names)
-    date_columns = [col for col in df_transformed.columns 
-                   if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
-    
+    date_columns = [col for col in df_transformed.columns
+                    if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
+
     for col in date_columns:
         try:
-            df_transformed[col] = pd.to_datetime(df_transformed[col], errors='coerce', infer_datetime_format=True)
+            df_transformed[col] = pd.to_datetime(
+                df_transformed[col], errors='coerce', infer_datetime_format=True)
             # Standardize all dates to 'YYYY-MM-DD HH:MM:SS'
-            df_transformed[col] = df_transformed[col].dt.strftime('%Y-%m-%d %H:%M:%S')
-            
-            print(f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})")
+            df_transformed[col] = df_transformed[col].dt.strftime(
+                '%Y-%m-%d %H:%M:%S')
+
+            logger.info(
+                f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})")
         except Exception as e:
-            print(f"⚠️ Could not standardize column '{col}': {e}")
+            logger.error(f"⚠️ Could not standardize column '{col}': {e}")
 
-    
     # TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
     return df_transformed
diff --git a/app/main.py b/app/main.py
@@ -1,8 +1,11 @@
 import os
-
 from app.etl.extract import extract
 from app.etl.transform import transform
 from app.etl.load import load
+import logging as lg
+lg.basicConfig(level=lg.debug())
+
+logger = lg.getLogger(__name__)
 
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 data_path = os.path.join(BASE_DIR, "data.csv")
@@ -16,34 +19,34 @@ def run_pipeline(csv_path: str =data_path, db_path: str = "etl_data.db"):
         db_path: Path to the output SQLite database
     """
     try:
-        print("🚀 Starting ETL Pipeline")  # TODO (Find & Fix): Use logging instead of print
-        print(f"📁 Input file: {csv_path}")
-        print(f"🗄️ Output database: {db_path}")
-        print("-" * 50)
+        lg.info("🚀 Starting ETL Pipeline")  # TODO (Find & Fix): Use logging instead of print
+        lg.info(f"📁 Input file: {csv_path}")
+        lg.info(f"🗄️ Output database: {db_path}")
+        lg.info("-" * 50)
         
         # Extract
-        print("📥 STEP 1: EXTRACT")
+        lg.info("📥 STEP 1: EXTRACT")
         df = extract(csv_path)
-        print(f"✅ Extracted {len(df)} rows")
-        print(f"📊 Columns: {list(df.columns)}")
-        print()
+        lg.info(f"✅ Extracted {len(df)} rows")
+        lg.info(f"📊 Columns: {list(df.columns)}")
+        lg.info()
         
         # Transform
-        print("🔄 STEP 2: TRANSFORM")
+        lg.info("🔄 STEP 2: TRANSFORM")
         df_transformed = transform(df)
-        print(f"✅ Transformed data ready")
-        print()
+        lg.info(f"✅ Transformed data ready")
+        lg.info()
         
         # Load
-        print("📤 STEP 3: LOAD")
+        lg.info("📤 STEP 3: LOAD")
         load(df_transformed, db_path)
-        print()
+        lg.info()
         
-        print("🎉 ETL Pipeline completed successfully!")
-        print(f"📈 Final dataset: {len(df_transformed)} rows, {len(df_transformed.columns)} columns")
+        lg.info("🎉 ETL Pipeline completed successfully!")
+        lg.info(f"📈 Final dataset: {len(df_transformed)} rows, {len(df_transformed.columns)} columns")
         
     except FileNotFoundError as e:
-        print(f"❌ File Error: {e}")
+        lg.error(f"❌ File Error: {e}")
 
     except ValueError as e:
         # TODO (Find & Fix): Error handling missing