OPCODE-Open-Spring-Fest
diff --git a/‎README.md‎
Lines changed: 81 additions & 2 deletions b/‎README.md‎
Lines changed: 81 additions & 2 deletions
diff --git a/‎app/data.csv‎
Lines changed: 100001 additions & 0 deletions b/‎app/data.csv‎
Lines changed: 100001 additions & 0 deletions
diff --git a/‎app/etl/__pycache__/extract.cpython-313.pyc‎
2.35 KB b/‎app/etl/__pycache__/extract.cpython-313.pyc‎
2.35 KB
diff --git a/‎app/etl/__pycache__/load.cpython-313.pyc‎
3.75 KB b/‎app/etl/__pycache__/load.cpython-313.pyc‎
3.75 KB
diff --git a/‎app/etl/__pycache__/transform.cpython-313.pyc‎
3.75 KB b/‎app/etl/__pycache__/transform.cpython-313.pyc‎
3.75 KB
diff --git a/‎app/etl/extract.py‎
Lines changed: 57 additions & 0 deletions b/‎app/etl/extract.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎app/etl/load.py‎
Lines changed: 47 additions & 0 deletions b/‎app/etl/load.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎app/etl/transform.py‎
Lines changed: 54 additions & 0 deletions b/‎app/etl/transform.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎app/main.py‎
Lines changed: 52 additions & 0 deletions b/‎app/main.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎app/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎app/requirements.txt‎
Lines changed: 1 addition & 0 deletions
@@ -1,2 +1,81 @@
-# template
-A Template Repository for OpenSpringFest (OSF)
+# 📘 ETL Problems – Open Source Learning Project
+
+Welcome to **ETL Problems**, an open-source project designed for learning, experimenting, and contributing to real-world data engineering workflows.
+
+This repository contains a **deliberately broken ETL pipeline** that mimics issues data engineers face daily. The goal is for contributors to **identify, fix, and enhance** the pipeline — while learning best practices in **data extraction, transformation, and loading**.
+
+---
+
+## 🚀 What’s Inside?
+The pipeline follows a simple **ETL flow**:
+
+1. **Extract** → Reads data from a CSV file (with encoding fallback).
+2. **Transform** → Cleans, deduplicates, and prepares the dataset.
+3. **Load** → Stores processed data into an SQLite database (with idempotency).
+
+---
+
+## ⚠️ Find and Fix Issues
+
+These bugs are intentionally introduced and marked in the code with  
+`# TODO (Find & Fix): ...`  
+Contributors should search for these comments and fix the issues.
+
+### Examples:
+- Unused imports
+- Incorrect default values
+- Wrong file extension checks
+- Missing error handling
+- Print statements instead of logging
+- Missing idempotency in database load
+- No duplicate removal in transform
+- Missing actual logic in extract/transform/load steps
+
+---
+
+## 🎯 Ways to Contribute
+
+- Fix bugs marked with `# TODO (Find & Fix): ...`
+- Improve error handling and logging
+- Add tests and validation
+- Enhance documentation
+- Add new features (scrapers, data quality checks, visualizations)
+
+---
+
+## 🛠 Setup Instructions
+
+Clone the repo and install dependencies:
+
+```bash
+git clone https://github.com/<your-username>/etl-problems.git
+cd etl-problems
+pip install -r requirements.txt
+python main.py
+```
+
+---
+
+## 🧪 Testing
+
+Unit tests can be added in the `tests/` folder.  
+Run them with:
+
+```bash
+pytest tests/
+```
+
+---
+
+## 💡 Tips for Contributors
+
+- Search for `# TODO (Find & Fix): ...` in the codebase.
+- Check the [Issues](https://github.com/<your-username>/etl-problems/issues) for tasks and guidance.
+- If you find a new bug, open an issue and suggest a fix.
+- All contributions, big or small, are welcome!
+
+---
+
+## 📬 Questions?
+
+Open an issue or start a discussion in the repo. Happy hacking!
@@ -0,0 +1,57 @@
+import pandas as pd
+import os
+# TODO (Find & Fix)
+from typing import Optional
+
+def extract(path: str = "xyz.csv") -> ________:  # TODO (Find & Fix)
+    """
+    Extracts data from CSV file.
+    
+    Args:
+        path: Path to the CSV file
+        
+    Returns:
+        DataFrame containing the extracted data  # TODO (Find & Fix): Should specify pd.DataFrame in docstring
+        
+    Raises:
+        FileNotFoundError: If the file doesn't exist
+        ValueError: If the file is empty or invalid
+    """
+    # Validate file path
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"❌ File not found: {path}")
+    
+    if not path.lower().endswith('.txt'):  # TODO (Find & Fix)
+        raise ValueError(f"❌ File must be a CSV: {path}")
+    
+    try:
+        # Try different encodings
+        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
+        df = None
+        
+        for encoding in encodings:
+            try:
+                # TODO (Find & Fix)
+                pass
+            except UnicodeDecodeError:
+                # TODO (Find & Fix)
+                pass
+        
+        if df is None:
+            # TODO (Find & Fix)
+            pass
+        
+        # Validate data
+        if df.empty:
+            # TODO (Find & Fix)
+            pass
+        
+        print(f"✅ Extracted {len(df)} rows and {len(df.columns)} columns")  # TODO (Find & Fix): Use logging instead of print
+        return df
+        
+    except pd.errors.EmptyDataError:
+        raise ValueError("❌ File contains no data")
+    except pd.errors.ParserError as e:
+        raise ValueError(f"❌ Error parsing CSV: {e}")
+    except Exception as e:
+        raise ValueError(f"❌ Unexpected error reading file: {e}")
@@ -0,0 +1,47 @@
+import pandas as pd
+import sqlite3
+import os
+# TODO (Find & Fix)
+from typing import Optional
+
+def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "processed_data"):
+    """
+    Loads data into SQLite database with proper error handling and upsert logic.
+    
+    Args:
+        df: DataFrame to load
+        db_path: Path to SQLite database file
+        table_name: Name of the table to create/update
+    """
+    if df.empty:
+        print("⚠️ Warning: Empty DataFrame received, nothing to load")  # TODO (Find & Fix)
+        return
+    
+    print(f"🔄 Loading {len(df)} rows into database '{db_path}'")  # TODO (Find & Fix)
+    
+    # Ensure directory exists
+    db_dir = os.path.dirname(db_path)
+    if db_dir and not os.path.exists(db_dir):
+        os.makedirs(db_dir)
+    
+    conn = None
+    try:
+        # Connect to database
+        conn = sqlite3.connect(db_path)
+        cursor = conn.cursor()
+        
+        # TODO (Find & Fix): Table creation and schema logic missing
+        
+        # TODO (Find & Fix): Idempotency check missing (should avoid duplicate inserts)
+        # TODO (Find & Fix): Bulk insert without checking for duplicates
+        df.to_sql(table_name, conn, if_exists="append", index=False)
+        
+    except sqlite3.Error as e:
+        # TODO (Find & Fix): Error handling missing
+        pass
+    except Exception as e:
+        # TODO (Find & Fix): Error handling missing
+        pass
+    finally:
+        # TODO (Find & Fix): Connection not closed properly
+        pass
@@ -0,0 +1,54 @@
+import pandas as pd
+from datetime import datetime
+# TODO (Find & Fix)
+from typing import Optional
+
+def transform(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Transform data by cleaning and standardizing it.
+    
+    Args:
+        df: Input DataFrame
+        
+    Returns:
+        Transformed DataFrame
+    """
+    if df.empty:
+        # TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
+        pass
+    
+    # Create a copy to avoid modifying original
+    df_transformed = df.copy()
+    
+    print(f"🔄 Starting transformation of {len(df_transformed)} rows")  # TODO (Find & Fix): Use logging instead of print
+    
+    # Handle duplicates
+    initial_rows = len(df_transformed)
+    # TODO (Find & Fix): Duplicates are not removed
+    duplicates_removed = initial_rows - len(df_transformed)
+    if duplicates_removed > 0:
+        # TODO (Find & Fix): Should log how many duplicates were removed
+        pass
+    
+    # Handle null values in numeric columns
+    numeric_columns = df_transformed.select_dtypes(include=['number']).columns
+    for col in numeric_columns:
+        # TODO (Find & Fix): Nulls in numeric columns are not handled
+        pass
+    
+    # Handle null values in text columns
+    text_columns = df_transformed.select_dtypes(include=['object']).columns
+    for col in text_columns:
+        # TODO (Find & Fix): Nulls in text columns are not handled
+        pass
+    
+    # Standardize date columns (look for common date column names)
+    date_columns = [col for col in df_transformed.columns 
+                   if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
+    
+    for col in date_columns:
+        # TODO (Find & Fix): Date columns are not standardized
+        pass
+    
+    # TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
+    return df_transformed
@@ -0,0 +1,52 @@
+from app.etl.extract import extract
+from app.etl.transform import transform
+from app.etl.load import load
+
+def run_pipeline(csv_path: str = "data.csv", db_path: str = "etl_data.db"):
+    """
+    Run the complete ETL pipeline.
+    
+    Args:
+        csv_path: Path to the input CSV file
+        db_path: Path to the output SQLite database
+    """
+    try:
+        print("🚀 Starting ETL Pipeline")  # TODO (Find & Fix): Use logging instead of print
+        print(f"📁 Input file: {csv_path}")
+        print(f"🗄️ Output database: {db_path}")
+        print("-" * 50)
+        
+        # Extract
+        print("📥 STEP 1: EXTRACT")
+        df = extract(csv_path)
+        print(f"✅ Extracted {len(df)} rows")
+        print(f"📊 Columns: {list(df.columns)}")
+        print()
+        
+        # Transform
+        print("🔄 STEP 2: TRANSFORM")
+        df_transformed = transform(df)
+        print(f"✅ Transformed data ready")
+        print()
+        
+        # Load
+        print("📤 STEP 3: LOAD")
+        load(df_transformed, db_path)
+        print()
+        
+        print("🎉 ETL Pipeline completed successfully!")
+        print(f"📈 Final dataset: {len(df_transformed)} rows, {len(df_transformed.columns)} columns")
+        
+    except FileNotFoundError as e:
+        print(f"❌ File Error: {e}")
+
+    except ValueError as e:
+        # TODO (Find & Fix): Error handling missing
+        pass
+    except Exception as e:
+        # TODO (Find & Fix): Error handling missing
+        pass
+
+if __name__ == "__main__":    
+    # Run the pipeline
+    run_pipeline()
@@ -0,0 +1 @@
+pandas>=2.0.0