Skip to content

Commit 32e1b96

Browse files
committed
Added the statements
1 parent 569a4a6 commit 32e1b96

File tree

10 files changed

+100293
-2
lines changed

10 files changed

+100293
-2
lines changed

README.md

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,81 @@
1-
# template
2-
A Template Repository for OpenSpringFest (OSF)
1+
# 📘 ETL Problems – Open Source Learning Project
2+
3+
Welcome to **ETL Problems**, an open-source project designed for learning, experimenting, and contributing to real-world data engineering workflows.
4+
5+
This repository contains a **deliberately broken ETL pipeline** that mimics issues data engineers face daily. The goal is for contributors to **identify, fix, and enhance** the pipeline — while learning best practices in **data extraction, transformation, and loading**.
6+
7+
---
8+
9+
## 🚀 What’s Inside?
10+
The pipeline follows a simple **ETL flow**:
11+
12+
1. **Extract** → Reads data from a CSV file (with encoding fallback).
13+
2. **Transform** → Cleans, deduplicates, and prepares the dataset.
14+
3. **Load** → Stores processed data into an SQLite database (with idempotency).
15+
16+
---
17+
18+
## ⚠️ Find and Fix Issues
19+
20+
These bugs are intentionally introduced and marked in the code with
21+
`# TODO (Find & Fix): ...`
22+
Contributors should search for these comments and fix the issues.
23+
24+
### Examples:
25+
- Unused imports
26+
- Incorrect default values
27+
- Wrong file extension checks
28+
- Missing error handling
29+
- Print statements instead of logging
30+
- Missing idempotency in database load
31+
- No duplicate removal in transform
32+
- Missing actual logic in extract/transform/load steps
33+
34+
---
35+
36+
## 🎯 Ways to Contribute
37+
38+
- Fix bugs marked with `# TODO (Find & Fix): ...`
39+
- Improve error handling and logging
40+
- Add tests and validation
41+
- Enhance documentation
42+
- Add new features (scrapers, data quality checks, visualizations)
43+
44+
---
45+
46+
## 🛠 Setup Instructions
47+
48+
Clone the repo and install dependencies:
49+
50+
```bash
51+
git clone https://github.com/<your-username>/etl-problems.git
52+
cd etl-problems
53+
pip install -r requirements.txt
54+
python main.py
55+
```
56+
57+
---
58+
59+
## 🧪 Testing
60+
61+
Unit tests can be added in the `tests/` folder.
62+
Run them with:
63+
64+
```bash
65+
pytest tests/
66+
```
67+
68+
---
69+
70+
## 💡 Tips for Contributors
71+
72+
- Search for `# TODO (Find & Fix): ...` in the codebase.
73+
- Check the [Issues](https://github.com/<your-username>/etl-problems/issues) for tasks and guidance.
74+
- If you find a new bug, open an issue and suggest a fix.
75+
- All contributions, big or small, are welcome!
76+
77+
---
78+
79+
## 📬 Questions?
80+
81+
Open an issue or start a discussion in the repo. Happy hacking!

app/data.csv

Lines changed: 100001 additions & 0 deletions
Large diffs are not rendered by default.
2.35 KB
Binary file not shown.
3.75 KB
Binary file not shown.
3.75 KB
Binary file not shown.

app/etl/extract.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import pandas as pd
2+
import os
3+
# TODO (Find & Fix)
4+
from typing import Optional
5+
6+
def extract(path: str = "xyz.csv") -> ________: # TODO (Find & Fix)
7+
"""
8+
Extracts data from CSV file.
9+
10+
Args:
11+
path: Path to the CSV file
12+
13+
Returns:
14+
DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring
15+
16+
Raises:
17+
FileNotFoundError: If the file doesn't exist
18+
ValueError: If the file is empty or invalid
19+
"""
20+
# Validate file path
21+
if not os.path.exists(path):
22+
raise FileNotFoundError(f"❌ File not found: {path}")
23+
24+
if not path.lower().endswith('.txt'): # TODO (Find & Fix)
25+
raise ValueError(f"❌ File must be a CSV: {path}")
26+
27+
try:
28+
# Try different encodings
29+
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
30+
df = None
31+
32+
for encoding in encodings:
33+
try:
34+
# TODO (Find & Fix)
35+
pass
36+
except UnicodeDecodeError:
37+
# TODO (Find & Fix)
38+
pass
39+
40+
if df is None:
41+
# TODO (Find & Fix)
42+
pass
43+
44+
# Validate data
45+
if df.empty:
46+
# TODO (Find & Fix)
47+
pass
48+
49+
print(f"✅ Extracted {len(df)} rows and {len(df.columns)} columns") # TODO (Find & Fix): Use logging instead of print
50+
return df
51+
52+
except pd.errors.EmptyDataError:
53+
raise ValueError("❌ File contains no data")
54+
except pd.errors.ParserError as e:
55+
raise ValueError(f"❌ Error parsing CSV: {e}")
56+
except Exception as e:
57+
raise ValueError(f"❌ Unexpected error reading file: {e}")

app/etl/load.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
import pandas as pd
2+
import sqlite3
3+
import os
4+
# TODO (Find & Fix)
5+
from typing import Optional
6+
7+
def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "processed_data"):
8+
"""
9+
Loads data into SQLite database with proper error handling and upsert logic.
10+
11+
Args:
12+
df: DataFrame to load
13+
db_path: Path to SQLite database file
14+
table_name: Name of the table to create/update
15+
"""
16+
if df.empty:
17+
print("⚠️ Warning: Empty DataFrame received, nothing to load") # TODO (Find & Fix)
18+
return
19+
20+
print(f"🔄 Loading {len(df)} rows into database '{db_path}'") # TODO (Find & Fix)
21+
22+
# Ensure directory exists
23+
db_dir = os.path.dirname(db_path)
24+
if db_dir and not os.path.exists(db_dir):
25+
os.makedirs(db_dir)
26+
27+
conn = None
28+
try:
29+
# Connect to database
30+
conn = sqlite3.connect(db_path)
31+
cursor = conn.cursor()
32+
33+
# TODO (Find & Fix): Table creation and schema logic missing
34+
35+
# TODO (Find & Fix): Idempotency check missing (should avoid duplicate inserts)
36+
# TODO (Find & Fix): Bulk insert without checking for duplicates
37+
df.to_sql(table_name, conn, if_exists="append", index=False)
38+
39+
except sqlite3.Error as e:
40+
# TODO (Find & Fix): Error handling missing
41+
pass
42+
except Exception as e:
43+
# TODO (Find & Fix): Error handling missing
44+
pass
45+
finally:
46+
# TODO (Find & Fix): Connection not closed properly
47+
pass

app/etl/transform.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import pandas as pd
2+
from datetime import datetime
3+
# TODO (Find & Fix)
4+
from typing import Optional
5+
6+
def transform(df: pd.DataFrame) -> pd.DataFrame:
7+
"""
8+
Transform data by cleaning and standardizing it.
9+
10+
Args:
11+
df: Input DataFrame
12+
13+
Returns:
14+
Transformed DataFrame
15+
"""
16+
if df.empty:
17+
# TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
18+
pass
19+
20+
# Create a copy to avoid modifying original
21+
df_transformed = df.copy()
22+
23+
print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print
24+
25+
# Handle duplicates
26+
initial_rows = len(df_transformed)
27+
# TODO (Find & Fix): Duplicates are not removed
28+
duplicates_removed = initial_rows - len(df_transformed)
29+
if duplicates_removed > 0:
30+
# TODO (Find & Fix): Should log how many duplicates were removed
31+
pass
32+
33+
# Handle null values in numeric columns
34+
numeric_columns = df_transformed.select_dtypes(include=['number']).columns
35+
for col in numeric_columns:
36+
# TODO (Find & Fix): Nulls in numeric columns are not handled
37+
pass
38+
39+
# Handle null values in text columns
40+
text_columns = df_transformed.select_dtypes(include=['object']).columns
41+
for col in text_columns:
42+
# TODO (Find & Fix): Nulls in text columns are not handled
43+
pass
44+
45+
# Standardize date columns (look for common date column names)
46+
date_columns = [col for col in df_transformed.columns
47+
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
48+
49+
for col in date_columns:
50+
# TODO (Find & Fix): Date columns are not standardized
51+
pass
52+
53+
# TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
54+
return df_transformed

app/main.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from app.etl.extract import extract
2+
from app.etl.transform import transform
3+
from app.etl.load import load
4+
5+
def run_pipeline(csv_path: str = "data.csv", db_path: str = "etl_data.db"):
6+
"""
7+
Run the complete ETL pipeline.
8+
9+
Args:
10+
csv_path: Path to the input CSV file
11+
db_path: Path to the output SQLite database
12+
"""
13+
try:
14+
print("🚀 Starting ETL Pipeline") # TODO (Find & Fix): Use logging instead of print
15+
print(f"📁 Input file: {csv_path}")
16+
print(f"🗄️ Output database: {db_path}")
17+
print("-" * 50)
18+
19+
# Extract
20+
print("📥 STEP 1: EXTRACT")
21+
df = extract(csv_path)
22+
print(f"✅ Extracted {len(df)} rows")
23+
print(f"📊 Columns: {list(df.columns)}")
24+
print()
25+
26+
# Transform
27+
print("🔄 STEP 2: TRANSFORM")
28+
df_transformed = transform(df)
29+
print(f"✅ Transformed data ready")
30+
print()
31+
32+
# Load
33+
print("📤 STEP 3: LOAD")
34+
load(df_transformed, db_path)
35+
print()
36+
37+
print("🎉 ETL Pipeline completed successfully!")
38+
print(f"📈 Final dataset: {len(df_transformed)} rows, {len(df_transformed.columns)} columns")
39+
40+
except FileNotFoundError as e:
41+
print(f"❌ File Error: {e}")
42+
43+
except ValueError as e:
44+
# TODO (Find & Fix): Error handling missing
45+
pass
46+
except Exception as e:
47+
# TODO (Find & Fix): Error handling missing
48+
pass
49+
50+
if __name__ == "__main__":
51+
# Run the pipeline
52+
run_pipeline()

app/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pandas>=2.0.0

0 commit comments

Comments
 (0)