Skip to content

Commit 15a0ed2

Browse files
committed
feat: implemented logging
1 parent ae242dd commit 15a0ed2

File tree

4 files changed

+88
-63
lines changed

4 files changed

+88
-63
lines changed

app/etl/extract.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,79 @@
11
import pandas as pd
22
import os
3+
import logging as lg
4+
35
# TODO (Find & Fix)
46

5-
def extract(path: str = "xyz.csv") -> pd.DataFrame :
7+
8+
logger = lg.getLogger(__name__)
9+
logger.setLevel(lg.DEBUG)
10+
11+
12+
def extract(path: str = "xyz.csv") -> pd.DataFrame:
613
"""
714
Extracts data from CSV, Excel, or JSON file.
8-
15+
916
Args:
1017
path: Path to the data file (supports .csv, .xlsx, .json)
11-
18+
1219
Returns:
1320
pd.DataFrame: DataFrame containing the extracted data
14-
21+
1522
Raises:
1623
FileNotFoundError: If the file doesn't exist
1724
ValueError: If the file is empty or invalid
1825
"""
1926
# Validate file path
2027
if not os.path.exists(path):
2128
raise FileNotFoundError(f"❌ File not found: {path}")
22-
29+
2330
# Get file extension
2431
ext = os.path.splitext(path)[-1].lower()
25-
32+
2633
# Check if file format is supported
2734
if ext not in ['.csv', '.xlsx', '.xls', '.json']:
2835
raise ValueError(f"Unsupported file format: {ext}")
29-
36+
3037
try:
3138
if ext == '.csv':
3239
# Try different encodings for CSV files
3340
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
3441
df = None
35-
42+
3643
for encoding in encodings:
3744
try:
3845
df = pd.read_csv(path, encoding=encoding)
39-
print(f"Successfully read CSV with encoding: {encoding}")
46+
logger.info(f"Successfully read CSV with encoding: {encoding}")
4047
break
4148
except UnicodeDecodeError:
42-
print(f"Failed to read with encoding '{encoding}'")
49+
logger.error(f"Failed to read with encoding '{encoding}'")
4350
continue
4451
except Exception as e:
45-
print(f"Error reading with encoding '{encoding}': {e}")
52+
logger.error(
53+
f"Error reading with encoding '{encoding}': {e}")
4654
continue
47-
55+
4856
if df is None:
49-
raise ValueError(f"Could not read CSV with tried encodings: {encodings}")
50-
57+
raise ValueError(
58+
f"Could not read CSV with tried encodings: {encodings}")
59+
5160
elif ext in ['.xls', '.xlsx']:
5261
df = pd.read_excel(path)
53-
print(f"Successfully read Excel file: {path}")
54-
62+
logger.info(f"Successfully read Excel file: {path}")
63+
5564
elif ext == '.json':
5665
df = pd.read_json(path)
57-
print(f"Successfully read JSON file: {path}")
58-
66+
logger.info(f"Successfully read JSON file: {path}")
67+
5968
# Validate data
6069
if df.empty:
6170
raise ValueError("File contains no data")
62-
63-
print(f"✅ Extracted {len(df)} rows and {len(df.columns)} columns") # TODO: Use logging instead of print
71+
72+
# TODO: Use logging instead of print
73+
logger.info(
74+
f"✅ Extracted {len(df)} rows and {len(df.columns)} columns")
6475
return df
65-
76+
6677
except pd.errors.EmptyDataError:
6778
raise ValueError("❌ File contains no data")
6879
except pd.errors.ParserError as e:

app/etl/load.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import pandas as pd
22
import sqlite3
33
import os
4+
import logging as lg
45
# TODO (Find & Fix)
6+
logger=lg.getLogger(__name__)
7+
logger.setLevel(lg.DEBUG)
8+
59

610
def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "processed_data"):
711
"""
@@ -13,10 +17,10 @@ def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "proc
1317
table_name: Name of the table to create/update
1418
"""
1519
if df.empty:
16-
print("⚠️ Warning: Empty DataFrame received, nothing to load") # TODO (Find & Fix)
20+
logger.warning("⚠️ Warning: Empty DataFrame received, nothing to load") # TODO (Find & Fix)
1721
return
1822

19-
print(f"🔄 Loading {len(df)} rows into database '{db_path}'") # TODO (Find & Fix)
23+
logger.info(f"🔄 Loading {len(df)} rows into database '{db_path}'") # TODO (Find & Fix)
2024

2125
# Ensure directory exists
2226
db_dir = os.path.dirname(db_path)

app/etl/transform.py

Lines changed: 30 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,69 @@
11
import pandas as pd
2+
import logging as lg
23
# TODO (Find & Fix)
34

5+
logger = lg.getLogger(__name__)
6+
logger.setLevel(lg.DEBUG)
7+
8+
49
def transform(df: pd.DataFrame) -> pd.DataFrame:
510
"""
611
Transform data by cleaning and standardizing it.
7-
12+
813
Args:
914
df: Input DataFrame
10-
15+
1116
Returns:
1217
Transformed DataFrame
1318
"""
1419
if df.empty:
1520
# TODO (Find & Fix): Should raise a ValueError if DataFrame is empty
1621
pass
17-
18-
# Create a copy to avoid modifying original
22+
23+
# Create a copy to avoid modifying original
1924
df_transformed = df.copy()
20-
21-
print(f"🔄 Starting transformation of {len(df_transformed)} rows") # TODO (Find & Fix): Use logging instead of print
22-
25+
26+
# TODO (Find & Fix): Use logging instead of print
27+
logger.info(f"🔄 Starting transformation of {len(df_transformed)} rows")
28+
2329
# Handle duplicates
2430
initial_rows = len(df_transformed)
25-
# Removing duplicates
26-
df_transformed=df_transformed.drop_duplicates()
31+
# Removing duplicates
32+
df_transformed = df_transformed.drop_duplicates()
2733

2834
duplicates_removed = initial_rows - len(df_transformed)
2935
if duplicates_removed > 0:
3036
# Number of duplicates removed
31-
print(f"✅ Removed {duplicates_removed} duplicate rows.")
32-
33-
37+
logger.info(f"✅ Removed {duplicates_removed} duplicate rows.")
38+
3439
# Handle null values in numeric columns
3540
numeric_columns = df_transformed.select_dtypes(include=['number']).columns
3641
for col in numeric_columns:
3742
# TODO (Find & Fix): Nulls in numeric columns are not handled
3843
pass
39-
44+
4045
# Handle null values in text columns
4146
text_columns = df_transformed.select_dtypes(include=['object']).columns
4247
for col in text_columns:
4348
# TODO (Find & Fix): Nulls in text columns are not handled
4449
pass
45-
50+
4651
# Standardize date columns (look for common date column names)
47-
date_columns = [col for col in df_transformed.columns
48-
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
49-
52+
date_columns = [col for col in df_transformed.columns
53+
if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated'])]
54+
5055
for col in date_columns:
5156
try:
52-
df_transformed[col] = pd.to_datetime(df_transformed[col], errors='coerce', infer_datetime_format=True)
57+
df_transformed[col] = pd.to_datetime(
58+
df_transformed[col], errors='coerce', infer_datetime_format=True)
5359
# Standardize all dates to 'YYYY-MM-DD HH:MM:SS'
54-
df_transformed[col] = df_transformed[col].dt.strftime('%Y-%m-%d %H:%M:%S')
55-
56-
print(f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})")
60+
df_transformed[col] = df_transformed[col].dt.strftime(
61+
'%Y-%m-%d %H:%M:%S')
62+
63+
logger.info(
64+
f"✅ Standardized date column '{col}' (e.g., {df_transformed[col].iloc[0]})")
5765
except Exception as e:
58-
print(f"⚠️ Could not standardize column '{col}': {e}")
66+
logger.error(f"⚠️ Could not standardize column '{col}': {e}")
5967

60-
6168
# TODO (Find & Fix): Text columns are not cleaned (strip, lowercase)
6269
return df_transformed

app/main.py

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import os
2-
32
from app.etl.extract import extract
43
from app.etl.transform import transform
54
from app.etl.load import load
5+
import logging as lg
6+
lg.basicConfig(level=lg.debug())
7+
8+
logger = lg.getLogger(__name__)
69

710
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
811
data_path = os.path.join(BASE_DIR, "data.csv")
@@ -16,34 +19,34 @@ def run_pipeline(csv_path: str =data_path, db_path: str = "etl_data.db"):
1619
db_path: Path to the output SQLite database
1720
"""
1821
try:
19-
print("🚀 Starting ETL Pipeline") # TODO (Find & Fix): Use logging instead of print
20-
print(f"📁 Input file: {csv_path}")
21-
print(f"🗄️ Output database: {db_path}")
22-
print("-" * 50)
22+
lg.info("🚀 Starting ETL Pipeline") # TODO (Find & Fix): Use logging instead of print
23+
lg.info(f"📁 Input file: {csv_path}")
24+
lg.info(f"🗄️ Output database: {db_path}")
25+
lg.info("-" * 50)
2326

2427
# Extract
25-
print("📥 STEP 1: EXTRACT")
28+
lg.info("📥 STEP 1: EXTRACT")
2629
df = extract(csv_path)
27-
print(f"✅ Extracted {len(df)} rows")
28-
print(f"📊 Columns: {list(df.columns)}")
29-
print()
30+
lg.info(f"✅ Extracted {len(df)} rows")
31+
lg.info(f"📊 Columns: {list(df.columns)}")
32+
lg.info()
3033

3134
# Transform
32-
print("🔄 STEP 2: TRANSFORM")
35+
lg.info("🔄 STEP 2: TRANSFORM")
3336
df_transformed = transform(df)
34-
print(f"✅ Transformed data ready")
35-
print()
37+
lg.info(f"✅ Transformed data ready")
38+
lg.info()
3639

3740
# Load
38-
print("📤 STEP 3: LOAD")
41+
lg.info("📤 STEP 3: LOAD")
3942
load(df_transformed, db_path)
40-
print()
43+
lg.info()
4144

42-
print("🎉 ETL Pipeline completed successfully!")
43-
print(f"📈 Final dataset: {len(df_transformed)} rows, {len(df_transformed.columns)} columns")
45+
lg.info("🎉 ETL Pipeline completed successfully!")
46+
lg.info(f"📈 Final dataset: {len(df_transformed)} rows, {len(df_transformed.columns)} columns")
4447

4548
except FileNotFoundError as e:
46-
print(f"❌ File Error: {e}")
49+
lg.error(f"❌ File Error: {e}")
4750

4851
except ValueError as e:
4952
# TODO (Find & Fix): Error handling missing

0 commit comments

Comments
 (0)