Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
node_modules
node_modules
# Ignore virtual environments
myenv/
venv/
.env
.venv
ENV/
env.bak/
venv.bak/
1 change: 1 addition & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# i just created this file so python know this directory is package and can import from it (needed to import extract function in root/tests/test_extract.py)
Binary file added app/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
1 change: 1 addition & 0 deletions app/etl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# i just created this file so python know this directory is package and can import from it (needed to import extract function in root/tests/test_extract.py)
Binary file added app/etl/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file added app/etl/__pycache__/extract.cpython-310.pyc
Binary file not shown.
8 changes: 3 additions & 5 deletions app/etl/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
if not os.path.exists(path):
raise FileNotFoundError(f"❌ File not found: {path}")

if not path.lower().endswith('.csv'): # TODO (Find & Fix)
if not str(path).lower().endswith('.csv'):
raise ValueError(f"❌ File must be a CSV: {path}")

try:
Expand All @@ -32,12 +32,12 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
for encoding in encodings:
try:
# TODO (Find & Fix)
pass
pass
except UnicodeDecodeError:
print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed

if df is None:
raise ValueError(f" Could not read CSV with tried encodings: {encodings}")
raise ValueError(f"Could not read CSV with tried encodings: {encodings}")

# Validate data
if df.empty:
Expand All @@ -50,5 +50,3 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
raise ValueError("❌ File contains no data")
except pd.errors.ParserError as e:
raise ValueError(f"❌ Error parsing CSV: {e}")
except Exception as e:
raise ValueError(f"❌ Unexpected error reading file: {e}")
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# i just created this file so python know this directory is package and can import from it (needed to import extract function in root/tests/test_extract.py)
Binary file added tests/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file not shown.
Empty file added tests/data/empty.csv
Empty file.
1 change: 1 addition & 0 deletions tests/data/invalid_path.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
this is a dummy file for unit tests
3 changes: 3 additions & 0 deletions tests/data/valid.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name,age,city
Alice,25,Paris
Bob,30,London
40 changes: 40 additions & 0 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from pathlib import Path
import pytest
import re
import pandas as pd
from app.etl.extract import extract

def test_empty_file_path():
path = ""
with pytest.raises(FileNotFoundError, match=f"❌ File not found: {path}"):
extract(path)

def test_invalid_file_extension():
path = "./tests/data/invalid_path.txt"
with pytest.raises(ValueError, match="File must be a CSV"):
extract(path)

def test_invalid_encoding(tmp_path):
bad_file = tmp_path / "invalid_encoding.csv"
bad_file.write_bytes(b"\xff\xfe\x00\x00\xff")
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
with pytest.raises(ValueError, match=re.escape(f"Could not read CSV with tried encodings: {encodings}")):
extract(bad_file)

def test_empty_file():
path = "./tests/data/empty.csv"
with pytest.raises(ValueError, match="File contains no data"):
extract(path)

def test_valid_file(tmp_path):
csv_file = tmp_path / "valid.csv"
csv_data = "name,age,city\nAlice,25,Paris\nBob,30,London\n"
csv_file.write_text(csv_data, encoding="utf-8")
df = extract(csv_file)
expected_df = pd.DataFrame({
"name": ["Alice", "Bob"],
"age": [25, 30],
"city": ["Paris", "London"]
})
assert isinstance(df, pd.DataFrame)
pd.testing.assert_frame_equal(df.reset_index(drop=True), expected_df)
Loading