diff --git a/.gitignore b/.gitignore index b512c09..2c9d39e 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,9 @@ -node_modules \ No newline at end of file +node_modules +# Ignore virtual environments +myenv/ +venv/ +.env +.venv +ENV/ +env.bak/ +venv.bak/ \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..59d9f98 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1 @@ +# i just created this file so python know this directory is package and can import from it (needed to import extract function in root/tests/test_extract.py) \ No newline at end of file diff --git a/app/__pycache__/__init__.cpython-310.pyc b/app/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..c98e2b2 Binary files /dev/null and b/app/__pycache__/__init__.cpython-310.pyc differ diff --git a/app/etl/__init__.py b/app/etl/__init__.py new file mode 100644 index 0000000..59d9f98 --- /dev/null +++ b/app/etl/__init__.py @@ -0,0 +1 @@ +# i just created this file so python know this directory is package and can import from it (needed to import extract function in root/tests/test_extract.py) \ No newline at end of file diff --git a/app/etl/__pycache__/__init__.cpython-310.pyc b/app/etl/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..7dbe7b9 Binary files /dev/null and b/app/etl/__pycache__/__init__.cpython-310.pyc differ diff --git a/app/etl/__pycache__/extract.cpython-310.pyc b/app/etl/__pycache__/extract.cpython-310.pyc new file mode 100644 index 0000000..793239e Binary files /dev/null and b/app/etl/__pycache__/extract.cpython-310.pyc differ diff --git a/app/etl/extract.py b/app/etl/extract.py index 94714f2..bdf0f68 100644 --- a/app/etl/extract.py +++ b/app/etl/extract.py @@ -21,7 +21,7 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame : if not os.path.exists(path): raise FileNotFoundError(f"❌ File not found: {path}") - if not path.lower().endswith('.csv'): # TODO (Find & Fix) + if not str(path).lower().endswith('.csv'): raise ValueError(f"❌ File must be a CSV: {path}") try: @@ -32,12 +32,12 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame : for encoding in encodings: try: # TODO (Find & Fix) - pass + pass except UnicodeDecodeError: print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed if df is None: - raise ValueError(f" Could not read CSV with tried encodings: {encodings}") + raise ValueError(f"Could not read CSV with tried encodings: {encodings}") # Validate data if df.empty: @@ -50,5 +50,3 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame : raise ValueError("❌ File contains no data") except pd.errors.ParserError as e: raise ValueError(f"❌ Error parsing CSV: {e}") - except Exception as e: - raise ValueError(f"❌ Unexpected error reading file: {e}") \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..59d9f98 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# i just created this file so python know this directory is package and can import from it (needed to import extract function in root/tests/test_extract.py) \ No newline at end of file diff --git a/tests/__pycache__/__init__.cpython-310.pyc b/tests/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..cf1af23 Binary files /dev/null and b/tests/__pycache__/__init__.cpython-310.pyc differ diff --git a/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc b/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc new file mode 100644 index 0000000..dbb29cd Binary files /dev/null and b/tests/__pycache__/test_extract.cpython-310-pytest-8.4.2.pyc differ diff --git a/tests/data/empty.csv b/tests/data/empty.csv new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/invalid_path.txt b/tests/data/invalid_path.txt new file mode 100644 index 0000000..20461b3 --- /dev/null +++ b/tests/data/invalid_path.txt @@ -0,0 +1 @@ +this is a dummy file for unit tests \ No newline at end of file diff --git a/tests/data/valid.csv b/tests/data/valid.csv new file mode 100644 index 0000000..5c5fd75 --- /dev/null +++ b/tests/data/valid.csv @@ -0,0 +1,3 @@ +name,age,city +Alice,25,Paris +Bob,30,London \ No newline at end of file diff --git a/tests/test_extract.py b/tests/test_extract.py new file mode 100644 index 0000000..85c1f52 --- /dev/null +++ b/tests/test_extract.py @@ -0,0 +1,40 @@ +from pathlib import Path +import pytest +import re +import pandas as pd +from app.etl.extract import extract + +def test_empty_file_path(): + path = "" + with pytest.raises(FileNotFoundError, match=f"❌ File not found: {path}"): + extract(path) + +def test_invalid_file_extension(): + path = "./tests/data/invalid_path.txt" + with pytest.raises(ValueError, match="File must be a CSV"): + extract(path) + +def test_invalid_encoding(tmp_path): + bad_file = tmp_path / "invalid_encoding.csv" + bad_file.write_bytes(b"\xff\xfe\x00\x00\xff") + encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1'] + with pytest.raises(ValueError, match=re.escape(f"Could not read CSV with tried encodings: {encodings}")): + extract(bad_file) + +def test_empty_file(): + path = "./tests/data/empty.csv" + with pytest.raises(ValueError, match="File contains no data"): + extract(path) + +def test_valid_file(tmp_path): + csv_file = tmp_path / "valid.csv" + csv_data = "name,age,city\nAlice,25,Paris\nBob,30,London\n" + csv_file.write_text(csv_data, encoding="utf-8") + df = extract(csv_file) + expected_df = pd.DataFrame({ + "name": ["Alice", "Bob"], + "age": [25, 30], + "city": ["Paris", "London"] + }) + assert isinstance(df, pd.DataFrame) + pd.testing.assert_frame_equal(df.reset_index(drop=True), expected_df)