Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 37 additions & 23 deletions app/etl/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

def extract(path: str = "xyz.csv") -> pd.DataFrame :
"""
Extracts data from CSV file.
Extracts data from CSV, Excel, or JSON file.

Args:
path: Path to the CSV file
path: Path to the data file (supports .csv, .xlsx, .json)

Returns:
DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring
pd.DataFrame: DataFrame containing the extracted data

Raises:
FileNotFoundError: If the file doesn't exist
Expand All @@ -20,27 +20,41 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
if not os.path.exists(path):
raise FileNotFoundError(f"❌ File not found: {path}")

if not str(path).lower().endswith('.csv'):
raise ValueError(f"❌ File must be a CSV: {path}")
# Get file extension
ext = os.path.splitext(path)[-1].lower()

# Check if file format is supported
if ext not in ['.csv', '.xlsx', '.xls', '.json']:
raise ValueError(f"Unsupported file format: {ext}")

try:
# Try different encodings
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
df = None

for encoding in encodings:
try:
# TODO (Find & Fix)
pass
except UnicodeDecodeError:
print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed
continue
except Exception as e:
print(f"Error reading with encoding '{encoding}': {e}")
continue

if df is None:
raise ValueError(f"Could not read CSV with tried encodings: {encodings}")
if ext == '.csv':
# Try different encodings for CSV files
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
df = None

for encoding in encodings:
try:
df = pd.read_csv(path, encoding=encoding)
print(f"Successfully read CSV with encoding: {encoding}")
break
except UnicodeDecodeError:
print(f"Failed to read with encoding '{encoding}'")
continue
except Exception as e:
print(f"Error reading with encoding '{encoding}': {e}")
continue

if df is None:
raise ValueError(f"Could not read CSV with tried encodings: {encodings}")

elif ext in ['.xls', '.xlsx']:
df = pd.read_excel(path)
print(f"Successfully read Excel file: {path}")

elif ext == '.json':
df = pd.read_json(path)
print(f"Successfully read JSON file: {path}")

# Validate data
if df.empty:
Expand All @@ -52,6 +66,6 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
except pd.errors.EmptyDataError:
raise ValueError("❌ File contains no data")
except pd.errors.ParserError as e:
raise ValueError(f"❌ Error parsing CSV: {e}")
raise ValueError(f"❌ Error parsing file: {e}")
except Exception as e:
raise ValueError(f"❌ Unexpected error reading file: {e}")
1 change: 1 addition & 0 deletions app/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pandas>=2.0.0
openpyxl>=3.0.0
Loading