Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 47 additions & 16 deletions app/etl/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,72 @@

def extract(path: str = "xyz.csv") -> pd.DataFrame :
"""
Extracts data from CSV file.
Extracts data from CSV, Excel, or JSON file.

Args:
path: Path to the CSV file
path: Path to the data file (supports .csv, .xlsx, .json)

Returns:
DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring
pd.DataFrame: DataFrame containing the extracted data

Raises:
FileNotFoundError: If the file doesn't exist
ValueError: If the file is empty or invalid
ValueError: If the file format is unsupported or file is empty/invalid
"""
# Validate file path
if not os.path.exists(path):
raise FileNotFoundError(f"❌ File not found: {path}")

if not path.lower().endswith('.csv'): # TODO (Find & Fix)
raise ValueError(f"❌ File must be a CSV: {path}")
# Get file extension
file_ext = os.path.splitext(path)[-1].lower()

# Check if file format is supported
supported_formats = ['.csv', '.xlsx', '.xls', '.json']
if file_ext not in supported_formats:
raise ValueError(f"❌ Unsupported file format: {file_ext}. Supported formats: {supported_formats}")

try:
# Try different encodings
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
df = None

for encoding in encodings:
if file_ext == '.csv':
# Try different encodings for CSV files
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']

for encoding in encodings:
try:
df = pd.read_csv(path, encoding=encoding)
print(f"Successfully read CSV with encoding: {encoding}")
break
except UnicodeDecodeError:
print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed
continue
except Exception as e:
print(f"Error reading with encoding '{encoding}': {e}")
continue

if df is None:
raise ValueError(f"Could not read CSV with tried encodings: {encodings}")

elif file_ext in ['.xlsx', '.xls']:
# Read Excel files
try:
df = pd.read_excel(path)
print(f"Successfully read Excel file: {path}")
except Exception as e:
raise ValueError(f"❌ Error reading Excel file: {e}")

elif file_ext == '.json':
# Read JSON files
try:
# TODO (Find & Fix)
pass
except UnicodeDecodeError:
print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed
df = pd.read_json(path)
print(f"Successfully read JSON file: {path}")
except Exception as e:
raise ValueError(f"❌ Error reading JSON file: {e}")

# Validate data
if df is None:
raise ValueError(f" Could not read CSV with tried encodings: {encodings}")
raise ValueError("❌ Failed to read data from file")

# Validate data
if df.empty:
raise ValueError("File contains no data")

Expand All @@ -49,6 +80,6 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
except pd.errors.EmptyDataError:
raise ValueError("❌ File contains no data")
except pd.errors.ParserError as e:
raise ValueError(f"❌ Error parsing CSV: {e}")
raise ValueError(f"❌ Error parsing file: {e}")
except Exception as e:
raise ValueError(f"❌ Unexpected error reading file: {e}")
1 change: 1 addition & 0 deletions app/requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
pandas>=2.0.0
openpyxl>=3.0.0