Skip to content

Commit 2309c78

Browse files
Abhi SuryawanshiAbhi Suryawanshi
authored andcommitted
feat: extend extract() to support Excel and JSON while preserving CSV functionality
- Added file type detection using os.path.splitext() - Added support for .xlsx, .xls, and .json - Implemented format-specific reading logic - Enhanced error handling for unsupported formats - Preserved CSV functionality including multi-encoding support - Added 'openpyxl>=3.0.0' to requirements
1 parent 7f97b56 commit 2309c78

File tree

2 files changed

+49
-22
lines changed

2 files changed

+49
-22
lines changed

app/etl/extract.py

Lines changed: 48 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -3,57 +3,83 @@
33
# TODO (Find & Fix)
44
from typing import Optional
55

6-
def extract(path: str = "xyz.csv") -> pd.DataFrame :
6+
def extract(path: str = "xyz.csv") -> pd.DataFrame:
77
"""
8-
Extracts data from CSV file.
8+
Extracts data from CSV, Excel, or JSON file.
99
1010
Args:
11-
path: Path to the CSV file
11+
path: Path to the data file (supports .csv, .xlsx, .json)
1212
1313
Returns:
14-
DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring
14+
pd.DataFrame: DataFrame containing the extracted data
1515
1616
Raises:
1717
FileNotFoundError: If the file doesn't exist
18-
ValueError: If the file is empty or invalid
18+
ValueError: If the file format is unsupported or file is empty/invalid
1919
"""
2020
# Validate file path
2121
if not os.path.exists(path):
2222
raise FileNotFoundError(f"❌ File not found: {path}")
2323

24-
if not path.lower().endswith('.csv'): # TODO (Find & Fix)
25-
raise ValueError(f"❌ File must be a CSV: {path}")
24+
# Get file extension
25+
file_ext = os.path.splitext(path)[-1].lower()
26+
27+
# Check if file format is supported
28+
supported_formats = ['.csv', '.xlsx', '.xls', '.json']
29+
if file_ext not in supported_formats:
30+
raise ValueError(f"❌ Unsupported file format: {file_ext}. Supported formats: {supported_formats}")
2631

2732
try:
28-
# Try different encodings
29-
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
3033
df = None
3134

32-
for encoding in encodings:
35+
if file_ext == '.csv':
36+
# Try different encodings for CSV files
37+
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
38+
39+
for encoding in encodings:
40+
try:
41+
df = pd.read_csv(path, encoding=encoding)
42+
print(f"Successfully read CSV with encoding: {encoding}")
43+
break
44+
except UnicodeDecodeError:
45+
print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed
46+
continue
47+
except Exception as e:
48+
print(f"Error reading with encoding '{encoding}': {e}")
49+
continue
50+
51+
if df is None:
52+
raise ValueError(f"Could not read CSV with tried encodings: {encodings}")
53+
54+
elif file_ext in ['.xlsx', '.xls']:
55+
# Read Excel files
56+
try:
57+
df = pd.read_excel(path)
58+
print(f"Successfully read Excel file: {path}")
59+
except Exception as e:
60+
raise ValueError(f"❌ Error reading Excel file: {e}")
61+
62+
elif file_ext == '.json':
63+
# Read JSON files
3364
try:
34-
df = pd.read_csv(path, encoding=encoding)
35-
print(f"Successfully read CSV with encoding: {encoding}")
36-
break
37-
except UnicodeDecodeError:
38-
print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed
39-
continue
65+
df = pd.read_json(path)
66+
print(f"Successfully read JSON file: {path}")
4067
except Exception as e:
41-
print(f"Error reading with encoding '{encoding}': {e}")
42-
continue
68+
raise ValueError(f"❌ Error reading JSON file: {e}")
4369

70+
# Validate data
4471
if df is None:
45-
raise ValueError(f" Could not read CSV with tried encodings: {encodings}")
72+
raise ValueError("❌ Failed to read data from file")
4673

47-
# Validate data
4874
if df.empty:
49-
raise ValueError("File contains no data")
75+
raise ValueError("File contains no data")
5076

5177
print(f"✅ Extracted {len(df)} rows and {len(df.columns)} columns") # TODO: Use logging instead of print
5278
return df
5379

5480
except pd.errors.EmptyDataError:
5581
raise ValueError("❌ File contains no data")
5682
except pd.errors.ParserError as e:
57-
raise ValueError(f"❌ Error parsing CSV: {e}")
83+
raise ValueError(f"❌ Error parsing file: {e}")
5884
except Exception as e:
5985
raise ValueError(f"❌ Unexpected error reading file: {e}")

app/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
pandas>=2.0.0
2+
openpyxl>=3.0.0

0 commit comments

Comments
 (0)