Skip to content

Commit d8a7a23

Browse files
Merge pull request #33 from Abhi2006-cloud/feature/multi-format-support-clean
fix#15 : Support Other File Formats in Extract Function #15
2 parents f1410a1 + d29d1be commit d8a7a23

File tree

2 files changed

+38
-23
lines changed

2 files changed

+38
-23
lines changed

app/etl/extract.py

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@
44

55
def extract(path: str = "xyz.csv") -> pd.DataFrame :
66
"""
7-
Extracts data from CSV file.
7+
Extracts data from CSV, Excel, or JSON file.
88
99
Args:
10-
path: Path to the CSV file
10+
path: Path to the data file (supports .csv, .xlsx, .json)
1111
1212
Returns:
13-
DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring
13+
pd.DataFrame: DataFrame containing the extracted data
1414
1515
Raises:
1616
FileNotFoundError: If the file doesn't exist
@@ -20,27 +20,41 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
2020
if not os.path.exists(path):
2121
raise FileNotFoundError(f"❌ File not found: {path}")
2222

23-
if not str(path).lower().endswith('.csv'):
24-
raise ValueError(f"❌ File must be a CSV: {path}")
23+
# Get file extension
24+
ext = os.path.splitext(path)[-1].lower()
25+
26+
# Check if file format is supported
27+
if ext not in ['.csv', '.xlsx', '.xls', '.json']:
28+
raise ValueError(f"Unsupported file format: {ext}")
2529

2630
try:
27-
# Try different encodings
28-
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
29-
df = None
30-
31-
for encoding in encodings:
32-
try:
33-
# TODO (Find & Fix)
34-
pass
35-
except UnicodeDecodeError:
36-
print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed
37-
continue
38-
except Exception as e:
39-
print(f"Error reading with encoding '{encoding}': {e}")
40-
continue
41-
42-
if df is None:
43-
raise ValueError(f"Could not read CSV with tried encodings: {encodings}")
31+
if ext == '.csv':
32+
# Try different encodings for CSV files
33+
encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
34+
df = None
35+
36+
for encoding in encodings:
37+
try:
38+
df = pd.read_csv(path, encoding=encoding)
39+
print(f"Successfully read CSV with encoding: {encoding}")
40+
break
41+
except UnicodeDecodeError:
42+
print(f"Failed to read with encoding '{encoding}'")
43+
continue
44+
except Exception as e:
45+
print(f"Error reading with encoding '{encoding}': {e}")
46+
continue
47+
48+
if df is None:
49+
raise ValueError(f"Could not read CSV with tried encodings: {encodings}")
50+
51+
elif ext in ['.xls', '.xlsx']:
52+
df = pd.read_excel(path)
53+
print(f"Successfully read Excel file: {path}")
54+
55+
elif ext == '.json':
56+
df = pd.read_json(path)
57+
print(f"Successfully read JSON file: {path}")
4458

4559
# Validate data
4660
if df.empty:
@@ -52,6 +66,6 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
5266
except pd.errors.EmptyDataError:
5367
raise ValueError("❌ File contains no data")
5468
except pd.errors.ParserError as e:
55-
raise ValueError(f"❌ Error parsing CSV: {e}")
69+
raise ValueError(f"❌ Error parsing file: {e}")
5670
except Exception as e:
5771
raise ValueError(f"❌ Unexpected error reading file: {e}")

app/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
pandas>=2.0.0
2+
openpyxl>=3.0.0

0 commit comments

Comments
 (0)