44
55def extract (path : str = "xyz.csv" ) -> pd .DataFrame :
66 """
7- Extracts data from CSV file.
7+ Extracts data from CSV, Excel, or JSON file.
88
99 Args:
10- path: Path to the CSV file
10+ path: Path to the data file (supports .csv, .xlsx, .json)
1111
1212 Returns:
13- DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring
13+ pd. DataFrame: DataFrame containing the extracted data
1414
1515 Raises:
1616 FileNotFoundError: If the file doesn't exist
@@ -20,27 +20,41 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
2020 if not os .path .exists (path ):
2121 raise FileNotFoundError (f"❌ File not found: { path } " )
2222
23- if not str (path ).lower ().endswith ('.csv' ):
24- raise ValueError (f"❌ File must be a CSV: { path } " )
23+ # Get file extension
24+ ext = os .path .splitext (path )[- 1 ].lower ()
25+
26+ # Check if file format is supported
27+ if ext not in ['.csv' , '.xlsx' , '.xls' , '.json' ]:
28+ raise ValueError (f"Unsupported file format: { ext } " )
2529
2630 try :
27- # Try different encodings
28- encodings = ['utf-8' , 'latin-1' , 'cp1252' , 'iso-8859-1' ]
29- df = None
30-
31- for encoding in encodings :
32- try :
33- # TODO (Find & Fix)
34- pass
35- except UnicodeDecodeError :
36- print (f"Failed to read with encoding '{ encoding } '" ) # Log the encoding that failed
37- continue
38- except Exception as e :
39- print (f"Error reading with encoding '{ encoding } ': { e } " )
40- continue
41-
42- if df is None :
43- raise ValueError (f"Could not read CSV with tried encodings: { encodings } " )
31+ if ext == '.csv' :
32+ # Try different encodings for CSV files
33+ encodings = ['utf-8' , 'latin-1' , 'cp1252' , 'iso-8859-1' ]
34+ df = None
35+
36+ for encoding in encodings :
37+ try :
38+ df = pd .read_csv (path , encoding = encoding )
39+ print (f"Successfully read CSV with encoding: { encoding } " )
40+ break
41+ except UnicodeDecodeError :
42+ print (f"Failed to read with encoding '{ encoding } '" )
43+ continue
44+ except Exception as e :
45+ print (f"Error reading with encoding '{ encoding } ': { e } " )
46+ continue
47+
48+ if df is None :
49+ raise ValueError (f"Could not read CSV with tried encodings: { encodings } " )
50+
51+ elif ext in ['.xls' , '.xlsx' ]:
52+ df = pd .read_excel (path )
53+ print (f"Successfully read Excel file: { path } " )
54+
55+ elif ext == '.json' :
56+ df = pd .read_json (path )
57+ print (f"Successfully read JSON file: { path } " )
4458
4559 # Validate data
4660 if df .empty :
@@ -52,6 +66,6 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame :
5266 except pd .errors .EmptyDataError :
5367 raise ValueError ("❌ File contains no data" )
5468 except pd .errors .ParserError as e :
55- raise ValueError (f"❌ Error parsing CSV : { e } " )
69+ raise ValueError (f"❌ Error parsing file : { e } " )
5670 except Exception as e :
5771 raise ValueError (f"❌ Unexpected error reading file: { e } " )
0 commit comments