33# TODO (Find & Fix)
44from typing import Optional
55
6- def extract (path : str = "xyz.csv" ) -> pd .DataFrame :
6+ def extract (path : str = "xyz.csv" ) -> pd .DataFrame :
77 """
8- Extracts data from CSV file.
8+ Extracts data from CSV, Excel, or JSON file.
99
1010 Args:
11- path: Path to the CSV file
11+ path: Path to the data file (supports .csv, .xlsx, .json)
1212
1313 Returns:
14- DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring
14+ pd. DataFrame: DataFrame containing the extracted data
1515
1616 Raises:
1717 FileNotFoundError: If the file doesn't exist
18- ValueError: If the file is empty or invalid
18+ ValueError: If the file format is unsupported or file is empty/ invalid
1919 """
2020 # Validate file path
2121 if not os .path .exists (path ):
2222 raise FileNotFoundError (f"❌ File not found: { path } " )
2323
24- if not path .lower ().endswith ('.csv' ): # TODO (Find & Fix)
25- raise ValueError (f"❌ File must be a CSV: { path } " )
24+ # Get file extension
25+ file_ext = os .path .splitext (path )[- 1 ].lower ()
26+
27+ # Check if file format is supported
28+ supported_formats = ['.csv' , '.xlsx' , '.xls' , '.json' ]
29+ if file_ext not in supported_formats :
30+ raise ValueError (f"❌ Unsupported file format: { file_ext } . Supported formats: { supported_formats } " )
2631
2732 try :
28- # Try different encodings
29- encodings = ['utf-8' , 'latin-1' , 'cp1252' , 'iso-8859-1' ]
3033 df = None
3134
32- for encoding in encodings :
35+ if file_ext == '.csv' :
36+ # Try different encodings for CSV files
37+ encodings = ['utf-8' , 'latin-1' , 'cp1252' , 'iso-8859-1' ]
38+
39+ for encoding in encodings :
40+ try :
41+ df = pd .read_csv (path , encoding = encoding )
42+ print (f"Successfully read CSV with encoding: { encoding } " )
43+ break
44+ except UnicodeDecodeError :
45+ print (f"Failed to read with encoding '{ encoding } '" ) # Log the encoding that failed
46+ continue
47+ except Exception as e :
48+ print (f"Error reading with encoding '{ encoding } ': { e } " )
49+ continue
50+
51+ if df is None :
52+ raise ValueError (f"Could not read CSV with tried encodings: { encodings } " )
53+
54+ elif file_ext in ['.xlsx' , '.xls' ]:
55+ # Read Excel files
56+ try :
57+ df = pd .read_excel (path )
58+ print (f"Successfully read Excel file: { path } " )
59+ except Exception as e :
60+ raise ValueError (f"❌ Error reading Excel file: { e } " )
61+
62+ elif file_ext == '.json' :
63+ # Read JSON files
3364 try :
34- df = pd .read_csv (path , encoding = encoding )
35- print (f"Successfully read CSV with encoding: { encoding } " )
36- break
37- except UnicodeDecodeError :
38- print (f"Failed to read with encoding '{ encoding } '" ) # Log the encoding that failed
39- continue
65+ df = pd .read_json (path )
66+ print (f"Successfully read JSON file: { path } " )
4067 except Exception as e :
41- print (f"Error reading with encoding '{ encoding } ': { e } " )
42- continue
68+ raise ValueError (f"❌ Error reading JSON file: { e } " )
4369
70+ # Validate data
4471 if df is None :
45- raise ValueError (f" Could not read CSV with tried encodings: { encodings } " )
72+ raise ValueError ("❌ Failed to read data from file " )
4673
47- # Validate data
4874 if df .empty :
49- raise ValueError ("File contains no data" )
75+ raise ValueError ("❌ File contains no data" )
5076
5177 print (f"✅ Extracted { len (df )} rows and { len (df .columns )} columns" ) # TODO: Use logging instead of print
5278 return df
5379
5480 except pd .errors .EmptyDataError :
5581 raise ValueError ("❌ File contains no data" )
5682 except pd .errors .ParserError as e :
57- raise ValueError (f"❌ Error parsing CSV : { e } " )
83+ raise ValueError (f"❌ Error parsing file : { e } " )
5884 except Exception as e :
5985 raise ValueError (f"❌ Unexpected error reading file: { e } " )
0 commit comments