diff --git a/app/etl/extract.py b/app/etl/extract.py index 94714f2..f1a69ee 100644 --- a/app/etl/extract.py +++ b/app/etl/extract.py @@ -5,41 +5,72 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame : """ - Extracts data from CSV file. + Extracts data from CSV, Excel, or JSON file. Args: - path: Path to the CSV file + path: Path to the data file (supports .csv, .xlsx, .json) Returns: - DataFrame containing the extracted data # TODO (Find & Fix): Should specify pd.DataFrame in docstring + pd.DataFrame: DataFrame containing the extracted data Raises: FileNotFoundError: If the file doesn't exist - ValueError: If the file is empty or invalid + ValueError: If the file format is unsupported or file is empty/invalid """ # Validate file path if not os.path.exists(path): raise FileNotFoundError(f"❌ File not found: {path}") - if not path.lower().endswith('.csv'): # TODO (Find & Fix) - raise ValueError(f"❌ File must be a CSV: {path}") + # Get file extension + file_ext = os.path.splitext(path)[-1].lower() + + # Check if file format is supported + supported_formats = ['.csv', '.xlsx', '.xls', '.json'] + if file_ext not in supported_formats: + raise ValueError(f"❌ Unsupported file format: {file_ext}. Supported formats: {supported_formats}") try: - # Try different encodings - encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1'] df = None - for encoding in encodings: + if file_ext == '.csv': + # Try different encodings for CSV files + encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1'] + + for encoding in encodings: + try: + df = pd.read_csv(path, encoding=encoding) + print(f"Successfully read CSV with encoding: {encoding}") + break + except UnicodeDecodeError: + print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed + continue + except Exception as e: + print(f"Error reading with encoding '{encoding}': {e}") + continue + + if df is None: + raise ValueError(f"Could not read CSV with tried encodings: {encodings}") + + elif file_ext in ['.xlsx', '.xls']: + # Read Excel files + try: + df = pd.read_excel(path) + print(f"Successfully read Excel file: {path}") + except Exception as e: + raise ValueError(f"❌ Error reading Excel file: {e}") + + elif file_ext == '.json': + # Read JSON files try: - # TODO (Find & Fix) - pass - except UnicodeDecodeError: - print(f"Failed to read with encoding '{encoding}'") # Log the encoding that failed + df = pd.read_json(path) + print(f"Successfully read JSON file: {path}") + except Exception as e: + raise ValueError(f"❌ Error reading JSON file: {e}") + # Validate data if df is None: - raise ValueError(f" Could not read CSV with tried encodings: {encodings}") + raise ValueError("❌ Failed to read data from file") - # Validate data if df.empty: raise ValueError("File contains no data") @@ -49,6 +80,6 @@ def extract(path: str = "xyz.csv") -> pd.DataFrame : except pd.errors.EmptyDataError: raise ValueError("❌ File contains no data") except pd.errors.ParserError as e: - raise ValueError(f"❌ Error parsing CSV: {e}") + raise ValueError(f"❌ Error parsing file: {e}") except Exception as e: raise ValueError(f"❌ Unexpected error reading file: {e}") \ No newline at end of file diff --git a/app/requirements.txt b/app/requirements.txt index 6951741..fc36e3c 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -1 +1,2 @@ pandas>=2.0.0 +openpyxl>=3.0.0