|
1 | 1 | import pandas as pd |
2 | 2 | import os |
| 3 | +import logging as lg |
3 | 4 |
|
4 | | -# Get the base directory (app/) relative to this file (app/etl/extract.py) |
5 | | -BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) |
6 | | -DEFAULT_DATA_PATH = os.path.join(BASE_DIR, "data.csv") |
| 5 | +# TODO (Find & Fix) |
7 | 6 |
|
8 | | -def extract(path: str = DEFAULT_DATA_PATH) -> pd.DataFrame : |
| 7 | + |
| 8 | +logger = lg.getLogger(__name__) |
| 9 | +logger.setLevel(lg.DEBUG) |
| 10 | + |
| 11 | + |
| 12 | +def extract(path: str = "xyz.csv") -> pd.DataFrame: |
9 | 13 | """ |
10 | 14 | Extracts data from CSV, Excel, or JSON file. |
11 | | - |
| 15 | +
|
12 | 16 | Args: |
13 | 17 | path: Path to the data file (supports .csv, .xlsx, .json) |
14 | | - |
| 18 | +
|
15 | 19 | Returns: |
16 | 20 | pd.DataFrame: DataFrame containing the extracted data |
17 | | - |
| 21 | +
|
18 | 22 | Raises: |
19 | 23 | FileNotFoundError: If the file doesn't exist |
20 | 24 | ValueError: If the file is empty or invalid |
21 | 25 | """ |
22 | 26 | # Validate file path |
23 | 27 | if not os.path.exists(path): |
24 | 28 | raise FileNotFoundError(f"❌ File not found: {path}") |
25 | | - |
| 29 | + |
26 | 30 | # Get file extension |
27 | 31 | ext = os.path.splitext(path)[-1].lower() |
28 | | - |
| 32 | + |
29 | 33 | # Check if file format is supported |
30 | 34 | if ext not in ['.csv', '.xlsx', '.xls', '.json']: |
31 | 35 | raise ValueError(f"Unsupported file format: {ext}") |
32 | | - |
| 36 | + |
33 | 37 | try: |
34 | 38 | if ext == '.csv': |
35 | 39 | # Try different encodings for CSV files |
36 | 40 | encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1'] |
37 | 41 | df = None |
38 | | - |
| 42 | + |
39 | 43 | for encoding in encodings: |
40 | 44 | try: |
41 | 45 | df = pd.read_csv(path, encoding=encoding) |
42 | | - print(f"Successfully read CSV with encoding: {encoding}") |
| 46 | + logger.info(f"Successfully read CSV with encoding: {encoding}") |
43 | 47 | break |
44 | 48 | except UnicodeDecodeError: |
45 | | - print(f"Failed to read with encoding '{encoding}'") |
| 49 | + logger.error(f"Failed to read with encoding '{encoding}'") |
46 | 50 | continue |
47 | 51 | except Exception as e: |
48 | | - print(f"Error reading with encoding '{encoding}': {e}") |
| 52 | + logger.error( |
| 53 | + f"Error reading with encoding '{encoding}': {e}") |
49 | 54 | continue |
50 | | - |
| 55 | + |
51 | 56 | if df is None: |
52 | | - raise ValueError(f"Could not read CSV with tried encodings: {encodings}") |
53 | | - |
| 57 | + raise ValueError( |
| 58 | + f"Could not read CSV with tried encodings: {encodings}") |
| 59 | + |
54 | 60 | elif ext in ['.xls', '.xlsx']: |
55 | 61 | df = pd.read_excel(path) |
56 | | - print(f"Successfully read Excel file: {path}") |
57 | | - |
| 62 | + logger.info(f"Successfully read Excel file: {path}") |
| 63 | + |
58 | 64 | elif ext == '.json': |
59 | 65 | df = pd.read_json(path) |
60 | | - print(f"Successfully read JSON file: {path}") |
61 | | - |
| 66 | + logger.info(f"Successfully read JSON file: {path}") |
| 67 | + |
62 | 68 | # Validate data |
63 | 69 | if df.empty: |
64 | 70 | raise ValueError("File contains no data") |
65 | | - |
66 | | - print(f"✅ Extracted {len(df)} rows and {len(df.columns)} columns") # TODO: Use logging instead of print |
| 71 | + |
| 72 | + # TODO: Use logging instead of print |
| 73 | + logger.info( |
| 74 | + f"✅ Extracted {len(df)} rows and {len(df.columns)} columns") |
67 | 75 | return df |
68 | | - |
| 76 | + |
69 | 77 | except pd.errors.EmptyDataError: |
70 | 78 | raise ValueError("❌ File contains no data") |
71 | 79 | except pd.errors.ParserError as e: |
|
0 commit comments