diff --git a/.gitignore b/.gitignore index 2c9d39e..c4f4de9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,12 @@ venv/ .venv ENV/ env.bak/ -venv.bak/ \ No newline at end of file +venv.bak/ + +etl_data.db + +# Ignore Python cache files +__pycache__/ +*.pyc +*.pyo +*.pyd diff --git a/app/__pycache__/__init__.cpython-310.pyc b/app/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index c98e2b2..0000000 Binary files a/app/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/app/etl/__pycache__/__init__.cpython-310.pyc b/app/etl/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 7dbe7b9..0000000 Binary files a/app/etl/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/app/etl/__pycache__/extract.cpython-310.pyc b/app/etl/__pycache__/extract.cpython-310.pyc deleted file mode 100644 index 793239e..0000000 Binary files a/app/etl/__pycache__/extract.cpython-310.pyc and /dev/null differ diff --git a/app/etl/__pycache__/extract.cpython-313.pyc b/app/etl/__pycache__/extract.cpython-313.pyc deleted file mode 100644 index 10b5268..0000000 Binary files a/app/etl/__pycache__/extract.cpython-313.pyc and /dev/null differ diff --git a/app/etl/__pycache__/load.cpython-313.pyc b/app/etl/__pycache__/load.cpython-313.pyc deleted file mode 100644 index 2790389..0000000 Binary files a/app/etl/__pycache__/load.cpython-313.pyc and /dev/null differ diff --git a/app/etl/__pycache__/transform.cpython-313.pyc b/app/etl/__pycache__/transform.cpython-313.pyc deleted file mode 100644 index 4347539..0000000 Binary files a/app/etl/__pycache__/transform.cpython-313.pyc and /dev/null differ diff --git a/app/etl/load.py b/app/etl/load.py index 66a1284..4ae4589 100644 --- a/app/etl/load.py +++ b/app/etl/load.py @@ -21,8 +21,7 @@ def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "proc # Ensure directory exists db_dir = os.path.dirname(db_path) if db_dir and not os.path.exists(db_dir): - os.makedirs(db_dir) - + os.makedirs(db_dir) conn = None try: # Connect to database @@ -30,11 +29,9 @@ def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "proc cursor = conn.cursor() # TODO (Find & Fix): Table creation and schema logic missing - - # Idempotency check (should avoid duplicate inserts) cursor.execute(f""" CREATE TABLE IF NOT EXISTS {table_name} ( - employee_id INTEGER PRIMARY KEY, + employee_id TEXT PRIMARY KEY, name TEXT, email TEXT, age INTEGER, @@ -50,15 +47,11 @@ def load(df: pd.DataFrame, db_path: str = "etl_data.db", table_name: str = "proc ) """) - data_to_insert = [tuple(row) for row in df.itertuples(index=False, name=None)] placeholders = ", ".join(["?"] * len(df.columns)) column_names = ", ".join(df.columns) - sql_query = f"INSERT OR IGNORE INTO {table_name} ({column_names}) VALUES ({placeholders})" - cursor.executemany(sql_query, data_to_insert) + sql_query = f"INSERT OR REPLACE INTO {table_name} ({column_names}) VALUES ({placeholders})" + cursor.executemany(sql_query, df.itertuples(index=False, name=None)) conn.commit() - # TODO (Find & Fix): Bulk insert without checking for duplicates - - except sqlite3.Error as e: if conn: conn.rollback() diff --git a/app/main.py b/app/main.py index e61920b..e881ef4 100644 --- a/app/main.py +++ b/app/main.py @@ -1,8 +1,13 @@ +import os + from app.etl.extract import extract from app.etl.transform import transform from app.etl.load import load -def run_pipeline(csv_path: str = "data.csv", db_path: str = "etl_data.db"): +BASE_DIR = os.path.dirname(os.path.abspath(__file__)) +data_path = os.path.join(BASE_DIR, "data.csv") + +def run_pipeline(csv_path: str =data_path, db_path: str = "etl_data.db"): """ Run the complete ETL pipeline.