|
1 | | -import sqlite3 |
2 | 1 | import pandas as pd |
| 2 | +import sqlite3 |
3 | 3 | import os |
| 4 | +import logging |
4 | 5 | from typing import Union |
5 | 6 |
|
6 | 7 |
|
7 | 8 | def create_sqlite_db( |
8 | 9 | df: pd.DataFrame, |
9 | | - schema_file: Union[str, os.PathLike] = None, |
10 | | - db_file: Union[str, os.PathLike] = None, |
| 10 | + schema_file: Union[str, os.PathLike], |
| 11 | + db_file: Union[str, os.PathLike], |
| 12 | + table_name: str, |
| 13 | + log_dir: Union[str, os.PathLike] = None |
11 | 14 | ) -> None: |
12 | 15 | """ |
13 | | - Create an SQLite database using a schema file and load data from a pandas |
14 | | - DataFrame. |
| 16 | + Create or update an SQLite database using a schema file and load data into |
| 17 | + a specified table from a pandas DataFrame. |
15 | 18 |
|
16 | 19 | Parameters |
17 | 20 | ---------- |
18 | 21 | df : pd.DataFrame |
19 | 22 | The data to be loaded into the database. |
20 | 23 |
|
21 | 24 | schema_file : Union[str, os.PathLike] |
22 | | - Path to the SQL file containing the schema definition. Default is |
23 | | - 'db_schema.sql'. |
| 25 | + Path to the SQL file containing the schema definition. |
| 26 | +
|
| 27 | + db_file : Union[str, os.PathLike] |
| 28 | + Path to the SQLite database file to be created or updated. |
24 | 29 |
|
25 | | - db_file : Union[str, os.PathLike], optional |
26 | | - Path to the SQLite database file to be created. |
27 | | - Defaults to './database.db'. |
| 30 | + table_name : str |
| 31 | + Name of the table to insert the DataFrame into. |
28 | 32 |
|
| 33 | + log_dir : Union[str, os.PathLike], optional |
| 34 | + Directory where the log file is written. Default is the current |
| 35 | + working directory. |
| 36 | +
|
| 37 | + Raises |
| 38 | + ------ |
| 39 | + FileNotFoundError |
| 40 | + If the schema file does not exist. |
| 41 | +
|
| 42 | + ValueError |
| 43 | + If the specified table name is not found in the schema or the schema |
| 44 | + does not match the DataFrame structure. |
29 | 45 | Examples |
30 | 46 | -------- |
31 | 47 | .. code-block:: python |
32 | 48 |
|
33 | 49 | import pandas as pd |
34 | 50 | from sqlite_manager import create_sqlite_db |
35 | 51 |
|
| 52 | + # Define the DataFrame to insert |
36 | 53 | data = { |
37 | | - "id": [1, 2, 3], |
38 | | - "name": ["Alice", "Bob", "Charlie"], |
39 | | - "age": [25, 30, 35] |
| 54 | + "InvoiceNo": ["A001", "A002", "A003"], |
| 55 | + "StockCode": ["P001", "P002", "P003"], |
| 56 | + "Description": ["Product 1", "Product 2", "Product 3"], |
| 57 | + "Quantity": [10, 5, 20], |
| 58 | + "InvoiceDate": ["2023-01-01", "2023-01-02", "2023-01-03"], |
| 59 | + "UnitPrice": [12.5, 8.0, 15.0], |
| 60 | + "CustomerID": ["C001", "C002", "C003"], |
| 61 | + "Country": ["USA", "UK", "Germany"] |
40 | 62 | } |
41 | 63 | df = pd.DataFrame(data) |
42 | 64 |
|
43 | | - schema_file = "db_schema.sql" |
44 | | - # Example schema (contents of db_schema.sql) |
45 | | - # CREATE TABLE ExampleTable ( |
46 | | - # id INTEGER PRIMARY KEY, |
47 | | - # name TEXT, |
48 | | - # age INTEGER |
| 65 | + # Schema file (SQL file defining the database schema) |
| 66 | + schema_file = "schema.sql" |
| 67 | + # Contents of schema.sql: |
| 68 | + # CREATE TABLE IF NOT EXISTS OnlineRetail ( |
| 69 | + # InvoiceNo TEXT NOT NULL, |
| 70 | + # StockCode TEXT NOT NULL, |
| 71 | + # Description TEXT, |
| 72 | + # Quantity INTEGER NOT NULL, |
| 73 | + # InvoiceDate TEXT NOT NULL, |
| 74 | + # UnitPrice REAL NOT NULL, |
| 75 | + # CustomerID TEXT, |
| 76 | + # Country TEXT |
49 | 77 | # ); |
50 | 78 |
|
51 | | - db_file = "example_database.db" |
| 79 | + # SQLite database file to create or update |
| 80 | + db_file = "data/online_retail.db" |
52 | 81 |
|
53 | | - create_sqlite_db(df, schema_file, db_file) |
54 | | - """ |
55 | | - if schema_file is None: |
56 | | - schema_file = os.path.abspath("./db_schema.sql") |
57 | | - if db_file is None: |
58 | | - db_file = os.path.abspath("./database.db") |
59 | | - |
60 | | - # Check if the database file already exists |
61 | | - if os.path.exists(db_file): |
62 | | - raise FileExistsError( |
63 | | - f"The database file '{db_file}' already exists." |
64 | | - f" Please specify a different path or remove the existing file." |
| 82 | + # Create or update the database and insert data into the table |
| 83 | + create_sqlite_db( |
| 84 | + df=df, |
| 85 | + schema_file=schema_file, |
| 86 | + db_file=db_file, |
| 87 | + table_name="OnlineRetail", |
| 88 | + log_dir="." # Optional |
65 | 89 | ) |
66 | | - |
67 | | - # Create the database and apply the schema |
| 90 | + """ |
| 91 | + if log_dir is None: |
| 92 | + log_dir = os.getcwd() |
| 93 | + |
| 94 | + os.makedirs(log_dir, exist_ok=True) |
| 95 | + |
| 96 | + # Configure the logging |
| 97 | + log_file = os.path.join(log_dir, "create_sqlite_db.log") |
| 98 | + logging.basicConfig( |
| 99 | + level=logging.INFO, |
| 100 | + format="%(asctime)s - %(levelname)s - %(message)s", |
| 101 | + handlers=[ |
| 102 | + logging.FileHandler(log_file), |
| 103 | + logging.StreamHandler() |
| 104 | + ] |
| 105 | + ) |
| 106 | + |
| 107 | + # Validate input files |
| 108 | + if not os.path.exists(schema_file): |
| 109 | + raise FileNotFoundError(f"Schema file '{schema_file}' not found.") |
| 110 | + |
| 111 | + # Read and validate schema |
| 112 | + with open(schema_file, 'r') as file: |
| 113 | + schema = file.read() |
| 114 | + |
| 115 | + if (f"CREATE TABLE {table_name}" not in schema |
| 116 | + and f"CREATE TABLE IF NOT EXISTS {table_name}" not in schema): |
| 117 | + logging.error( |
| 118 | + f"Table '{table_name}' is not defined in the schema file.") |
| 119 | + raise ValueError( |
| 120 | + f"Table '{table_name}' is not defined in the schema file.") |
| 121 | + |
| 122 | + # Check if the database already exists |
| 123 | + db_exists = os.path.exists(db_file) |
| 124 | + |
| 125 | + # Connect to SQLite database |
68 | 126 | conn = sqlite3.connect(db_file) |
| 127 | + cursor = conn.cursor() |
69 | 128 | try: |
70 | | - with open(schema_file, 'r') as file: |
71 | | - schema = file.read() |
72 | | - cursor = conn.cursor() |
73 | | - cursor.executescript(schema) |
74 | | - conn.commit() |
75 | | - print(f"Database and schema created at: {db_file}") |
76 | | - |
77 | | - # Load the DataFrame into the database |
78 | | - table_name = schema.split("CREATE TABLE")[1].split("(")[0].strip() |
79 | | - df.to_sql( |
80 | | - table_name, |
81 | | - conn, |
82 | | - if_exists='append', |
83 | | - index=False |
84 | | - ) |
85 | | - print(f"Data inserted into table: {table_name}") |
| 129 | + if not db_exists: |
| 130 | + logging.info( |
| 131 | + f"Database does not exist. Creating new database at: {db_file}") |
| 132 | + conn.executescript(schema) |
| 133 | + conn.commit() |
| 134 | + logging.info("Schema applied successfully.") |
| 135 | + else: |
| 136 | + logging.info(f"Using existing database at: {db_file}") |
| 137 | + # Apply the schema in case new tables are defined |
| 138 | + conn.executescript(schema) |
| 139 | + conn.commit() |
| 140 | + logging.info( |
| 141 | + "Schema re-applied to ensure all definitions are current.") |
| 142 | + |
| 143 | + # Check if the specified table exists |
| 144 | + cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") |
| 145 | + existing_tables = [row[0] for row in cursor.fetchall()] |
| 146 | + if table_name in existing_tables: |
| 147 | + logging.info( |
| 148 | + f"Table '{table_name}' already exists." |
| 149 | + f" Data will be appended." |
| 150 | + ) |
| 151 | + else: |
| 152 | + logging.info( |
| 153 | + f"Table '{table_name}' was created from the schema.") |
| 154 | + |
| 155 | + # Validate table schema against DataFrame columns |
| 156 | + cursor.execute(f"PRAGMA table_info({table_name});") |
| 157 | + schema_columns = [row[1] for row in cursor.fetchall()] |
| 158 | + if not schema_columns: |
| 159 | + raise ValueError( |
| 160 | + f"Table '{table_name}' does not exist after applying" |
| 161 | + f" the schema." |
| 162 | + ) |
| 163 | + |
| 164 | + missing_columns = [col for col in df.columns if |
| 165 | + col not in schema_columns] |
| 166 | + if missing_columns: |
| 167 | + logging.error( |
| 168 | + f"Columns in DataFrame not found in table schema:" |
| 169 | + f" {missing_columns}" |
| 170 | + ) |
| 171 | + raise ValueError( |
| 172 | + f"Table schema is missing required columns: {missing_columns}") |
| 173 | + |
| 174 | + # Insert DataFrame into the specified table |
| 175 | + df.to_sql(table_name, conn, if_exists='append', index=False) |
| 176 | + logging.info(f"Inserted {len(df)} rows into table '{table_name}'.") |
86 | 177 |
|
87 | 178 | except sqlite3.Error as e: |
88 | | - print(f"SQLite Error: {e}") |
| 179 | + logging.error(f"SQLite Error: {e}") |
| 180 | + raise |
| 181 | + except ValueError as ve: |
| 182 | + logging.error(f"Validation Error: {ve}") |
89 | 183 | raise |
90 | | - |
91 | 184 | finally: |
92 | 185 | conn.close() |
93 | | - print(f"Database connection closed: {db_file}") |
| 186 | + logging.info(f"Database connection closed: {db_file}") |
0 commit comments