|
| 1 | +""" |
| 2 | +Utilities to bootstrap the simple eFP databases directly from the shared schema |
| 3 | +definitions. Shared by the CLI script and the Flask endpoint so we only maintain |
| 4 | +one implementation. |
| 5 | +""" |
| 6 | + |
| 7 | +from __future__ import annotations |
| 8 | + |
| 9 | +import re |
| 10 | +import hashlib |
| 11 | +from typing import Dict, Iterable, List |
| 12 | + |
| 13 | +from sqlalchemy import Column, Index, MetaData, Table, create_engine, text |
| 14 | +from sqlalchemy.dialects.mysql import FLOAT, INTEGER, TEXT, VARCHAR |
| 15 | +from sqlalchemy.engine import URL |
| 16 | + |
| 17 | +from api.models.efp_schemas import SIMPLE_EFP_DATABASE_SCHEMAS |
| 18 | + |
| 19 | + |
| 20 | +def _column_type(column_spec): |
| 21 | + """ |
| 22 | + Convert a column specification dictionary to a SQLAlchemy MySQL column type. |
| 23 | +
|
| 24 | + :param column_spec: Dictionary containing column metadata from schema definition |
| 25 | + :type column_spec: Dict[str, Any] |
| 26 | + :return: SQLAlchemy column type object (VARCHAR, INTEGER, FLOAT, or TEXT) |
| 27 | + :rtype: sqlalchemy.types.TypeEngine |
| 28 | + :raises ValueError: If column type is not one of: string, integer, float, text |
| 29 | + """ |
| 30 | + col_type = column_spec.get("type") |
| 31 | + if col_type == "string": |
| 32 | + return VARCHAR(column_spec["length"]) |
| 33 | + if col_type == "integer": |
| 34 | + return INTEGER(unsigned=column_spec.get("unsigned", False)) |
| 35 | + if col_type == "float": |
| 36 | + # explicit mysql float keeps parity with the original dumps |
| 37 | + return FLOAT() |
| 38 | + if col_type == "text": |
| 39 | + return TEXT() |
| 40 | + raise ValueError(f"Unsupported column type: {col_type}") |
| 41 | + |
| 42 | + |
| 43 | +def _build_table(metadata: MetaData, spec, db_name: str) -> Table: |
| 44 | + """ |
| 45 | + Build a SQLAlchemy Table object from a schema specification. |
| 46 | +
|
| 47 | + Creates columns with proper types, constraints, defaults, and indexes based on |
| 48 | + the schema definition. This Table object can be used to generate CREATE TABLE |
| 49 | + SQL statements. |
| 50 | +
|
| 51 | + :param metadata: SQLAlchemy MetaData object to attach the table to |
| 52 | + :type metadata: sqlalchemy.schema.MetaData |
| 53 | + :param spec: Database schema specification from SIMPLE_EFP_DATABASE_SCHEMAS |
| 54 | + :type spec: Dict[str, Any] |
| 55 | + :param db_name: Name of the database (used for index naming) |
| 56 | + :type db_name: str |
| 57 | + :return: SQLAlchemy Table object with all columns and indexes defined |
| 58 | + :rtype: sqlalchemy.schema.Table |
| 59 | + """ |
| 60 | + columns = [] |
| 61 | + for column in spec["columns"]: |
| 62 | + kwargs = {"nullable": column.get("nullable", True)} |
| 63 | + if column.get("primary_key"): |
| 64 | + kwargs["primary_key"] = True |
| 65 | + default_value = column.get("default") |
| 66 | + if default_value is not None: |
| 67 | + if isinstance(default_value, str): |
| 68 | + kwargs["server_default"] = text(f"'{default_value}'") |
| 69 | + else: |
| 70 | + kwargs["server_default"] = text(str(default_value)) |
| 71 | + |
| 72 | + columns.append(Column(column["name"], _column_type(column), **kwargs)) |
| 73 | + |
| 74 | + table = Table(spec["table_name"], metadata, *columns, mysql_charset=spec.get("charset")) |
| 75 | + index_cols = spec.get("index") or [] |
| 76 | + if index_cols: |
| 77 | + # MySQL cannot index TEXT/BLOB columns without a prefix length. |
| 78 | + # Filter them out to avoid bootstrap failures when schemas use TEXT. |
| 79 | + text_cols = {col["name"] for col in spec["columns"] if col.get("type") == "text"} |
| 80 | + index_cols = [col for col in index_cols if col not in text_cols] |
| 81 | + if index_cols: |
| 82 | + index_name = _make_index_name(db_name, index_cols) |
| 83 | + Index(index_name, *[table.c[col] for col in index_cols]) |
| 84 | + return table |
| 85 | + |
| 86 | + |
| 87 | +def _make_index_name(db_name: str, index_cols: Iterable[str], max_len: int = 64) -> str: |
| 88 | + """ |
| 89 | + Create a MySQL-safe index name capped at 64 characters. |
| 90 | +
|
| 91 | + If the generated name is too long, fall back to a truncated db_name with a stable hash |
| 92 | + to keep names deterministic and avoid collisions. |
| 93 | + """ |
| 94 | + base = f"ix_{db_name}_{'_'.join(index_cols)}" |
| 95 | + if len(base) <= max_len: |
| 96 | + return base |
| 97 | + |
| 98 | + digest = hashlib.sha1(base.encode("utf-8")).hexdigest()[:8] |
| 99 | + reserved = len("ix_") + 1 + len(digest) |
| 100 | + db_len = max_len - reserved |
| 101 | + if db_len <= 0: |
| 102 | + return f"ix_{digest}" |
| 103 | + return f"ix_{db_name[:db_len]}_{digest}" |
| 104 | + |
| 105 | + |
| 106 | +def _build_url(host: str, port: int, user: str, password: str, database: str | None = None) -> URL: |
| 107 | + """ |
| 108 | + Build a SQLAlchemy database URL for MySQL connections. |
| 109 | +
|
| 110 | + :param host: MySQL server hostname (e.g., 'localhost', 'BAR_mysqldb') |
| 111 | + :type host: str |
| 112 | + :param port: MySQL server port number (typically 3306) |
| 113 | + :type port: int |
| 114 | + :param user: MySQL username for authentication |
| 115 | + :type user: str |
| 116 | + :param password: MySQL password for authentication |
| 117 | + :type password: str |
| 118 | + :param database: Optional database name to connect to; if None, connects to server without selecting a database |
| 119 | + :type database: str or None |
| 120 | + :return: SQLAlchemy URL object for mysql+mysqldb connections |
| 121 | + :rtype: sqlalchemy.engine.URL |
| 122 | + """ |
| 123 | + return URL.create( |
| 124 | + drivername="mysql+mysqldb", |
| 125 | + username=user, |
| 126 | + password=password, |
| 127 | + host=host, |
| 128 | + port=port, |
| 129 | + database=database, |
| 130 | + ) |
| 131 | + |
| 132 | + |
| 133 | +def ensure_database(server_url: URL, db_name: str, charset: str) -> None: |
| 134 | + """ |
| 135 | + Create a MySQL database if it doesn't already exist. |
| 136 | +
|
| 137 | + Executes CREATE DATABASE IF NOT EXISTS with the specified character set. |
| 138 | + Safe to call multiple times - will not error if database already exists. |
| 139 | +
|
| 140 | + :param server_url: SQLAlchemy URL for MySQL server connection (without database selected) |
| 141 | + :type server_url: sqlalchemy.engine.URL |
| 142 | + :param db_name: Name of the database to create |
| 143 | + :type db_name: str |
| 144 | + :param charset: MySQL character set (e.g., 'latin1', 'utf8mb4') |
| 145 | + :type charset: str |
| 146 | + :return: None |
| 147 | + :rtype: None |
| 148 | + :raises ValueError: If db_name or charset contains invalid characters |
| 149 | + """ |
| 150 | + # Validate database name to prevent SQL injection - only allow safe identifier characters |
| 151 | + if not re.match(r'^[a-zA-Z0-9_$]+$', db_name): |
| 152 | + raise ValueError(f"Invalid database name: {db_name}. Only alphanumeric, underscore, and dollar sign characters are allowed.") |
| 153 | + |
| 154 | + # Validate charset name to prevent SQL injection - only allow safe characters |
| 155 | + if not re.match(r'^[a-zA-Z0-9_]+$', charset): |
| 156 | + raise ValueError(f"Invalid charset name: {charset}. Only alphanumeric and underscore characters are allowed.") |
| 157 | + |
| 158 | + server_engine = create_engine(server_url) |
| 159 | + with server_engine.begin() as conn: |
| 160 | + # Safe to use f-string here since we've validated the inputs above |
| 161 | + conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` DEFAULT CHARACTER SET {charset}")) |
| 162 | + |
| 163 | + |
| 164 | +def ensure_schema(db_url: URL, spec, db_name: str) -> Dict[str, object]: |
| 165 | + """ |
| 166 | + Create database tables and insert seed data if the table is empty. |
| 167 | +
|
| 168 | + Uses SQLAlchemy's metadata.create_all() to generate CREATE TABLE statements |
| 169 | + from the schema specification. If seed_rows are defined in the spec and the |
| 170 | + table is empty, inserts the seed data. |
| 171 | +
|
| 172 | + :param db_url: SQLAlchemy URL for the specific database connection |
| 173 | + :type db_url: sqlalchemy.engine.URL |
| 174 | + :param spec: Database schema specification from SIMPLE_EFP_DATABASE_SCHEMAS |
| 175 | + :type spec: Dict[str, Any] |
| 176 | + :param db_name: Name of the database (used for table naming) |
| 177 | + :type db_name: str |
| 178 | + :return: Dictionary with 'table' (table name) and 'seeded_rows' (count of inserted rows) |
| 179 | + :rtype: Dict[str, object] |
| 180 | + """ |
| 181 | + metadata = MetaData() |
| 182 | + table = _build_table(metadata, spec, db_name) |
| 183 | + engine = create_engine(db_url) |
| 184 | + metadata.create_all(engine, checkfirst=True) |
| 185 | + |
| 186 | + seed_rows = spec.get("seed_rows") or [] |
| 187 | + inserted = 0 |
| 188 | + if seed_rows: |
| 189 | + with engine.connect() as conn: |
| 190 | + count = conn.execute(text(f"SELECT COUNT(1) FROM {table.name}")).scalar() or 0 |
| 191 | + if count == 0: |
| 192 | + with engine.begin() as write_conn: |
| 193 | + write_conn.execute(table.insert(), seed_rows) |
| 194 | + inserted = len(seed_rows) |
| 195 | + |
| 196 | + return {"table": table.name, "seeded_rows": inserted} |
| 197 | + |
| 198 | + |
| 199 | +def bootstrap_simple_efp_databases( |
| 200 | + *, |
| 201 | + host: str, |
| 202 | + port: int, |
| 203 | + user: str, |
| 204 | + password: str, |
| 205 | + databases: Iterable[str] | None = None, |
| 206 | +) -> List[Dict[str, object]]: |
| 207 | + """ |
| 208 | + Bootstrap simple eFP databases in MySQL from schema definitions. |
| 209 | +
|
| 210 | + This is the main entry point for creating eFP databases. For each database: |
| 211 | + 1. Creates the database if it doesn't exist |
| 212 | + 2. Creates the sample_data table with schema from SIMPLE_EFP_DATABASE_SCHEMAS |
| 213 | + 3. Inserts seed rows if the table is empty and seed_rows are defined |
| 214 | +
|
| 215 | + Used by: |
| 216 | + - scripts/bootstrap_simple_efp_dbs.py (CLI tool) |
| 217 | + - config/init.sh (Docker/CI initialization) |
| 218 | + - api/resources/efp_proxy.py (HTTP bootstrap endpoint) |
| 219 | +
|
| 220 | + :param host: MySQL server hostname (e.g., 'localhost', 'BAR_mysqldb' for Docker) |
| 221 | + :type host: str |
| 222 | + :param port: MySQL server port number (typically 3306) |
| 223 | + :type port: int |
| 224 | + :param user: MySQL username with CREATE DATABASE privileges |
| 225 | + :type user: str |
| 226 | + :param password: MySQL password for authentication |
| 227 | + :type password: str |
| 228 | + :param databases: Optional list of specific databases to bootstrap; if None, bootstraps all databases in SIMPLE_EFP_DATABASE_SCHEMAS |
| 229 | + :type databases: Iterable[str] or None |
| 230 | + :return: List of result dictionaries, each containing 'database' (name), 'table' (table name), and 'seeded_rows' (count) |
| 231 | + :rtype: List[Dict[str, object]] |
| 232 | + :raises ValueError: If a requested database is not defined in SIMPLE_EFP_DATABASE_SCHEMAS |
| 233 | +
|
| 234 | + Example:: |
| 235 | +
|
| 236 | + results = bootstrap_simple_efp_databases( |
| 237 | + host='localhost', |
| 238 | + port=3306, |
| 239 | + user='root', |
| 240 | + password='password', |
| 241 | + databases=['cannabis', 'dna_damage'] |
| 242 | + ) |
| 243 | + # Returns: [ |
| 244 | + # {'database': 'cannabis', 'table': 'sample_data', 'seeded_rows': 1}, |
| 245 | + # {'database': 'dna_damage', 'table': 'sample_data', 'seeded_rows': 1} |
| 246 | + # ] |
| 247 | + """ |
| 248 | + |
| 249 | + results: List[Dict[str, object]] = [] |
| 250 | + target_dbs = list(databases) if databases is not None else list(SIMPLE_EFP_DATABASE_SCHEMAS.keys()) |
| 251 | + |
| 252 | + server_url = _build_url(host, port, user, password, database=None) |
| 253 | + for db_name in target_dbs: |
| 254 | + if db_name not in SIMPLE_EFP_DATABASE_SCHEMAS: |
| 255 | + raise ValueError(f"Unknown simple eFP database: {db_name}") |
| 256 | + |
| 257 | + spec = SIMPLE_EFP_DATABASE_SCHEMAS[db_name] |
| 258 | + charset = spec.get("charset", "utf8mb4") |
| 259 | + ensure_database(server_url, db_name, charset) |
| 260 | + db_url = _build_url(host, port, user, password, database=db_name) |
| 261 | + schema_result = ensure_schema(db_url, spec, db_name) |
| 262 | + results.append( |
| 263 | + { |
| 264 | + "database": db_name, |
| 265 | + "table": schema_result["table"], |
| 266 | + "seeded_rows": schema_result["seeded_rows"], |
| 267 | + } |
| 268 | + ) |
| 269 | + |
| 270 | + return results |
| 271 | + |
| 272 | + |
| 273 | +__all__ = ["bootstrap_simple_efp_databases"] |
0 commit comments