Skip to content

Commit 16c51c2

Browse files
rmobminaclaude
andcommitted
restore bootstrap_simple_efp_dbs.py and efp_bootstrap.py needed for CI
These scripts are called by config/init.sh to create simple eFP MySQL databases from schema definitions in CI, where SQLite mirrors are not available. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent a472927 commit 16c51c2

File tree

2 files changed

+371
-0
lines changed

2 files changed

+371
-0
lines changed

api/services/efp_bootstrap.py

Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
"""
2+
Utilities to bootstrap the simple eFP databases directly from the shared schema
3+
definitions. Shared by the CLI script and the Flask endpoint so we only maintain
4+
one implementation.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import re
10+
import hashlib
11+
from typing import Dict, Iterable, List
12+
13+
from sqlalchemy import Column, Index, MetaData, Table, create_engine, text
14+
from sqlalchemy.dialects.mysql import FLOAT, INTEGER, TEXT, VARCHAR
15+
from sqlalchemy.engine import URL
16+
17+
from api.models.efp_schemas import SIMPLE_EFP_DATABASE_SCHEMAS
18+
19+
20+
def _column_type(column_spec):
21+
"""
22+
Convert a column specification dictionary to a SQLAlchemy MySQL column type.
23+
24+
:param column_spec: Dictionary containing column metadata from schema definition
25+
:type column_spec: Dict[str, Any]
26+
:return: SQLAlchemy column type object (VARCHAR, INTEGER, FLOAT, or TEXT)
27+
:rtype: sqlalchemy.types.TypeEngine
28+
:raises ValueError: If column type is not one of: string, integer, float, text
29+
"""
30+
col_type = column_spec.get("type")
31+
if col_type == "string":
32+
return VARCHAR(column_spec["length"])
33+
if col_type == "integer":
34+
return INTEGER(unsigned=column_spec.get("unsigned", False))
35+
if col_type == "float":
36+
# explicit mysql float keeps parity with the original dumps
37+
return FLOAT()
38+
if col_type == "text":
39+
return TEXT()
40+
raise ValueError(f"Unsupported column type: {col_type}")
41+
42+
43+
def _build_table(metadata: MetaData, spec, db_name: str) -> Table:
44+
"""
45+
Build a SQLAlchemy Table object from a schema specification.
46+
47+
Creates columns with proper types, constraints, defaults, and indexes based on
48+
the schema definition. This Table object can be used to generate CREATE TABLE
49+
SQL statements.
50+
51+
:param metadata: SQLAlchemy MetaData object to attach the table to
52+
:type metadata: sqlalchemy.schema.MetaData
53+
:param spec: Database schema specification from SIMPLE_EFP_DATABASE_SCHEMAS
54+
:type spec: Dict[str, Any]
55+
:param db_name: Name of the database (used for index naming)
56+
:type db_name: str
57+
:return: SQLAlchemy Table object with all columns and indexes defined
58+
:rtype: sqlalchemy.schema.Table
59+
"""
60+
columns = []
61+
for column in spec["columns"]:
62+
kwargs = {"nullable": column.get("nullable", True)}
63+
if column.get("primary_key"):
64+
kwargs["primary_key"] = True
65+
default_value = column.get("default")
66+
if default_value is not None:
67+
if isinstance(default_value, str):
68+
kwargs["server_default"] = text(f"'{default_value}'")
69+
else:
70+
kwargs["server_default"] = text(str(default_value))
71+
72+
columns.append(Column(column["name"], _column_type(column), **kwargs))
73+
74+
table = Table(spec["table_name"], metadata, *columns, mysql_charset=spec.get("charset"))
75+
index_cols = spec.get("index") or []
76+
if index_cols:
77+
# MySQL cannot index TEXT/BLOB columns without a prefix length.
78+
# Filter them out to avoid bootstrap failures when schemas use TEXT.
79+
text_cols = {col["name"] for col in spec["columns"] if col.get("type") == "text"}
80+
index_cols = [col for col in index_cols if col not in text_cols]
81+
if index_cols:
82+
index_name = _make_index_name(db_name, index_cols)
83+
Index(index_name, *[table.c[col] for col in index_cols])
84+
return table
85+
86+
87+
def _make_index_name(db_name: str, index_cols: Iterable[str], max_len: int = 64) -> str:
88+
"""
89+
Create a MySQL-safe index name capped at 64 characters.
90+
91+
If the generated name is too long, fall back to a truncated db_name with a stable hash
92+
to keep names deterministic and avoid collisions.
93+
"""
94+
base = f"ix_{db_name}_{'_'.join(index_cols)}"
95+
if len(base) <= max_len:
96+
return base
97+
98+
digest = hashlib.sha1(base.encode("utf-8")).hexdigest()[:8]
99+
reserved = len("ix_") + 1 + len(digest)
100+
db_len = max_len - reserved
101+
if db_len <= 0:
102+
return f"ix_{digest}"
103+
return f"ix_{db_name[:db_len]}_{digest}"
104+
105+
106+
def _build_url(host: str, port: int, user: str, password: str, database: str | None = None) -> URL:
107+
"""
108+
Build a SQLAlchemy database URL for MySQL connections.
109+
110+
:param host: MySQL server hostname (e.g., 'localhost', 'BAR_mysqldb')
111+
:type host: str
112+
:param port: MySQL server port number (typically 3306)
113+
:type port: int
114+
:param user: MySQL username for authentication
115+
:type user: str
116+
:param password: MySQL password for authentication
117+
:type password: str
118+
:param database: Optional database name to connect to; if None, connects to server without selecting a database
119+
:type database: str or None
120+
:return: SQLAlchemy URL object for mysql+mysqldb connections
121+
:rtype: sqlalchemy.engine.URL
122+
"""
123+
return URL.create(
124+
drivername="mysql+mysqldb",
125+
username=user,
126+
password=password,
127+
host=host,
128+
port=port,
129+
database=database,
130+
)
131+
132+
133+
def ensure_database(server_url: URL, db_name: str, charset: str) -> None:
134+
"""
135+
Create a MySQL database if it doesn't already exist.
136+
137+
Executes CREATE DATABASE IF NOT EXISTS with the specified character set.
138+
Safe to call multiple times - will not error if database already exists.
139+
140+
:param server_url: SQLAlchemy URL for MySQL server connection (without database selected)
141+
:type server_url: sqlalchemy.engine.URL
142+
:param db_name: Name of the database to create
143+
:type db_name: str
144+
:param charset: MySQL character set (e.g., 'latin1', 'utf8mb4')
145+
:type charset: str
146+
:return: None
147+
:rtype: None
148+
:raises ValueError: If db_name or charset contains invalid characters
149+
"""
150+
# Validate database name to prevent SQL injection - only allow safe identifier characters
151+
if not re.match(r'^[a-zA-Z0-9_$]+$', db_name):
152+
raise ValueError(f"Invalid database name: {db_name}. Only alphanumeric, underscore, and dollar sign characters are allowed.")
153+
154+
# Validate charset name to prevent SQL injection - only allow safe characters
155+
if not re.match(r'^[a-zA-Z0-9_]+$', charset):
156+
raise ValueError(f"Invalid charset name: {charset}. Only alphanumeric and underscore characters are allowed.")
157+
158+
server_engine = create_engine(server_url)
159+
with server_engine.begin() as conn:
160+
# Safe to use f-string here since we've validated the inputs above
161+
conn.execute(text(f"CREATE DATABASE IF NOT EXISTS `{db_name}` DEFAULT CHARACTER SET {charset}"))
162+
163+
164+
def ensure_schema(db_url: URL, spec, db_name: str) -> Dict[str, object]:
165+
"""
166+
Create database tables and insert seed data if the table is empty.
167+
168+
Uses SQLAlchemy's metadata.create_all() to generate CREATE TABLE statements
169+
from the schema specification. If seed_rows are defined in the spec and the
170+
table is empty, inserts the seed data.
171+
172+
:param db_url: SQLAlchemy URL for the specific database connection
173+
:type db_url: sqlalchemy.engine.URL
174+
:param spec: Database schema specification from SIMPLE_EFP_DATABASE_SCHEMAS
175+
:type spec: Dict[str, Any]
176+
:param db_name: Name of the database (used for table naming)
177+
:type db_name: str
178+
:return: Dictionary with 'table' (table name) and 'seeded_rows' (count of inserted rows)
179+
:rtype: Dict[str, object]
180+
"""
181+
metadata = MetaData()
182+
table = _build_table(metadata, spec, db_name)
183+
engine = create_engine(db_url)
184+
metadata.create_all(engine, checkfirst=True)
185+
186+
seed_rows = spec.get("seed_rows") or []
187+
inserted = 0
188+
if seed_rows:
189+
with engine.connect() as conn:
190+
count = conn.execute(text(f"SELECT COUNT(1) FROM {table.name}")).scalar() or 0
191+
if count == 0:
192+
with engine.begin() as write_conn:
193+
write_conn.execute(table.insert(), seed_rows)
194+
inserted = len(seed_rows)
195+
196+
return {"table": table.name, "seeded_rows": inserted}
197+
198+
199+
def bootstrap_simple_efp_databases(
200+
*,
201+
host: str,
202+
port: int,
203+
user: str,
204+
password: str,
205+
databases: Iterable[str] | None = None,
206+
) -> List[Dict[str, object]]:
207+
"""
208+
Bootstrap simple eFP databases in MySQL from schema definitions.
209+
210+
This is the main entry point for creating eFP databases. For each database:
211+
1. Creates the database if it doesn't exist
212+
2. Creates the sample_data table with schema from SIMPLE_EFP_DATABASE_SCHEMAS
213+
3. Inserts seed rows if the table is empty and seed_rows are defined
214+
215+
Used by:
216+
- scripts/bootstrap_simple_efp_dbs.py (CLI tool)
217+
- config/init.sh (Docker/CI initialization)
218+
- api/resources/efp_proxy.py (HTTP bootstrap endpoint)
219+
220+
:param host: MySQL server hostname (e.g., 'localhost', 'BAR_mysqldb' for Docker)
221+
:type host: str
222+
:param port: MySQL server port number (typically 3306)
223+
:type port: int
224+
:param user: MySQL username with CREATE DATABASE privileges
225+
:type user: str
226+
:param password: MySQL password for authentication
227+
:type password: str
228+
:param databases: Optional list of specific databases to bootstrap; if None, bootstraps all databases in SIMPLE_EFP_DATABASE_SCHEMAS
229+
:type databases: Iterable[str] or None
230+
:return: List of result dictionaries, each containing 'database' (name), 'table' (table name), and 'seeded_rows' (count)
231+
:rtype: List[Dict[str, object]]
232+
:raises ValueError: If a requested database is not defined in SIMPLE_EFP_DATABASE_SCHEMAS
233+
234+
Example::
235+
236+
results = bootstrap_simple_efp_databases(
237+
host='localhost',
238+
port=3306,
239+
user='root',
240+
password='password',
241+
databases=['cannabis', 'dna_damage']
242+
)
243+
# Returns: [
244+
# {'database': 'cannabis', 'table': 'sample_data', 'seeded_rows': 1},
245+
# {'database': 'dna_damage', 'table': 'sample_data', 'seeded_rows': 1}
246+
# ]
247+
"""
248+
249+
results: List[Dict[str, object]] = []
250+
target_dbs = list(databases) if databases is not None else list(SIMPLE_EFP_DATABASE_SCHEMAS.keys())
251+
252+
server_url = _build_url(host, port, user, password, database=None)
253+
for db_name in target_dbs:
254+
if db_name not in SIMPLE_EFP_DATABASE_SCHEMAS:
255+
raise ValueError(f"Unknown simple eFP database: {db_name}")
256+
257+
spec = SIMPLE_EFP_DATABASE_SCHEMAS[db_name]
258+
charset = spec.get("charset", "utf8mb4")
259+
ensure_database(server_url, db_name, charset)
260+
db_url = _build_url(host, port, user, password, database=db_name)
261+
schema_result = ensure_schema(db_url, spec, db_name)
262+
results.append(
263+
{
264+
"database": db_name,
265+
"table": schema_result["table"],
266+
"seeded_rows": schema_result["seeded_rows"],
267+
}
268+
)
269+
270+
return results
271+
272+
273+
__all__ = ["bootstrap_simple_efp_databases"]
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#!/usr/bin/env python3
2+
"""
3+
bootstrap the simple efp databases from the shared schema definitions
4+
"""
5+
6+
from __future__ import annotations
7+
8+
import argparse
9+
import os
10+
import sys
11+
from pathlib import Path
12+
13+
from sqlalchemy.exc import SQLAlchemyError
14+
15+
# ensure the repository root is importable when this script is executed standalone
16+
ROOT_DIR = Path(__file__).resolve().parents[1]
17+
if str(ROOT_DIR) not in sys.path:
18+
sys.path.insert(0, str(ROOT_DIR))
19+
20+
from api.services.efp_bootstrap import bootstrap_simple_efp_databases # noqa: E402
21+
22+
23+
def _default_host() -> str:
24+
"""
25+
Determine the default MySQL hostname from environment variables.
26+
27+
Priority order:
28+
1. DB_HOST environment variable (explicit override)
29+
2. MYSQL_HOST environment variable (alternative name)
30+
3. 'localhost' (default for local dev and GitHub Actions CI)
31+
32+
Docker deployments should set DB_HOST=BAR_mysqldb explicitly.
33+
34+
:return: MySQL hostname to use for connections
35+
:rtype: str
36+
"""
37+
if os.environ.get("DB_HOST"):
38+
return os.environ["DB_HOST"]
39+
if os.environ.get("MYSQL_HOST"):
40+
return os.environ["MYSQL_HOST"]
41+
# GitHub Actions CI uses localhost; Docker Compose uses bar_mysqldb
42+
# CI env var is set in GitHub Actions, but not DB_HOST
43+
# In Docker, DB_HOST should be explicitly set to BAR_mysqldb
44+
return "localhost"
45+
46+
47+
def parse_args() -> argparse.Namespace:
48+
"""
49+
Parse command-line arguments for the bootstrap script.
50+
51+
:return: Parsed arguments containing host, port, user, password, and optional database list
52+
:rtype: argparse.Namespace
53+
"""
54+
parser = argparse.ArgumentParser(description="Create simple eFP MySQL databases from in-memory schemas.")
55+
parser.add_argument("--host", default=_default_host(), help="MySQL hostname (default: %(default)s)")
56+
parser.add_argument("--port", type=int, default=int(os.environ.get("DB_PORT", 3306)), help="MySQL port")
57+
parser.add_argument("--user", default=os.environ.get("DB_USER", "root"), help="MySQL user")
58+
parser.add_argument("--password", default=os.environ.get("DB_PASS", "root"), help="MySQL password")
59+
parser.add_argument(
60+
"--databases",
61+
nargs="*",
62+
help="Optional list of databases to bootstrap (defaults to every simple schema).",
63+
)
64+
return parser.parse_args()
65+
66+
67+
def main():
68+
"""
69+
Main entry point for the bootstrap CLI script.
70+
71+
Parses command-line arguments and calls bootstrap_simple_efp_databases()
72+
to create MySQL databases. Prints success messages for each database created.
73+
74+
Output format: [ok] ensured database_name.table_name (seeded N rows)
75+
76+
:raises SQLAlchemyError: If database creation or connection fails
77+
"""
78+
args = parse_args()
79+
results = bootstrap_simple_efp_databases(
80+
host=args.host,
81+
port=args.port,
82+
user=args.user,
83+
password=args.password,
84+
databases=args.databases,
85+
)
86+
for entry in results:
87+
seeded = entry["seeded_rows"]
88+
seed_msg = f"seeded {seeded} rows" if seeded else "no seed rows inserted"
89+
print(f"[ok] ensured {entry['database']}.{entry['table']} ({seed_msg})")
90+
91+
92+
if __name__ == "__main__":
93+
try:
94+
main()
95+
except SQLAlchemyError as exc:
96+
# match init.sh style output to keep ci logs readable
97+
print(f"failed to initialize simple efp databases: {exc}")
98+
raise

0 commit comments

Comments
 (0)