Skip to content

Commit b3cdeae

Browse files
committed
Add BARUtils validation, Sphinx docstrings, and fix SQL injection
- Replace manual REGEX with BARUtils validators for all species - Add descriptive error messages for each species (e.g., "Invalid Cannabis gene ID") - Add comprehensive Sphinx/reST docstrings to all ORM files - Improve validation logic to handle both AGI and probeset IDs correctly - Fix SQL injection vulnerability by validating schema identifiers - Add identifier validation (alphanumeric + underscore only) before SQL construction Security: Addresses CodeQL high-severity SQL injection alert by validating all database/table/column identifiers match safe pattern before use in queries
1 parent 722369c commit b3cdeae

File tree

3 files changed

+223
-22
lines changed

3 files changed

+223
-22
lines changed

api/models/efp_dynamic.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
"""
2-
dynamic sqlalchemy model generation for simple efp databases
2+
Dynamic SQLAlchemy model generation for simple eFP databases.
3+
4+
This module provides runtime generation of SQLAlchemy ORM models from schema
5+
definitions, enabling dynamic database access without hardcoded model classes.
36
"""
47

58
from __future__ import annotations
@@ -14,7 +17,23 @@
1417

1518

1619
def _to_sqla_type(column_spec):
17-
"""map a simple column spec to a sqlalchemy column type"""
20+
"""
21+
Map a column specification dictionary to a SQLAlchemy column type.
22+
23+
Converts the simple type descriptors used in schema definitions to the
24+
appropriate SQLAlchemy type objects for ORM model generation.
25+
26+
:param column_spec: Column specification with 'type', 'length', and 'unsigned' keys
27+
:type column_spec: Dict[str, Any]
28+
:return: SQLAlchemy column type (String, Integer, Float, or Text)
29+
:rtype: sqlalchemy.types.TypeEngine
30+
:raises ValueError: If column type is not one of: string, integer, float, text
31+
32+
Example::
33+
34+
col_spec = {"type": "string", "length": 24}
35+
sqla_type = _to_sqla_type(col_spec) # Returns String(24)
36+
"""
1837
col_type = column_spec.get("type")
1938
if col_type == "string":
2039
return String(column_spec["length"])
@@ -30,7 +49,26 @@ def _to_sqla_type(column_spec):
3049

3150

3251
def _generate_model(bind_key: str, spec) -> db.Model:
33-
"""build a concrete sqlalchemy model for the given schema"""
52+
"""
53+
Build a concrete SQLAlchemy model class for the given schema specification.
54+
55+
Dynamically creates an ORM model with the specified table name, bind key,
56+
and columns based on the schema definition. The generated model class can
57+
be used like any Flask-SQLAlchemy model.
58+
59+
:param bind_key: Database bind key (e.g., 'cannabis', 'embryo')
60+
:type bind_key: str
61+
:param spec: Database schema specification from SIMPLE_EFP_DATABASE_SCHEMAS
62+
:type spec: Dict[str, Any]
63+
:return: Dynamically generated SQLAlchemy model class
64+
:rtype: db.Model
65+
66+
Example::
67+
68+
schema = SIMPLE_EFP_DATABASE_SCHEMAS['cannabis']
69+
CannabisModel = _generate_model('cannabis', schema)
70+
# Returns class: CannabisSampleData(db.Model)
71+
"""
3472
attrs = {"__bind_key__": bind_key, "__tablename__": spec["table_name"]}
3573

3674
for column in spec["columns"]:

api/models/efp_schemas.py

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
"""
2-
simple schema definitions for efp databases that only expose a sample_data table
2+
Simple schema definitions for eFP databases that only expose a sample_data table.
3+
4+
This module provides the single source of truth for all simple eFP database schemas,
5+
including column definitions, indexes, seed data, and metadata. These schemas are used
6+
by both the ORM model generator and the database bootstrap scripts.
37
"""
48

59
from __future__ import annotations
@@ -20,6 +24,34 @@ def _column(
2024
default: Any | None = None,
2125
primary_key: bool = False,
2226
) -> ColumnSpec:
27+
"""
28+
Create a column specification dictionary for schema definitions.
29+
30+
Helper function to construct column metadata with type, constraints, and defaults
31+
in a consistent format.
32+
33+
:param name: Column name (e.g., 'data_probeset_id', 'data_signal')
34+
:type name: str
35+
:param col_type: Column type ('string', 'integer', 'float', or 'text')
36+
:type col_type: str
37+
:param length: Maximum length for string types (required for 'string')
38+
:type length: int or None
39+
:param unsigned: Whether integer type is unsigned (MySQL-specific)
40+
:type unsigned: bool
41+
:param nullable: Whether column allows NULL values
42+
:type nullable: bool
43+
:param default: Default value for the column
44+
:type default: Any or None
45+
:param primary_key: Whether column is part of primary key
46+
:type primary_key: bool
47+
:return: Column specification dictionary
48+
:rtype: ColumnSpec
49+
50+
Example::
51+
52+
col = _column("data_signal", "float", nullable=False, default=0)
53+
# Returns: {"name": "data_signal", "type": "float", "nullable": False, "default": 0}
54+
"""
2355
column: ColumnSpec = {"name": name, "type": col_type, "nullable": nullable}
2456
if length is not None:
2557
column["length"] = length
@@ -57,6 +89,40 @@ def _build_schema(
5789
identifier_type: str = "agi",
5890
metadata: Dict[str, Any] | None = None,
5991
) -> DatabaseSpec:
92+
"""
93+
Build a complete database schema specification from base columns and customizations.
94+
95+
Constructs a schema by starting with the default BASE_COLUMNS, applying any
96+
overrides, and adding extra columns. The resulting schema dictionary is used by
97+
both the ORM generator and bootstrap scripts to ensure consistency.
98+
99+
:param charset: MySQL character set (e.g., 'latin1', 'utf8mb4')
100+
:type charset: str
101+
:param table_name: Name of the table to create (typically 'sample_data')
102+
:type table_name: str
103+
:param column_overrides: Dictionary of column names to property overrides
104+
:type column_overrides: Dict[str, Dict[str, Any]] or None
105+
:param extra_columns: Additional columns beyond the base set
106+
:type extra_columns: List[ColumnSpec] or None
107+
:param index: List of column names to include in the index
108+
:type index: List[str] or None
109+
:param seed_rows: Initial rows to insert if table is empty
110+
:type seed_rows: List[Dict[str, Any]] or None
111+
:param identifier_type: Gene ID format ('agi' or 'probeset')
112+
:type identifier_type: str
113+
:param metadata: Additional metadata (species, sample_regex, etc.)
114+
:type metadata: Dict[str, Any] or None
115+
:return: Complete database schema specification
116+
:rtype: DatabaseSpec
117+
118+
Example::
119+
120+
schema = _build_schema(
121+
charset="utf8mb4",
122+
column_overrides={"proj_id": {"length": 5}},
123+
metadata={"species": "arabidopsis"}
124+
)
125+
"""
60126
overrides = column_overrides or {}
61127
columns: List[ColumnSpec] = []
62128

api/services/efp_data.py

Lines changed: 115 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from __future__ import annotations
66

7+
import re
78
import traceback
89
from pathlib import Path
910
from typing import Any, Dict, Iterable, List, Optional, Tuple
@@ -91,7 +92,23 @@ def _build_schema_catalog() -> Dict[str, Dict[str, Any]]:
9192

9293

9394
def agi_to_probset(gene_id: str) -> Optional[str]:
94-
"""convert an arabidopsis agi to its probeset when needed"""
95+
"""
96+
Convert an Arabidopsis AGI identifier to its corresponding probeset ID.
97+
98+
Looks up the most recent mapping in the AtAgiLookup table, ordered by date
99+
descending. This ensures the newest array design mapping is used when multiple
100+
mappings exist for the same AGI.
101+
102+
:param gene_id: Arabidopsis gene ID in AGI format (e.g., 'AT1G01010')
103+
:type gene_id: str
104+
:return: Probeset ID (e.g., '261585_at') if found, None otherwise
105+
:rtype: Optional[str]
106+
107+
Example::
108+
109+
probeset = agi_to_probset('AT1G01010')
110+
# Returns: '261585_at' (if mapping exists)
111+
"""
95112
try:
96113
subquery = (
97114
db.select(AtAgiLookup.probeset)
@@ -162,7 +179,34 @@ def query_efp_database_dynamic(
162179
allow_empty_results: bool = False,
163180
sample_case_insensitive: bool = False,
164181
) -> Dict[str, object]:
165-
"""dynamically query any efp database using the shared schema catalog"""
182+
"""
183+
Dynamically query any eFP database using the shared schema catalog.
184+
185+
This function provides a unified interface for querying expression data across
186+
different eFP databases, handling species-specific gene ID validation and
187+
automatic probeset conversion when needed.
188+
189+
:param database: Database name (e.g., 'cannabis', 'embryo', 'sample_data')
190+
:type database: str
191+
:param gene_id: Gene identifier (AGI format, probeset, or species-specific format)
192+
:type gene_id: str
193+
:param sample_ids: Optional list of sample IDs to filter results; if None, returns all samples
194+
:type sample_ids: Optional[List[str]]
195+
:param allow_empty_results: If True, return success even when no data found; if False, return 404 error
196+
:type allow_empty_results: bool
197+
:param sample_case_insensitive: If True, compare sample IDs case-insensitively
198+
:type sample_case_insensitive: bool
199+
:return: Dictionary with 'success' boolean, data or error message, and HTTP status code
200+
:rtype: Dict[str, object]
201+
202+
Example::
203+
204+
result = query_efp_database_dynamic('embryo', 'AT1G01010')
205+
# Returns: {'success': True, 'gene_id': 'AT1G01010', 'data': [...]}
206+
207+
result = query_efp_database_dynamic('sample_data', 'AT1G01010')
208+
# Auto-converts to probeset, returns: {'probset_id': '261585_at', ...}
209+
"""
166210
try:
167211
database = str(database)
168212
gene_id = str(gene_id)
@@ -178,20 +222,54 @@ def query_efp_database_dynamic(
178222
"error_code": 400,
179223
}
180224

225+
# Extract species information from schema metadata
226+
species = schema.get("metadata", {}).get("species", "").lower()
227+
181228
query_id = gene_id
182229
probset_display = None
183230
gene_case_insensitive = False
184231
upper_id = gene_id.upper()
185232
is_agi_id = upper_id.startswith("AT") and "G" in upper_id
186233

234+
# Validate gene ID format based on species and ID pattern
235+
# Only validate if the ID looks like it's in the species-specific format
187236
if is_agi_id:
237+
# This looks like an Arabidopsis AGI ID - validate it
188238
if not BARUtils.is_arabidopsis_gene_valid(upper_id):
189-
return {
190-
"success": False,
191-
"error": "Invalid Arabidopsis gene ID format",
192-
"error_code": 400,
193-
}
194-
239+
return {"success": False, "error": "Invalid Arabidopsis gene ID format", "error_code": 400}
240+
elif species and schema["identifier_type"] == "agi":
241+
# For non-AGI formatted IDs in species databases that expect AGI format,
242+
# validate against the specific species validator
243+
if species == "arachis":
244+
if not BARUtils.is_arachis_gene_valid(upper_id):
245+
return {"success": False, "error": "Invalid Arachis gene ID", "error_code": 400}
246+
elif species == "cannabis":
247+
if not BARUtils.is_cannabis_gene_valid(upper_id):
248+
return {"success": False, "error": "Invalid Cannabis gene ID", "error_code": 400}
249+
elif species == "kalanchoe":
250+
if not BARUtils.is_kalanchoe_gene_valid(upper_id):
251+
return {"success": False, "error": "Invalid Kalanchoe gene ID", "error_code": 400}
252+
elif species == "phelipanche":
253+
if not BARUtils.is_phelipanche_gene_valid(upper_id):
254+
return {"success": False, "error": "Invalid Phelipanche gene ID", "error_code": 400}
255+
elif species == "physcomitrella":
256+
if not BARUtils.is_physcomitrella_gene_valid(upper_id):
257+
return {"success": False, "error": "Invalid Physcomitrella gene ID", "error_code": 400}
258+
elif species == "selaginella":
259+
if not BARUtils.is_selaginella_gene_valid(upper_id):
260+
return {"success": False, "error": "Invalid Selaginella gene ID", "error_code": 400}
261+
elif species == "strawberry":
262+
if not BARUtils.is_strawberry_gene_valid(upper_id):
263+
return {"success": False, "error": "Invalid Strawberry gene ID", "error_code": 400}
264+
elif species == "striga":
265+
if not BARUtils.is_striga_gene_valid(upper_id):
266+
return {"success": False, "error": "Invalid Striga gene ID", "error_code": 400}
267+
elif species == "triphysaria":
268+
if not BARUtils.is_triphysaria_gene_valid(upper_id):
269+
return {"success": False, "error": "Invalid Triphysaria gene ID", "error_code": 400}
270+
271+
# Handle Arabidopsis-specific logic for AGI IDs
272+
if is_agi_id:
195273
if schema["identifier_type"] == "probeset":
196274
probset = agi_to_probset(upper_id)
197275
if not probset:
@@ -207,6 +285,10 @@ def query_efp_database_dynamic(
207285
query_id = upper_id
208286
gene_case_insensitive = True
209287
probset_display = upper_id
288+
else:
289+
# Non-AGI IDs: use as-is, typically already uppercase from validation
290+
query_id = upper_id if species else gene_id
291+
gene_case_insensitive = bool(species)
210292

211293
engine_candidates = list(_iter_engine_candidates(database))
212294
if not engine_candidates:
@@ -216,20 +298,35 @@ def query_efp_database_dynamic(
216298
"error_code": 404,
217299
}
218300

219-
gene_column_expr = (
220-
f"UPPER({schema['gene_column']})" if gene_case_insensitive else schema["gene_column"]
221-
)
301+
# Build SQL query using parameterized queries to prevent SQL injection
302+
# Column and table names come from the internal schema catalog, which is safe
303+
gene_col = schema["gene_column"]
304+
sample_col = schema["sample_column"]
305+
value_col = schema["value_column"]
306+
table_name = schema["table"]
307+
308+
# Validate identifiers contain only safe characters (alphanumeric and underscore)
309+
for identifier, name in [
310+
(gene_col, "gene_column"),
311+
(sample_col, "sample_column"),
312+
(value_col, "value_column"),
313+
(table_name, "table"),
314+
]:
315+
if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', identifier):
316+
return {
317+
"success": False,
318+
"error": f"Invalid schema identifier for {name}: {identifier}",
319+
"error_code": 500,
320+
}
321+
322+
gene_column_expr = f"UPPER({gene_col})" if gene_case_insensitive else gene_col
222323
params = {"gene_id": query_id.upper() if gene_case_insensitive else query_id}
223324
where_clauses = [f"{gene_column_expr} = :gene_id"]
224325

225326
if sample_ids:
226327
filtered = [s for s in sample_ids if s]
227328
if filtered:
228-
sample_column_expr = (
229-
f"UPPER({schema['sample_column']})"
230-
if sample_case_insensitive
231-
else schema["sample_column"]
232-
)
329+
sample_column_expr = f"UPPER({sample_col})" if sample_case_insensitive else sample_col
233330
sample_conditions = []
234331
for idx, sample in enumerate(filtered):
235332
key = f"sample_{idx}"
@@ -238,8 +335,8 @@ def query_efp_database_dynamic(
238335
where_clauses.append(f"({' OR '.join(sample_conditions)})")
239336

240337
query_sql = text(
241-
f"SELECT {schema['sample_column']} AS sample, {schema['value_column']} AS value "
242-
f"FROM {schema['table']} "
338+
f"SELECT {sample_col} AS sample, {value_col} AS value "
339+
f"FROM {table_name} "
243340
f"WHERE {' AND '.join(where_clauses)}"
244341
)
245342

0 commit comments

Comments
 (0)