Add BARUtils validation, Sphinx docstrings, and fix SQL injection

rmobmina · rmobmina · commit b3cdeaec0b4d · 2026-01-08T09:04:02.000-05:00
- Replace manual REGEX with BARUtils validators for all species
- Add descriptive error messages for each species (e.g., "Invalid Cannabis gene ID")
- Add comprehensive Sphinx/reST docstrings to all ORM files
- Improve validation logic to handle both AGI and probeset IDs correctly
- Fix SQL injection vulnerability by validating schema identifiers
- Add identifier validation (alphanumeric + underscore only) before SQL construction

Security: Addresses CodeQL high-severity SQL injection alert by validating
all database/table/column identifiers match safe pattern before use in queries
diff --git a/api/models/efp_dynamic.py b/api/models/efp_dynamic.py
@@ -1,5 +1,8 @@
 """
-dynamic sqlalchemy model generation for simple efp databases
+Dynamic SQLAlchemy model generation for simple eFP databases.
+
+This module provides runtime generation of SQLAlchemy ORM models from schema
+definitions, enabling dynamic database access without hardcoded model classes.
 """
 
 from __future__ import annotations
@@ -14,7 +17,23 @@
 
 
 def _to_sqla_type(column_spec):
-    """map a simple column spec to a sqlalchemy column type"""
+    """
+    Map a column specification dictionary to a SQLAlchemy column type.
+
+    Converts the simple type descriptors used in schema definitions to the
+    appropriate SQLAlchemy type objects for ORM model generation.
+
+    :param column_spec: Column specification with 'type', 'length', and 'unsigned' keys
+    :type column_spec: Dict[str, Any]
+    :return: SQLAlchemy column type (String, Integer, Float, or Text)
+    :rtype: sqlalchemy.types.TypeEngine
+    :raises ValueError: If column type is not one of: string, integer, float, text
+
+    Example::
+
+        col_spec = {"type": "string", "length": 24}
+        sqla_type = _to_sqla_type(col_spec)  # Returns String(24)
+    """
     col_type = column_spec.get("type")
     if col_type == "string":
         return String(column_spec["length"])
@@ -30,7 +49,26 @@ def _to_sqla_type(column_spec):
 
 
 def _generate_model(bind_key: str, spec) -> db.Model:
-    """build a concrete sqlalchemy model for the given schema"""
+    """
+    Build a concrete SQLAlchemy model class for the given schema specification.
+
+    Dynamically creates an ORM model with the specified table name, bind key,
+    and columns based on the schema definition. The generated model class can
+    be used like any Flask-SQLAlchemy model.
+
+    :param bind_key: Database bind key (e.g., 'cannabis', 'embryo')
+    :type bind_key: str
+    :param spec: Database schema specification from SIMPLE_EFP_DATABASE_SCHEMAS
+    :type spec: Dict[str, Any]
+    :return: Dynamically generated SQLAlchemy model class
+    :rtype: db.Model
+
+    Example::
+
+        schema = SIMPLE_EFP_DATABASE_SCHEMAS['cannabis']
+        CannabisModel = _generate_model('cannabis', schema)
+        # Returns class: CannabisSampleData(db.Model)
+    """
     attrs = {"__bind_key__": bind_key, "__tablename__": spec["table_name"]}
 
     for column in spec["columns"]:
diff --git a/api/models/efp_schemas.py b/api/models/efp_schemas.py
@@ -1,5 +1,9 @@
 """
-simple schema definitions for efp databases that only expose a sample_data table
+Simple schema definitions for eFP databases that only expose a sample_data table.
+
+This module provides the single source of truth for all simple eFP database schemas,
+including column definitions, indexes, seed data, and metadata. These schemas are used
+by both the ORM model generator and the database bootstrap scripts.
 """
 
 from __future__ import annotations
@@ -20,6 +24,34 @@ def _column(
     default: Any | None = None,
     primary_key: bool = False,
 ) -> ColumnSpec:
+    """
+    Create a column specification dictionary for schema definitions.
+
+    Helper function to construct column metadata with type, constraints, and defaults
+    in a consistent format.
+
+    :param name: Column name (e.g., 'data_probeset_id', 'data_signal')
+    :type name: str
+    :param col_type: Column type ('string', 'integer', 'float', or 'text')
+    :type col_type: str
+    :param length: Maximum length for string types (required for 'string')
+    :type length: int or None
+    :param unsigned: Whether integer type is unsigned (MySQL-specific)
+    :type unsigned: bool
+    :param nullable: Whether column allows NULL values
+    :type nullable: bool
+    :param default: Default value for the column
+    :type default: Any or None
+    :param primary_key: Whether column is part of primary key
+    :type primary_key: bool
+    :return: Column specification dictionary
+    :rtype: ColumnSpec
+
+    Example::
+
+        col = _column("data_signal", "float", nullable=False, default=0)
+        # Returns: {"name": "data_signal", "type": "float", "nullable": False, "default": 0}
+    """
     column: ColumnSpec = {"name": name, "type": col_type, "nullable": nullable}
     if length is not None:
         column["length"] = length
@@ -57,6 +89,40 @@ def _build_schema(
     identifier_type: str = "agi",
     metadata: Dict[str, Any] | None = None,
 ) -> DatabaseSpec:
+    """
+    Build a complete database schema specification from base columns and customizations.
+
+    Constructs a schema by starting with the default BASE_COLUMNS, applying any
+    overrides, and adding extra columns. The resulting schema dictionary is used by
+    both the ORM generator and bootstrap scripts to ensure consistency.
+
+    :param charset: MySQL character set (e.g., 'latin1', 'utf8mb4')
+    :type charset: str
+    :param table_name: Name of the table to create (typically 'sample_data')
+    :type table_name: str
+    :param column_overrides: Dictionary of column names to property overrides
+    :type column_overrides: Dict[str, Dict[str, Any]] or None
+    :param extra_columns: Additional columns beyond the base set
+    :type extra_columns: List[ColumnSpec] or None
+    :param index: List of column names to include in the index
+    :type index: List[str] or None
+    :param seed_rows: Initial rows to insert if table is empty
+    :type seed_rows: List[Dict[str, Any]] or None
+    :param identifier_type: Gene ID format ('agi' or 'probeset')
+    :type identifier_type: str
+    :param metadata: Additional metadata (species, sample_regex, etc.)
+    :type metadata: Dict[str, Any] or None
+    :return: Complete database schema specification
+    :rtype: DatabaseSpec
+
+    Example::
+
+        schema = _build_schema(
+            charset="utf8mb4",
+            column_overrides={"proj_id": {"length": 5}},
+            metadata={"species": "arabidopsis"}
+        )
+    """
     overrides = column_overrides or {}
     columns: List[ColumnSpec] = []
 
diff --git a/api/services/efp_data.py b/api/services/efp_data.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+import re
 import traceback
 from pathlib import Path
 from typing import Any, Dict, Iterable, List, Optional, Tuple
@@ -91,7 +92,23 @@ def _build_schema_catalog() -> Dict[str, Dict[str, Any]]:
 
 
 def agi_to_probset(gene_id: str) -> Optional[str]:
-    """convert an arabidopsis agi to its probeset when needed"""
+    """
+    Convert an Arabidopsis AGI identifier to its corresponding probeset ID.
+
+    Looks up the most recent mapping in the AtAgiLookup table, ordered by date
+    descending. This ensures the newest array design mapping is used when multiple
+    mappings exist for the same AGI.
+
+    :param gene_id: Arabidopsis gene ID in AGI format (e.g., 'AT1G01010')
+    :type gene_id: str
+    :return: Probeset ID (e.g., '261585_at') if found, None otherwise
+    :rtype: Optional[str]
+
+    Example::
+
+        probeset = agi_to_probset('AT1G01010')
+        # Returns: '261585_at' (if mapping exists)
+    """
     try:
         subquery = (
             db.select(AtAgiLookup.probeset)
@@ -162,7 +179,34 @@ def query_efp_database_dynamic(
     allow_empty_results: bool = False,
     sample_case_insensitive: bool = False,
 ) -> Dict[str, object]:
-    """dynamically query any efp database using the shared schema catalog"""
+    """
+    Dynamically query any eFP database using the shared schema catalog.
+
+    This function provides a unified interface for querying expression data across
+    different eFP databases, handling species-specific gene ID validation and
+    automatic probeset conversion when needed.
+
+    :param database: Database name (e.g., 'cannabis', 'embryo', 'sample_data')
+    :type database: str
+    :param gene_id: Gene identifier (AGI format, probeset, or species-specific format)
+    :type gene_id: str
+    :param sample_ids: Optional list of sample IDs to filter results; if None, returns all samples
+    :type sample_ids: Optional[List[str]]
+    :param allow_empty_results: If True, return success even when no data found; if False, return 404 error
+    :type allow_empty_results: bool
+    :param sample_case_insensitive: If True, compare sample IDs case-insensitively
+    :type sample_case_insensitive: bool
+    :return: Dictionary with 'success' boolean, data or error message, and HTTP status code
+    :rtype: Dict[str, object]
+
+    Example::
+
+        result = query_efp_database_dynamic('embryo', 'AT1G01010')
+        # Returns: {'success': True, 'gene_id': 'AT1G01010', 'data': [...]}
+
+        result = query_efp_database_dynamic('sample_data', 'AT1G01010')
+        # Auto-converts to probeset, returns: {'probset_id': '261585_at', ...}
+    """
     try:
         database = str(database)
         gene_id = str(gene_id)
@@ -178,20 +222,54 @@ def query_efp_database_dynamic(
                 "error_code": 400,
             }
 
+        # Extract species information from schema metadata
+        species = schema.get("metadata", {}).get("species", "").lower()
+
         query_id = gene_id
         probset_display = None
         gene_case_insensitive = False
         upper_id = gene_id.upper()
         is_agi_id = upper_id.startswith("AT") and "G" in upper_id
 
+        # Validate gene ID format based on species and ID pattern
+        # Only validate if the ID looks like it's in the species-specific format
         if is_agi_id:
+            # This looks like an Arabidopsis AGI ID - validate it
             if not BARUtils.is_arabidopsis_gene_valid(upper_id):
-                return {
-                    "success": False,
-                    "error": "Invalid Arabidopsis gene ID format",
-                    "error_code": 400,
-                }
-
+                return {"success": False, "error": "Invalid Arabidopsis gene ID format", "error_code": 400}
+        elif species and schema["identifier_type"] == "agi":
+            # For non-AGI formatted IDs in species databases that expect AGI format,
+            # validate against the specific species validator
+            if species == "arachis":
+                if not BARUtils.is_arachis_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Arachis gene ID", "error_code": 400}
+            elif species == "cannabis":
+                if not BARUtils.is_cannabis_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Cannabis gene ID", "error_code": 400}
+            elif species == "kalanchoe":
+                if not BARUtils.is_kalanchoe_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Kalanchoe gene ID", "error_code": 400}
+            elif species == "phelipanche":
+                if not BARUtils.is_phelipanche_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Phelipanche gene ID", "error_code": 400}
+            elif species == "physcomitrella":
+                if not BARUtils.is_physcomitrella_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Physcomitrella gene ID", "error_code": 400}
+            elif species == "selaginella":
+                if not BARUtils.is_selaginella_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Selaginella gene ID", "error_code": 400}
+            elif species == "strawberry":
+                if not BARUtils.is_strawberry_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Strawberry gene ID", "error_code": 400}
+            elif species == "striga":
+                if not BARUtils.is_striga_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Striga gene ID", "error_code": 400}
+            elif species == "triphysaria":
+                if not BARUtils.is_triphysaria_gene_valid(upper_id):
+                    return {"success": False, "error": "Invalid Triphysaria gene ID", "error_code": 400}
+
+        # Handle Arabidopsis-specific logic for AGI IDs
+        if is_agi_id:
             if schema["identifier_type"] == "probeset":
                 probset = agi_to_probset(upper_id)
                 if not probset:
@@ -207,6 +285,10 @@ def query_efp_database_dynamic(
                 query_id = upper_id
                 gene_case_insensitive = True
                 probset_display = upper_id
+        else:
+            # Non-AGI IDs: use as-is, typically already uppercase from validation
+            query_id = upper_id if species else gene_id
+            gene_case_insensitive = bool(species)
 
         engine_candidates = list(_iter_engine_candidates(database))
         if not engine_candidates:
@@ -216,20 +298,35 @@ def query_efp_database_dynamic(
                 "error_code": 404,
             }
 
-        gene_column_expr = (
-            f"UPPER({schema['gene_column']})" if gene_case_insensitive else schema["gene_column"]
-        )
+        # Build SQL query using parameterized queries to prevent SQL injection
+        # Column and table names come from the internal schema catalog, which is safe
+        gene_col = schema["gene_column"]
+        sample_col = schema["sample_column"]
+        value_col = schema["value_column"]
+        table_name = schema["table"]
+
+        # Validate identifiers contain only safe characters (alphanumeric and underscore)
+        for identifier, name in [
+            (gene_col, "gene_column"),
+            (sample_col, "sample_column"),
+            (value_col, "value_column"),
+            (table_name, "table"),
+        ]:
+            if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', identifier):
+                return {
+                    "success": False,
+                    "error": f"Invalid schema identifier for {name}: {identifier}",
+                    "error_code": 500,
+                }
+
+        gene_column_expr = f"UPPER({gene_col})" if gene_case_insensitive else gene_col
         params = {"gene_id": query_id.upper() if gene_case_insensitive else query_id}
         where_clauses = [f"{gene_column_expr} = :gene_id"]
 
         if sample_ids:
             filtered = [s for s in sample_ids if s]
             if filtered:
-                sample_column_expr = (
-                    f"UPPER({schema['sample_column']})"
-                    if sample_case_insensitive
-                    else schema["sample_column"]
-                )
+                sample_column_expr = f"UPPER({sample_col})" if sample_case_insensitive else sample_col
                 sample_conditions = []
                 for idx, sample in enumerate(filtered):
                     key = f"sample_{idx}"
@@ -238,8 +335,8 @@ def query_efp_database_dynamic(
                 where_clauses.append(f"({' OR '.join(sample_conditions)})")
 
         query_sql = text(
-            f"SELECT {schema['sample_column']} AS sample, {schema['value_column']} AS value "
-            f"FROM {schema['table']} "
+            f"SELECT {sample_col} AS sample, {value_col} AS value "
+            f"FROM {table_name} "
             f"WHERE {' AND '.join(where_clauses)}"
         )