44
55from __future__ import annotations
66
7+ import re
78import traceback
89from pathlib import Path
910from typing import Any , Dict , Iterable , List , Optional , Tuple
@@ -91,7 +92,23 @@ def _build_schema_catalog() -> Dict[str, Dict[str, Any]]:
9192
9293
9394def agi_to_probset (gene_id : str ) -> Optional [str ]:
94- """convert an arabidopsis agi to its probeset when needed"""
95+ """
96+ Convert an Arabidopsis AGI identifier to its corresponding probeset ID.
97+
98+ Looks up the most recent mapping in the AtAgiLookup table, ordered by date
99+ descending. This ensures the newest array design mapping is used when multiple
100+ mappings exist for the same AGI.
101+
102+ :param gene_id: Arabidopsis gene ID in AGI format (e.g., 'AT1G01010')
103+ :type gene_id: str
104+ :return: Probeset ID (e.g., '261585_at') if found, None otherwise
105+ :rtype: Optional[str]
106+
107+ Example::
108+
109+ probeset = agi_to_probset('AT1G01010')
110+ # Returns: '261585_at' (if mapping exists)
111+ """
95112 try :
96113 subquery = (
97114 db .select (AtAgiLookup .probeset )
@@ -162,7 +179,34 @@ def query_efp_database_dynamic(
162179 allow_empty_results : bool = False ,
163180 sample_case_insensitive : bool = False ,
164181) -> Dict [str , object ]:
165- """dynamically query any efp database using the shared schema catalog"""
182+ """
183+ Dynamically query any eFP database using the shared schema catalog.
184+
185+ This function provides a unified interface for querying expression data across
186+ different eFP databases, handling species-specific gene ID validation and
187+ automatic probeset conversion when needed.
188+
189+ :param database: Database name (e.g., 'cannabis', 'embryo', 'sample_data')
190+ :type database: str
191+ :param gene_id: Gene identifier (AGI format, probeset, or species-specific format)
192+ :type gene_id: str
193+ :param sample_ids: Optional list of sample IDs to filter results; if None, returns all samples
194+ :type sample_ids: Optional[List[str]]
195+ :param allow_empty_results: If True, return success even when no data found; if False, return 404 error
196+ :type allow_empty_results: bool
197+ :param sample_case_insensitive: If True, compare sample IDs case-insensitively
198+ :type sample_case_insensitive: bool
199+ :return: Dictionary with 'success' boolean, data or error message, and HTTP status code
200+ :rtype: Dict[str, object]
201+
202+ Example::
203+
204+ result = query_efp_database_dynamic('embryo', 'AT1G01010')
205+ # Returns: {'success': True, 'gene_id': 'AT1G01010', 'data': [...]}
206+
207+ result = query_efp_database_dynamic('sample_data', 'AT1G01010')
208+ # Auto-converts to probeset, returns: {'probset_id': '261585_at', ...}
209+ """
166210 try :
167211 database = str (database )
168212 gene_id = str (gene_id )
@@ -178,20 +222,54 @@ def query_efp_database_dynamic(
178222 "error_code" : 400 ,
179223 }
180224
225+ # Extract species information from schema metadata
226+ species = schema .get ("metadata" , {}).get ("species" , "" ).lower ()
227+
181228 query_id = gene_id
182229 probset_display = None
183230 gene_case_insensitive = False
184231 upper_id = gene_id .upper ()
185232 is_agi_id = upper_id .startswith ("AT" ) and "G" in upper_id
186233
234+ # Validate gene ID format based on species and ID pattern
235+ # Only validate if the ID looks like it's in the species-specific format
187236 if is_agi_id :
237+ # This looks like an Arabidopsis AGI ID - validate it
188238 if not BARUtils .is_arabidopsis_gene_valid (upper_id ):
189- return {
190- "success" : False ,
191- "error" : "Invalid Arabidopsis gene ID format" ,
192- "error_code" : 400 ,
193- }
194-
239+ return {"success" : False , "error" : "Invalid Arabidopsis gene ID format" , "error_code" : 400 }
240+ elif species and schema ["identifier_type" ] == "agi" :
241+ # For non-AGI formatted IDs in species databases that expect AGI format,
242+ # validate against the specific species validator
243+ if species == "arachis" :
244+ if not BARUtils .is_arachis_gene_valid (upper_id ):
245+ return {"success" : False , "error" : "Invalid Arachis gene ID" , "error_code" : 400 }
246+ elif species == "cannabis" :
247+ if not BARUtils .is_cannabis_gene_valid (upper_id ):
248+ return {"success" : False , "error" : "Invalid Cannabis gene ID" , "error_code" : 400 }
249+ elif species == "kalanchoe" :
250+ if not BARUtils .is_kalanchoe_gene_valid (upper_id ):
251+ return {"success" : False , "error" : "Invalid Kalanchoe gene ID" , "error_code" : 400 }
252+ elif species == "phelipanche" :
253+ if not BARUtils .is_phelipanche_gene_valid (upper_id ):
254+ return {"success" : False , "error" : "Invalid Phelipanche gene ID" , "error_code" : 400 }
255+ elif species == "physcomitrella" :
256+ if not BARUtils .is_physcomitrella_gene_valid (upper_id ):
257+ return {"success" : False , "error" : "Invalid Physcomitrella gene ID" , "error_code" : 400 }
258+ elif species == "selaginella" :
259+ if not BARUtils .is_selaginella_gene_valid (upper_id ):
260+ return {"success" : False , "error" : "Invalid Selaginella gene ID" , "error_code" : 400 }
261+ elif species == "strawberry" :
262+ if not BARUtils .is_strawberry_gene_valid (upper_id ):
263+ return {"success" : False , "error" : "Invalid Strawberry gene ID" , "error_code" : 400 }
264+ elif species == "striga" :
265+ if not BARUtils .is_striga_gene_valid (upper_id ):
266+ return {"success" : False , "error" : "Invalid Striga gene ID" , "error_code" : 400 }
267+ elif species == "triphysaria" :
268+ if not BARUtils .is_triphysaria_gene_valid (upper_id ):
269+ return {"success" : False , "error" : "Invalid Triphysaria gene ID" , "error_code" : 400 }
270+
271+ # Handle Arabidopsis-specific logic for AGI IDs
272+ if is_agi_id :
195273 if schema ["identifier_type" ] == "probeset" :
196274 probset = agi_to_probset (upper_id )
197275 if not probset :
@@ -207,6 +285,10 @@ def query_efp_database_dynamic(
207285 query_id = upper_id
208286 gene_case_insensitive = True
209287 probset_display = upper_id
288+ else :
289+ # Non-AGI IDs: use as-is, typically already uppercase from validation
290+ query_id = upper_id if species else gene_id
291+ gene_case_insensitive = bool (species )
210292
211293 engine_candidates = list (_iter_engine_candidates (database ))
212294 if not engine_candidates :
@@ -216,20 +298,35 @@ def query_efp_database_dynamic(
216298 "error_code" : 404 ,
217299 }
218300
219- gene_column_expr = (
220- f"UPPER({ schema ['gene_column' ]} )" if gene_case_insensitive else schema ["gene_column" ]
221- )
301+ # Build SQL query using parameterized queries to prevent SQL injection
302+ # Column and table names come from the internal schema catalog, which is safe
303+ gene_col = schema ["gene_column" ]
304+ sample_col = schema ["sample_column" ]
305+ value_col = schema ["value_column" ]
306+ table_name = schema ["table" ]
307+
308+ # Validate identifiers contain only safe characters (alphanumeric and underscore)
309+ for identifier , name in [
310+ (gene_col , "gene_column" ),
311+ (sample_col , "sample_column" ),
312+ (value_col , "value_column" ),
313+ (table_name , "table" ),
314+ ]:
315+ if not re .match (r'^[a-zA-Z_][a-zA-Z0-9_]*$' , identifier ):
316+ return {
317+ "success" : False ,
318+ "error" : f"Invalid schema identifier for { name } : { identifier } " ,
319+ "error_code" : 500 ,
320+ }
321+
322+ gene_column_expr = f"UPPER({ gene_col } )" if gene_case_insensitive else gene_col
222323 params = {"gene_id" : query_id .upper () if gene_case_insensitive else query_id }
223324 where_clauses = [f"{ gene_column_expr } = :gene_id" ]
224325
225326 if sample_ids :
226327 filtered = [s for s in sample_ids if s ]
227328 if filtered :
228- sample_column_expr = (
229- f"UPPER({ schema ['sample_column' ]} )"
230- if sample_case_insensitive
231- else schema ["sample_column" ]
232- )
329+ sample_column_expr = f"UPPER({ sample_col } )" if sample_case_insensitive else sample_col
233330 sample_conditions = []
234331 for idx , sample in enumerate (filtered ):
235332 key = f"sample_{ idx } "
@@ -238,8 +335,8 @@ def query_efp_database_dynamic(
238335 where_clauses .append (f"({ ' OR ' .join (sample_conditions )} )" )
239336
240337 query_sql = text (
241- f"SELECT { schema [ 'sample_column' ] } AS sample, { schema [ 'value_column' ] } AS value "
242- f"FROM { schema [ 'table' ] } "
338+ f"SELECT { sample_col } AS sample, { value_col } AS value "
339+ f"FROM { table_name } "
243340 f"WHERE { ' AND ' .join (where_clauses )} "
244341 )
245342
0 commit comments