Skip to content

Commit 7a1b6c8

Browse files
committed
add gene ID detection and probeset conversion utilities for eFP expression endpoint
1 parent aef98a9 commit 7a1b6c8

File tree

3 files changed

+678
-27
lines changed

3 files changed

+678
-27
lines changed

.flake8

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
[flake8]
22
ignore = E501, E203, E121, E123, E126, W503, W504
3+
per-file-ignores =
4+
# DATABASE_SPECIES uses aligned dict values for readability (intentional)
5+
api/utils/gene_id_utils.py: E241
36
exclude =
47
.git,
58
.venv,

api/resources/gene_expression.py

Lines changed: 42 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,17 @@
11
from flask_restx import Namespace, Resource
22
from markupsafe import escape
33

4-
from api import db
5-
from api.models.annotations_lookup import AtAgiLookup
6-
from api.services.efp_data import query_efp_database_dynamic, DYNAMIC_DATABASE_SCHEMAS
4+
from api.services.efp_data import query_efp_database_dynamic
5+
from api.utils.bar_utils import BARUtils
6+
from api.utils.gene_id_utils import (
7+
CROSS_SPECIES_DATABASES,
8+
DATABASE_SPECIES,
9+
PROBESET_DATABASES,
10+
convert_gene_to_probeset,
11+
is_probeset_id,
12+
normalize_gene_id,
13+
validate_gene_id,
14+
)
715

816
gene_expression = Namespace(
917
'Gene Expression',
@@ -18,47 +26,54 @@
1826
)
1927
@gene_expression.param(
2028
"gene_id",
21-
"Gene ID (AGI format like AT1G01010 or probeset like 261585_at)",
29+
"Gene ID (e.g. AT1G01010 for Arabidopsis, or a probeset like 261585_at)",
2230
_in="path",
2331
default="AT1G01010",
2432
)
2533
@gene_expression.param(
2634
"database",
27-
"Database name (e.g., sample_data, klepikova, single_cell)",
35+
"Database name (e.g. klepikova, atgenexp, sample_data)",
2836
_in="path",
2937
default="klepikova",
3038
)
3139
class GeneExpression(Resource):
3240
def get(self, database, gene_id):
41+
database = str(escape(database))
42+
gene_id = str(escape(gene_id))
43+
44+
# 1. Resolve database species and expected input species.
45+
# Cross-species databases (e.g. phelipanche) accept an Arabidopsis AGI
46+
# even though the database itself belongs to a different species.
47+
species = DATABASE_SPECIES.get(database)
48+
if species is None:
49+
return BARUtils.error_exit(f"Unknown database '{database}'"), 400
50+
input_species = CROSS_SPECIES_DATABASES.get(database, species)
3351

34-
database = escape(database)
35-
gene_id = escape(gene_id)
52+
# 2. If the caller already supplied a probeset ID, use it directly
53+
if is_probeset_id(gene_id):
54+
query_id = gene_id
55+
else:
56+
# 3. Validate gene ID format against the expected input species regex
57+
if not validate_gene_id(gene_id, input_species):
58+
return BARUtils.error_exit(f"Invalid {input_species} gene ID: '{gene_id}'"), 400
3659

37-
upper_id = gene_id.upper()
38-
is_agi = upper_id.startswith("AT") and "G" in upper_id
60+
# 4. Normalise (e.g. strip maize transcript suffix _T##)
61+
gene_id = normalize_gene_id(gene_id, species)
3962

40-
# for databases that store probeset IDs, convert AGI to probeset via at_agi_lookup
41-
schema = DYNAMIC_DATABASE_SCHEMAS.get(str(database))
42-
if schema and is_agi and schema.get("identifier_type") == "probeset":
43-
subquery = (
44-
db.select(AtAgiLookup.probeset)
45-
.where(AtAgiLookup.agi == upper_id)
46-
.order_by(AtAgiLookup.date.desc())
47-
.limit(1)
48-
.subquery()
49-
)
50-
sq_query = db.session.query(subquery)
51-
if sq_query.count() > 0:
52-
gene_id = sq_query[0][0]
63+
# 5. Microarray / non-direct databases need gene ID → probeset conversion
64+
if database in PROBESET_DATABASES:
65+
probeset, err = convert_gene_to_probeset(gene_id, species, database)
66+
if err:
67+
return BARUtils.error_exit(err), 404
68+
query_id = probeset
5369
else:
54-
return {"success": False, "error": f"No probeset found for {gene_id}", "error_code": 404}, 404
70+
query_id = gene_id
5571

56-
result = query_efp_database_dynamic(database, gene_id, sample_ids=None)
72+
result = query_efp_database_dynamic(database, query_id)
5773

5874
if result["success"]:
59-
return result
60-
else:
61-
return result, result.get("error_code", 500)
75+
return BARUtils.success_exit(result)
76+
return BARUtils.error_exit(result["error"]), result.get("error_code", 500)
6277

6378

6479
gene_expression.add_resource(GeneExpression, '/expression/<string:database>/<string:gene_id>')

0 commit comments

Comments
 (0)