Skip to content

Commit 426a7e3

Browse files
committed
Merge remote-tracking branch 'origin/release-2025.3.1' into modify/estelle/495/renameTaxIdToCode
2 parents 13dca05 + 9f0e5d7 commit 426a7e3

File tree

17 files changed

+447
-90
lines changed

17 files changed

+447
-90
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""Make faf95_max and ancestry possibly nullable
2+
3+
Revision ID: 2b7a977e7e98
4+
Revises: a8e345cca190
5+
Create Date: 2025-08-21 10:08:58.565416
6+
7+
"""
8+
9+
from alembic import op
10+
import sqlalchemy as sa
11+
12+
13+
# revision identifiers, used by Alembic.
14+
revision = "2b7a977e7e98"
15+
down_revision = "a8e345cca190"
16+
branch_labels = None
17+
depends_on = None
18+
19+
20+
def upgrade():
21+
# ### commands auto generated by Alembic - please adjust! ###
22+
op.alter_column("gnomad_variants", "faf95_max", existing_type=sa.DOUBLE_PRECISION(precision=53), nullable=True)
23+
op.alter_column("gnomad_variants", "faf95_max_ancestry", existing_type=sa.VARCHAR(), nullable=True)
24+
# ### end Alembic commands ###
25+
26+
27+
def downgrade():
28+
# ### commands auto generated by Alembic - please adjust! ###
29+
op.alter_column("gnomad_variants", "faf95_max_ancestry", existing_type=sa.VARCHAR(), nullable=False)
30+
op.alter_column("gnomad_variants", "faf95_max", existing_type=sa.DOUBLE_PRECISION(precision=53), nullable=False)
31+
# ### end Alembic commands ###

src/mavedb/lib/annotation/annotate.py

Lines changed: 6 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,17 @@
1313
from ga4gh.va_spec.acmg_2015 import VariantPathogenicityEvidenceLine
1414
from ga4gh.va_spec.base.core import ExperimentalVariantFunctionalImpactStudyResult, Statement
1515

16-
from mavedb.lib.annotation.constants import FUNCTIONAL_RANGES, CLINICAL_RANGES
1716
from mavedb.lib.annotation.evidence_line import acmg_evidence_line, functional_evidence_line
1817
from mavedb.lib.annotation.proposition import (
1918
mapped_variant_to_experimental_variant_clinical_impact_proposition,
2019
mapped_variant_to_experimental_variant_functional_impact_proposition,
2120
)
2221
from mavedb.lib.annotation.statement import mapped_variant_to_functional_statement
2322
from mavedb.lib.annotation.study_result import mapped_variant_to_experimental_variant_impact_study_result
23+
from mavedb.lib.annotation.util import (
24+
can_annotate_variant_for_pathogenicity_evidence,
25+
can_annotate_variant_for_functional_statement,
26+
)
2427
from mavedb.models.mapped_variant import MappedVariant
2528

2629

@@ -29,14 +32,7 @@ def variant_study_result(mapped_variant: MappedVariant) -> ExperimentalVariantFu
2932

3033

3134
def variant_functional_impact_statement(mapped_variant: MappedVariant) -> Optional[Statement]:
32-
if mapped_variant.variant.score_set.score_ranges is None:
33-
return None
34-
35-
if not any(
36-
range_key in mapped_variant.variant.score_set.score_ranges
37-
and mapped_variant.variant.score_set.score_ranges[range_key] is not None
38-
for range_key in FUNCTIONAL_RANGES
39-
):
35+
if not can_annotate_variant_for_functional_statement(mapped_variant):
4036
return None
4137

4238
# TODO#494: Add support for multiple functional evidence lines. If a score set has multiple ranges
@@ -51,14 +47,7 @@ def variant_functional_impact_statement(mapped_variant: MappedVariant) -> Option
5147
def variant_pathogenicity_evidence(
5248
mapped_variant: MappedVariant,
5349
) -> Optional[VariantPathogenicityEvidenceLine]:
54-
if mapped_variant.variant.score_set.score_ranges is None:
55-
return None
56-
57-
if not any(
58-
range_key in mapped_variant.variant.score_set.score_ranges
59-
and mapped_variant.variant.score_set.score_ranges[range_key] is not None
60-
for range_key in CLINICAL_RANGES
61-
):
50+
if not can_annotate_variant_for_pathogenicity_evidence(mapped_variant):
6251
return None
6352

6453
study_result = mapped_variant_to_experimental_variant_impact_study_result(mapped_variant)

src/mavedb/lib/annotation/classification.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,13 @@ def functional_classification_of_variant(
4040
)
4141

4242
# This property of this column is guaranteed to be defined.
43-
functional_score: float = mapped_variant.variant.data["score_data"]["score"] # type: ignore
43+
functional_score: Optional[float] = mapped_variant.variant.data["score_data"]["score"] # type: ignore
44+
if functional_score is None:
45+
raise ValueError(
46+
f"Variant {mapped_variant.variant.urn} does not have a functional score."
47+
" Unable to classify functional impact."
48+
)
49+
4450
for range in score_ranges.ranges:
4551
lower_bound, upper_bound = inf_or_float(range.range[0], lower=True), inf_or_float(range.range[1], lower=False)
4652
if functional_score > lower_bound and functional_score <= upper_bound:
@@ -72,7 +78,12 @@ def pillar_project_clinical_classification_of_variant(
7278
)
7379

7480
# This property of this column is guaranteed to be defined.
75-
functional_score: float = mapped_variant.variant.data["score_data"]["score"] # type: ignore
81+
functional_score: Optional[float] = mapped_variant.variant.data["score_data"]["score"] # type: ignore
82+
if functional_score is None:
83+
raise ValueError(
84+
f"Variant {mapped_variant.variant.urn} does not have a functional score."
85+
" Unable to classify clinical impact."
86+
)
7687

7788
for range in score_ranges.ranges:
7889
lower_bound, upper_bound = inf_or_float(range.range[0], lower=True), inf_or_float(range.range[1], lower=False)

src/mavedb/lib/annotation/util.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
Expression,
99
LiteralSequenceExpression,
1010
)
11+
from mavedb.lib.annotation.constants import CLINICAL_RANGES, FUNCTIONAL_RANGES
1112
from mavedb.models.mapped_variant import MappedVariant
1213
from mavedb.lib.annotation.exceptions import MappingDataDoesntExistException
1314

@@ -137,3 +138,114 @@ def variation_from_mapped_variant(mapped_variant: MappedVariant) -> MolecularVar
137138
)
138139

139140
return vrs_object_from_mapped_variant(mapped_variant.post_mapped)
141+
142+
143+
def _can_annotate_variant_base_assumptions(mapped_variant: MappedVariant) -> bool:
144+
"""
145+
Check if a mapped variant meets the basic requirements for annotation.
146+
147+
This function validates that a mapped variant has the necessary data
148+
to proceed with annotation by checking for a valid score value.
149+
150+
Args:
151+
mapped_variant (MappedVariant): The mapped variant to check for
152+
annotation eligibility.
153+
154+
Returns:
155+
bool: True if the variant can be annotated (has score ranges and
156+
a non-None score), False otherwise.
157+
"""
158+
# This property is guaranteed to exist for all variants.
159+
if mapped_variant.variant.data["score_data"]["score"] is None: # type: ignore
160+
return False
161+
162+
return True
163+
164+
165+
def _variant_score_ranges_have_required_keys_for_annotation(
166+
mapped_variant: MappedVariant, key_options: list[str]
167+
) -> bool:
168+
"""
169+
Check if a mapped variant's score set contains any of the required score range keys for annotation and is present.
170+
171+
Args:
172+
mapped_variant (MappedVariant): The mapped variant object containing the variant with score set data.
173+
key_options (list[str]): List of possible score range keys to check for in the score set.
174+
175+
Returns:
176+
bool: False if none of the required keys are found or if all found keys have None values.
177+
Returns True (implicitly) if at least one required key exists with a non-None value.
178+
"""
179+
if mapped_variant.variant.score_set.score_ranges is None:
180+
return False
181+
182+
if not any(
183+
range_key in mapped_variant.variant.score_set.score_ranges
184+
and mapped_variant.variant.score_set.score_ranges[range_key] is not None
185+
for range_key in key_options
186+
):
187+
return False
188+
189+
return True
190+
191+
192+
def can_annotate_variant_for_pathogenicity_evidence(mapped_variant: MappedVariant) -> bool:
193+
"""
194+
Determine if a mapped variant can be annotated for pathogenicity evidence.
195+
196+
This function checks whether a given mapped variant meets all the necessary
197+
requirements to receive pathogenicity evidence annotations. It validates
198+
both basic annotation assumptions and the presence of required clinical
199+
score range keys.
200+
201+
Args:
202+
mapped_variant (MappedVariant): The mapped variant object to evaluate
203+
for pathogenicity evidence annotation eligibility.
204+
205+
Returns:
206+
bool: True if the variant can be annotated for pathogenicity evidence,
207+
False otherwise.
208+
209+
Notes:
210+
The function performs two main validation checks:
211+
1. Basic annotation assumptions via _can_annotate_variant_base_assumptions
212+
2. Required clinical range keys via _variant_score_ranges_have_required_keys_for_annotation
213+
214+
Both checks must pass for the variant to be considered eligible for
215+
pathogenicity evidence annotation.
216+
"""
217+
if not _can_annotate_variant_base_assumptions(mapped_variant):
218+
return False
219+
if not _variant_score_ranges_have_required_keys_for_annotation(mapped_variant, CLINICAL_RANGES):
220+
return False
221+
222+
return True
223+
224+
225+
def can_annotate_variant_for_functional_statement(mapped_variant: MappedVariant) -> bool:
226+
"""
227+
Determine if a mapped variant can be annotated for functional statements.
228+
229+
This function checks if a variant meets all the necessary conditions to receive
230+
functional annotations by validating base assumptions and ensuring the variant's
231+
score ranges contain the required keys for functional annotation.
232+
233+
Args:
234+
mapped_variant (MappedVariant): The variant object to check for annotation
235+
eligibility, containing mapping information and score data.
236+
237+
Returns:
238+
bool: True if the variant can be annotated for functional statements,
239+
False otherwise.
240+
241+
Notes:
242+
The function performs two main checks:
243+
1. Validates base assumptions using _can_annotate_variant_base_assumptions
244+
2. Verifies score ranges have required keys using FUNCTIONAL_RANGES
245+
"""
246+
if not _can_annotate_variant_base_assumptions(mapped_variant):
247+
return False
248+
if not _variant_score_ranges_have_required_keys_for_annotation(mapped_variant, FUNCTIONAL_RANGES):
249+
return False
250+
251+
return True

src/mavedb/lib/gnomad.py

Lines changed: 43 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from sqlalchemy.orm import Session
88

99
from mavedb.lib.logging.context import logging_context, save_to_logging_context
10+
from mavedb.lib.utils import batched
1011
from mavedb.db.athena import engine as athena_engine
1112
from mavedb.models.gnomad_variant import GnomADVariant
1213
from mavedb.models.mapped_variant import MappedVariant
@@ -56,7 +57,7 @@ def allele_list_from_list_like_string(alleles_string: str) -> list[str]:
5657
if not alleles_string:
5758
return []
5859

59-
if not re.match(r"^\"\[\s*[AGTC]+(?:\s*,\s*[AGTC]+)\s*\]\"$", alleles_string):
60+
if not re.match(r"^\[\s*[AGTC]+(?:\s*,\s*[AGTC]+)\s*\]$", alleles_string):
6061
raise ValueError("Invalid format for alleles string.")
6162

6263
alleles_string = alleles_string.strip().strip('"[]')
@@ -67,7 +68,9 @@ def allele_list_from_list_like_string(alleles_string: str) -> list[str]:
6768

6869
def gnomad_variant_data_for_caids(caids: Sequence[str]) -> Sequence[Row[Any]]: # pragma: no cover
6970
"""
70-
Fetches variant rows from the gnomAD table for a list of CAIDs.
71+
Fetches variant rows from the gnomAD table for a list of CAIDs. Athena has a maximum character limit of 262144
72+
in queries. CAIDs are about 12 characters long on average + 4 for two quotes, a comma and a space. Chunk our list
73+
into chunks of 260000/16=16250 so we are guaranteed to remain under the character limit.
7174
7275
Args:
7376
caids (list[str]): A list of CAIDs (Canonical Allele Identifiers) to query.
@@ -87,36 +90,45 @@ def gnomad_variant_data_for_caids(caids: Sequence[str]) -> Sequence[Row[Any]]:
8790
Raises:
8891
sqlalchemy.exc.SQLAlchemyError: If there is an error executing the query.
8992
"""
90-
91-
caid_str = ",".join(f"'{caid}'" for caid in caids)
92-
athena_query = f"""
93-
SELECT
94-
"locus.contig",
95-
"locus.position",
96-
"alleles",
97-
"caid",
98-
"joint.freq.all.ac",
99-
"joint.freq.all.an",
100-
"joint.fafmax.faf95_max_gen_anc",
101-
"joint.fafmax.faf95_max"
102-
FROM
103-
{gnomad_table_name()}
104-
WHERE
105-
caid IN ({caid_str})
106-
"""
107-
108-
save_to_logging_context({"num_caids": len(caids)})
109-
logger.debug(msg=f"Fetching gnomAD variants from Athena with query:\n{athena_query}", extra=logging_context())
93+
chunked_caids = batched(caids, 16250)
94+
caid_strs = [",".join(f"'{caid}'" for caid in chunk) for chunk in chunked_caids]
95+
save_to_logging_context({"num_caids": len(caids), "num_chunks": len(caid_strs)})
11096

11197
with athena_engine.connect() as athena_connection:
11298
logger.debug(msg="Connected to Athena", extra=logging_context())
113-
result = athena_connection.execute(text(athena_query))
114-
rows = result.fetchall()
11599

116-
save_to_logging_context({"num_gnomad_variant_rows_fetched": len(rows)})
117-
logger.debug(msg="Done fetching gnomAD variants from Athena", extra=logging_context())
100+
result_rows: list[Row[Any]] = []
101+
for chunk_index, caid_str in enumerate(caid_strs):
102+
athena_query = f"""
103+
SELECT
104+
"locus.contig",
105+
"locus.position",
106+
"alleles",
107+
"caid",
108+
"joint.freq.all.ac",
109+
"joint.freq.all.an",
110+
"joint.fafmax.faf95_max_gen_anc",
111+
"joint.fafmax.faf95_max"
112+
FROM
113+
{gnomad_table_name()}
114+
WHERE
115+
caid IN ({caid_str})
116+
"""
117+
logger.debug(
118+
msg=f"Fetching gnomAD variants from Athena (batch {chunk_index}) with query:\n{athena_query}",
119+
extra=logging_context(),
120+
)
121+
122+
result = athena_connection.execute(text(athena_query))
123+
rows = result.fetchall()
124+
result_rows.extend(rows)
125+
126+
logger.debug(f"Fetched {len(rows)} gnomAD variants from Athena (batch {chunk_index}).")
118127

119-
return rows
128+
save_to_logging_context({"num_gnomad_variant_rows_fetched": len(result_rows)})
129+
logger.debug(msg="Done fetching gnomAD variants from Athena", extra=logging_context())
130+
131+
return result_rows
120132

121133

122134
def link_gnomad_variants_to_mapped_variants(
@@ -153,7 +165,10 @@ def link_gnomad_variants_to_mapped_variants(
153165
allele_number = int(row.__getattribute__("joint.freq.all.an"))
154166
allele_frequency = float(allele_count) / float(allele_number)
155167
faf95_max_ancestry = row.__getattribute__("joint.fafmax.faf95_max_gen_anc")
156-
faf95_max = float(row.__getattribute__("joint.fafmax.faf95_max"))
168+
faf95_max = row.__getattribute__("joint.fafmax.faf95_max")
169+
170+
if faf95_max is not None:
171+
faf95_max = float(faf95_max)
157172

158173
for mapped_variant in mapped_variants_with_caids:
159174
# Remove any existing gnomAD variants for this mapped variant that match the current gnomAD data version to avoid data duplication.

src/mavedb/models/gnomad_variant.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ class GnomADVariant(Base):
2424
allele_number = Column(Integer, nullable=False)
2525
allele_frequency = Column(Float, nullable=False)
2626

27-
faf95_max = Column(Float, nullable=False)
28-
faf95_max_ancestry = Column(String, nullable=False)
27+
faf95_max = Column(Float, nullable=True)
28+
faf95_max_ancestry = Column(String, nullable=True)
2929

3030
creation_date = Column(Date, nullable=False, default=date.today)
3131
modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today)

src/mavedb/routers/mapped_variant.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,12 @@ async def show_mapped_variant_functional_impact_statement(
139139

140140
if not functional_impact:
141141
logger.info(
142-
msg="Could not construct a functional impact statement for this mapped variant; No score range evidence exists for this score set.",
142+
msg="Could not construct a functional impact statement for this mapped variant. Variant does not have sufficient evidence to evaluate its functional impact.",
143143
extra=logging_context(),
144144
)
145145
raise HTTPException(
146146
status_code=404,
147-
detail=f"Could not construct a functional impact statement for mapped variant {urn}: No score range evidence found",
147+
detail=f"Could not construct a functional impact statement for mapped variant {urn}. Variant does not have sufficient evidence to evaluate its functional impact.",
148148
)
149149

150150
return functional_impact
@@ -180,12 +180,12 @@ async def show_mapped_variant_acmg_evidence_line(
180180

181181
if not pathogenicity_evidence:
182182
logger.info(
183-
msg="Could not construct a pathogenicity evidence line for this mapped variant; No calibrations exist for this score set.",
183+
msg="Could not construct a pathogenicity evidence line for this mapped variant; Variant does not have sufficient evidence to evaluate its pathogenicity.",
184184
extra=logging_context(),
185185
)
186186
raise HTTPException(
187187
status_code=404,
188-
detail=f"Could not construct a pathogenicity evidence line for mapped variant {urn}; No calibrations exist for this score set",
188+
detail=f"Could not construct a pathogenicity evidence line for mapped variant {urn}; Variant does not have sufficient evidence to evaluate its pathogenicity.",
189189
)
190190

191191
return pathogenicity_evidence

0 commit comments

Comments
 (0)