Skip to content

Commit 36bc828

Browse files
committed
Enforce a 1 to 1 relationship between incoming score rows and outgoing annotated mappings
Prior to this change, it was possible for some score rows to generate valid mappings with other score rows not creating a mapped variant. This had some negative downstream consequences, which will be remedied by ensuring that if any variant receives a mapped variant, all variants receive a mapped variant.
1 parent a0b6970 commit 36bc828

File tree

4 files changed

+109
-55
lines changed

4 files changed

+109
-55
lines changed

src/api/routers/map.py

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
6868
error_message="Score set contains no variants to map",
6969
).model_dump(exclude_none=True)
7070
)
71+
total_score_records = sum(len(v) for v in records.values())
7172

7273
try:
7374
alignment_results = build_alignment_result(metadata, True)
@@ -120,15 +121,27 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
120121
metadata=metadata, error_message=str(e).strip("'")
121122
).model_dump(exclude_none=True)
122123
)
123-
if not vrs_results or all(
124-
mapping_result is None for mapping_result in vrs_results.values()
125-
):
124+
125+
nonetype_vrs_results = [
126+
result is None
127+
for target_gene in vrs_results
128+
for result in vrs_results[target_gene]
129+
]
130+
131+
if not vrs_results or all(nonetype_vrs_results):
126132
return JSONResponse(
127133
content=ScoresetMapping(
128134
metadata=metadata,
129135
error_message="No variant mappings available for this score set",
130136
).model_dump(exclude_none=True)
131137
)
138+
if any(nonetype_vrs_results):
139+
return JSONResponse(
140+
content=ScoresetMapping(
141+
metadata=metadata,
142+
error_message="Some variants generated vrs results, but not all. If any variants were mapped, all should have been.",
143+
).model_dump(exclude_none=True)
144+
)
132145

133146
annotated_vrs_results = {}
134147
try:
@@ -146,15 +159,27 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
146159
metadata=metadata, error_message=str(e).strip("'")
147160
).model_dump(exclude_none=True)
148161
)
149-
if not annotated_vrs_results or all(
150-
mapping_result is None for mapping_result in annotated_vrs_results.values()
151-
):
162+
163+
nonetype_annotated_vrs_results = [
164+
result is None
165+
for target_gene in annotated_vrs_results
166+
for result in annotated_vrs_results[target_gene]
167+
]
168+
169+
if not annotated_vrs_results or all(nonetype_annotated_vrs_results):
152170
return JSONResponse(
153171
content=ScoresetMapping(
154172
metadata=metadata,
155173
error_message="No annotated variant mappings available for this score set",
156174
).model_dump(exclude_none=True)
157175
)
176+
if any(nonetype_annotated_vrs_results):
177+
return JSONResponse(
178+
content=ScoresetMapping(
179+
metadata=metadata,
180+
error_message="Some variants generated annotated vrs results, but not all. If any variants were annotated, all should have been.",
181+
).model_dump(exclude_none=True)
182+
)
158183

159184
try:
160185
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
@@ -235,6 +260,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
235260
).model_dump(exclude_none=True)
236261
)
237262

263+
if len(mapped_scores) != total_score_records:
264+
return JSONResponse(
265+
content=ScoresetMapping(
266+
metadata=metadata,
267+
error_message=f"Mismatch between number of mapped scores ({len(mapped_scores)}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.",
268+
).model_dump(exclude_none=True)
269+
)
270+
238271
return JSONResponse(
239272
content=ScoresetMapping(
240273
metadata=raw_metadata,

src/dcd_mapping/annotate.py

Lines changed: 34 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -266,13 +266,16 @@ def _annotate_allele_mapping(
266266
sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}"
267267
accession = get_chromosome_identifier_from_vrs_id(sequence_id)
268268
if accession is None:
269-
raise ValueError
270-
if accession.startswith("refseq:"):
269+
accession = None
270+
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
271+
elif accession.startswith("refseq:"):
271272
accession = accession[7:]
272273
else:
273274
if tx_results is None or isinstance(tx_results, TxSelectError):
274-
raise ValueError # impossible by definition
275-
accession = tx_results.np
275+
accession = None
276+
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
277+
else:
278+
accession = tx_results.np
276279

277280
sr = get_seqrepo()
278281
loc = mapped_score.post_mapped.location
@@ -281,8 +284,9 @@ def _annotate_allele_mapping(
281284
post_mapped.extensions = [
282285
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
283286
]
284-
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
285-
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]
287+
if accession:
288+
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
289+
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]
286290

287291
if vrs_version == VrsVersion.V_1_3:
288292
pre_mapped = _allele_to_vod(pre_mapped)
@@ -295,9 +299,7 @@ def _annotate_allele_mapping(
295299
mavedb_id=mapped_score.accession_id,
296300
score=float(mapped_score.score) if mapped_score.score else None,
297301
annotation_layer=mapped_score.annotation_layer,
298-
error_message=mapped_score.error_message
299-
if mapped_score.error_message
300-
else None, # TODO might not need if statement here
302+
error_message=mapped_score.error_message,
301303
)
302304

303305

@@ -321,13 +323,17 @@ def _annotate_haplotype_mapping(
321323
sequence_id = f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}"
322324
accession = get_chromosome_identifier_from_vrs_id(sequence_id)
323325
if accession is None:
324-
raise ValueError
325-
if accession.startswith("refseq:"):
326+
accession = None
327+
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
328+
elif accession.startswith("refseq:"):
326329
accession = accession[7:]
327330
else:
328331
if tx_results is None or isinstance(tx_results, TxSelectError):
329-
raise ValueError # impossible by definition
330-
accession = tx_results.np
332+
# impossible by definition
333+
accession = None
334+
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
335+
else:
336+
accession = tx_results.np
331337

332338
sr = get_seqrepo()
333339
for allele in post_mapped.members:
@@ -337,8 +343,9 @@ def _annotate_haplotype_mapping(
337343
allele.extensions = [
338344
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
339345
]
340-
hgvs, syntax = _get_hgvs_string(allele, accession)
341-
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
346+
if accession:
347+
hgvs, syntax = _get_hgvs_string(allele, accession)
348+
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
342349

343350
if vrs_version == VrsVersion.V_1_3:
344351
pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped)
@@ -351,9 +358,7 @@ def _annotate_haplotype_mapping(
351358
mavedb_id=mapped_score.accession_id,
352359
score=float(mapped_score.score) if mapped_score.score is not None else None,
353360
annotation_layer=mapped_score.annotation_layer,
354-
error_message=mapped_score.error_message
355-
if mapped_score.error_message
356-
else None, # TODO might not need if statement here
361+
error_message=mapped_score.error_message,
357362
)
358363

359364

@@ -388,6 +393,7 @@ def annotate(
388393
ScoreAnnotationWithLayer(
389394
mavedb_id=mapped_score.accession_id,
390395
score=float(mapped_score.score) if mapped_score.score else None,
396+
vrs_version=vrs_version,
391397
error_message=mapped_score.error_message,
392398
)
393399
)
@@ -410,8 +416,16 @@ def annotate(
410416
)
411417
)
412418
else:
413-
# TODO how to combine this error message with other potential error messages?
414-
ValueError("inconsistent variant structure")
419+
score_annotations.append(
420+
ScoreAnnotationWithLayer(
421+
pre_mapped=mapped_score.pre_mapped,
422+
post_mapped=mapped_score.post_mapped,
423+
vrs_version=vrs_version,
424+
mavedb_id=mapped_score.accession_id,
425+
score=float(mapped_score.score) if mapped_score.score else None,
426+
error_message=f"Multiple issues with annotation: Inconsistent variant structure (Allele and Haplotype mix).{' ' + mapped_score.error_message if mapped_score.error_message else ''}",
427+
)
428+
)
415429

416430
return score_annotations
417431

@@ -519,11 +533,6 @@ def _set_scoreset_layer(
519533
expressions. If genomic expressions are available, that's what we'd like to use.
520534
This function tells us how to filter the final annotation objects.
521535
"""
522-
if urn.startswith("urn:mavedb:00000097"):
523-
_logger.debug(
524-
"Manually selecting protein annotation for scores from urn:mavedb:00000097"
525-
)
526-
return AnnotationLayer.PROTEIN
527536
for mapping in mappings:
528537
if mapping.annotation_layer == AnnotationLayer.GENOMIC:
529538
return AnnotationLayer.GENOMIC

src/dcd_mapping/transcripts.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,10 @@ async def _get_compatible_transcripts(
5454
chromosome = get_chromosome_identifier(aligned_chrom)
5555
gene_symbol = get_gene_symbol(target_gene)
5656
if not gene_symbol:
57-
raise TxSelectError
57+
msg = (
58+
f"Unable to find gene symbol for target gene {target_gene.target_gene_name}"
59+
)
60+
raise TxSelectError(msg)
5861
transcript_matches = []
5962
for hit_range in align_result.hit_subranges:
6063
matches_list = await get_transcripts(
@@ -179,7 +182,8 @@ async def _select_protein_reference(
179182
if not best_tx:
180183
best_tx = await _get_longest_compatible_transcript(common_transcripts)
181184
if not best_tx:
182-
raise TxSelectError
185+
msg = f"Unable to find matching MANE transcripts for target gene {target_gene.target_gene_name}"
186+
raise TxSelectError(msg)
183187
ref_sequence = get_sequence(best_tx.refseq_prot)
184188
nm_accession = best_tx.refseq_nuc
185189
np_accession = best_tx.refseq_prot
@@ -323,19 +327,6 @@ async def select_transcript(
323327
:param align_result: alignment results
324328
:return: Transcript description (accession ID, offset, selected sequence, etc)
325329
"""
326-
if scoreset_urn.startswith("urn:mavedb:00000097"):
327-
_logger.info(
328-
"Score sets in urn:mavedb:00000097 are already expressed in full HGVS strings -- using predefined results because additional hard-coding is unnecessary"
329-
)
330-
return TxSelectResult(
331-
nm="NM_007294.3",
332-
np="NP_009225.1",
333-
start=0,
334-
is_full_match=False,
335-
transcript_mode=TranscriptPriority.MANE_SELECT,
336-
sequence=_get_protein_sequence(target_gene.target_sequence),
337-
)
338-
339330
if target_gene.target_gene_category != TargetType.PROTEIN_CODING:
340331
_logger.debug(
341332
"%s is regulatory/noncoding -- skipping transcript selection",
@@ -366,7 +357,9 @@ async def select_transcripts(
366357
* Dict where keys are target labels and values are alignment result objects
367358
:return: dict where keys are target labels and values are objects describing selected transcript (accession ID, offset, selected sequence, etc)
368359
"""
369-
selected_transcripts = {}
360+
selected_transcripts: dict[
361+
str, TxSelectResult | TxSelectError | KeyError | None
362+
] = {}
370363
for target_gene in scoreset_metadata.target_genes:
371364
if scoreset_metadata.target_genes[target_gene].target_accession_id:
372365
# for accession-based targets, create tx select objects for protein sequence accessions only

src/dcd_mapping/vrs_map.py

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -453,8 +453,11 @@ def _map_genomic(
453453
# if the sequence id starts with SQ, it is a target sequence which is in the ga4gh namespace
454454
namespace = "ga4gh"
455455
else:
456-
msg = f"Namespace could not be inferred from sequence: {sequence_id}"
457-
raise ValueError(msg)
456+
return MappedScore(
457+
accession_id=row.accession,
458+
score=row.score,
459+
error_message=f"Namespace could not be inferred from sequence: {sequence_id}",
460+
)
458461

459462
if (
460463
row.hgvs_nt in {"_wt", "_sy", "="}
@@ -609,8 +612,11 @@ def _map_genomic(
609612
error_message=str(e),
610613
)
611614
else:
612-
msg = f"Reference sequence namespace not supported: {namespace}"
613-
raise ValueError(msg)
615+
return MappedScore(
616+
accession_id=row.accession,
617+
score=row.score,
618+
error_message=f"Reference sequence namespace not supported: {namespace}",
619+
)
614620

615621
return MappedScore(
616622
accession_id=row.accession,
@@ -783,7 +789,14 @@ def _map_accession(
783789
variations: list[MappedScore] = []
784790
sequence_id = metadata.target_accession_id
785791
if sequence_id is None:
786-
raise ValueError
792+
return [
793+
MappedScore(
794+
accession_id=row.accession,
795+
score=row.score,
796+
error_message="Could not generate mapped allele objects. No sequence id was provided.",
797+
)
798+
for row in records
799+
]
787800

788801
store_accession(sequence_id)
789802

@@ -802,8 +815,14 @@ def _map_accession(
802815
hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result)
803816
variations.append(hgvs_nt_mappings)
804817
else:
805-
msg = f"Unrecognized accession prefix for accession id {metadata.target_accession_id}"
806-
raise ValueError(msg)
818+
[
819+
MappedScore(
820+
accession_id=row.accession,
821+
score=row.score,
822+
error_message=f"Unrecognized accession prefix for accession id {metadata.target_accession_id}",
823+
)
824+
for row in records
825+
]
807826

808827
return variations
809828

@@ -887,7 +906,7 @@ def vrs_map(
887906
records: list[ScoreRow],
888907
transcript: TxSelectResult | TxSelectError | None = None,
889908
silent: bool = True,
890-
) -> list[MappedScore] | None:
909+
) -> list[MappedScore]:
891910
"""Given a description of a MAVE scoreset and an aligned transcript, generate
892911
the corresponding VRS objects.
893912

0 commit comments

Comments
 (0)