Skip to content

Commit 5869536

Browse files
committed
Bug fixes for temporary non-multi-target staging deployment
1 parent d38f364 commit 5869536

File tree

4 files changed

+107
-31
lines changed

4 files changed

+107
-31
lines changed

src/api/routers/map.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
158158
}
159159
for layer in AnnotationLayer
160160
}
161-
161+
# sometimes Nonetype layers show up in preferred layers dict; remove these
162+
preferred_layers.discard(None)
162163
for layer in preferred_layers:
163164
reference_sequences[layer][
164165
"computed_reference_sequence"
@@ -168,7 +169,10 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
168169
reference_sequences[layer][
169170
"mapped_reference_sequence"
170171
] = _get_mapped_reference_sequence(
171-
layer, transcripts[target_gene], alignment_results[target_gene]
172+
metadata.target_genes[target_gene],
173+
layer,
174+
transcripts[target_gene],
175+
alignment_results[target_gene],
172176
)
173177

174178
mapped_scores: list[ScoreAnnotation] = []

src/dcd_mapping/align.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,13 @@ def align(
336336
# blat names the result id "query" if there is only one query; replace "query" with the target gene name for single-target score sets
337337
if target_label == "query" and len(scoreset_metadata.target_genes) == 1:
338338
target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015
339+
# NOTE this is a temporary fix that will not work for multi-target score sets!
340+
# blat automatically reformats query names.
341+
if (
342+
target_label not in scoreset_metadata.target_genes
343+
and len(scoreset_metadata.target_genes) == 1
344+
):
345+
target_label = list(scoreset_metadata.target_genes.keys())[0] # noqa: RUF015
339346
target_gene = scoreset_metadata.target_genes[target_label]
340347
alignment_results[target_label] = _get_best_match(blat_result, target_gene)
341348
return alignment_results

src/dcd_mapping/annotate.py

Lines changed: 57 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -469,12 +469,14 @@ def _get_computed_reference_sequence(
469469

470470

471471
def _get_mapped_reference_sequence(
472+
metadata: TargetGene,
472473
layer: AnnotationLayer,
473474
tx_output: TxSelectResult | TxSelectError | None = None,
474475
align_result: AlignmentResult | None = None,
475476
) -> MappedReferenceSequence | None:
476477
"""Report the mapped reference sequence for a score set
477478
479+
:param metadata: Target gene metadata from MaveDB API
478480
:param layer: AnnotationLayer
479481
:param tx_output: Transcript data for a score set
480482
:return A MappedReferenceSequence object
@@ -500,13 +502,21 @@ def _get_mapped_reference_sequence(
500502
sequence_id=vrs_id,
501503
sequence_accessions=[tx_output.np],
502504
)
503-
seq_id = get_chromosome_identifier(align_result.chrom)
505+
# accession-based score sets with genomic accession do not have alignment results
506+
if (
507+
align_result is None
508+
and metadata.target_accession_id
509+
and metadata.target_accession_id.startswith("NC")
510+
):
511+
seq_id = metadata.target_accession_id
512+
else:
513+
seq_id = get_chromosome_identifier(align_result.chrom)
504514
vrs_id = get_vrs_id_from_identifier(seq_id)
505515
if vrs_id is None:
506516
# TODO catch this error, don't fail whole job for one target
507-
# msg = "ID could not be acquired from Seqrepo for chromosome identifier"
508-
# raise ValueError(msg)
509-
return None
517+
msg = "ID could not be acquired from Seqrepo for chromosome identifier"
518+
raise ValueError(msg)
519+
# return None
510520
return MappedReferenceSequence(
511521
sequence_type=TargetSequenceType.DNA,
512522
sequence_id=vrs_id,
@@ -593,9 +603,11 @@ def save_mapped_output_json(
593603
"computed_reference_sequence": None,
594604
"mapped_reference_sequence": None,
595605
}
596-
for layer in preferred_layers
606+
# TODO change this back after reimplementing multi-target mapping
607+
for layer in AnnotationLayer
597608
}
598-
609+
# sometimes Nonetype layers show up in preferred layers dict; remove these
610+
preferred_layers.discard(None)
599611
for layer in preferred_layers:
600612
reference_sequences[target_gene][layer][
601613
"computed_reference_sequence"
@@ -605,7 +617,10 @@ def save_mapped_output_json(
605617
reference_sequences[target_gene][layer][
606618
"mapped_reference_sequence"
607619
] = _get_mapped_reference_sequence(
608-
layer, tx_output[target_gene], align_results[target_gene]
620+
metadata.target_genes[target_gene],
621+
layer,
622+
tx_output[target_gene],
623+
align_results[target_gene],
609624
)
610625

611626
for m in mappings[target_gene]:
@@ -615,21 +630,43 @@ def save_mapped_output_json(
615630
# drop annotation layer from mapping object
616631
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
617632

618-
# drop Nonetype reference sequences
619-
for target_gene in reference_sequences:
620-
for layer in list(reference_sequences[target_gene].keys()):
621-
if (
622-
reference_sequences[target_gene][layer]["mapped_reference_sequence"]
623-
is None
624-
and reference_sequences[target_gene][layer][
625-
"computed_reference_sequence"
626-
]
627-
is None
628-
) or layer is None:
629-
del reference_sequences[target_gene][layer]
630-
633+
# TODO drop this "continue" after reimplementing multi-target mapping
634+
continue
635+
636+
# TODO add this back after reimplementing multi-target mapping
637+
# drop Nonetype reference sequences
638+
# for target_gene in reference_sequences:
639+
# for layer in list(reference_sequences[target_gene].keys()):
640+
# if (
641+
# reference_sequences[target_gene][layer]["mapped_reference_sequence"]
642+
# is None
643+
# and reference_sequences[target_gene][layer][
644+
# "computed_reference_sequence"
645+
# ]
646+
# is None
647+
# ) or layer is None:
648+
# del reference_sequences[target_gene][layer]
649+
650+
# TODO drop this "continue" after reimplementing multi-target mapping
651+
continue
652+
# TODO drop this after reimplementing multi-target mapping
653+
reference_sequences = reference_sequences.popitem()[1] # get only value in dict
654+
# TODO change this back after reimplementing multi-target mapping
655+
# this only works for --prefer_genomic right now, which is fine because we're going to change it back after reimplementing multi-target mapping
631656
output = ScoresetMapping(
632657
metadata=metadata.model_dump(),
658+
computed_protein_reference_sequence=reference_sequences[
659+
AnnotationLayer.PROTEIN
660+
]["computed_reference_sequence"],
661+
mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][
662+
"mapped_reference_sequence"
663+
],
664+
computed_genomic_reference_sequence=reference_sequences[
665+
AnnotationLayer.GENOMIC
666+
]["computed_reference_sequence"],
667+
mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][
668+
"mapped_reference_sequence"
669+
],
633670
reference_sequences=reference_sequences,
634671
mapped_scores=mapped_scores,
635672
)

src/dcd_mapping/vrs_map.py

Lines changed: 37 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,19 @@ def _hgvs_nt_is_valid(hgvs_nt: str) -> bool:
665665
)
666666

667667

668+
def _hgvs_pro_is_valid(hgvs_pro: str) -> bool:
669+
"""Check for invalid or unavailable protein MAVE-HGVS variation
670+
671+
:param hgvs_nt: MAVE_HGVS protein expression
672+
:return: True if expression appears populated and valid
673+
"""
674+
return (
675+
(hgvs_pro not in {"_wt", "_sy", "NA"})
676+
and (len(hgvs_pro) != 3)
677+
and ("fs" not in hgvs_pro)
678+
)
679+
680+
668681
def _map_protein_coding(
669682
metadata: TargetGene,
670683
records: list[ScoreRow],
@@ -691,24 +704,39 @@ def _map_protein_coding(
691704

692705
variations: list[MappedScore] = []
693706
for row in records:
694-
if isinstance(transcript, TxSelectError):
707+
hgvs_nt_mappings = None
708+
hgvs_pro_mappings = None
709+
if _hgvs_nt_is_valid(row.hgvs_nt):
710+
hgvs_nt_mappings = _map_genomic(row, gsequence_id, align_result)
711+
712+
if (
713+
isinstance(transcript, TxSelectError) and not hgvs_nt_mappings
714+
): # only create error message if there is not an hgvs nt mapping
695715
# TODO create pre mapped allele
696716
hgvs_pro_mappings = MappedScore(
697717
accession_id=row.accession,
698718
score=row.score,
699719
error_message=str(transcript).strip("'"),
700720
)
701721
else:
702-
hgvs_pro_mappings = _map_protein_coding_pro(row, psequence_id, transcript)
722+
if _hgvs_pro_is_valid(row.hgvs_pro):
723+
hgvs_pro_mappings = _map_protein_coding_pro(
724+
row, psequence_id, transcript
725+
)
726+
elif (
727+
not hgvs_nt_mappings
728+
): # only create error message if there is not an hgvs nt mapping
729+
hgvs_pro_mappings = MappedScore(
730+
accession_id=row.accession,
731+
score=row.score,
732+
error_message="Invalid protein variant syntax",
733+
)
734+
735+
# append both pro and nt mappings if both available
703736
if hgvs_pro_mappings:
704737
variations.append(hgvs_pro_mappings)
705-
706-
if _hgvs_nt_is_valid(row.hgvs_nt):
707-
hgvs_nt_mappings = _map_genomic(row, gsequence_id, align_result)
708-
709-
if hgvs_nt_mappings:
710-
variations.append(hgvs_nt_mappings)
711-
738+
if hgvs_nt_mappings:
739+
variations.append(hgvs_nt_mappings)
712740
return variations
713741

714742

0 commit comments

Comments
 (0)