Skip to content

Commit ccec5a5

Browse files
committed
Re-implement multi-target mapping
1 parent 4cdc7a3 commit ccec5a5

File tree

4 files changed

+75
-100
lines changed

4 files changed

+75
-100
lines changed

src/api/routers/map.py

Lines changed: 51 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
""""Provide mapping router"""
22
from pathlib import Path
33

4-
from cool_seq_tool.schemas import AnnotationLayer
54
from fastapi import APIRouter, HTTPException
65
from fastapi.responses import JSONResponse
76
from requests import HTTPError
@@ -140,46 +139,60 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
140139
error_message="No annotated variant mappings available for this score set",
141140
)
142141

143-
# TODO this will need to be changed to support multi-target score sets.
144-
# This version works for accession based score sets.
145-
# Not implementing multi-target changes because this will require corresponding changes on mavedb-api and we want to get this on staging quickly right now.
146-
# For now, only accept single-target score sets so that we don't need to change structure of JSON output.
147-
target_gene = list(metadata.target_genes.keys())[0] # noqa: RUF015
148142
try:
149143
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
150-
preferred_layers = {
151-
_set_scoreset_layer(urn, annotated_vrs_results[target_gene]),
152-
}
153-
154-
reference_sequences = {
155-
layer: {
156-
"computed_reference_sequence": None,
157-
"mapped_reference_sequence": None,
144+
reference_sequences: dict[str, dict] = {}
145+
mapped_scores: list[ScoreAnnotation] = []
146+
for target_gene in annotated_vrs_results:
147+
preferred_layers = {
148+
_set_scoreset_layer(urn, annotated_vrs_results[target_gene]),
158149
}
159-
for layer in AnnotationLayer
160-
}
161-
# sometimes Nonetype layers show up in preferred layers dict; remove these
162-
preferred_layers.discard(None)
163-
for layer in preferred_layers:
164-
reference_sequences[layer][
165-
"computed_reference_sequence"
166-
] = _get_computed_reference_sequence(
167-
metadata.target_genes[target_gene], layer, transcripts[target_gene]
168-
)
169-
reference_sequences[layer][
170-
"mapped_reference_sequence"
171-
] = _get_mapped_reference_sequence(
172-
metadata.target_genes[target_gene],
173-
layer,
174-
transcripts[target_gene],
175-
alignment_results[target_gene],
176-
)
150+
reference_sequences[target_gene] = {
151+
layer: {
152+
"computed_reference_sequence": None,
153+
"mapped_reference_sequence": None,
154+
}
155+
for layer in preferred_layers
156+
}
157+
# sometimes Nonetype layers show up in preferred layers dict; remove these
158+
preferred_layers.discard(None)
159+
for layer in preferred_layers:
160+
reference_sequences[target_gene][layer][
161+
"computed_reference_sequence"
162+
] = _get_computed_reference_sequence(
163+
metadata.target_genes[target_gene], layer, transcripts[target_gene]
164+
)
165+
reference_sequences[target_gene][layer][
166+
"mapped_reference_sequence"
167+
] = _get_mapped_reference_sequence(
168+
metadata.target_genes[target_gene],
169+
layer,
170+
transcripts[target_gene],
171+
alignment_results[target_gene],
172+
)
173+
174+
for m in annotated_vrs_results[target_gene]:
175+
if m.pre_mapped is None:
176+
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
177+
elif m.annotation_layer in preferred_layers:
178+
# drop annotation layer from mapping object
179+
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
180+
181+
# drop Nonetype reference sequences
182+
for target_gene in reference_sequences:
183+
for layer in list(reference_sequences[target_gene].keys()):
184+
if (
185+
reference_sequences[target_gene][layer][
186+
"mapped_reference_sequence"
187+
]
188+
is None
189+
and reference_sequences[target_gene][layer][
190+
"computed_reference_sequence"
191+
]
192+
is None
193+
) or layer is None:
194+
del reference_sequences[target_gene][layer]
177195

178-
mapped_scores: list[ScoreAnnotation] = []
179-
for m in annotated_vrs_results[target_gene]:
180-
if m.annotation_layer in preferred_layers:
181-
# drop annotation layer from mapping object
182-
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
183196
except Exception as e:
184197
return JSONResponse(
185198
content=ScoresetMapping(
@@ -190,18 +203,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
190203
return JSONResponse(
191204
content=ScoresetMapping(
192205
metadata=raw_metadata,
193-
computed_protein_reference_sequence=reference_sequences[
194-
AnnotationLayer.PROTEIN
195-
]["computed_reference_sequence"],
196-
mapped_protein_reference_sequence=reference_sequences[
197-
AnnotationLayer.PROTEIN
198-
]["mapped_reference_sequence"],
199-
computed_genomic_reference_sequence=reference_sequences[
200-
AnnotationLayer.GENOMIC
201-
]["computed_reference_sequence"],
202-
mapped_genomic_reference_sequence=reference_sequences[
203-
AnnotationLayer.GENOMIC
204-
]["mapped_reference_sequence"],
206+
reference_sequences=reference_sequences,
205207
mapped_scores=mapped_scores,
206208
).model_dump(exclude_none=True)
207209
)

src/dcd_mapping/annotate.py

Lines changed: 13 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -603,8 +603,7 @@ def save_mapped_output_json(
603603
"computed_reference_sequence": None,
604604
"mapped_reference_sequence": None,
605605
}
606-
# TODO change this back after reimplementing multi-target mapping
607-
for layer in AnnotationLayer
606+
for layer in preferred_layers
608607
}
609608
# sometimes Nonetype layers show up in preferred layers dict; remove these
610609
preferred_layers.discard(None)
@@ -630,43 +629,21 @@ def save_mapped_output_json(
630629
# drop annotation layer from mapping object
631630
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
632631

633-
# TODO drop this "continue" after reimplementing multi-target mapping
634-
continue
635-
636-
# TODO add this back after reimplementing multi-target mapping
637632
# drop Nonetype reference sequences
638-
# for target_gene in reference_sequences:
639-
# for layer in list(reference_sequences[target_gene].keys()):
640-
# if (
641-
# reference_sequences[target_gene][layer]["mapped_reference_sequence"]
642-
# is None
643-
# and reference_sequences[target_gene][layer][
644-
# "computed_reference_sequence"
645-
# ]
646-
# is None
647-
# ) or layer is None:
648-
# del reference_sequences[target_gene][layer]
649-
650-
# TODO drop this "continue" after reimplementing multi-target mapping
651-
continue
652-
# TODO drop this after reimplementing multi-target mapping
653-
reference_sequences = reference_sequences.popitem()[1] # get only value in dict
654-
# TODO change this back after reimplementing multi-target mapping
655-
# this only works for --prefer_genomic right now, which is fine because we're going to change it back after reimplementing multi-target mapping
633+
for target_gene in reference_sequences:
634+
for layer in list(reference_sequences[target_gene].keys()):
635+
if (
636+
reference_sequences[target_gene][layer]["mapped_reference_sequence"]
637+
is None
638+
and reference_sequences[target_gene][layer][
639+
"computed_reference_sequence"
640+
]
641+
is None
642+
) or layer is None:
643+
del reference_sequences[target_gene][layer]
644+
656645
output = ScoresetMapping(
657646
metadata=metadata.model_dump(),
658-
computed_protein_reference_sequence=reference_sequences[
659-
AnnotationLayer.PROTEIN
660-
]["computed_reference_sequence"],
661-
mapped_protein_reference_sequence=reference_sequences[AnnotationLayer.PROTEIN][
662-
"mapped_reference_sequence"
663-
],
664-
computed_genomic_reference_sequence=reference_sequences[
665-
AnnotationLayer.GENOMIC
666-
]["computed_reference_sequence"],
667-
mapped_genomic_reference_sequence=reference_sequences[AnnotationLayer.GENOMIC][
668-
"mapped_reference_sequence"
669-
],
670647
reference_sequences=reference_sequences,
671648
mapped_scores=mapped_scores,
672649
)

src/dcd_mapping/mavedb_data.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,6 @@ def get_scoreset_metadata(
187187
metadata = get_raw_scoreset_metadata(scoreset_urn, dcd_mapping_dir)
188188
target_genes = {}
189189
multi_target = len(metadata["targetGenes"]) > 1
190-
if multi_target:
191-
msg = f"Multiple target genes for {scoreset_urn}. Multi-target score sets are not currently supported."
192-
raise ScoresetNotSupportedError(msg)
193190

194191
for gene in metadata["targetGenes"]:
195192
if not _metadata_response_is_human(metadata):

src/dcd_mapping/schemas.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -205,17 +205,16 @@ class ScoresetMapping(BaseModel):
205205
mapped_date_utc: str = Field(
206206
default=datetime.datetime.now(tz=datetime.UTC).isoformat()
207207
)
208-
# TODO re-implement metadata change later to support multi-target score sets. will require corresponding changes in mavedb-api
209-
# reference_sequences: dict[
210-
# str,
211-
# dict[
212-
# AnnotationLayer,
213-
# dict[str, ComputedReferenceSequence | MappedReferenceSequence | None],
214-
# ],
215-
# ] | None = None
216-
computed_protein_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None
217-
mapped_protein_reference_sequence: MappedReferenceSequence | None = None
218-
computed_genomic_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None
219-
mapped_genomic_reference_sequence: MappedReferenceSequence | None = None
208+
reference_sequences: dict[
209+
str,
210+
dict[
211+
AnnotationLayer,
212+
dict[str, ComputedReferenceSequence | MappedReferenceSequence | None],
213+
],
214+
] | None = None
215+
# computed_protein_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None
216+
# mapped_protein_reference_sequence: MappedReferenceSequence | None = None
217+
# computed_genomic_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None
218+
# mapped_genomic_reference_sequence: MappedReferenceSequence | None = None
220219
mapped_scores: list[ScoreAnnotation] | None = None
221220
error_message: str | None = None

0 commit comments

Comments
 (0)