Skip to content

Commit a44bd2e

Browse files
committed
Add transcript information to mapped metadata for genomic score sets
1 parent ccec5a5 commit a44bd2e

File tree

3 files changed

+56
-12
lines changed

3 files changed

+56
-12
lines changed

src/api/routers/map.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
""""Provide mapping router"""
22
from pathlib import Path
33

4+
from cool_seq_tool.schemas import AnnotationLayer
45
from fastapi import APIRouter, HTTPException
56
from fastapi.responses import JSONResponse
67
from requests import HTTPError
@@ -21,7 +22,13 @@
2122
with_mavedb_score_set,
2223
)
2324
from dcd_mapping.resource_utils import ResourceAcquisitionError
24-
from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion
25+
from dcd_mapping.schemas import (
26+
ScoreAnnotation,
27+
ScoresetMapping,
28+
TargetType,
29+
TxSelectResult,
30+
VrsVersion,
31+
)
2532
from dcd_mapping.transcripts import select_transcripts
2633
from dcd_mapping.vrs_map import VrsMapError, vrs_map
2734

@@ -147,7 +154,8 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
147154
preferred_layers = {
148155
_set_scoreset_layer(urn, annotated_vrs_results[target_gene]),
149156
}
150-
reference_sequences[target_gene] = {
157+
target_gene_name = metadata.target_genes[target_gene].target_gene_name
158+
reference_sequences[target_gene_name] = {
151159
layer: {
152160
"computed_reference_sequence": None,
153161
"mapped_reference_sequence": None,
@@ -157,12 +165,12 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
157165
# sometimes Nonetype layers show up in preferred layers dict; remove these
158166
preferred_layers.discard(None)
159167
for layer in preferred_layers:
160-
reference_sequences[target_gene][layer][
168+
reference_sequences[target_gene_name][layer][
161169
"computed_reference_sequence"
162170
] = _get_computed_reference_sequence(
163171
metadata.target_genes[target_gene], layer, transcripts[target_gene]
164172
)
165-
reference_sequences[target_gene][layer][
173+
reference_sequences[target_gene_name][layer][
166174
"mapped_reference_sequence"
167175
] = _get_mapped_reference_sequence(
168176
metadata.target_genes[target_gene],
@@ -193,6 +201,23 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
193201
) or layer is None:
194202
del reference_sequences[target_gene][layer]
195203

204+
# if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict
205+
if (
206+
AnnotationLayer.GENOMIC in reference_sequences[target_gene_name]
207+
and metadata.target_genes[target_gene].target_gene_category
208+
== TargetType.PROTEIN_CODING
209+
and metadata.target_genes[target_gene].target_accession_id is None
210+
and transcripts[target_gene] is not None
211+
and isinstance(transcripts[target_gene], TxSelectResult)
212+
and transcripts[target_gene].nm is not None
213+
):
214+
reference_sequences[target_gene_name][AnnotationLayer.CDNA] = {
215+
"computed_reference_sequence": None,
216+
"mapped_reference_sequence": {
217+
"sequence_accessions": [transcripts[target_gene].nm]
218+
},
219+
}
220+
196221
except Exception as e:
197222
return JSONResponse(
198223
content=ScoresetMapping(

src/dcd_mapping/annotate.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
ScoresetMetadata,
4242
TargetGene,
4343
TargetSequenceType,
44+
TargetType,
4445
TxSelectResult,
4546
VrsVersion,
4647
)
@@ -598,7 +599,10 @@ def save_mapped_output_json(
598599
mapping.annotation_layer for mapping in mappings[target_gene]
599600
}
600601

601-
reference_sequences[target_gene] = {
602+
# use target gene name in reference sequence dictionary, rather than the label, which differs between score sets
603+
target_gene_name = metadata.target_genes[target_gene].target_gene_name
604+
605+
reference_sequences[target_gene_name] = {
602606
layer: {
603607
"computed_reference_sequence": None,
604608
"mapped_reference_sequence": None,
@@ -608,12 +612,12 @@ def save_mapped_output_json(
608612
# sometimes Nonetype layers show up in preferred layers dict; remove these
609613
preferred_layers.discard(None)
610614
for layer in preferred_layers:
611-
reference_sequences[target_gene][layer][
615+
reference_sequences[target_gene_name][layer][
612616
"computed_reference_sequence"
613617
] = _get_computed_reference_sequence(
614618
metadata.target_genes[target_gene], layer, tx_output[target_gene]
615619
)
616-
reference_sequences[target_gene][layer][
620+
reference_sequences[target_gene_name][layer][
617621
"mapped_reference_sequence"
618622
] = _get_mapped_reference_sequence(
619623
metadata.target_genes[target_gene],
@@ -622,6 +626,23 @@ def save_mapped_output_json(
622626
align_results[target_gene],
623627
)
624628

629+
# if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict
630+
if (
631+
AnnotationLayer.GENOMIC in reference_sequences[target_gene_name]
632+
and metadata.target_genes[target_gene].target_gene_category
633+
== TargetType.PROTEIN_CODING
634+
and metadata.target_genes[target_gene].target_accession_id is None
635+
and tx_output[target_gene] is not None
636+
and isinstance(tx_output[target_gene], TxSelectResult)
637+
and tx_output[target_gene].nm is not None
638+
):
639+
reference_sequences[target_gene_name][AnnotationLayer.CDNA] = {
640+
"computed_reference_sequence": None,
641+
"mapped_reference_sequence": {
642+
"sequence_accessions": [tx_output[target_gene].nm]
643+
},
644+
}
645+
625646
for m in mappings[target_gene]:
626647
if m.pre_mapped is None:
627648
mapped_scores.append(ScoreAnnotation(**m.model_dump()))

src/dcd_mapping/schemas.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,10 @@ class ScoresetMapping(BaseModel):
209209
str,
210210
dict[
211211
AnnotationLayer,
212-
dict[str, ComputedReferenceSequence | MappedReferenceSequence | None],
212+
dict[
213+
str, ComputedReferenceSequence | MappedReferenceSequence | dict | None
214+
],
213215
],
214216
] | None = None
215-
# computed_protein_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None
216-
# mapped_protein_reference_sequence: MappedReferenceSequence | None = None
217-
# computed_genomic_reference_sequence: ComputedReferenceSequence | MappedReferenceSequence | None = None
218-
# mapped_genomic_reference_sequence: MappedReferenceSequence | None = None
219217
mapped_scores: list[ScoreAnnotation] | None = None
220218
error_message: str | None = None

0 commit comments

Comments
 (0)