Skip to content

Commit 5d52e0c

Browse files
committed
feat: compute gene info for all mapped targets.
Computes a new `gene_info` property for all mapped targets. This property is defined by an `hgnc_symbol` and a `selection_method`. The hgnc symbol is the HGNC symbol of the gene to which this target relates. The selection method is the method by which this symbol was selected and may be: - `tx_selection`: via the selected transcript - `alignment_max_covered_bases`: based on the gene 'feature' (via Ensembl) which covered the most bases of the aligned target - `variants_max_covered_bases`: same as `alignment_max_covered_bases`, but based on variant bases rather than aligned bases - `target_metadata`: based on parsing the target metadata the user supplied - `target_category`: no gene info was selected because the target was not protein coding (see #66) Various helpers were added to `dcd_mapping.annotate` which support this calculation. Gene info selection should not cause job failures, and will simply fail to select gene info on failure.
1 parent 6598890 commit 5d52e0c

File tree

4 files changed

+681
-19
lines changed

4 files changed

+681
-19
lines changed

src/api/routers/map.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
_get_mapped_reference_sequence,
1313
_set_scoreset_layer,
1414
annotate,
15+
compute_target_gene_info,
1516
)
1617
from dcd_mapping.exceptions import (
1718
AlignmentError,
@@ -34,6 +35,7 @@
3435
from dcd_mapping.schemas import (
3536
ScoreAnnotation,
3637
ScoresetMapping,
38+
TargetAnnotation,
3739
TargetType,
3840
TxSelectResult,
3941
VrsVersion,
@@ -196,29 +198,41 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
196198

197199
try:
198200
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
199-
reference_sequences: dict[str, dict] = {}
201+
reference_sequences: dict[str, TargetAnnotation] = {}
200202
mapped_scores: list[ScoreAnnotation] = []
201203
for target_gene in annotated_vrs_results:
202204
preferred_layers = {
203205
_set_scoreset_layer(urn, annotated_vrs_results[target_gene]),
204206
}
205207
target_gene_name = metadata.target_genes[target_gene].target_gene_name
206-
reference_sequences[target_gene_name] = {
208+
reference_sequences[target_gene_name] = TargetAnnotation()
209+
reference_sequences[target_gene_name].layers = {
207210
layer: {
208211
"computed_reference_sequence": None,
209212
"mapped_reference_sequence": None,
210213
}
211214
for layer in preferred_layers
212215
}
216+
213217
# sometimes Nonetype layers show up in preferred layers dict; remove these
214218
preferred_layers.discard(None)
219+
220+
# Determine one gene symbol per target and its selection method
221+
gene_info = await compute_target_gene_info(
222+
target_key=target_gene,
223+
transcripts=transcripts,
224+
alignment_results=alignment_results,
225+
metadata=metadata,
226+
mapped_scores=annotated_vrs_results[target_gene],
227+
)
228+
215229
for layer in preferred_layers:
216-
reference_sequences[target_gene_name][layer][
230+
reference_sequences[target_gene_name].layers[layer][
217231
"computed_reference_sequence"
218232
] = _get_computed_reference_sequence(
219233
metadata.target_genes[target_gene], layer, transcripts[target_gene]
220234
)
221-
reference_sequences[target_gene_name][layer][
235+
reference_sequences[target_gene_name].layers[layer][
222236
"mapped_reference_sequence"
223237
] = _get_mapped_reference_sequence(
224238
metadata.target_genes[target_gene],
@@ -227,6 +241,9 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
227241
alignment_results[target_gene],
228242
)
229243

244+
if gene_info is not None:
245+
reference_sequences[target_gene_name].gene_info = gene_info
246+
230247
for m in annotated_vrs_results[target_gene]:
231248
if m.pre_mapped is None:
232249
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
@@ -236,15 +253,15 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
236253

237254
# if genomic layer, not accession-based, and target gene type is coding, add cdna entry (just the sequence accession) to reference_sequences dict
238255
if (
239-
AnnotationLayer.GENOMIC in reference_sequences[target_gene_name]
256+
AnnotationLayer.GENOMIC in reference_sequences[target_gene_name].layers
240257
and metadata.target_genes[target_gene].target_gene_category
241258
== TargetType.PROTEIN_CODING
242259
and metadata.target_genes[target_gene].target_accession_id is None
243260
and transcripts[target_gene] is not None
244261
and isinstance(transcripts[target_gene], TxSelectResult)
245262
and transcripts[target_gene].nm is not None
246263
):
247-
reference_sequences[target_gene_name][AnnotationLayer.CDNA] = {
264+
reference_sequences[target_gene_name].layers[AnnotationLayer.CDNA] = {
248265
"computed_reference_sequence": None,
249266
"mapped_reference_sequence": {
250267
"sequence_accessions": [transcripts[target_gene].nm]
@@ -253,18 +270,18 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
253270

254271
# drop Nonetype reference sequences
255272
for target_gene in reference_sequences:
256-
for layer in list(reference_sequences[target_gene].keys()):
273+
for layer in list(reference_sequences[target_gene].layers.keys()):
257274
if (
258-
reference_sequences[target_gene][layer][
275+
reference_sequences[target_gene].layers[layer][
259276
"mapped_reference_sequence"
260277
]
261278
is None
262-
and reference_sequences[target_gene][layer][
279+
and reference_sequences[target_gene].layers[layer][
263280
"computed_reference_sequence"
264281
]
265282
is None
266283
) or layer is None:
267-
del reference_sequences[target_gene][layer]
284+
del reference_sequences[target_gene].layers[layer]
268285

269286
except Exception as e:
270287
return JSONResponse(

0 commit comments

Comments
 (0)