Skip to content

Commit ac427d7

Browse files
committed
feat: update mapper job to use new target metadata (layer/gene_info) format
1 parent ba26e4c commit ac427d7

File tree

3 files changed

+55
-83
lines changed

3 files changed

+55
-83
lines changed

src/mavedb/scripts/populate_mapped_variants.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import logging
2-
import click
32
from datetime import date
4-
from typing import Sequence, Optional
3+
from typing import Optional, Sequence, Union
54

5+
import click
66
from sqlalchemy import cast, select
77
from sqlalchemy.dialects.postgresql import JSONB
88
from sqlalchemy.orm import Session
@@ -12,10 +12,9 @@
1212
from mavedb.lib.logging.context import format_raised_exception_info_as_dict
1313
from mavedb.lib.mapping import ANNOTATION_LAYERS
1414
from mavedb.models.enums.mapping_state import MappingState
15-
from mavedb.models.score_set import ScoreSet
1615
from mavedb.models.mapped_variant import MappedVariant
16+
from mavedb.models.score_set import ScoreSet
1717
from mavedb.models.variant import Variant
18-
1918
from mavedb.scripts.environment import script_environment, with_database_session
2019

2120
logger = logging.getLogger(__name__)
@@ -111,18 +110,24 @@ def populate_mapped_variant_data(db: Session, urns: Sequence[Optional[str]], all
111110
)
112111
# allow for multiple annotation layers
113112
pre_mapped_metadata = {}
114-
post_mapped_metadata = {}
113+
post_mapped_metadata: dict[str, Union[Optional[str], dict[str, dict[str, str | list[str]]]]] = {}
115114
excluded_pre_mapped_keys = {"sequence"}
116-
for annotation_layer in reference_metadata[target_gene_identifier]:
117-
layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get(
115+
116+
gene_info = reference_metadata[target_gene_identifier].get("gene_info")
117+
if gene_info:
118+
target_gene.mapped_hgnc_name = gene_info.get("hgnc_symbol")
119+
post_mapped_metadata["hgnc_name_selection_method"] = gene_info.get("selection_method")
120+
121+
for annotation_layer in reference_metadata[target_gene_identifier]["layers"]:
122+
layer_premapped = reference_metadata[target_gene_identifier]["layers"][annotation_layer].get(
118123
"computed_reference_sequence"
119124
)
120125
if layer_premapped:
121126
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
122127
k: layer_premapped[k]
123128
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
124129
}
125-
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
130+
layer_postmapped = reference_metadata[target_gene_identifier]["layers"][annotation_layer].get(
126131
"mapped_reference_sequence"
127132
)
128133
if layer_postmapped:

src/mavedb/worker/jobs.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -456,18 +456,19 @@ async def map_variants_for_score_set(
456456
pre_mapped_metadata = {}
457457
post_mapped_metadata = {}
458458
excluded_pre_mapped_keys = {"sequence"}
459-
for annotation_layer in reference_metadata[target_gene_identifier]:
460-
layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get(
461-
"computed_reference_sequence"
462-
)
459+
post_mapped_metadata["gene_info"] = reference_metadata[target_gene_identifier].get("gene_info")
460+
for annotation_layer in reference_metadata[target_gene_identifier]["layers"]:
461+
layer_premapped = reference_metadata[target_gene_identifier]["layers"][
462+
annotation_layer
463+
].get("computed_reference_sequence")
463464
if layer_premapped:
464465
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
465466
k: layer_premapped[k]
466467
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
467468
}
468-
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
469-
"mapped_reference_sequence"
470-
)
469+
layer_postmapped = reference_metadata[target_gene_identifier]["layers"][
470+
annotation_layer
471+
].get("mapped_reference_sequence")
471472
if layer_postmapped:
472473
post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped
473474
target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB)

tests/helpers/constants.py

Lines changed: 34 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,27 +1189,38 @@
11891189
}
11901190
}
11911191

1192+
1193+
TEST_GENE_INFO = {
1194+
"hgnc_symbol": VALID_GENE,
1195+
"selection_method": "tx_selection",
1196+
}
1197+
1198+
1199+
TEST_GENOMIC_LAYER = {
1200+
"computed_reference_sequence": {
1201+
"sequence_type": "dna",
1202+
"sequence_id": "ga4gh:SQ.ref_test",
1203+
"sequence": "ACGTTT",
1204+
},
1205+
"mapped_reference_sequence": {
1206+
"sequence_type": "dna",
1207+
"sequence_id": "ga4gh:SQ.map_test",
1208+
"sequence_accessions": [VALID_CHR_ACCESSION],
1209+
},
1210+
}
1211+
1212+
TEST_CODING_LAYER = {
1213+
"mapped_reference_sequence": {
1214+
"sequence_accessions": [VALID_NT_ACCESSION],
1215+
},
1216+
}
1217+
11921218
TEST_SEQ_SCORESET_VARIANT_MAPPING_SCAFFOLD = {
11931219
"metadata": {},
11941220
"reference_sequences": {
11951221
"TEST1": {
1196-
"g": {
1197-
"computed_reference_sequence": {
1198-
"sequence_type": "dna",
1199-
"sequence_id": "ga4gh:SQ.ref_test",
1200-
"sequence": "ACGTTT",
1201-
},
1202-
"mapped_reference_sequence": {
1203-
"sequence_type": "dna",
1204-
"sequence_id": "ga4gh:SQ.map_test",
1205-
"sequence_accessions": [VALID_CHR_ACCESSION],
1206-
},
1207-
},
1208-
"c": {
1209-
"mapped_reference_sequence": {
1210-
"sequence_accessions": [VALID_NT_ACCESSION],
1211-
},
1212-
},
1222+
"gene_info": TEST_GENE_INFO,
1223+
"layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER},
12131224
}
12141225
},
12151226
"mapped_scores": [],
@@ -1222,23 +1233,8 @@
12221233
"metadata": {},
12231234
"reference_sequences": {
12241235
"TEST2": {
1225-
"g": {
1226-
"computed_reference_sequence": {
1227-
"sequence_type": "dna",
1228-
"sequence_id": "ga4gh:SQ.ref_test",
1229-
"sequence": "ACGTTT",
1230-
},
1231-
"mapped_reference_sequence": {
1232-
"sequence_type": "dna",
1233-
"sequence_id": "ga4gh:SQ.map_test",
1234-
"sequence_accessions": [VALID_CHR_ACCESSION],
1235-
},
1236-
},
1237-
"c": {
1238-
"mapped_reference_sequence": {
1239-
"sequence_accessions": [VALID_NT_ACCESSION],
1240-
},
1241-
},
1236+
"gene_info": TEST_GENE_INFO,
1237+
"layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER},
12421238
}
12431239
},
12441240
"mapped_scores": [],
@@ -1251,42 +1247,12 @@
12511247
"metadata": {},
12521248
"reference_sequences": {
12531249
"TEST3": {
1254-
"g": {
1255-
"computed_reference_sequence": {
1256-
"sequence_type": "dna",
1257-
"sequence_id": "ga4gh:SQ.ref_test3",
1258-
"sequence": "ACGTTT",
1259-
},
1260-
"mapped_reference_sequence": {
1261-
"sequence_type": "dna",
1262-
"sequence_id": "ga4gh:SQ.map_test",
1263-
"sequence_accessions": [VALID_CHR_ACCESSION],
1264-
},
1265-
},
1266-
"c": {
1267-
"mapped_reference_sequence": {
1268-
"sequence_accessions": [VALID_NT_ACCESSION],
1269-
},
1270-
},
1250+
"gene_info": TEST_GENE_INFO,
1251+
"layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER},
12711252
},
12721253
"TEST4": {
1273-
"g": {
1274-
"computed_reference_sequence": {
1275-
"sequence_type": "dna",
1276-
"sequence_id": "ga4gh:SQ.ref_test4",
1277-
"sequence": "TAATGCC",
1278-
},
1279-
"mapped_reference_sequence": {
1280-
"sequence_type": "dna",
1281-
"sequence_id": "ga4gh:SQ.map_test",
1282-
"sequence_accessions": [VALID_CHR_ACCESSION],
1283-
},
1284-
},
1285-
"c": {
1286-
"mapped_reference_sequence": {
1287-
"sequence_accessions": [VALID_NT_ACCESSION],
1288-
},
1289-
},
1254+
"gene_info": TEST_GENE_INFO,
1255+
"layers": {"g": TEST_GENOMIC_LAYER, "c": TEST_CODING_LAYER},
12901256
},
12911257
},
12921258
"mapped_scores": [],

0 commit comments

Comments
 (0)