Skip to content

Commit 0e52962

Browse files
authored
Merge pull request #436 from VariantEffect/mapper-multi-target
Support mapper update for multi-target score sets
2 parents 0a0ebaa + ccbe20f commit 0e52962

File tree

9 files changed

+415
-139
lines changed

9 files changed

+415
-139
lines changed
Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import os
2-
from datetime import date
3-
from typing import Optional, TypedDict
2+
from typing import Optional
43

5-
import requests
64
from cdot.hgvs.dataproviders import SeqFetcher, ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider
75

6+
from mavedb.lib.mapping import VRSMap
7+
88
GENOMIC_FASTA_FILES = [
99
"/data/GCF_000001405.39_GRCh38.p13_genomic.fna.gz",
1010
"/data/GCF_000001405.25_GRCh37.p13_genomic.fna.gz",
@@ -21,29 +21,5 @@ def cdot_rest() -> RESTDataProvider:
2121
return RESTDataProvider(seqfetcher=seqfetcher())
2222

2323

24-
class VRSMap:
25-
url: str
26-
27-
class ScoreSetMappingResults(TypedDict):
28-
metadata: Optional[dict[str, str]]
29-
dcd_mapping_version: str
30-
mapped_date_utc: date
31-
computed_genomic_reference_sequence: Optional[dict[str, str]]
32-
mapped_genomic_reference_sequence: Optional[dict[str, str]]
33-
computed_protein_reference_sequence: Optional[dict[str, str]]
34-
mapped_protein_reference_sequence: Optional[dict[str, str]]
35-
mapped_scores: Optional[list[dict]]
36-
error_message: Optional[str]
37-
38-
def __init__(self, url: str) -> None:
39-
self.url = url
40-
41-
def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults:
42-
uri = f"{self.url}/api/v1/map/{score_set_urn}"
43-
response = requests.post(uri)
44-
response.raise_for_status()
45-
return response.json()
46-
47-
4824
def vrs_mapper(url: Optional[str] = None) -> VRSMap:
4925
return VRSMap(DCD_MAP_URL) if not url else VRSMap(url)

src/mavedb/lib/mapping.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from datetime import date
2+
from typing import Optional, TypedDict, Union
3+
4+
import requests
5+
6+
ANNOTATION_LAYERS = {
7+
"g": "genomic",
8+
"p": "protein",
9+
"c": "cdna",
10+
}
11+
12+
13+
class VRSMap:
14+
url: str
15+
16+
class ScoreSetMappingResults(TypedDict):
17+
metadata: Optional[dict[str, str]]
18+
dcd_mapping_version: str
19+
mapped_date_utc: date
20+
reference_sequences: Optional[dict[str, dict[str, dict[str, dict[str, Union[str, list[str]]]]]]]
21+
mapped_scores: Optional[list[dict]]
22+
error_message: Optional[str]
23+
24+
def __init__(self, url: str) -> None:
25+
self.url = url
26+
27+
def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults:
28+
uri = f"{self.url}/api/v1/map/{score_set_urn}"
29+
response = requests.post(uri)
30+
response.raise_for_status()
31+
return response.json()

src/mavedb/scripts/populate_mapped_variants.py

Lines changed: 37 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
from sqlalchemy.orm import Session
99

1010
from mavedb.data_providers.services import vrs_mapper
11+
from mavedb.lib.exceptions import NonexistentMappingReferenceError
1112
from mavedb.lib.logging.context import format_raised_exception_info_as_dict
13+
from mavedb.lib.mapping import ANNOTATION_LAYERS
1214
from mavedb.models.enums.mapping_state import MappingState
1315
from mavedb.models.score_set import ScoreSet
1416
from mavedb.models.mapped_variant import MappedVariant
15-
from mavedb.models.target_gene import TargetGene
1617
from mavedb.models.variant import Variant
1718

1819
from mavedb.scripts.environment import script_environment, with_database_session
@@ -91,47 +92,43 @@ def populate_mapped_variant_data(db: Session, urns: Sequence[Optional[str]], all
9192
db.commit()
9293
logger.info(f"No mapped variants available for {score_set.urn}.")
9394
else:
94-
computed_genomic_ref = mapped_scoreset.get("computed_genomic_reference_sequence")
95-
mapped_genomic_ref = mapped_scoreset.get("mapped_genomic_reference_sequence")
96-
computed_protein_ref = mapped_scoreset.get("computed_protein_reference_sequence")
97-
mapped_protein_ref = mapped_scoreset.get("mapped_protein_reference_sequence")
98-
99-
# assumes one target gene per score set, which is currently true in mavedb as of sept. 2024.
100-
target_gene = db.scalars(
101-
select(TargetGene)
102-
.join(ScoreSet)
103-
.where(
104-
ScoreSet.urn == str(score_set.urn),
95+
reference_metadata = mapped_scoreset.get("reference_sequences")
96+
if not reference_metadata:
97+
raise NonexistentMappingReferenceError()
98+
99+
for target_gene_identifier in reference_metadata:
100+
target_gene = next(
101+
(
102+
target_gene
103+
for target_gene in score_set.target_genes
104+
if target_gene.name == target_gene_identifier
105+
),
106+
None,
105107
)
106-
).one()
107-
108-
excluded_pre_mapped_keys = {"sequence"}
109-
if computed_genomic_ref and mapped_genomic_ref:
110-
pre_mapped_metadata = computed_genomic_ref
111-
target_gene.pre_mapped_metadata = cast(
112-
{
113-
"genomic": {
114-
k: pre_mapped_metadata[k]
115-
for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys
108+
if not target_gene:
109+
raise ValueError(
110+
f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}."
111+
)
112+
# allow for multiple annotation layers
113+
pre_mapped_metadata = {}
114+
post_mapped_metadata = {}
115+
excluded_pre_mapped_keys = {"sequence"}
116+
for annotation_layer in reference_metadata[target_gene_identifier]:
117+
layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get(
118+
"computed_reference_sequence"
119+
)
120+
if layer_premapped:
121+
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
122+
k: layer_premapped[k]
123+
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
116124
}
117-
},
118-
JSONB,
119-
)
120-
target_gene.post_mapped_metadata = cast({"genomic": mapped_genomic_ref}, JSONB)
121-
elif computed_protein_ref and mapped_protein_ref:
122-
pre_mapped_metadata = computed_protein_ref
123-
target_gene.pre_mapped_metadata = cast(
124-
{
125-
"protein": {
126-
k: pre_mapped_metadata[k]
127-
for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys
128-
}
129-
},
130-
JSONB,
131-
)
132-
target_gene.post_mapped_metadata = cast({"protein": mapped_protein_ref}, JSONB)
133-
else:
134-
raise ValueError(f"incomplete or inconsistent metadata for score set {score_set.urn}")
125+
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
126+
"mapped_reference_sequence"
127+
)
128+
if layer_postmapped:
129+
post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped
130+
target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB)
131+
target_gene.post_mapped_metadata = cast(post_mapped_metadata, JSONB)
135132

136133
mapped_variants = [
137134
variant_from_mapping(db=db, mapping=mapped_score, dcd_mapping_version=dcd_mapping_version)

src/mavedb/worker/jobs.py

Lines changed: 34 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
NonexistentMappingResultsError,
3636
)
3737
from mavedb.lib.logging.context import format_raised_exception_info_as_dict
38+
from mavedb.lib.mapping import ANNOTATION_LAYERS
3839
from mavedb.lib.score_sets import (
3940
columns_for_dataset,
4041
create_variants,
@@ -390,55 +391,43 @@ async def map_variants_for_score_set(
390391
score_set.mapping_state = MappingState.failed
391392
score_set.mapping_errors = {"error_message": mapping_results.get("error_message")}
392393
else:
393-
# TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support:
394-
# this assumes single-target mapping, will need to be changed to support multi-target mapping
395-
# just in case there are multiple target genes in the db for a score set (this point shouldn't be reached
396-
# while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence.
397-
# TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support:
398-
# this also assumes that the score set is based on a target sequence, not a target accession
399-
400-
computed_genomic_ref = mapping_results.get("computed_genomic_reference_sequence")
401-
mapped_genomic_ref = mapping_results.get("mapped_genomic_reference_sequence")
402-
computed_protein_ref = mapping_results.get("computed_protein_reference_sequence")
403-
mapped_protein_ref = mapping_results.get("mapped_protein_reference_sequence")
404-
405-
if computed_genomic_ref:
406-
target_sequence = computed_genomic_ref["sequence"] # noqa: F841
407-
elif computed_protein_ref:
408-
target_sequence = computed_protein_ref["sequence"] # noqa: F841
409-
else:
394+
reference_metadata = mapping_results.get("reference_sequences")
395+
if not reference_metadata:
410396
raise NonexistentMappingReferenceError()
411397

412-
# TODO(VariantEffect/dcd_mapping2#2): Handle variant mappings for score sets with more than 1 target.
413-
target_gene = score_set.target_genes[0]
414-
415-
excluded_pre_mapped_keys = {"sequence"}
416-
if computed_genomic_ref and mapped_genomic_ref:
417-
pre_mapped_metadata = computed_genomic_ref
418-
target_gene.pre_mapped_metadata = cast(
419-
{
420-
"genomic": {
421-
k: pre_mapped_metadata[k]
422-
for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys
423-
}
424-
},
425-
JSONB,
398+
for target_gene_identifier in reference_metadata:
399+
target_gene = next(
400+
(
401+
target_gene
402+
for target_gene in score_set.target_genes
403+
if target_gene.name == target_gene_identifier
404+
),
405+
None,
426406
)
427-
target_gene.post_mapped_metadata = cast({"genomic": mapped_genomic_ref}, JSONB)
428-
elif computed_protein_ref and mapped_protein_ref:
429-
pre_mapped_metadata = computed_protein_ref
430-
target_gene.pre_mapped_metadata = cast(
431-
{
432-
"protein": {
433-
k: pre_mapped_metadata[k]
434-
for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys
407+
if not target_gene:
408+
raise ValueError(
409+
f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}."
410+
)
411+
# allow for multiple annotation layers
412+
pre_mapped_metadata = {}
413+
post_mapped_metadata = {}
414+
excluded_pre_mapped_keys = {"sequence"}
415+
for annotation_layer in reference_metadata[target_gene_identifier]:
416+
layer_premapped = reference_metadata[target_gene_identifier][annotation_layer].get(
417+
"computed_reference_sequence"
418+
)
419+
if layer_premapped:
420+
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
421+
k: layer_premapped[k]
422+
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
435423
}
436-
},
437-
JSONB,
438-
)
439-
target_gene.post_mapped_metadata = cast({"protein": mapped_protein_ref}, JSONB)
440-
else:
441-
raise NonexistentMappingReferenceError()
424+
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
425+
"mapped_reference_sequence"
426+
)
427+
if layer_postmapped:
428+
post_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = layer_postmapped
429+
target_gene.pre_mapped_metadata = cast(pre_mapped_metadata, JSONB)
430+
target_gene.post_mapped_metadata = cast(post_mapped_metadata, JSONB)
442431

443432
total_variants = 0
444433
successful_mapped_variants = 0

0 commit comments

Comments
 (0)