Skip to content

Commit 3e165bc

Browse files
committed
Fix bugs in mapper job, and centralize some mapper resources
variant_mapper_manager previously failed because of an incorrectly placed 'where' function and because we did two related db queries, one for a score set and one for target gene(s) within that score set, which resulted in an error when adding the score set changes to the db. Instead, query to select the score set, and then loop through the score set's target genes rather than querying the target genes table in the db.
1 parent a68de0a commit 3e165bc

File tree

3 files changed

+47
-50
lines changed

3 files changed

+47
-50
lines changed
Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import os
2-
from datetime import date
3-
from typing import Optional, TypedDict
2+
from typing import Optional
43

5-
import requests
64
from cdot.hgvs.dataproviders import SeqFetcher, ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider
75

6+
from mavedb.lib.mapping import VRSMap
7+
88
GENOMIC_FASTA_FILES = [
99
"/data/GCF_000001405.39_GRCh38.p13_genomic.fna.gz",
1010
"/data/GCF_000001405.25_GRCh37.p13_genomic.fna.gz",
@@ -21,29 +21,5 @@ def cdot_rest() -> RESTDataProvider:
2121
return RESTDataProvider(seqfetcher=seqfetcher())
2222

2323

24-
class VRSMap:
25-
url: str
26-
27-
class ScoreSetMappingResults(TypedDict):
28-
metadata: Optional[dict[str, str]]
29-
dcd_mapping_version: str
30-
mapped_date_utc: date
31-
computed_genomic_reference_sequence: Optional[dict[str, str]]
32-
mapped_genomic_reference_sequence: Optional[dict[str, str]]
33-
computed_protein_reference_sequence: Optional[dict[str, str]]
34-
mapped_protein_reference_sequence: Optional[dict[str, str]]
35-
mapped_scores: Optional[list[dict]]
36-
error_message: Optional[str]
37-
38-
def __init__(self, url: str) -> None:
39-
self.url = url
40-
41-
def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults:
42-
uri = f"{self.url}/api/v1/map/{score_set_urn}"
43-
response = requests.post(uri)
44-
response.raise_for_status()
45-
return response.json()
46-
47-
4824
def vrs_mapper(url: Optional[str] = None) -> VRSMap:
4925
return VRSMap(DCD_MAP_URL) if not url else VRSMap(url)

src/mavedb/lib/mapping.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
from datetime import date
2+
from typing import Optional, TypedDict
3+
4+
import requests
5+
6+
ANNOTATION_LAYERS = {
7+
"g": "genomic",
8+
"p": "protein",
9+
"c": "cdna",
10+
}
11+
12+
13+
class VRSMap:
14+
url: str
15+
16+
class ScoreSetMappingResults(TypedDict):
17+
metadata: Optional[dict[str, str]]
18+
dcd_mapping_version: str
19+
mapped_date_utc: date
20+
computed_genomic_reference_sequence: Optional[dict[str, str]]
21+
mapped_genomic_reference_sequence: Optional[dict[str, str]]
22+
computed_protein_reference_sequence: Optional[dict[str, str]]
23+
mapped_protein_reference_sequence: Optional[dict[str, str]]
24+
mapped_scores: Optional[list[dict]]
25+
error_message: Optional[str]
26+
27+
def __init__(self, url: str) -> None:
28+
self.url = url
29+
30+
def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults:
31+
uri = f"{self.url}/api/v1/map/{score_set_urn}"
32+
response = requests.post(uri)
33+
response.raise_for_status()
34+
return response.json()

src/mavedb/worker/jobs.py

Lines changed: 10 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
NonexistentMappingResultsError,
3636
)
3737
from mavedb.lib.logging.context import format_raised_exception_info_as_dict
38+
from mavedb.lib.mapping import ANNOTATION_LAYERS
3839
from mavedb.lib.score_sets import (
3940
columns_for_dataset,
4041
create_variants,
@@ -50,7 +51,6 @@
5051
from mavedb.models.enums.processing_state import ProcessingState
5152
from mavedb.models.mapped_variant import MappedVariant
5253
from mavedb.models.published_variant import PublishedVariantsMV
53-
from mavedb.models.target_gene import TargetGene
5454
from mavedb.models.score_set import ScoreSet
5555
from mavedb.models.user import User
5656
from mavedb.models.variant import Variant
@@ -248,12 +248,6 @@ async def create_variants_for_score_set(
248248
# Mapping variants
249249
####################################################################################################
250250

251-
ANNOTATION_LAYERS = {
252-
"g": "genomic",
253-
"p": "protein",
254-
"c": "cdna",
255-
}
256-
257251

258252
@asynccontextmanager
259253
async def mapping_in_execution(redis: ArqRedis, job_id: str):
@@ -397,25 +391,19 @@ async def map_variants_for_score_set(
397391
score_set.mapping_state = MappingState.failed
398392
score_set.mapping_errors = {"error_message": mapping_results.get("error_message")}
399393
else:
400-
# TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support:
401-
# this assumes single-target mapping, will need to be changed to support multi-target mapping
402-
# just in case there are multiple target genes in the db for a score set (this point shouldn't be reached
403-
# while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence.
404-
# TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support:
405-
# this also assumes that the score set is based on a target sequence, not a target accession
406-
407394
reference_metadata = mapping_results.get("reference_sequences")
408395
if not reference_metadata:
409396
raise NonexistentMappingReferenceError()
410397

411398
for target_gene_identifier in reference_metadata:
412-
target_gene = db.scalars(
413-
select(
414-
TargetGene.where(
415-
TargetGene.name == target_gene_identifier, TargetGene.score_set_id == score_set.id
416-
)
417-
)
418-
).one_or_none()
399+
target_gene = next(
400+
(
401+
target_gene
402+
for target_gene in score_set.target_genes
403+
if target_gene.name == target_gene_identifier
404+
),
405+
None,
406+
)
419407
if not target_gene:
420408
raise ValueError(
421409
f"Target gene {target_gene_identifier} not found in database for score set {score_set.urn}."
@@ -431,8 +419,7 @@ async def map_variants_for_score_set(
431419
if layer_premapped:
432420
pre_mapped_metadata[ANNOTATION_LAYERS[annotation_layer]] = {
433421
k: layer_premapped[k]
434-
for k in set(list(layer_premapped.keys()))
435-
- excluded_pre_mapped_keys # TODO does this work if no 'sequence' key?
422+
for k in set(list(layer_premapped.keys())) - excluded_pre_mapped_keys
436423
}
437424
layer_postmapped = reference_metadata[target_gene_identifier][annotation_layer].get(
438425
"mapped_reference_sequence"

0 commit comments

Comments
 (0)