Skip to content

Commit 1333963

Browse files
authored
Merge pull request #56 from VariantEffect/mavedb-dev
Release 2025.2.0
2 parents 2bd58d2 + f037ce1 commit 1333963

File tree

14 files changed

+282
-122
lines changed

14 files changed

+282
-122
lines changed

docker-compose-dev.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ services:
3030
- vrs-mapping-data-dev:/var/lib/postgresql/data
3131

3232
seqrepo:
33-
image: biocommons/seqrepo:2021-01-29
33+
image: biocommons/seqrepo:2024-12-20
3434
volumes:
3535
- vrs-mapping-seqrepo-dev:/usr/local/share/seqrepo
3636

settings/.env.dev

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ MAVEDB_API_KEY=
3030
# Environment variables for seqrepo
3131
####################################################################################################
3232

33-
SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2021-01-29
33+
SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20

src/api/routers/map.py

Lines changed: 60 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,31 @@
66
from fastapi.responses import JSONResponse
77
from requests import HTTPError
88

9-
from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result
9+
from dcd_mapping.align import build_alignment_result
1010
from dcd_mapping.annotate import (
1111
_get_computed_reference_sequence,
1212
_get_mapped_reference_sequence,
1313
_set_scoreset_layer,
1414
annotate,
1515
)
16-
from dcd_mapping.lookup import DataLookupError
17-
from dcd_mapping.mavedb_data import (
16+
from dcd_mapping.exceptions import (
17+
AlignmentError,
18+
BlatNotFoundError,
19+
DataLookupError,
20+
MissingSequenceIdError,
21+
ResourceAcquisitionError,
1822
ScoresetNotSupportedError,
23+
UnsupportedReferenceSequenceNameSpaceError,
24+
UnsupportedReferenceSequencePrefixError,
25+
VrsMapError,
26+
)
27+
from dcd_mapping.mavedb_data import (
1928
get_raw_scoreset_metadata,
2029
get_scoreset_metadata,
2130
get_scoreset_records,
31+
patch_target_sequence_type,
2232
with_mavedb_score_set,
2333
)
24-
from dcd_mapping.resource_utils import ResourceAcquisitionError
2534
from dcd_mapping.schemas import (
2635
ScoreAnnotation,
2736
ScoresetMapping,
@@ -30,7 +39,7 @@
3039
VrsVersion,
3140
)
3241
from dcd_mapping.transcripts import select_transcripts
33-
from dcd_mapping.vrs_map import VrsMapError, vrs_map
42+
from dcd_mapping.vrs_map import vrs_map
3443

3544
router = APIRouter(
3645
prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}}
@@ -48,6 +57,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
4857
try:
4958
metadata = get_scoreset_metadata(urn, store_path)
5059
records = get_scoreset_records(metadata, True, store_path)
60+
metadata = patch_target_sequence_type(metadata, records)
5161
except ScoresetNotSupportedError as e:
5262
return JSONResponse(
5363
content=ScoresetMapping(
@@ -66,6 +76,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
6676
error_message="Score set contains no variants to map",
6777
).model_dump(exclude_none=True)
6878
)
79+
total_score_records = sum(len(v) for v in records.values())
6980

7081
try:
7182
alignment_results = build_alignment_result(metadata, True)
@@ -112,21 +123,38 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
112123
transcript=transcripts[target_gene],
113124
silent=True,
114125
)
115-
except VrsMapError as e:
126+
except (
127+
UnsupportedReferenceSequenceNameSpaceError,
128+
VrsMapError,
129+
UnsupportedReferenceSequencePrefixError,
130+
MissingSequenceIdError,
131+
) as e:
116132
return JSONResponse(
117133
content=ScoresetMapping(
118134
metadata=metadata, error_message=str(e).strip("'")
119135
).model_dump(exclude_none=True)
120136
)
121-
if not vrs_results or all(
122-
mapping_result is None for mapping_result in vrs_results.values()
123-
):
137+
138+
nonetype_vrs_results = [
139+
result is None
140+
for target_gene in vrs_results
141+
for result in vrs_results[target_gene]
142+
]
143+
144+
if not vrs_results or all(nonetype_vrs_results):
124145
return JSONResponse(
125146
content=ScoresetMapping(
126147
metadata=metadata,
127148
error_message="No variant mappings available for this score set",
128149
).model_dump(exclude_none=True)
129150
)
151+
if any(nonetype_vrs_results):
152+
return JSONResponse(
153+
content=ScoresetMapping(
154+
metadata=metadata,
155+
error_message="Some variants generated vrs results, but not all. If any variants were mapped, all should have been.",
156+
).model_dump(exclude_none=True)
157+
)
130158

131159
annotated_vrs_results = {}
132160
try:
@@ -144,15 +172,27 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
144172
metadata=metadata, error_message=str(e).strip("'")
145173
).model_dump(exclude_none=True)
146174
)
147-
if not annotated_vrs_results or all(
148-
mapping_result is None for mapping_result in annotated_vrs_results.values()
149-
):
175+
176+
nonetype_annotated_vrs_results = [
177+
result is None
178+
for target_gene in annotated_vrs_results
179+
for result in annotated_vrs_results[target_gene]
180+
]
181+
182+
if not annotated_vrs_results or all(nonetype_annotated_vrs_results):
150183
return JSONResponse(
151184
content=ScoresetMapping(
152185
metadata=metadata,
153186
error_message="No annotated variant mappings available for this score set",
154187
).model_dump(exclude_none=True)
155188
)
189+
if any(nonetype_annotated_vrs_results):
190+
return JSONResponse(
191+
content=ScoresetMapping(
192+
metadata=metadata,
193+
error_message="Some variants generated annotated vrs results, but not all. If any variants were annotated, all should have been.",
194+
).model_dump(exclude_none=True)
195+
)
156196

157197
try:
158198
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
@@ -233,6 +273,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
233273
).model_dump(exclude_none=True)
234274
)
235275

276+
if len(mapped_scores) != total_score_records:
277+
return JSONResponse(
278+
content=ScoresetMapping(
279+
metadata=metadata,
280+
error_message=f"Mismatch between number of mapped scores ({len(mapped_scores)}) and total score records ({total_score_records}). This is unexpected and indicates an issue with the mapping process.",
281+
).model_dump(exclude_none=True)
282+
)
283+
236284
return JSONResponse(
237285
content=ScoresetMapping(
238286
metadata=raw_metadata,

src/dcd_mapping/align.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@
1212
from Bio.SearchIO._model import Hit, QueryResult
1313
from cool_seq_tool.schemas import Strand
1414

15-
from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
16-
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, ScoresetNotSupportedError
17-
from dcd_mapping.resource_utils import (
15+
from dcd_mapping.exceptions import (
16+
AlignmentError,
17+
BlatNotFoundError,
1818
ResourceAcquisitionError,
19-
http_download,
19+
ScoresetNotSupportedError,
2020
)
21+
from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
22+
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH
23+
from dcd_mapping.resource_utils import http_download
2124
from dcd_mapping.schemas import (
2225
AlignmentResult,
2326
GeneLocation,
@@ -32,14 +35,6 @@
3235
_logger = logging.getLogger(__name__)
3336

3437

35-
class AlignmentError(Exception):
36-
"""Raise when errors encountered during alignment."""
37-
38-
39-
class BlatNotFoundError(AlignmentError):
40-
"""Raise when BLAT binary appears to be missing."""
41-
42-
4338
def _write_query_file(file: Path, lines: list[str]) -> None:
4439
"""Write lines to query file. This method is broken out to enable easy mocking while
4540
testing.
@@ -363,6 +358,11 @@ def align(
363358
msg = f"BLAT result {target_label} matches multiple target gene names in scoreset {scoreset_metadata.urn}"
364359
target_gene = scoreset_metadata.target_genes[target_label]
365360
alignment_results[target_label] = _get_best_match(blat_result, target_gene)
361+
# confirm that there is an alignment result for each target gene
362+
for target_gene in scoreset_metadata.target_genes:
363+
if target_gene not in alignment_results:
364+
msg = f"No BLAT result found for target gene {target_gene} in scoreset {scoreset_metadata.urn}"
365+
raise AlignmentError(msg)
366366
return alignment_results
367367

368368

src/dcd_mapping/annotate.py

Lines changed: 52 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ def _get_vrs_ref_allele_seq(
140140
metadata: TargetGene,
141141
urn: str,
142142
tx_select_results: TxSelectResult | None,
143-
) -> Extension:
143+
) -> Extension | None:
144144
"""Create `vrs_ref_allele_seq` property."""
145145
start, end = _offset_allele_ref_seq(urn, allele.location.start, allele.location.end)
146146
if (
@@ -161,8 +161,12 @@ def _get_vrs_ref_allele_seq(
161161
seq = f"ga4gh:{allele.location.sequenceReference.refgetAccession}" # type: ignore
162162
sr = get_seqrepo()
163163
ref = sr.get_sequence(seq, start, end)
164-
if ref is None:
165-
raise ValueError
164+
165+
if not ref:
166+
msg = f"Could not retrieve reference sequence for allele {allele.id} in urn {urn} with start {start} and end {end}"
167+
_logger.warning(msg)
168+
return None
169+
166170
return Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
167171

168172

@@ -256,23 +260,28 @@ def _annotate_allele_mapping(
256260
post_mapped: Allele = mapped_score.post_mapped
257261

258262
# get vrs_ref_allele_seq for pre-mapped variants
259-
pre_mapped.extensions = [
260-
_get_vrs_ref_allele_seq(pre_mapped, metadata, urn, tx_results)
261-
]
263+
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
264+
pre_mapped, metadata, urn, tx_results
265+
)
266+
if ref_allele_seq_extension is not None:
267+
pre_mapped.extensions = [ref_allele_seq_extension]
262268

263269
if post_mapped:
264270
# Determine reference sequence
265271
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
266272
sequence_id = f"ga4gh:{mapped_score.post_mapped.location.sequenceReference.refgetAccession}"
267273
accession = get_chromosome_identifier_from_vrs_id(sequence_id)
268274
if accession is None:
269-
raise ValueError
270-
if accession.startswith("refseq:"):
275+
accession = None
276+
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
277+
elif accession.startswith("refseq:"):
271278
accession = accession[7:]
272279
else:
273280
if tx_results is None or isinstance(tx_results, TxSelectError):
274-
raise ValueError # impossible by definition
275-
accession = tx_results.np
281+
accession = None
282+
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
283+
else:
284+
accession = tx_results.np
276285

277286
sr = get_seqrepo()
278287
loc = mapped_score.post_mapped.location
@@ -281,8 +290,9 @@ def _annotate_allele_mapping(
281290
post_mapped.extensions = [
282291
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
283292
]
284-
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
285-
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]
293+
if accession:
294+
hgvs_string, syntax = _get_hgvs_string(post_mapped, accession)
295+
post_mapped.expressions = [Expression(syntax=syntax, value=hgvs_string)]
286296

287297
if vrs_version == VrsVersion.V_1_3:
288298
pre_mapped = _allele_to_vod(pre_mapped)
@@ -295,9 +305,7 @@ def _annotate_allele_mapping(
295305
mavedb_id=mapped_score.accession_id,
296306
score=float(mapped_score.score) if mapped_score.score else None,
297307
annotation_layer=mapped_score.annotation_layer,
298-
error_message=mapped_score.error_message
299-
if mapped_score.error_message
300-
else None, # TODO might not need if statement here
308+
error_message=mapped_score.error_message,
301309
)
302310

303311

@@ -311,23 +319,32 @@ def _annotate_haplotype_mapping(
311319
"""Perform annotations and, if necessary, create VRS 1.3 equivalents for haplotype mappings."""
312320
pre_mapped: Haplotype = mapped_score.pre_mapped # type: ignore
313321
post_mapped: Haplotype = mapped_score.post_mapped # type: ignore
322+
314323
# get vrs_ref_allele_seq for pre-mapped variants
315324
for allele in pre_mapped.members:
316-
allele.extensions = [_get_vrs_ref_allele_seq(allele, metadata, urn, tx_results)]
325+
ref_allele_seq_extension = _get_vrs_ref_allele_seq(
326+
allele, metadata, urn, tx_results
327+
)
328+
if ref_allele_seq_extension is not None:
329+
allele.extensions = [ref_allele_seq_extension]
317330

318331
if post_mapped:
319332
# Determine reference sequence
320333
if mapped_score.annotation_layer == AnnotationLayer.GENOMIC:
321334
sequence_id = f"ga4gh:{post_mapped.members[0].location.sequenceReference.refgetAccession}"
322335
accession = get_chromosome_identifier_from_vrs_id(sequence_id)
323336
if accession is None:
324-
raise ValueError
325-
if accession.startswith("refseq:"):
337+
accession = None
338+
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
339+
elif accession.startswith("refseq:"):
326340
accession = accession[7:]
327341
else:
328342
if tx_results is None or isinstance(tx_results, TxSelectError):
329-
raise ValueError # impossible by definition
330-
accession = tx_results.np
343+
# impossible by definition
344+
accession = None
345+
mapped_score.error_message = "Could not determine accession for this annotation. No allele expression is available."
346+
else:
347+
accession = tx_results.np
331348

332349
sr = get_seqrepo()
333350
for allele in post_mapped.members:
@@ -337,8 +354,9 @@ def _annotate_haplotype_mapping(
337354
allele.extensions = [
338355
Extension(type="Extension", name="vrs_ref_allele_seq", value=ref)
339356
]
340-
hgvs, syntax = _get_hgvs_string(allele, accession)
341-
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
357+
if accession:
358+
hgvs, syntax = _get_hgvs_string(allele, accession)
359+
allele.expressions = [Expression(syntax=syntax, value=hgvs)]
342360

343361
if vrs_version == VrsVersion.V_1_3:
344362
pre_mapped = _haplotype_to_haplotype_1_3(pre_mapped)
@@ -351,9 +369,7 @@ def _annotate_haplotype_mapping(
351369
mavedb_id=mapped_score.accession_id,
352370
score=float(mapped_score.score) if mapped_score.score is not None else None,
353371
annotation_layer=mapped_score.annotation_layer,
354-
error_message=mapped_score.error_message
355-
if mapped_score.error_message
356-
else None, # TODO might not need if statement here
372+
error_message=mapped_score.error_message,
357373
)
358374

359375

@@ -388,6 +404,7 @@ def annotate(
388404
ScoreAnnotationWithLayer(
389405
mavedb_id=mapped_score.accession_id,
390406
score=float(mapped_score.score) if mapped_score.score else None,
407+
vrs_version=vrs_version,
391408
error_message=mapped_score.error_message,
392409
)
393410
)
@@ -410,8 +427,16 @@ def annotate(
410427
)
411428
)
412429
else:
413-
# TODO how to combine this error message with other potential error messages?
414-
ValueError("inconsistent variant structure")
430+
score_annotations.append(
431+
ScoreAnnotationWithLayer(
432+
pre_mapped=mapped_score.pre_mapped,
433+
post_mapped=mapped_score.post_mapped,
434+
vrs_version=vrs_version,
435+
mavedb_id=mapped_score.accession_id,
436+
score=float(mapped_score.score) if mapped_score.score else None,
437+
error_message=f"Multiple issues with annotation: Inconsistent variant structure (Allele and Haplotype mix).{' ' + mapped_score.error_message if mapped_score.error_message else ''}",
438+
)
439+
)
415440

416441
return score_annotations
417442

@@ -519,11 +544,6 @@ def _set_scoreset_layer(
519544
expressions. If genomic expressions are available, that's what we'd like to use.
520545
This function tells us how to filter the final annotation objects.
521546
"""
522-
if urn.startswith("urn:mavedb:00000097"):
523-
_logger.debug(
524-
"Manually selecting protein annotation for scores from urn:mavedb:00000097"
525-
)
526-
return AnnotationLayer.PROTEIN
527547
for mapping in mappings:
528548
if mapping.annotation_layer == AnnotationLayer.GENOMIC:
529549
return AnnotationLayer.GENOMIC

0 commit comments

Comments
 (0)