Skip to content

Commit b923f4e

Browse files
authored
Merge pull request #30 from VariantEffect/mavedb-dev
MaveDB Mapping v2024.1.2
2 parents 22ed17c + d28ce73 commit b923f4e

File tree

8 files changed

+89
-45
lines changed

8 files changed

+89
-45
lines changed

src/api/routers/map.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
""""Provide mapping router"""
2+
from pathlib import Path
3+
24
from cool_seq_tool.schemas import AnnotationLayer
35
from fastapi import APIRouter, HTTPException
46
from fastapi.responses import JSONResponse
@@ -17,6 +19,7 @@
1719
get_raw_scoreset_metadata,
1820
get_scoreset_metadata,
1921
get_scoreset_records,
22+
with_mavedb_score_set,
2023
)
2124
from dcd_mapping.resource_utils import ResourceAcquisitionError
2225
from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion
@@ -29,7 +32,8 @@
2932

3033

3134
@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping)
32-
async def map_scoreset(urn: str) -> ScoresetMapping:
35+
@with_mavedb_score_set
36+
async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapping:
3337
"""Perform end-to-end mapping for a scoreset.
3438
3539
:param urn: identifier for a scoreset.
@@ -38,8 +42,8 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
3842
:param silent: if True, suppress console information output
3943
"""
4044
try:
41-
metadata = get_scoreset_metadata(urn)
42-
records = get_scoreset_records(urn, True)
45+
metadata = get_scoreset_metadata(urn, store_path)
46+
records = get_scoreset_records(urn, True, store_path)
4347
except ScoresetNotSupportedError as e:
4448
return ScoresetMapping(
4549
metadata=None,
@@ -49,6 +53,14 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
4953
msg = f"Unable to acquire resource from MaveDB: {e}"
5054
raise HTTPException(status_code=500, detail=msg) from e
5155

56+
if not records:
57+
return JSONResponse(
58+
content=ScoresetMapping(
59+
metadata=metadata,
60+
error_message="Score set contains no variants to map",
61+
).model_dump(exclude_none=True)
62+
)
63+
5264
try:
5365
alignment_result = align(metadata, True)
5466
except BlatNotFoundError as e:
@@ -108,7 +120,7 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
108120
)
109121

110122
try:
111-
raw_metadata = get_raw_scoreset_metadata(urn)
123+
raw_metadata = get_raw_scoreset_metadata(urn, store_path)
112124
preferred_layers = {
113125
_set_scoreset_layer(urn, vrs_results),
114126
}
@@ -124,7 +136,7 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
124136
for layer in preferred_layers:
125137
reference_sequences[layer][
126138
"computed_reference_sequence"
127-
] = _get_computed_reference_sequence(urn, layer, transcript)
139+
] = _get_computed_reference_sequence(metadata, layer, transcript)
128140
reference_sequences[layer][
129141
"mapped_reference_sequence"
130142
] = _get_mapped_reference_sequence(layer, transcript, alignment_result)

src/dcd_mapping/annotate.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
get_seqrepo,
3030
get_vrs_id_from_identifier,
3131
)
32-
from dcd_mapping.mavedb_data import get_raw_scoreset_metadata, get_scoreset_metadata
3332
from dcd_mapping.resource_utils import LOCAL_STORE_PATH
3433
from dcd_mapping.schemas import (
3534
AlignmentResult,
@@ -409,7 +408,7 @@ def annotate(
409408

410409

411410
def _get_computed_reference_sequence(
412-
ss: str,
411+
metadata: ScoresetMetadata,
413412
layer: AnnotationLayer,
414413
tx_output: TxSelectResult | None = None,
415414
) -> ComputedReferenceSequence:
@@ -429,7 +428,6 @@ def _get_computed_reference_sequence(
429428
sequence_type=TargetSequenceType.PROTEIN,
430429
sequence_id=seq_id,
431430
)
432-
metadata = get_scoreset_metadata(ss)
433431
seq_id = f"ga4gh:SQ.{sha512t24u(metadata.target_sequence.encode('ascii'))}"
434432
return ComputedReferenceSequence(
435433
sequence=metadata.target_sequence,
@@ -516,7 +514,7 @@ def write_scoreset_mapping_to_json(
516514

517515

518516
def save_mapped_output_json(
519-
urn: str,
517+
metadata: ScoresetMetadata,
520518
mappings: list[ScoreAnnotationWithLayer],
521519
align_result: AlignmentResult,
522520
tx_output: TxSelectResult | None,
@@ -533,10 +531,9 @@ def save_mapped_output_json(
533531
<dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
534532
:return: output location
535533
"""
536-
metadata = get_raw_scoreset_metadata(urn)
537534
if preferred_layer_only:
538535
preferred_layers = {
539-
_set_scoreset_layer(urn, mappings),
536+
_set_scoreset_layer(metadata.urn, mappings),
540537
}
541538
else:
542539
preferred_layers = {mapping.annotation_layer for mapping in mappings}
@@ -549,20 +546,10 @@ def save_mapped_output_json(
549546
for layer in preferred_layers:
550547
reference_sequences[layer][
551548
"computed_reference_sequence"
552-
] = _get_computed_reference_sequence(urn, layer, tx_output)
549+
] = _get_computed_reference_sequence(metadata, layer, tx_output)
553550
reference_sequences[layer][
554551
"mapped_reference_sequence"
555552
] = _get_mapped_reference_sequence(layer, tx_output, align_result)
556-
# except Exception as e:
557-
# _logger.warning(
558-
# str(e)
559-
# )
560-
# output = ScoresetMapping(
561-
# metadata=metadata,
562-
# error_message = str(e).strip("'")
563-
# )
564-
565-
# return write_scoreset_mapping_to_json
566553

567554
mapped_scores: list[ScoreAnnotation] = []
568555
for m in mappings:
@@ -573,7 +560,7 @@ def save_mapped_output_json(
573560
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
574561

575562
output = ScoresetMapping(
576-
metadata=metadata,
563+
metadata=metadata.model_dump(),
577564
computed_protein_reference_sequence=reference_sequences[
578565
AnnotationLayer.PROTEIN
579566
]["computed_reference_sequence"],
@@ -589,4 +576,4 @@ def save_mapped_output_json(
589576
mapped_scores=mapped_scores,
590577
)
591578

592-
return write_scoreset_mapping_to_json(urn, output, output_path)
579+
return write_scoreset_mapping_to_json(metadata.urn, output, output_path)

src/dcd_mapping/main.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
ScoresetNotSupportedError,
2525
get_scoreset_metadata,
2626
get_scoreset_records,
27+
with_mavedb_score_set,
2728
)
2829
from dcd_mapping.resource_utils import ResourceAcquisitionError
2930
from dcd_mapping.schemas import (
@@ -264,7 +265,7 @@ async def map_scoreset(
264265
return
265266
try:
266267
final_output = save_mapped_output_json(
267-
metadata.urn,
268+
metadata,
268269
vrs_results,
269270
alignment_result,
270271
transcript,
@@ -287,12 +288,14 @@ async def map_scoreset(
287288
_emit_info(f"Annotated scores saved to: {final_output}.", silent)
288289

289290

291+
@with_mavedb_score_set
290292
async def map_scoreset_urn(
291293
urn: str,
292294
output_path: Path | None = None,
293295
vrs_version: VrsVersion = VrsVersion.V_2,
294296
prefer_genomic: bool = False,
295297
silent: bool = True,
298+
store_path: Path | None = None,
296299
) -> None:
297300
"""Perform end-to-end mapping for a scoreset.
298301
@@ -302,8 +305,8 @@ async def map_scoreset_urn(
302305
:param silent: if True, suppress console information output
303306
"""
304307
try:
305-
metadata = get_scoreset_metadata(urn)
306-
records = get_scoreset_records(urn, silent)
308+
metadata = get_scoreset_metadata(urn, store_path)
309+
records = get_scoreset_records(urn, silent, store_path)
307310
except ScoresetNotSupportedError as e:
308311
_emit_info(f"Score set not supported: {e}", silent, logging.ERROR)
309312
final_output = write_scoreset_mapping_to_json(
@@ -321,6 +324,20 @@ async def map_scoreset_urn(
321324
_logger.critical(msg)
322325
click.echo(f"Error: {msg}")
323326
raise e
327+
328+
if not records:
329+
_emit_info("Score set contains no variants to map", silent, logging.ERROR)
330+
final_output = write_scoreset_mapping_to_json(
331+
urn,
332+
ScoresetMapping(
333+
metadata=metadata,
334+
error_message="Score set contains no variants to map",
335+
),
336+
output_path,
337+
)
338+
_emit_info(f"Score set mapping output saved to: {final_output}.", silent)
339+
return
340+
324341
await map_scoreset(
325342
metadata, records, output_path, vrs_version, prefer_genomic, silent
326343
)

src/dcd_mapping/mavedb_data.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
33
Much of this can/should be replaced by the ``mavetools`` library? (and/or ``wags-tails``.)
44
"""
5+
56
import csv
67
import json
78
import logging
89
import tempfile
910
import zipfile
11+
from collections.abc import Callable
12+
from functools import wraps
1013
from pathlib import Path
1114
from typing import Any
1215

@@ -20,7 +23,7 @@
2023
authentication_header,
2124
http_download,
2225
)
23-
from dcd_mapping.schemas import ScoreRow, ScoresetMetadata, UniProtRef
26+
from dcd_mapping.schemas import ScoreRow, ScoresetMapping, ScoresetMetadata, UniProtRef
2427

2528
__all__ = [
2629
"get_scoreset_urns",
@@ -135,6 +138,7 @@ def get_raw_scoreset_metadata(
135138
"""
136139
if not dcd_mapping_dir:
137140
dcd_mapping_dir = LOCAL_STORE_PATH
141+
138142
metadata_file = dcd_mapping_dir / f"{scoreset_urn}_metadata.json"
139143
if not metadata_file.exists():
140144
url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{scoreset_urn}"
@@ -265,3 +269,27 @@ def get_scoreset_records(
265269
raise ResourceAcquisitionError(msg) from e
266270

267271
return _load_scoreset_records(scores_csv)
272+
273+
274+
def with_mavedb_score_set(fn: Callable) -> Callable:
275+
@wraps(fn)
276+
async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002
277+
urn = args[0] if args else kwargs["urn"]
278+
silent = kwargs.get("silent", False)
279+
280+
with tempfile.TemporaryDirectory(
281+
prefix=f"{LOCAL_STORE_PATH.as_posix()}/"
282+
) as temp_dir:
283+
# Set up metadata and scores for the current run. Now they will be accessible by these functions
284+
# without the need to download the data again.
285+
temp_dir_as_path = Path(temp_dir)
286+
get_scoreset_metadata(urn, temp_dir_as_path)
287+
get_scoreset_records(urn, silent, temp_dir_as_path)
288+
289+
# Pass the storage path of the temp directory to the wrapped function as a kwarg.
290+
kwargs["store_path"] = temp_dir_as_path
291+
v: ScoresetMapping = await fn(*args, **kwargs)
292+
293+
return v
294+
295+
return wrapper

src/dcd_mapping/schemas.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ class TargetSequenceType(str, Enum):
2121
class TargetType(str, Enum):
2222
"""Define target gene types."""
2323

24-
PROTEIN_CODING = "Protein coding"
25-
REGULATORY = "Regulatory"
26-
OTHER_NC = "Other noncoding"
24+
PROTEIN_CODING = "protein_coding"
25+
REGULATORY = "regulatory"
26+
OTHER_NC = "other_noncoding"
2727

2828

2929
class VrsVersion(str, Enum):

src/dcd_mapping/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
"""Provide dcd mapping version"""
22

3-
dcd_mapping_version = "2024.1.1"
3+
dcd_mapping_version = "2024.1.2"

0 commit comments

Comments
 (0)