Skip to content

Commit 21c75ab

Browse files
committed
Use Temporary Directory for MaveDB Score and Metadata Files
Creates a decorator for mapping routines which creates a temporary directory in which score set metadata and score files can be downloaded. The directory path is then passed to the mapping routine so that these temp files can be used by the mapper. Once the wrapped function exits, the temporary directory is purged.
1 parent 2813d84 commit 21c75ab

File tree

4 files changed

+49
-27
lines changed

4 files changed

+49
-27
lines changed

src/api/routers/map.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
""""Provide mapping router"""
2+
from pathlib import Path
3+
24
from cool_seq_tool.schemas import AnnotationLayer
35
from fastapi import APIRouter, HTTPException
46
from fastapi.responses import JSONResponse
@@ -17,6 +19,7 @@
1719
get_raw_scoreset_metadata,
1820
get_scoreset_metadata,
1921
get_scoreset_records,
22+
with_mavedb_score_set,
2023
)
2124
from dcd_mapping.resource_utils import ResourceAcquisitionError
2225
from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion
@@ -29,7 +32,8 @@
2932

3033

3134
@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping)
32-
async def map_scoreset(urn: str) -> ScoresetMapping:
35+
@with_mavedb_score_set
36+
async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapping:
3337
"""Perform end-to-end mapping for a scoreset.
3438
3539
:param urn: identifier for a scoreset.
@@ -38,8 +42,8 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
3842
:param silent: if True, suppress console information output
3943
"""
4044
try:
41-
metadata = get_scoreset_metadata(urn)
42-
records = get_scoreset_records(urn, True)
45+
metadata = get_scoreset_metadata(urn, store_path)
46+
records = get_scoreset_records(urn, True, store_path)
4347
except ScoresetNotSupportedError as e:
4448
return ScoresetMapping(
4549
metadata=None,
@@ -132,7 +136,7 @@ async def map_scoreset(urn: str) -> ScoresetMapping:
132136
for layer in preferred_layers:
133137
reference_sequences[layer][
134138
"computed_reference_sequence"
135-
] = _get_computed_reference_sequence(urn, layer, transcript)
139+
] = _get_computed_reference_sequence(metadata, layer, transcript)
136140
reference_sequences[layer][
137141
"mapped_reference_sequence"
138142
] = _get_mapped_reference_sequence(layer, transcript, alignment_result)

src/dcd_mapping/annotate.py

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
get_seqrepo,
3030
get_vrs_id_from_identifier,
3131
)
32-
from dcd_mapping.mavedb_data import get_raw_scoreset_metadata, get_scoreset_metadata
3332
from dcd_mapping.resource_utils import LOCAL_STORE_PATH
3433
from dcd_mapping.schemas import (
3534
AlignmentResult,
@@ -409,7 +408,7 @@ def annotate(
409408

410409

411410
def _get_computed_reference_sequence(
412-
ss: str,
411+
metadata: ScoresetMetadata,
413412
layer: AnnotationLayer,
414413
tx_output: TxSelectResult | None = None,
415414
) -> ComputedReferenceSequence:
@@ -429,7 +428,6 @@ def _get_computed_reference_sequence(
429428
sequence_type=TargetSequenceType.PROTEIN,
430429
sequence_id=seq_id,
431430
)
432-
metadata = get_scoreset_metadata(ss)
433431
seq_id = f"ga4gh:SQ.{sha512t24u(metadata.target_sequence.encode('ascii'))}"
434432
return ComputedReferenceSequence(
435433
sequence=metadata.target_sequence,
@@ -516,7 +514,7 @@ def write_scoreset_mapping_to_json(
516514

517515

518516
def save_mapped_output_json(
519-
urn: str,
517+
metadata: ScoresetMetadata,
520518
mappings: list[ScoreAnnotationWithLayer],
521519
align_result: AlignmentResult,
522520
tx_output: TxSelectResult | None,
@@ -533,10 +531,9 @@ def save_mapped_output_json(
533531
<dcd_mapping_data_dir>/urn:mavedb:00000XXX-X-X_mapping_<ISO8601 datetime>.json
534532
:return: output location
535533
"""
536-
metadata = get_raw_scoreset_metadata(urn)
537534
if preferred_layer_only:
538535
preferred_layers = {
539-
_set_scoreset_layer(urn, mappings),
536+
_set_scoreset_layer(metadata.urn, mappings),
540537
}
541538
else:
542539
preferred_layers = {mapping.annotation_layer for mapping in mappings}
@@ -549,20 +546,10 @@ def save_mapped_output_json(
549546
for layer in preferred_layers:
550547
reference_sequences[layer][
551548
"computed_reference_sequence"
552-
] = _get_computed_reference_sequence(urn, layer, tx_output)
549+
] = _get_computed_reference_sequence(metadata, layer, tx_output)
553550
reference_sequences[layer][
554551
"mapped_reference_sequence"
555552
] = _get_mapped_reference_sequence(layer, tx_output, align_result)
556-
# except Exception as e:
557-
# _logger.warning(
558-
# str(e)
559-
# )
560-
# output = ScoresetMapping(
561-
# metadata=metadata,
562-
# error_message = str(e).strip("'")
563-
# )
564-
565-
# return write_scoreset_mapping_to_json
566553

567554
mapped_scores: list[ScoreAnnotation] = []
568555
for m in mappings:
@@ -573,7 +560,7 @@ def save_mapped_output_json(
573560
mapped_scores.append(ScoreAnnotation(**m.model_dump()))
574561

575562
output = ScoresetMapping(
576-
metadata=metadata,
563+
metadata=metadata.model_dump(),
577564
computed_protein_reference_sequence=reference_sequences[
578565
AnnotationLayer.PROTEIN
579566
]["computed_reference_sequence"],
@@ -589,4 +576,4 @@ def save_mapped_output_json(
589576
mapped_scores=mapped_scores,
590577
)
591578

592-
return write_scoreset_mapping_to_json(urn, output, output_path)
579+
return write_scoreset_mapping_to_json(metadata.urn, output, output_path)

src/dcd_mapping/main.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
ScoresetNotSupportedError,
2525
get_scoreset_metadata,
2626
get_scoreset_records,
27+
with_mavedb_score_set,
2728
)
2829
from dcd_mapping.resource_utils import ResourceAcquisitionError
2930
from dcd_mapping.schemas import (
@@ -264,7 +265,7 @@ async def map_scoreset(
264265
return
265266
try:
266267
final_output = save_mapped_output_json(
267-
metadata.urn,
268+
metadata,
268269
vrs_results,
269270
alignment_result,
270271
transcript,
@@ -287,12 +288,14 @@ async def map_scoreset(
287288
_emit_info(f"Annotated scores saved to: {final_output}.", silent)
288289

289290

291+
@with_mavedb_score_set
290292
async def map_scoreset_urn(
291293
urn: str,
292294
output_path: Path | None = None,
293295
vrs_version: VrsVersion = VrsVersion.V_2,
294296
prefer_genomic: bool = False,
295297
silent: bool = True,
298+
store_path: Path | None = None,
296299
) -> None:
297300
"""Perform end-to-end mapping for a scoreset.
298301
@@ -302,8 +305,8 @@ async def map_scoreset_urn(
302305
:param silent: if True, suppress console information output
303306
"""
304307
try:
305-
metadata = get_scoreset_metadata(urn)
306-
records = get_scoreset_records(urn, silent)
308+
metadata = get_scoreset_metadata(urn, store_path)
309+
records = get_scoreset_records(urn, silent, store_path)
307310
except ScoresetNotSupportedError as e:
308311
_emit_info(f"Score set not supported: {e}", silent, logging.ERROR)
309312
final_output = write_scoreset_mapping_to_json(

src/dcd_mapping/mavedb_data.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
33
Much of this can/should be replaced by the ``mavetools`` library? (and/or ``wags-tails``.)
44
"""
5+
56
import csv
67
import json
78
import logging
89
import tempfile
910
import zipfile
11+
from collections.abc import Callable
12+
from functools import wraps
1013
from pathlib import Path
1114
from typing import Any
1215

@@ -20,7 +23,7 @@
2023
authentication_header,
2124
http_download,
2225
)
23-
from dcd_mapping.schemas import ScoreRow, ScoresetMetadata, UniProtRef
26+
from dcd_mapping.schemas import ScoreRow, ScoresetMapping, ScoresetMetadata, UniProtRef
2427

2528
__all__ = [
2629
"get_scoreset_urns",
@@ -135,6 +138,7 @@ def get_raw_scoreset_metadata(
135138
"""
136139
if not dcd_mapping_dir:
137140
dcd_mapping_dir = LOCAL_STORE_PATH
141+
138142
metadata_file = dcd_mapping_dir / f"{scoreset_urn}_metadata.json"
139143
if not metadata_file.exists():
140144
url = f"{MAVEDB_BASE_URL}/api/v1/score-sets/{scoreset_urn}"
@@ -265,3 +269,27 @@ def get_scoreset_records(
265269
raise ResourceAcquisitionError(msg) from e
266270

267271
return _load_scoreset_records(scores_csv)
272+
273+
274+
def with_mavedb_score_set(fn: Callable) -> Callable:
275+
@wraps(fn)
276+
async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002
277+
urn = args[0] if args else kwargs["urn"]
278+
silent = kwargs.get("silent", False)
279+
280+
with tempfile.TemporaryDirectory(
281+
prefix=f"{LOCAL_STORE_PATH.as_posix()}/"
282+
) as temp_dir:
283+
# Set up metadata and scores for the current run. Now they will be accessible by these functions
284+
# without the need to download the data again.
285+
temp_dir_as_path = Path(temp_dir)
286+
get_scoreset_metadata(urn, temp_dir_as_path)
287+
get_scoreset_records(urn, silent, temp_dir_as_path)
288+
289+
# Pass the storage path of the temp directory to the wrapped function as a kwarg.
290+
kwargs["store_path"] = temp_dir_as_path
291+
v: ScoresetMapping = await fn(*args, **kwargs)
292+
293+
return v
294+
295+
return wrapper

0 commit comments

Comments
 (0)