API support for accession-based mapping

sallybg · sallybg · commit 25f433274796 · 2025-04-09T11:15:09.000-07:00
Note that this is mostly structured to handle multi-target mapping,
but does not contain all changes required for multi-target mapping
(specifically final output / reference sequence structure).
Such changes would require corresponding changes in mavedb-api
which we are not prepared to deploy yet.
diff --git a/src/api/routers/map.py b/src/api/routers/map.py
@@ -6,7 +6,7 @@
 from fastapi.responses import JSONResponse
 from requests import HTTPError
 
-from dcd_mapping.align import AlignmentError, BlatNotFoundError, align
+from dcd_mapping.align import AlignmentError, BlatNotFoundError, build_alignment_result
 from dcd_mapping.annotate import (
     _get_computed_reference_sequence,
     _get_mapped_reference_sequence,
@@ -23,7 +23,7 @@
 )
 from dcd_mapping.resource_utils import ResourceAcquisitionError
 from dcd_mapping.schemas import ScoreAnnotation, ScoresetMapping, VrsVersion
-from dcd_mapping.transcripts import TxSelectError, select_transcript
+from dcd_mapping.transcripts import select_transcripts
 from dcd_mapping.vrs_map import VrsMapError, vrs_map
 
 router = APIRouter(
@@ -37,9 +37,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
     """Perform end-to-end mapping for a scoreset.
 
     :param urn: identifier for a scoreset.
-    :param output_path: optional path to save output at
-    :param vrs_version: version of VRS objects to output (1.3 or 2)
-    :param silent: if True, suppress console information output
+    :param store_path: optional path to save output at
     """
     try:
         metadata = get_scoreset_metadata(urn, store_path)
@@ -62,7 +60,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
         )
 
     try:
-        alignment_result = align(metadata, True)
+        alignment_results = build_alignment_result(metadata, True)
     except BlatNotFoundError as e:
         msg = "BLAT command appears missing. Ensure it is available on the $PATH or use the environment variable BLAT_BIN_PATH to point to it. See instructions in the README prerequisites section for more."
         raise HTTPException(status_code=500, detail=msg) from e
@@ -75,54 +73,82 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
                 metadata=metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
-
-    try:
-        transcript = await select_transcript(metadata, records, alignment_result)
-    except (TxSelectError, KeyError, ValueError) as e:
+    except ScoresetNotSupportedError as e:
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
+
+    try:
+        transcripts = await select_transcripts(metadata, records, alignment_results)
+    # NOTE: transcript selection errors are handled in select_transcripts,
+    # and they do not cause the entire mapping process to exit; instead, an error will be reported
+    # on the target level and on the variant level for variants relative to that target
+    # HTTPErrors and DataLookupErrors cause the mapping process to exit because these indicate
+    # underlying issues with data providers.
     except HTTPError as e:
         msg = f"HTTP error occurred during transcript selection: {e}"
         raise HTTPException(status_code=500, detail=msg) from e
     except DataLookupError as e:
         msg = f"Data lookup error occurred during transcript selection: {e}"
         raise HTTPException(status_code=500, detail=msg) from e
 
+    vrs_results = {}
     try:
-        vrs_results = vrs_map(metadata, alignment_result, records, transcript, True)
+        for target_gene in metadata.target_genes:
+            vrs_results[target_gene] = vrs_map(
+                metadata=metadata.target_genes[target_gene],
+                align_result=alignment_results[target_gene],
+                records=records[target_gene],
+                transcript=transcripts[target_gene],
+                silent=True,
+            )
     except VrsMapError as e:
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
-    if vrs_results is None:
+    # TODO this should instead check if all values in dict are none. or might not need this at all.
+    if vrs_results is None or len(vrs_results) == 0:
         return ScoresetMapping(
             metadata=metadata,
             error_message="No variant mappings available for this score set",
         )
 
+    annotated_vrs_results = {}
     try:
-        vrs_results = annotate(vrs_results, transcript, metadata, VrsVersion.V_2)
+        for target_gene in vrs_results:
+            annotated_vrs_results[target_gene] = annotate(
+                vrs_results[target_gene],
+                transcripts[target_gene],
+                metadata.target_genes[target_gene],
+                metadata.urn,
+                VrsVersion.V_2,
+            )
     except Exception as e:
         return JSONResponse(
             content=ScoresetMapping(
                 metadata=metadata, error_message=str(e).strip("'")
             ).model_dump(exclude_none=True)
         )
-    if vrs_results is None:
+    # TODO this should instead check if all values in dict are none. or might not need this at all.
+    if vrs_results is None or len(vrs_results) == 0:
         return ScoresetMapping(
             metadata=metadata,
             error_message="No annotated variant mappings available for this score set",
         )
 
+    # TODO this will need to be changed to support multi-target score sets.
+    # This version works for accession based score sets.
+    # Not implementing multi-target changes because this will require corresponding changes on mavedb-api and we want to get this on staging quickly right now.
+    # For now, only accept single-target score sets so that we don't need to change structure of JSON output.
+    target_gene = list(metadata["target_genes"].keys())[0]  # noqa: RUF015
     try:
         raw_metadata = get_raw_scoreset_metadata(urn, store_path)
         preferred_layers = {
-            _set_scoreset_layer(urn, vrs_results),
+            _set_scoreset_layer(urn, vrs_results[target_gene]),
         }
 
         reference_sequences = {
@@ -136,10 +162,14 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> ScoresetMapp
         for layer in preferred_layers:
             reference_sequences[layer][
                 "computed_reference_sequence"
-            ] = _get_computed_reference_sequence(metadata, layer, transcript)
+            ] = _get_computed_reference_sequence(
+                metadata.target_genes[target_gene], layer, transcripts[target_gene]
+            )
             reference_sequences[layer][
                 "mapped_reference_sequence"
-            ] = _get_mapped_reference_sequence(layer, transcript, alignment_result)
+            ] = _get_mapped_reference_sequence(
+                layer, transcripts[target_gene], alignment_results[target_gene]
+            )
 
         mapped_scores: list[ScoreAnnotation] = []
         for m in vrs_results: