Skip to content

Commit 97cfc58

Browse files
committed
wip: auto failover for protein level alignment
1 parent 36273a8 commit 97cfc58

File tree

6 files changed

+53
-14
lines changed

6 files changed

+53
-14
lines changed

src/api/routers/map.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
""""Provide mapping router"""
2+
import logging
23
from pathlib import Path
34

45
from cool_seq_tool.schemas import AnnotationLayer
@@ -45,6 +46,8 @@
4546
prefix="/api/v1", tags=["mappings"], responses={404: {"description": "Not found"}}
4647
)
4748

49+
_logger = logging.getLogger(__name__)
50+
4851

4952
@router.post(path="/map/{urn}", status_code=200, response_model=ScoresetMapping)
5053
@with_mavedb_score_set
@@ -57,7 +60,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
5760
try:
5861
metadata = get_scoreset_metadata(urn, store_path)
5962
records = get_scoreset_records(metadata, True, store_path)
60-
metadata = patch_target_sequence_type(metadata, records)
63+
metadata = patch_target_sequence_type(metadata, records, force=False)
6164
except ScoresetNotSupportedError as e:
6265
return JSONResponse(
6366
content=ScoresetMapping(

src/dcd_mapping/align.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import os
44
import subprocess
55
import tempfile
6+
from collections.abc import Mapping
67
from pathlib import Path
78
from urllib.parse import urlparse
89

@@ -19,7 +20,7 @@
1920
ScoresetNotSupportedError,
2021
)
2122
from dcd_mapping.lookup import get_chromosome_identifier, get_gene_location
22-
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH
23+
from dcd_mapping.mavedb_data import LOCAL_STORE_PATH, patch_target_sequence_type
2324
from dcd_mapping.resource_utils import http_download
2425
from dcd_mapping.schemas import (
2526
AlignmentResult,
@@ -441,7 +442,7 @@ def parse_cdot_mapping(cdot_mapping: dict, silent: bool) -> AlignmentResult:
441442

442443
def build_alignment_result(
443444
metadata: ScoresetMetadata, silent: bool
444-
) -> dict[str, AlignmentResult | None]:
445+
) -> Mapping[str, AlignmentResult | None]:
445446
# NOTE: Score set must contain all accession-based target genes or all sequence-based target genes
446447
# This decision was made because it is most efficient to run BLAT all together, so the alignment function
447448
# works on an entire score set rather than per target gene.
@@ -462,7 +463,25 @@ def build_alignment_result(
462463
score_set_type = "sequence"
463464

464465
if score_set_type == "sequence":
465-
alignment_result = align(metadata, silent)
466+
try:
467+
alignment_result = align(metadata, silent)
468+
except AlignmentError as e:
469+
failed_at_nucleotide_level = any(
470+
target_gene.target_sequence_type == TargetSequenceType.DNA
471+
for target_gene in metadata.target_genes.values()
472+
)
473+
474+
if failed_at_nucleotide_level:
475+
msg = f"BLAT alignment failed for {metadata.urn} at the nucleotide level. This alignment will be retried at the protein level."
476+
_logger.warning(msg)
477+
else:
478+
raise AlignmentError from e
479+
480+
# So long as force=True, the content of the records dict is irrelevant.
481+
alignment_result = align(
482+
patch_target_sequence_type(metadata, {}, force=True), silent
483+
)
484+
466485
else:
467486
alignment_result = fetch_alignment(metadata, silent)
468487

src/dcd_mapping/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,7 +346,7 @@ async def map_scoreset_urn(
346346
try:
347347
metadata = get_scoreset_metadata(urn, store_path)
348348
records = get_scoreset_records(metadata, silent, store_path)
349-
metadata = patch_target_sequence_type(metadata, records)
349+
metadata = patch_target_sequence_type(metadata, records, force=False)
350350
except ScoresetNotSupportedError as e:
351351
_emit_info(f"Score set not supported: {e}", silent, logging.ERROR)
352352
final_output = write_scoreset_mapping_to_json(

src/dcd_mapping/mavedb_data.py

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -326,21 +326,36 @@ def get_scoreset_records(
326326

327327

328328
def patch_target_sequence_type(
329-
metadata: ScoresetMetadata, records: dict
329+
metadata: ScoresetMetadata, records: dict, force: bool = False
330330
) -> ScoresetMetadata:
331331
"""If target sequence type is DNA but no nucleotide variants are defined, treat the target as if
332332
it were a protein level target.
333333
This avoids BLAT errors in cases where the target sequence was codon-optimized
334334
for a non-human organism
335335
"""
336+
patch_sequence_type = force or any(
337+
target.target_sequence_type == TargetSequenceType.DNA
338+
and not any(record.hgvs_nt for record in records.get(target_label, []))
339+
for target_label, target in metadata.target_genes.items()
340+
)
341+
342+
if not patch_sequence_type:
343+
msg = f"Not patching target sequence type for {metadata.urn}. Either force=True (was {force}), or at least one target has nucleotide-level variants."
344+
_logger.debug(msg)
345+
return metadata
346+
336347
for target_label, target in metadata.target_genes.items():
337-
if target.target_sequence_type == TargetSequenceType.DNA and not any(
338-
record.hgvs_nt for record in records.get(target_label, [])
339-
):
340-
msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because target only has protein-level variants"
341-
_logger.info(msg)
342-
target.target_sequence = _get_protein_sequence(target.target_sequence)
343-
target.target_sequence_type = TargetSequenceType.PROTEIN
348+
if not target.target_sequence:
349+
msg = f"Cannot patch target sequence type for {metadata.urn} target {target_label} because no target sequence is available."
350+
_logger.debug(msg)
351+
continue
352+
353+
msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein. (force was {force})."
354+
_logger.info(msg)
355+
356+
target.target_sequence = _get_protein_sequence(target.target_sequence)
357+
target.target_sequence_type = TargetSequenceType.PROTEIN
358+
344359
return metadata
345360

346361

src/dcd_mapping/transcripts.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Select best reference sequence."""
22
import logging
33
import re
4+
from collections.abc import Mapping
45

56
from Bio.Data.CodonTable import IUPACData
67
from Bio.Seq import Seq
@@ -345,7 +346,7 @@ async def select_transcript(
345346
async def select_transcripts(
346347
scoreset_metadata: ScoresetMetadata,
347348
records: dict[str, list[ScoreRow]],
348-
align_results: dict[str, AlignmentResult | None],
349+
align_results: Mapping[str, AlignmentResult | None],
349350
) -> dict[str, TxSelectResult | Exception | None]:
350351
"""Select appropriate human reference sequence for each target in a score set.
351352
:param scoreset_metadata: Metadata for score set from MaveDB API

src/dcd_mapping/vrs_map.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,7 @@ def _construct_vrs_allele(
842842
) -> Allele | Haplotype:
843843
alleles: list[Allele] = []
844844
for hgvs_string in hgvs_strings:
845+
_logger.info("Processing HGVS string: %s", hgvs_string)
845846
allele = translate_hgvs_to_vrs(hgvs_string)
846847

847848
if pre_map:

0 commit comments

Comments
 (0)