Skip to content

Commit 377a8c4

Browse files
committed
Translate NT target sequence for targets with protein-level variants
If a target has only protein-level variants, but the provided target sequence is a nucleotide sequence, translate the nucleotide sequence to an amino acid sequence immediately after metadata ingestion. This change avoids alignment errors that can occur when a target sequence has been codon-optimized to a non-human organism. Since we do not have sufficient metadata to assume that a target sequence has been codon-optimized, always perform translation when there are no nucleotide-level variants for a target.
1 parent 6dc2781 commit 377a8c4

File tree

3 files changed

+28
-0
lines changed

3 files changed

+28
-0
lines changed

src/api/routers/map.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from dcd_mapping.lookup import DataLookupError
1717
from dcd_mapping.mavedb_data import (
1818
ScoresetNotSupportedError,
19+
correct_target_sequence_type,
1920
get_raw_scoreset_metadata,
2021
get_scoreset_metadata,
2122
get_scoreset_records,
@@ -48,6 +49,7 @@ async def map_scoreset(urn: str, store_path: Path | None = None) -> JSONResponse
4849
try:
4950
metadata = get_scoreset_metadata(urn, store_path)
5051
records = get_scoreset_records(metadata, True, store_path)
52+
metadata = correct_target_sequence_type(metadata, records)
5153
except ScoresetNotSupportedError as e:
5254
return JSONResponse(
5355
content=ScoresetMapping(

src/dcd_mapping/main.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
)
2323
from dcd_mapping.mavedb_data import (
2424
ScoresetNotSupportedError,
25+
correct_target_sequence_type,
2526
get_scoreset_metadata,
2627
get_scoreset_records,
2728
with_mavedb_score_set,
@@ -332,6 +333,7 @@ async def map_scoreset_urn(
332333
try:
333334
metadata = get_scoreset_metadata(urn, store_path)
334335
records = get_scoreset_records(metadata, silent, store_path)
336+
metadata = correct_target_sequence_type(metadata, records)
335337
except ScoresetNotSupportedError as e:
336338
_emit_info(f"Score set not supported: {e}", silent, logging.ERROR)
337339
final_output = write_scoreset_mapping_to_json(

src/dcd_mapping/mavedb_data.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,10 @@
3030
ScoresetMapping,
3131
ScoresetMetadata,
3232
TargetGene,
33+
TargetSequenceType,
3334
UniProtRef,
3435
)
36+
from dcd_mapping.transcripts import _get_protein_sequence
3537

3638
__all__ = [
3739
"get_scoreset_urns",
@@ -324,6 +326,28 @@ def get_scoreset_records(
324326
return _load_scoreset_records(scores_csv, metadata)
325327

326328

329+
def correct_target_sequence_type(
330+
metadata: ScoresetMetadata, records: dict
331+
) -> ScoresetMetadata:
332+
"""If target sequence type is DNA but all variants are protein-level, change to protein.
333+
This avoids BLAT errors in cases where the target sequence was codon-optimized
334+
for a non-human organism
335+
"""
336+
for target_label, target in metadata.target_genes.items():
337+
if target.target_sequence_type == TargetSequenceType.DNA:
338+
all_protein = True
339+
for record in records.get(target_label, []):
340+
if record.hgvs_pro == "NA" or not record.hgvs_pro:
341+
all_protein = False
342+
break
343+
if all_protein:
344+
msg = f"Changing target sequence type for {metadata.urn} target {target_label} from DNA to protein because all variants are protein-level"
345+
_logger.info(msg)
346+
target.target_sequence = _get_protein_sequence(target.target_sequence)
347+
target.target_sequence_type = TargetSequenceType.PROTEIN
348+
return metadata
349+
350+
327351
def with_mavedb_score_set(fn: Callable) -> Callable:
328352
@wraps(fn)
329353
async def wrapper(*args, **kwargs) -> ScoresetMapping: # noqa: ANN002

0 commit comments

Comments
 (0)