Skip to content

Commit b75c18a

Browse files
committed
Construct VRS parsable HGVS strings for transcript identical (p.=) HGVS protein strings
1 parent 36bc828 commit b75c18a

File tree

1 file changed

+38
-17
lines changed

1 file changed

+38
-17
lines changed

src/dcd_mapping/vrs_map.py

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
)
1919
from ga4gh.vrs.normalize import normalize
2020
from mavehgvs.util import parse_variant_strings
21-
from mavehgvs.variant import Variant
21+
from mavehgvs.variant import AA_CODES, Variant
2222

2323
from dcd_mapping.lookup import (
2424
cdot_rest,
@@ -112,6 +112,9 @@ def _create_pre_mapped_hgvs_strings(
112112
msg = f"Alignment result or accession ID must be provided for {layer} annotations (Alignment was `{alignment}`)."
113113
raise ValueError(msg)
114114

115+
if raw_description == "p.=" and tx is not None:
116+
raw_description = _construct_transcript_identical_hgvs_string(tx)
117+
115118
raw_variant_strings = _parse_raw_variant_str(raw_description)
116119
variants, errors = parse_variant_strings(raw_variant_strings)
117120

@@ -123,7 +126,7 @@ def _create_pre_mapped_hgvs_strings(
123126

124127
# ga4gh hgvs_tools does not support intronic variants, so they will err out when vrs allele translator is called
125128
# therefore skip them there
126-
if is_intronic_variant(variant):
129+
if layer != AnnotationLayer.PROTEIN and is_intronic_variant(variant):
127130
msg = f"Variant is intronic and cannot be processed: {variant}"
128131
raise ValueError(msg)
129132

@@ -174,6 +177,9 @@ def _create_post_mapped_hgvs_strings(
174177
msg = f"Alignment result must be provided for {layer} annotations (Alignment was `{alignment}`)."
175178
raise ValueError(msg)
176179

180+
if raw_description == "p.=" and tx is not None:
181+
raw_description = _construct_transcript_identical_hgvs_string(tx)
182+
177183
raw_variants = _parse_raw_variant_str(raw_description)
178184
variants, errors = parse_variant_strings(raw_variants)
179185

@@ -185,7 +191,7 @@ def _create_post_mapped_hgvs_strings(
185191

186192
# ga4gh hgvs_tools does not support intronic variants, so they will err out when vrs allele translator is called
187193
# therefore skip them there
188-
if is_intronic_variant(variant):
194+
if layer != AnnotationLayer.PROTEIN and is_intronic_variant(variant):
189195
msg = f"Variant is intronic and cannot be processed: {variant}"
190196
raise ValueError(msg)
191197

@@ -326,6 +332,23 @@ def _parse_raw_variant_str(raw_description: str) -> list[str]:
326332
return [raw_description]
327333

328334

335+
def _construct_transcript_identical_hgvs_string(tx: TxSelectResult) -> str:
336+
"""Construct an HGVS string for a transcript identical variant.
337+
338+
:param tx: TxSelectResult object containing the transcript sequence and start position
339+
:return: The constructed HGVS string
340+
"""
341+
description = "p.["
342+
343+
for i, code in enumerate(tx.sequence, start=1):
344+
if i == 1:
345+
description += f"{AA_CODES[code]}{tx.start + i}="
346+
else:
347+
description += f";{AA_CODES[code]}{tx.start + i}="
348+
349+
return description + "]"
350+
351+
329352
def _map_protein_coding_pro(
330353
row: ScoreRow,
331354
sequence_id: str,
@@ -340,7 +363,7 @@ def _map_protein_coding_pro(
340363
:param transcript: The transcript selection information for a score set
341364
:return: VRS mapping object if mapping succeeds
342365
"""
343-
if row.hgvs_pro in {"_wt", "_sy", "NA"} or len(row.hgvs_pro) == 3:
366+
if row.hgvs_pro in {"_wt", "_sy", "NA"}:
344367
_logger.warning(
345368
"Can't process variant syntax %s for %s", row.hgvs_pro, row.accession
346369
)
@@ -677,11 +700,7 @@ def _hgvs_pro_is_valid(hgvs_pro: str) -> bool:
677700
:param hgvs_nt: MAVE_HGVS protein expression
678701
:return: True if expression appears populated and valid
679702
"""
680-
return (
681-
(hgvs_pro not in {"_wt", "_sy", "NA"})
682-
and (len(hgvs_pro) != 3)
683-
and ("fs" not in hgvs_pro)
684-
)
703+
return (hgvs_pro not in {"_wt", "_sy", "NA"}) and ("fs" not in hgvs_pro)
685704

686705

687706
def _map_protein_coding(
@@ -815,14 +834,16 @@ def _map_accession(
815834
hgvs_nt_mappings = _map_genomic(row, sequence_id, align_result)
816835
variations.append(hgvs_nt_mappings)
817836
else:
818-
[
819-
MappedScore(
820-
accession_id=row.accession,
821-
score=row.score,
822-
error_message=f"Unrecognized accession prefix for accession id {metadata.target_accession_id}",
823-
)
824-
for row in records
825-
]
837+
variations.append(
838+
[
839+
MappedScore(
840+
accession_id=row.accession,
841+
score=row.score,
842+
error_message=f"Unrecognized accession prefix for accession id {metadata.target_accession_id}",
843+
)
844+
for row in records
845+
]
846+
)
826847

827848
return variations
828849

0 commit comments

Comments
 (0)