Skip to content

Commit 8c32f6c

Browse files
Resolves merge conflict with main in variantmapper.py imports.
Merged ShiftOverBoundaryPreference and HGVSInvalidIntervalError imports from PR biocommons#719 with existing imports in main. Fixes biocommons#714 — ins/dup variants spanning intron/exon boundary where splice site & region remain completely intact. Co-Authored-By: Brendan ODonnell <brendan@odonnell.xyz>
2 parents 8a515ce + 6303c8f commit 8c32f6c

File tree

10 files changed

+296
-68
lines changed

10 files changed

+296
-68
lines changed

src/hgvs/_data/defaults.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ normalize = True
77
prevalidation_level = EXTRINSIC
88
replace_reference = True
99
ins_at_boundary_is_intronic = True
10+
shift_over_boundary = False
11+
shift_over_boundary_preference = DEFAULT
1012

1113
# strict_bounds: require transcript variants to be within transcript sequence bounds
1214
strict_bounds = True

src/hgvs/assemblymapper.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ def __init__(
5959
in_par_assume=hgvs.global_config.mapping.in_par_assume,
6060
replace_reference=hgvs.global_config.mapping.replace_reference,
6161
add_gene_symbol=hgvs.global_config.mapping.add_gene_symbol,
62+
shift_over_boundary=hgvs.global_config.mapping.shift_over_boundary,
63+
shift_over_boundary_preference=hgvs.global_config.mapping.shift_over_boundary_preference,
6264
*args,
6365
**kwargs,
6466
):
@@ -80,6 +82,8 @@ def __init__(
8082
replace_reference=replace_reference,
8183
prevalidation_level=prevalidation_level,
8284
add_gene_symbol=add_gene_symbol,
85+
shift_over_boundary=shift_over_boundary,
86+
shift_over_boundary_preference=shift_over_boundary_preference,
8387
*args,
8488
**kwargs,
8589
)
@@ -94,6 +98,8 @@ def __init__(
9498
replace_reference=replace_reference,
9599
prevalidation_level=prevalidation_level,
96100
add_gene_symbol=add_gene_symbol,
101+
shift_over_boundary=shift_over_boundary,
102+
shift_over_boundary_preference=shift_over_boundary_preference,
97103
)
98104
self._norm = hgvs.normalizer.Normalizer(
99105
hdp,
@@ -307,6 +313,14 @@ def _maybe_normalize(self, var):
307313
# fall through to return unnormalized variant
308314
return var
309315

316+
def _var_c_shifts(self, var_c):
317+
"""Try to shift c. variants to find alternative representations."""
318+
alt_ac = self._alt_ac_for_tx_ac(var_c.ac)
319+
yield from super(AssemblyMapper, self)._var_c_shifts(
320+
var_c, alt_ac, alt_aln_method=self.alt_aln_method
321+
)
322+
323+
310324

311325
# <LICENSE>
312326
# Copyright 2018 HGVS Contributors (https://github.com/biocommons/hgvs)

src/hgvs/enums.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@
55
ValidationLevel = OrderedEnum("ValidationLevel", "VALID WARNING ERROR")
66

77
PrevalidationLevel = OrderedEnum("PrevalidationLevel", "NONE INTRINSIC EXTRINSIC")
8+
9+
ShiftOverBoundaryPreference = OrderedEnum("ShiftOverBoundaryPreference", "DEFAULT INTRON EXON")

src/hgvs/utils/altseqbuilder.py

Lines changed: 16 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ def build_altseq(self):
132132
# should loop over each allele rather than assume only 1 variant; return a list for now
133133
alt_data = []
134134

135-
variant_location = self._get_variant_region()
135+
variant_location = self.get_variant_region()
136136

137137
if variant_location == self.EXON:
138138
edit_type = type(self._var_c.posedit.edit)
@@ -176,7 +176,7 @@ def build_altseq(self):
176176

177177
return alt_data
178178

179-
def _get_variant_region(self):
179+
def get_variant_region(self):
180180
"""Categorize variant by location in transcript (5'utr, exon, intron, 3'utr)
181181
182182
:return "exon", "intron", "five_utr", "three_utr", "whole_gene"
@@ -200,41 +200,37 @@ def _get_variant_region(self):
200200
):
201201
result = self.WHOLE_GENE
202202
elif (
203-
global_config.mapping.ins_at_boundary_is_intronic
204-
and self._var_c.posedit.edit.type == "dup"
203+
self._var_c.posedit.edit.type == "dup"
205204
and self._var_c.posedit.pos.start.base in self._transcript_data.exon_start_positions
205+
and self._var_c.posedit.pos.start.offset == 0
206206
):
207-
result = self.INTRON
207+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
208208
elif (
209-
global_config.mapping.ins_at_boundary_is_intronic
210-
and self._var_c.posedit.edit.type == "dup"
209+
self._var_c.posedit.edit.type == "dup"
211210
and self._var_c.posedit.pos.end.base in self._transcript_data.exon_end_positions
211+
and self._var_c.posedit.pos.end.offset == 0
212212
):
213-
result = self.INTRON
213+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
214214
elif (
215-
not global_config.mapping.ins_at_boundary_is_intronic
216-
and self._var_c.posedit.edit.type == "ins"
215+
self._var_c.posedit.edit.type == "ins"
217216
and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0
218217
):
219-
result = self.EXON
218+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
220219
elif (
221-
not global_config.mapping.ins_at_boundary_is_intronic
222-
and self._var_c.posedit.edit.type == "ins"
220+
self._var_c.posedit.edit.type == "ins"
223221
and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1
224222
):
225-
result = self.EXON
223+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
226224
elif (
227-
not global_config.mapping.ins_at_boundary_is_intronic
228-
and self._var_c.posedit.edit.type == "dup"
225+
self._var_c.posedit.edit.type == "dup"
229226
and self._var_c.posedit.pos.end.offset == -1
230227
):
231-
result = self.EXON
228+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
232229
elif (
233-
not global_config.mapping.ins_at_boundary_is_intronic
234-
and self._var_c.posedit.edit.type == "dup"
230+
self._var_c.posedit.edit.type == "dup"
235231
and self._var_c.posedit.pos.start.offset == 1
236232
):
237-
result = self.EXON
233+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
238234
elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0:
239235
# leave out anything else intronic for now
240236
result = self.INTRON

src/hgvs/variantmapper.py

Lines changed: 124 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,14 +16,21 @@
1616
import hgvs.sequencevariant
1717
import hgvs.validator
1818
from hgvs.decorators.lru_cache import lru_cache
19-
from hgvs.enums import PrevalidationLevel
20-
from hgvs.exceptions import HGVSInvalidVariantError, HGVSUnsupportedOperationError
19+
from hgvs.enums import PrevalidationLevel, ShiftOverBoundaryPreference
20+
from hgvs.exceptions import (
21+
HGVSInvalidIntervalError,
22+
HGVSInvalidVariantError,
23+
HGVSUnsupportedOperationError,
24+
)
2125
from hgvs.utils import altseq_to_hgvsp, altseqbuilder
2226
from hgvs.utils.position import get_start_end, get_start_end_interbase
2327
from hgvs.utils.reftranscriptdata import RefTranscriptData
2428

2529
_logger = logging.getLogger(__name__)
2630

31+
_SHUFFLE_3PRIME = 3
32+
_SHUFFLE_5PRIME = 5
33+
2734

2835
class VariantMapper:
2936
r"""Maps SequenceVariant objects between g., n., r., c., and p. representations.
@@ -70,6 +77,8 @@ def __init__(
7077
replace_reference=hgvs.global_config.mapping.replace_reference,
7178
prevalidation_level=hgvs.global_config.mapping.prevalidation_level,
7279
add_gene_symbol=hgvs.global_config.mapping.add_gene_symbol,
80+
shift_over_boundary=hgvs.global_config.mapping.shift_over_boundary,
81+
shift_over_boundary_preference=hgvs.global_config.mapping.shift_over_boundary_preference,
7382
):
7483
"""
7584
:param bool replace_reference: replace reference (entails additional network access)
@@ -92,6 +101,13 @@ def __init__(
92101
self.left_normalizer = hgvs.normalizer.Normalizer(
93102
hdp, shuffle_direction=5, variantmapper=self
94103
)
104+
self.shift_over_boundary = shift_over_boundary
105+
if shift_over_boundary_preference is None:
106+
self.shift_over_boundary_preference = ShiftOverBoundaryPreference.DEFAULT
107+
else:
108+
self.shift_over_boundary_preference = ShiftOverBoundaryPreference[
109+
shift_over_boundary_preference.upper()
110+
]
95111

96112
# ############################################################################
97113
# g⟷t
@@ -456,15 +472,43 @@ def c_to_p(
456472
var_c, reference_data, translation_table=reference_data.translation_table
457473
)
458474

475+
# attempt to shift ins/dup variants from the intron into the exon or vice versa
476+
if self.shift_over_boundary:
477+
original_region = builder.get_variant_region()
478+
if var_c.posedit.edit.type in ["ins", "dup"] and original_region in [
479+
builder.INTRON,
480+
builder.EXON,
481+
]:
482+
if alt_ac is None:
483+
msg = f"mapping specific variant {var_c} requires alt_ac"
484+
raise HGVSUnsupportedOperationError(msg)
485+
for shifted_var_c in VariantMapper._var_c_shifts(
486+
self, var_c, alt_ac, alt_aln_method
487+
):
488+
shifted_reference_data = RefTranscriptData(self.hdp, shifted_var_c.ac, pro_ac)
489+
shifted_builder = altseqbuilder.AltSeqBuilder(
490+
shifted_var_c, shifted_reference_data
491+
)
492+
shifted_region = shifted_builder.get_variant_region()
493+
if shifted_region not in [shifted_builder.INTRON, shifted_builder.EXON]:
494+
continue
495+
if original_region != shifted_region:
496+
# a shift is posible
497+
if self.shift_over_boundary_preference.name.lower() == shifted_region:
498+
# and that shift is preferred
499+
reference_data = shifted_reference_data
500+
builder = shifted_builder
501+
break
502+
459503
# TODO: handle case where you get 2+ alt sequences back;
460504
# currently get list of 1 element loop structure implemented
461505
# to handle this, but doesn't really do anything currently.
462506
all_alt_data = builder.build_altseq()
463507

464508
var_ps = []
465509
for alt_data in all_alt_data:
466-
builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data)
467-
var_p = builder.build_hgvsp()
510+
hgvsp_builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data)
511+
var_p = hgvsp_builder.build_hgvsp()
468512
var_ps.append(var_p)
469513

470514
var_p = var_ps[0]
@@ -645,6 +689,82 @@ def _update_gene_symbol(self, var, symbol):
645689
var.gene = symbol
646690
return var
647691

692+
def _var_c_shifts(self, var_c, alt_ac, alt_aln_method):
693+
"""Try to shift c. variants to find alternative representations."""
694+
if not var_c.posedit or var_c.posedit.edit.type not in ("ins", "dup"):
695+
return
696+
strand = self._fetch_AlignmentMapper(
697+
tx_ac=var_c.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method
698+
).strand
699+
var_g = VariantMapper.c_to_g(self, var_c, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
700+
for shifted_var_g in self._var_g_shifts(
701+
var_g, strand=strand, alt_aln_method=alt_aln_method
702+
):
703+
try:
704+
shifted_var_c = VariantMapper.g_to_c(
705+
self, shifted_var_g, tx_ac=var_c.ac, alt_aln_method=alt_aln_method
706+
)
707+
yield shifted_var_c
708+
except (
709+
HGVSInvalidVariantError,
710+
HGVSInvalidIntervalError,
711+
HGVSUnsupportedOperationError,
712+
):
713+
pass
714+
715+
def _var_g_shifts(self, var_g, strand, alt_aln_method):
716+
"""Try to shift g. variants to find alternative representations."""
717+
prev_var_g_strs = [str(var_g)]
718+
for shuffle_direction in [_SHUFFLE_3PRIME, _SHUFFLE_5PRIME]:
719+
try:
720+
shifted_var_g = self._var_g_shift_with_rewrite(
721+
var_g, shuffle_direction, strand, alt_aln_method
722+
)
723+
if str(shifted_var_g) in prev_var_g_strs:
724+
continue
725+
prev_var_g_strs.append(str(shifted_var_g))
726+
yield shifted_var_g
727+
except (
728+
HGVSInvalidVariantError,
729+
HGVSInvalidIntervalError,
730+
HGVSUnsupportedOperationError,
731+
):
732+
pass
733+
734+
def _var_g_shift_with_rewrite(self, var_g, shuffle_direction, strand, alt_aln_method):
735+
"""Attempt to shift a variant all the way left or right. Rewrite
736+
duplications as insertions so that the variant is shifted farther
737+
than would normally be possible using the HGVS notation."""
738+
var_g = copy.deepcopy(var_g)
739+
normalizer = hgvs.normalizer.Normalizer(
740+
self.hdp,
741+
alt_aln_method=alt_aln_method,
742+
validate=False,
743+
shuffle_direction=shuffle_direction,
744+
)
745+
var_g = normalizer.normalize(var_g)
746+
if var_g.posedit.edit.type == "dup":
747+
self._replace_reference(var_g)
748+
if (strand == 1 and shuffle_direction == _SHUFFLE_3PRIME) or (
749+
strand == -1 and shuffle_direction == _SHUFFLE_5PRIME
750+
):
751+
var_g.posedit = hgvs.posedit.PosEdit(
752+
pos=hgvs.location.Interval(
753+
start=hgvs.location.SimplePosition(base=var_g.posedit.pos.start.base - 1),
754+
end=hgvs.location.SimplePosition(base=var_g.posedit.pos.start.base),
755+
),
756+
edit=hgvs.edit.NARefAlt(ref=None, alt=var_g.posedit.edit.ref),
757+
)
758+
else:
759+
var_g.posedit = hgvs.posedit.PosEdit(
760+
pos=hgvs.location.Interval(
761+
start=hgvs.location.SimplePosition(base=var_g.posedit.pos.end.base),
762+
end=hgvs.location.SimplePosition(base=var_g.posedit.pos.end.base + 1),
763+
),
764+
edit=hgvs.edit.NARefAlt(ref=None, alt=var_g.posedit.edit.ref),
765+
)
766+
return var_g
767+
648768

649769
# <LICENSE>
650770
# Copyright 2018 HGVS Contributors (https://github.com/biocommons/hgvs)

tests/data/cache-py3.hdp

35.2 KB
Binary file not shown.

tests/data/gcp/real.tsv

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,12 @@ ID00048 NC_000010.10:g.89692922T>C NM_000314.4:c.406T>C NP_000305.3:p.(Cys136Arg
5050
ID00049 NC_000010.10:g.89692921dupA NM_000314.4:c.405dupA NP_000305.3:p.(Cys136Metfs*44)
5151
ID00050 NC_000010.10:g.89692923_89692939delGTGCATATTTATTACAT NM_000314.4:c.407_423delGTGCATATTTATTACAT NP_000305.3:p.(Cys136Serfs*38)
5252
ID00051 NC_000010.10:g.89712015C>A NM_000314.4:c.633C>A NP_000305.3:p.(Cys211*)
53-
ID00052 NC_000010.10:g.89685314dupT NM_000314.4:c.209dupT NP_000305.3:p.?
5453
ID00053 NC_000010.10:g.89711893C>T NM_000314.4:c.511C>T NP_000305.3:p.(Gln171*)
5554
ID00054 NC_000010.10:g.89692963dupA NM_000314.4:c.447dupA NP_000305.3:p.(Glu150Argfs*30)
5655
ID00055 NC_000010.10:g.89685315G>A NM_000314.4:c.209+1G>A NP_000305.3:p.?
5756
ID00056 NC_000010.10:g.89693009delG NM_000314.4:c.492+1delG NP_000305.3:p.?
5857
ID00057 NC_000010.10:g.89711873A>C NM_000314.4:c.493-2A>C NP_000305.3:p.?
5958
ID00058 NC_000010.10:g.89717676G>A NM_000314.4:c.701G>A NP_000305.3:p.(Arg234Gln)
6059
ID00059 NC_000010.10:g.89717777G>A NM_000314.4:c.801+1G>A NP_000305.3:p.?
61-
ID00060 NC_000010.10:g.89720648dupT NM_000314.4:c.802-3dupT NP_000305.3:p.?
6260
ID00061 NC_000005.9:g.131705667G>T NM_003060.3:c.3G>T NP_003051.1:p.Met1?
6361
ID00062 NC_000005.9:g.131706014G>A NM_003060.3:c.350G>A NP_003051.1:p.(Trp117*)

0 commit comments

Comments
 (0)