Skip to content

Commit ded59ae

Browse files
committed
Fix ins/dups where splice region is preserved
1 parent 5f27045 commit ded59ae

File tree

10 files changed

+302
-69
lines changed

10 files changed

+302
-69
lines changed

src/hgvs/_data/defaults.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ normalize = True
77
prevalidation_level = EXTRINSIC
88
replace_reference = True
99
ins_at_boundary_is_intronic = True
10+
shift_over_boundary = False
11+
shift_over_boundary_preference = DEFAULT
1012

1113
# strict_bounds: require transcript variants to be within transcript sequence bounds
1214
strict_bounds = True

src/hgvs/assemblymapper.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ def __init__(
5858
in_par_assume=hgvs.global_config.mapping.in_par_assume,
5959
replace_reference=hgvs.global_config.mapping.replace_reference,
6060
add_gene_symbol=hgvs.global_config.mapping.add_gene_symbol,
61+
shift_over_boundary=hgvs.global_config.mapping.shift_over_boundary,
62+
shift_over_boundary_preference=hgvs.global_config.mapping.shift_over_boundary_preference,
6163
*args,
6264
**kwargs,
6365
):
@@ -79,6 +81,8 @@ def __init__(
7981
replace_reference=replace_reference,
8082
prevalidation_level=prevalidation_level,
8183
add_gene_symbol=add_gene_symbol,
84+
shift_over_boundary=shift_over_boundary,
85+
shift_over_boundary_preference=shift_over_boundary_preference,
8286
*args,
8387
**kwargs,
8488
)
@@ -90,7 +94,9 @@ def __init__(
9094
if self.normalize:
9195
vm = VariantMapper(hdp=hdp, replace_reference=replace_reference,
9296
prevalidation_level=prevalidation_level,
93-
add_gene_symbol=add_gene_symbol)
97+
add_gene_symbol=add_gene_symbol,
98+
shift_over_boundary=shift_over_boundary,
99+
shift_over_boundary_preference=shift_over_boundary_preference)
94100
self._norm = hgvs.normalizer.Normalizer(
95101
hdp, alt_aln_method=alt_aln_method, validate=False, variantmapper=vm,
96102
)

src/hgvs/enums.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@
55
ValidationLevel = OrderedEnum("ValidationLevel", "VALID WARNING ERROR")
66

77
PrevalidationLevel = OrderedEnum("PrevalidationLevel", "NONE INTRINSIC EXTRINSIC")
8+
9+
ShiftOverBoundaryPreference = OrderedEnum("ShiftOverBoundaryPreference", "DEFAULT INTRON EXON")

src/hgvs/sequencevariant.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ class SequenceVariant:
2020
type = attr.ib()
2121
posedit = attr.ib()
2222
gene = attr.ib(default=None)
23+
shifts_into_exon_and_intron = attr.ib(default=False)
24+
is_shifted = attr.ib(default=False)
25+
at_boundary = attr.ib(default=False)
2326

2427
def format(self, conf=None):
2528
"""Formatting the stringification of sequence variants

src/hgvs/utils/altseqbuilder.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def __init__(self, var_c, transcript_data, translation_table=TranslationTable.st
106106
# check reference for special characteristics
107107
self._ref_has_multiple_stops = self._transcript_data.aa_sequence.count("*") > 1
108108
self._translation_table = translation_table
109+
self.at_boundary = False
109110

110111
def build_altseq(self):
111112
"""given a variant and a sequence, incorporate the variant and return the new sequence
@@ -132,7 +133,7 @@ def build_altseq(self):
132133
# should loop over each allele rather than assume only 1 variant; return a list for now
133134
alt_data = []
134135

135-
variant_location = self._get_variant_region()
136+
variant_location = self.get_variant_region()
136137

137138
if variant_location == self.EXON:
138139
edit_type = type(self._var_c.posedit.edit)
@@ -176,7 +177,7 @@ def build_altseq(self):
176177

177178
return alt_data
178179

179-
def _get_variant_region(self):
180+
def get_variant_region(self):
180181
"""Categorize variant by location in transcript (5'utr, exon, intron, 3'utr)
181182
182183
:return "exon", "intron", "five_utr", "three_utr", "whole_gene"
@@ -200,41 +201,43 @@ def _get_variant_region(self):
200201
):
201202
result = self.WHOLE_GENE
202203
elif (
203-
global_config.mapping.ins_at_boundary_is_intronic
204-
and self._var_c.posedit.edit.type == "dup"
204+
self._var_c.posedit.edit.type == "dup"
205205
and self._var_c.posedit.pos.start.base in self._transcript_data.exon_start_positions
206+
and self._var_c.posedit.pos.start.offset == 0
206207
):
207-
result = self.INTRON
208+
self.at_boundary = True
209+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
208210
elif (
209-
global_config.mapping.ins_at_boundary_is_intronic
210-
and self._var_c.posedit.edit.type == "dup"
211+
self._var_c.posedit.edit.type == "dup"
211212
and self._var_c.posedit.pos.end.base in self._transcript_data.exon_end_positions
213+
and self._var_c.posedit.pos.end.offset == 0
212214
):
213-
result = self.INTRON
215+
self.at_boundary = True
216+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
214217
elif (
215-
not global_config.mapping.ins_at_boundary_is_intronic
216-
and self._var_c.posedit.edit.type == "ins"
218+
self._var_c.posedit.edit.type == "ins"
217219
and self._var_c.posedit.pos.start.offset == -1 and self._var_c.posedit.pos.end.offset == 0
218220
):
219-
result = self.EXON
221+
self.at_boundary = True
222+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
220223
elif (
221-
not global_config.mapping.ins_at_boundary_is_intronic
222-
and self._var_c.posedit.edit.type == "ins"
224+
self._var_c.posedit.edit.type == "ins"
223225
and self._var_c.posedit.pos.start.offset == 0 and self._var_c.posedit.pos.end.offset == 1
224226
):
225-
result = self.EXON
227+
self.at_boundary = True
228+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
226229
elif (
227-
not global_config.mapping.ins_at_boundary_is_intronic
228-
and self._var_c.posedit.edit.type == "dup"
230+
self._var_c.posedit.edit.type == "dup"
229231
and self._var_c.posedit.pos.end.offset == -1
230232
):
231-
result = self.EXON
233+
self.at_boundary = True
234+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
232235
elif (
233-
not global_config.mapping.ins_at_boundary_is_intronic
234-
and self._var_c.posedit.edit.type == "dup"
236+
self._var_c.posedit.edit.type == "dup"
235237
and self._var_c.posedit.pos.start.offset == 1
236238
):
237-
result = self.EXON
239+
self.at_boundary = True
240+
result = self.INTRON if global_config.mapping.ins_at_boundary_is_intronic else self.EXON
238241
elif self._var_c.posedit.pos.start.offset != 0 or self._var_c.posedit.pos.end.offset != 0:
239242
# leave out anything else intronic for now
240243
result = self.INTRON

src/hgvs/variantmapper.py

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
import hgvs.utils.altseqbuilder as altseqbuilder
2020
import hgvs.validator
2121
from hgvs.decorators.lru_cache import lru_cache
22-
from hgvs.enums import PrevalidationLevel
23-
from hgvs.exceptions import HGVSInvalidVariantError, HGVSUnsupportedOperationError
22+
from hgvs.enums import PrevalidationLevel, ShiftOverBoundaryPreference
23+
from hgvs.exceptions import HGVSInvalidVariantError, HGVSUnsupportedOperationError, HGVSInvalidIntervalError
2424
from hgvs.utils.reftranscriptdata import RefTranscriptData
2525

2626
_logger = logging.getLogger(__name__)
@@ -71,6 +71,8 @@ def __init__(
7171
replace_reference=hgvs.global_config.mapping.replace_reference,
7272
prevalidation_level=hgvs.global_config.mapping.prevalidation_level,
7373
add_gene_symbol=hgvs.global_config.mapping.add_gene_symbol,
74+
shift_over_boundary=hgvs.global_config.mapping.shift_over_boundary,
75+
shift_over_boundary_preference=hgvs.global_config.mapping.shift_over_boundary_preference,
7476
):
7577
"""
7678
:param bool replace_reference: replace reference (entails additional network access)
@@ -93,6 +95,11 @@ def __init__(
9395
self.left_normalizer = hgvs.normalizer.Normalizer(
9496
hdp, shuffle_direction=5, variantmapper=self
9597
)
98+
self.shift_over_boundary = shift_over_boundary
99+
if shift_over_boundary_preference is None:
100+
self.shift_over_boundary_preference = ShiftOverBoundaryPreference.DEFAULT
101+
else:
102+
self.shift_over_boundary_preference = ShiftOverBoundaryPreference[shift_over_boundary_preference.upper()]
96103

97104
# ############################################################################
98105
# g⟷t
@@ -436,21 +443,52 @@ def c_to_p(self, var_c, pro_ac=None, alt_ac=None, alt_aln_method=hgvs.global_con
436443
reference_data = RefTranscriptData(self.hdp, var_c.ac, pro_ac, translation_table=translation_table)
437444
builder = altseqbuilder.AltSeqBuilder(var_c, reference_data, translation_table=translation_table)
438445

446+
# attempt to shift ins/dup variants from the intron into the exon or vice versa
447+
if self.shift_over_boundary:
448+
shifts_into_exon_and_intron = False
449+
is_shifted = False
450+
original_region = builder.get_variant_region()
451+
if (
452+
var_c.posedit.edit.type in ['ins', 'dup']
453+
and original_region in [builder.INTRON, builder.EXON]
454+
):
455+
if alt_ac is None:
456+
raise HGVSUnsupportedOperationError(f'mapping specific variant {var_c} requires alt_ac')
457+
for shifted_var_c in self._var_c_shifts(var_c, alt_ac, alt_aln_method):
458+
shifted_reference_data = RefTranscriptData(self.hdp, shifted_var_c.ac, pro_ac)
459+
shifted_builder = altseqbuilder.AltSeqBuilder(shifted_var_c, shifted_reference_data)
460+
shifted_region = shifted_builder.get_variant_region()
461+
if shifted_region not in [shifted_builder.INTRON, shifted_builder.EXON]:
462+
continue
463+
if original_region != shifted_region:
464+
# a shift is posible
465+
shifts_into_exon_and_intron = True
466+
if self.shift_over_boundary_preference.name.lower() == shifted_region:
467+
# and that shift is preferred
468+
is_shifted = True
469+
reference_data = shifted_reference_data
470+
builder = shifted_builder
471+
break
472+
439473
# TODO: handle case where you get 2+ alt sequences back;
440474
# currently get list of 1 element loop structure implemented
441475
# to handle this, but doesn't really do anything currently.
442476
all_alt_data = builder.build_altseq()
443477

444478
var_ps = []
445479
for alt_data in all_alt_data:
446-
builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data)
447-
var_p = builder.build_hgvsp()
480+
hgvsp_builder = altseq_to_hgvsp.AltSeqToHgvsp(reference_data, alt_data)
481+
var_p = hgvsp_builder.build_hgvsp()
448482
var_ps.append(var_p)
449483

450484
var_p = var_ps[0]
451485

452486
if self.add_gene_symbol:
453487
self._update_gene_symbol(var_p, var_c.gene)
488+
var_p.at_boundary = builder.at_boundary
489+
if self.shift_over_boundary:
490+
var_p.shifts_into_exon_and_intron = shifts_into_exon_and_intron
491+
var_p.is_shifted = is_shifted
454492

455493
return var_p
456494

@@ -625,6 +663,59 @@ def _update_gene_symbol(self, var, symbol):
625663
var.gene = symbol
626664
return var
627665

666+
def _var_c_shifts(self, var_c, alt_ac, alt_aln_method):
667+
"""Try to shift c. variants to find alternative representations."""
668+
strand = self._fetch_AlignmentMapper(tx_ac=var_c.ac, alt_ac=alt_ac, alt_aln_method=alt_aln_method).strand
669+
var_g = VariantMapper.c_to_g(self, var_c, alt_ac=alt_ac, alt_aln_method=alt_aln_method)
670+
for shifted_var_g in self._var_g_shifts(var_g, strand=strand, alt_aln_method=alt_aln_method):
671+
try:
672+
shifted_var_c = VariantMapper.g_to_c(self, shifted_var_g, tx_ac=var_c.ac, alt_aln_method=alt_aln_method)
673+
yield shifted_var_c
674+
except (HGVSInvalidVariantError, HGVSInvalidIntervalError, HGVSUnsupportedOperationError):
675+
pass
676+
677+
def _var_g_shifts(self, var_g, strand, alt_aln_method):
678+
"""Try to shift g. variants to find alternative representations."""
679+
prev_var_g_strs = [str(var_g)]
680+
for shuffle_direction in [3, 5]:
681+
try:
682+
shifted_var_g = self._var_g_shift_with_rewrite(var_g, shuffle_direction, strand, alt_aln_method)
683+
if str(shifted_var_g) in prev_var_g_strs:
684+
continue
685+
prev_var_g_strs.append(str(shifted_var_g))
686+
yield shifted_var_g
687+
except (HGVSInvalidVariantError, HGVSInvalidIntervalError, HGVSUnsupportedOperationError):
688+
pass
689+
690+
def _var_g_shift_with_rewrite(self, var_g, shuffle_direction, strand, alt_aln_method):
691+
"""Attempt to shift a variant all the way left or right. Rewrite
692+
duplications as insertions so that the variant is shifted farther
693+
than would normally be possible using the HGVS notation."""
694+
var_g = copy.deepcopy(var_g)
695+
normalizer = hgvs.normalizer.Normalizer(
696+
self.hdp, alt_aln_method=alt_aln_method, validate=False, shuffle_direction=shuffle_direction
697+
)
698+
var_g = normalizer.normalize(var_g)
699+
if var_g.posedit.edit.type == 'dup':
700+
self._replace_reference(var_g)
701+
if (strand == 1 and shuffle_direction == 3) or (strand == -1 and shuffle_direction == 5):
702+
var_g.posedit = hgvs.posedit.PosEdit(
703+
pos=hgvs.location.Interval(
704+
start=hgvs.location.SimplePosition(base=var_g.posedit.pos.start.base-1),
705+
end=hgvs.location.SimplePosition(base=var_g.posedit.pos.start.base),
706+
),
707+
edit=hgvs.edit.NARefAlt(ref=None, alt=var_g.posedit.edit.ref)
708+
)
709+
else:
710+
var_g.posedit = hgvs.posedit.PosEdit(
711+
pos=hgvs.location.Interval(
712+
start=hgvs.location.SimplePosition(base=var_g.posedit.pos.end.base),
713+
end=hgvs.location.SimplePosition(base=var_g.posedit.pos.end.base+1),
714+
),
715+
edit=hgvs.edit.NARefAlt(ref=None, alt=var_g.posedit.edit.ref)
716+
)
717+
return var_g
718+
628719

629720
# <LICENSE>
630721
# Copyright 2018 HGVS Contributors (https://github.com/biocommons/hgvs)

tests/data/gcp/real.tsv

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,12 @@ ID00048 NC_000010.10:g.89692922T>C NM_000314.4:c.406T>C NP_000305.3:p.(Cys136Arg
5050
ID00049 NC_000010.10:g.89692921dupA NM_000314.4:c.405dupA NP_000305.3:p.(Cys136Metfs*44)
5151
ID00050 NC_000010.10:g.89692923_89692939delGTGCATATTTATTACAT NM_000314.4:c.407_423delGTGCATATTTATTACAT NP_000305.3:p.(Cys136Serfs*38)
5252
ID00051 NC_000010.10:g.89712015C>A NM_000314.4:c.633C>A NP_000305.3:p.(Cys211*)
53-
ID00052 NC_000010.10:g.89685314dupT NM_000314.4:c.209dupT NP_000305.3:p.?
5453
ID00053 NC_000010.10:g.89711893C>T NM_000314.4:c.511C>T NP_000305.3:p.(Gln171*)
5554
ID00054 NC_000010.10:g.89692963dupA NM_000314.4:c.447dupA NP_000305.3:p.(Glu150Argfs*30)
5655
ID00055 NC_000010.10:g.89685315G>A NM_000314.4:c.209+1G>A NP_000305.3:p.?
5756
ID00056 NC_000010.10:g.89693009delG NM_000314.4:c.492+1delG NP_000305.3:p.?
5857
ID00057 NC_000010.10:g.89711873A>C NM_000314.4:c.493-2A>C NP_000305.3:p.?
5958
ID00058 NC_000010.10:g.89717676G>A NM_000314.4:c.701G>A NP_000305.3:p.(Arg234Gln)
6059
ID00059 NC_000010.10:g.89717777G>A NM_000314.4:c.801+1G>A NP_000305.3:p.?
61-
ID00060 NC_000010.10:g.89720648dupT NM_000314.4:c.802-3dupT NP_000305.3:p.?
6260
ID00061 NC_000005.9:g.131705667G>T NM_003060.3:c.3G>T NP_003051.1:p.Met1?
6361
ID00062 NC_000005.9:g.131706014G>A NM_003060.3:c.350G>A NP_003051.1:p.(Trp117*)

0 commit comments

Comments
 (0)