Skip to content

Commit a1b781d

Browse files
committed
Don't expand REF when symbolic <DEL> allele is present
Symbolic <DEL> alleles caused norm to expand REF to the full length of the deletion. This was not intended and was problematic for long deletions, the REF allele should list one base only. Resolves #2029
1 parent 3b24f36 commit a1b781d

File tree

6 files changed

+40
-9
lines changed

6 files changed

+40
-9
lines changed

NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ Changes affecting specific commands:
4040

4141
- Allow combining -m and -a with --old-rec-tag (#2020)
4242

43+
- Symbolic <DEL> alleles caused norm to expand REF to the full length of the deletion.
44+
This was not intended and problematic for long deletions, the REF allele should list
45+
one base only (#2029)
46+
4347
* bcftools query
4448

4549
- Add new `-N, --disable-automatic-newline` option for pre-1.18 query formatting behavior

test/norm.symbolic.1.out

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,6 @@
66
##INFO=<ID=ORI,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
77
#CHROM POS ID REF ALT QUAL FILTER INFO
88
20 15 . TAC T . . ORI=20|24|ACA|A
9-
20 15 . TAC <DEL> . . END=17;SVTYPE=DEL;ORI=20|24|A|<DEL>
9+
20 15 . T <DEL> . . END=17;SVTYPE=DEL;ORI=20|24|A|<DEL>
1010
20 93 . CAAA C . . ORI=20|98|AAAA|A
11-
20 93 . CAAA <DEL> . . END=96;SVTYPE=DEL;ORI=20|98|A|<DEL>
11+
20 93 . C <DEL> . . END=96;SVTYPE=DEL;ORI=20|98|A|<DEL>

test/norm.symbolic.2.out

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=20,length=2147483647>
4+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position">
5+
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
6+
##INFO=<ID=ORI,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
7+
#CHROM POS ID REF ALT QUAL FILTER INFO
8+
20 15 . TAC T,<DEL> . . END=17;SVTYPE=DEL;ORI=20|24|ACA|A,<DEL>
9+
20 93 . CAAA C . . ORI=20|98|AAAA|A
10+
20 93 . C <DEL> . . END=96;SVTYPE=DEL;ORI=20|98|A|<DEL>

test/norm.symbolic.2.vcf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
##fileformat=VCFv4.2
2+
##contig=<ID=20,length=2147483647>
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position">
4+
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
5+
#CHROM POS ID REF ALT QUAL FILTER INFO
6+
20 24 . ACA A,<DEL> . . END=26;SVTYPE=DEL
7+
20 98 . AAAA A . . .
8+
20 98 . A <DEL> . . END=101;SVTYPE=DEL

test/test.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@
289289
run_test(\&test_vcf_norm,$opts,in=>'norm.phased-split',out=>'norm.phased-split.1.out',args=>'-m -any');
290290
run_test(\&test_vcf_norm,$opts,in=>'norm.phased-join',out=>'norm.phased-join.1.out',args=>'-m +any');
291291
run_test(\&test_vcf_norm,$opts,in=>'norm.symbolic',fai=>'norm.symbolic',out=>'norm.symbolic.1.out',args=>'--old-rec-tag ORI');
292+
run_test(\&test_vcf_norm,$opts,in=>'norm.symbolic.2',fai=>'norm.symbolic',out=>'norm.symbolic.2.out',args=>'--old-rec-tag ORI');
292293
run_test(\&test_vcf_norm,$opts,in=>'norm.right-align',fai=>'norm.right-align',out=>'norm.right-align.1.out',args=>'--old-rec-tag ORI');
293294
run_test(\&test_vcf_norm,$opts,in=>'norm.right-align',fai=>'norm.right-align',out=>'norm.right-align.2.out',args=>'--old-rec-tag ORI -g {PATH}/norm.right-align.gff');
294295
run_test(\&test_vcf_view,$opts,in=>'weird-chr-names',out=>'weird-chr-names.1.out',args=>'',reg=>'-r 1');

vcfnorm.c

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,7 @@ static int realign(args_t *args, bcf1_t *line)
559559
hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
560560
kstring_t *als = args->tmp_als;
561561
kstring_t *del = args->tmp_del;
562+
int symbolic_alts = 1;
562563
for (i=0; i<line->n_allele; i++)
563564
{
564565
del[i].l = 0;
@@ -576,12 +577,13 @@ static int realign(args_t *args, bcf1_t *line)
576577
replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N
577578
als[0].l = 0;
578579
kputs(ref, &als[0]);
579-
als[i].l = 0;
580-
kputsn(ref,1,&als[i]);
581-
kputs(line->d.allele[i],&del[i]);
582-
continue;
583580
}
581+
als[i].l = 0;
582+
kputsn(als[0].s,1,&als[i]);
583+
kputs(line->d.allele[i],&del[i]);
584+
continue;
584585
}
586+
if ( i>0 ) symbolic_alts = 0;
585587
if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
586588
if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
587589
{
@@ -610,8 +612,15 @@ static int realign(args_t *args, bcf1_t *line)
610612
else
611613
new_pos = realign_right(args, line);
612614

613-
// Have the alleles changed?
614-
als[0].s[ als[0].l ] = 0; // in order for strcmp to work
615+
// Have the alleles changed? Consider <DEL> could have expanded the REF allele. In that
616+
// case it must be trimmed, however the new REF length must reflect the entire length.
617+
als[0].s[ als[0].l ] = 0; // for strcmp to work
618+
int new_reflen = strlen(als[0].s);
619+
if ( symbolic_alts )
620+
{
621+
als[0].l = 1;
622+
als[0].s[ als[0].l ] = 0;
623+
}
615624
if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
616625

617626
set_old_rec_tag(args, line, line, 0);
@@ -629,7 +638,6 @@ static int realign(args_t *args, bcf1_t *line)
629638
args->nchanged++;
630639

631640
// Update INFO/END if necessary
632-
int new_reflen = strlen(line->d.allele[0]);
633641
if ( (new_pos!=line->pos || reflen!=new_reflen) && bcf_get_info_int32(args->hdr, line, "END", &args->int32_arr, &args->nint32_arr)==1 )
634642
{
635643
// bcf_update_alleles_str() messed up rlen because line->pos changed. This will be fixed by bcf_update_info_int32()

0 commit comments

Comments
 (0)