Skip to content

Commit 7040c10

Browse files
committed
Support normalizing of symbolic <DEL.*> alleles
Resolves #1919
1 parent 4fb7100 commit 7040c10

File tree

6 files changed

+56
-4
lines changed

6 files changed

+56
-4
lines changed

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ Changes affecting specific commands:
6565

6666
- The `-m, --multiallelics +` mode now preserves phasing (#1893)
6767

68+
- Symbolic <DEL.*> alleles are now normalized too (#1919)
69+
6870
* bcftools query
6971

7072
- Force newline character in formatting expression when not given explicitly

test/norm.symbolic.1.out

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=20,length=2147483647>
4+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position">
5+
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
6+
##INFO=<ID=ORI,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
7+
#CHROM POS ID REF ALT QUAL FILTER INFO
8+
20 15 . TAC T . . ORI=20|24|ACA|A
9+
20 15 . TAC <DEL> . . END=17;SVTYPE=DEL;ORI=20|24|A|<DEL>
10+
20 93 . CAAA C . . ORI=20|98|AAAA|A
11+
20 93 . CAAA <DEL> . . END=96;SVTYPE=DEL;ORI=20|98|A|<DEL>

test/norm.symbolic.fa

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
>20
2+
AGGATGGGGCTCATTACACACACACACCTTGTCTCCAGAATCACTGGTGAGGAAGGGGAG
3+
TGCAGCCTGGGAGACAGAGCAAGACTCCATCTCAAAAAAAAAAAAAAAAAAAAAGGCCAT

test/norm.symbolic.vcf

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
##fileformat=VCFv4.2
2+
##contig=<ID=20,length=2147483647>
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position">
4+
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
5+
#CHROM POS ID REF ALT QUAL FILTER INFO
6+
20 24 . ACA A . . .
7+
20 24 . A <DEL> . . END=26;SVTYPE=DEL
8+
20 98 . AAAA A . . .
9+
20 98 . A <DEL> . . END=101;SVTYPE=DEL

test/test.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,7 @@
285285
run_test(\&test_vcf_norm,$opts,in=>'norm.m-any',out=>'norm.m-any.1.out',args=>'-m -any');
286286
run_test(\&test_vcf_norm,$opts,in=>'norm.phased-split',out=>'norm.phased-split.1.out',args=>'-m -any');
287287
run_test(\&test_vcf_norm,$opts,in=>'norm.phased-join',out=>'norm.phased-join.1.out',args=>'-m +any');
288+
run_test(\&test_vcf_norm,$opts,in=>'norm.symbolic',fai=>'norm.symbolic',out=>'norm.symbolic.1.out',args=>'--old-rec-tag ORI');
288289
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>'');
289290
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.2.out',args=>'-f PASS -Xks NA00003',reg=>'-r20,Y');
290291
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.3.out',args=>'-xs NA00003',reg=>'');

vcfnorm.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ typedef struct
8686
int32_t *int32_arr;
8787
int ntmp_arr1, ntmp_arr2, nint32_arr;
8888
kstring_t *tmp_str;
89-
kstring_t *tmp_als, tmp_kstr;
90-
int ntmp_als;
89+
kstring_t *tmp_als, *tmp_del, tmp_kstr;
90+
int ntmp_als, ntmp_del;
9191
rbuf_t rbuf;
9292
int buf_win; // maximum distance between two records to consider
9393
int aln_win; // the realignment window size (maximum repeat size)
@@ -398,10 +398,32 @@ static int realign(args_t *args, bcf1_t *line)
398398

399399
// make a copy of each allele for trimming
400400
hts_expand0(kstring_t,line->n_allele,args->ntmp_als,args->tmp_als);
401+
hts_expand0(kstring_t,line->n_allele,args->ntmp_del,args->tmp_del);
401402
kstring_t *als = args->tmp_als;
403+
kstring_t *del = args->tmp_del;
402404
for (i=0; i<line->n_allele; i++)
403405
{
404-
if ( line->d.allele[i][0]=='<' ) return ERR_SYMBOLIC; // symbolic allele
406+
del[i].l = 0;
407+
if ( line->d.allele[i][0]=='<' )
408+
{
409+
// symbolic allele, only <DEL.*> will be realigned
410+
if ( strncmp("<DEL",line->d.allele[i],4) ) return ERR_SYMBOLIC;
411+
if ( nref < line->rlen )
412+
{
413+
free(ref);
414+
reflen = line->rlen;
415+
ref = faidx_fetch_seq(args->fai, (char*)args->hdr->id[BCF_DT_CTG][line->rid].key, line->pos, line->pos+reflen-1, &nref);
416+
if ( !ref ) error("faidx_fetch_seq failed at %s:%"PRId64"\n", args->hdr->id[BCF_DT_CTG][line->rid].key, (int64_t) line->pos+1);
417+
seq_to_upper(ref,0);
418+
replace_iupac_codes(ref,nref); // any non-ACGT character in fasta ref is replaced with N
419+
als[0].l = 0;
420+
kputs(ref, &als[0]);
421+
als[i].l = 0;
422+
kputsn(ref,1,&als[i]);
423+
kputs(line->d.allele[i],&del[i]);
424+
continue;
425+
}
426+
}
405427
if ( line->d.allele[i][0]=='*' ) return ERR_SPANNING_DELETION; // spanning deletion
406428
if ( has_non_acgtn(line->d.allele[i],line->shared.l) )
407429
{
@@ -493,7 +515,8 @@ static int realign(args_t *args, bcf1_t *line)
493515
for (i=0; i<line->n_allele; i++)
494516
{
495517
if (i>0) kputc(',',&args->tmp_kstr);
496-
kputsn(als[i].s,als[i].l,&args->tmp_kstr);
518+
if ( del[i].l ) kputs(del[i].s,&args->tmp_kstr);
519+
else kputsn(als[i].s,als[i].l,&args->tmp_kstr);
497520
}
498521
args->tmp_kstr.s[ args->tmp_kstr.l ] = 0;
499522
bcf_update_alleles_str(args->out_hdr,line,args->tmp_kstr.s);
@@ -1939,7 +1962,10 @@ static void destroy_data(args_t *args)
19391962
free(args->maps[i].map);
19401963
for (i=0; i<args->ntmp_als; i++)
19411964
free(args->tmp_als[i].s);
1965+
for (i=0; i<args->ntmp_del; i++)
1966+
free(args->tmp_del[i].s);
19421967
free(args->tmp_als);
1968+
free(args->tmp_del);
19431969
free(args->tmp_kstr.s);
19441970
if ( args->tmp_str )
19451971
{

0 commit comments

Comments
 (0)