Skip to content

Commit 19bb932

Browse files
committed
Arrays in Number=R tags can be now subscripted by alleles found in FORMAT/GT
For example FORMAT/AD[GT] > 10 .. require support of more than 10 reads for each allele FORMAT/AD[0:GT] > 10 .. same as above, but in the first sample sSUM(FORMAT/AD[GT]) > 20 .. require total sample depth bigger than 20 Resolves #1849
1 parent 5202e44 commit 19bb932

File tree

8 files changed

+147
-6
lines changed

8 files changed

+147
-6
lines changed

NEWS

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ Changes affecting the whole of bcftools, or multiple commands:
1313
- evaluate to C_i = A_0 + B_i when length(A)=1 and set length(C)=length(B)
1414
- throw an error when length(A)!=length(B) AND length(A)!=1 AND length(B)!=1
1515

16+
- Arrays in Number=R tags can be now subscripted by alleles found in FORMAT/GT. For example,
17+
18+
FORMAT/AD[GT] > 10 .. require support of more than 10 reads for each allele
19+
FORMAT/AD[0:GT] > 10 .. same as above, but in the first sample
20+
sSUM(FORMAT/AD[GT]) > 20 .. require total sample depth bigger than 20
21+
1622

1723
Changes affecting specific commands:
1824

doc/bcftools.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3505,7 +3505,7 @@ to require that all alleles are of the given type. Compare
35053505

35063506
* array subscripts (0-based), "*" for any element, "-" to indicate a range. Note that
35073507
for querying FORMAT vectors, the colon ":" can be used to select a sample and an
3508-
element of the vector, as shown in the examples below
3508+
element of the vector, as shown in the examples below.
35093509

35103510
INFO/AF[0] > 0.3 .. first AF value bigger than 0.3
35113511
FORMAT/AD[0:0] > 30 .. first AD value of the first sample bigger than 30
@@ -3522,6 +3522,14 @@ element of the vector, as shown in the examples below
35223522
(DP4[0]+DP4[1])/(DP4[2]+DP4[3]) > 0.3
35233523
CSQ[*] ~ "missense_variant.*deleterious"
35243524

3525+
* in addition to array subscripts shown above, it is possible to subscript arrays
3526+
of Number=R tags by alleles found in FORMAT/GT (starting with version 1.17).
3527+
For example
3528+
3529+
FORMAT/AD[GT] > 10 .. require support of more than 10 reads for each allele
3530+
FORMAT/AD[0:GT] > 10 .. same as above, but in the first sample
3531+
sSUM(FORMAT/AD[GT]) > 20 .. require total sample depth bigger than 20
3532+
35253533
* with many samples it can be more practical to provide a file with sample names,
35263534
one sample name per line
35273535

filter.c

Lines changed: 111 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ typedef struct _token_t
6969
int hdr_id, tag_type; // BCF header lookup ID and one of BCF_HL_* types
7070
int idx; // 0-based index to VCF vectors,
7171
// -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
72+
// -3: select indices on the fly based on values in GT
7273
int *idxs; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
7374
int nidxs, nuidxs; // size of idxs array and the number of elements set to 1
7475
uint8_t *usmpl; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
@@ -100,6 +101,11 @@ struct _filter_t
100101
float *tmpf;
101102
kstring_t tmps;
102103
int max_unpack, mtmpi, mtmpf, nsamples;
104+
struct {
105+
bcf1_t *line;
106+
int32_t *buf, nbuf, mbuf; // GTs as obtained by bcf_get_genotypes()
107+
uint64_t *mask; // GTs as mask, e.g 0/0 is 1; 0/1 is 3, max 63 unique alleles
108+
} cached_GT;
103109
#if ENABLE_PERL_FILTERS
104110
PerlInterpreter *perl;
105111
#endif
@@ -350,6 +356,44 @@ char *expand_path(char *path)
350356

351357
return strdup(path);
352358
}
359+
static int filters_cache_genotypes(filter_t *flt, bcf1_t *line)
360+
{
361+
if ( flt->cached_GT.line==line ) return flt->cached_GT.nbuf > 0 ? 0 : -1;
362+
flt->cached_GT.line = line;
363+
flt->cached_GT.nbuf = bcf_get_genotypes(flt->hdr, line, &flt->cached_GT.buf, &flt->cached_GT.mbuf);
364+
if ( flt->cached_GT.nbuf<=0 ) return -1;
365+
if ( !flt->cached_GT.mask )
366+
{
367+
flt->cached_GT.mask = (uint64_t*) malloc(sizeof(*flt->cached_GT.mask)*flt->nsamples);
368+
if ( !flt->cached_GT.mask ) error("Could not alloc %zu bytes\n",sizeof(*flt->cached_GT.mask)*flt->nsamples);
369+
}
370+
int i,j, ngt1 = flt->cached_GT.nbuf / line->n_sample;
371+
for (i=0; i<line->n_sample; i++)
372+
{
373+
int32_t *ptr = flt->cached_GT.buf + i*ngt1;
374+
flt->cached_GT.mask[i] = 0;
375+
for (j=0; j<ngt1; j++)
376+
{
377+
if ( bcf_gt_is_missing(ptr[j]) ) continue;
378+
if ( ptr[j]==bcf_int32_vector_end ) break;
379+
int allele = bcf_gt_allele(ptr[j]);
380+
if ( allele > 63 )
381+
{
382+
static int warned = 0;
383+
if ( !warned )
384+
{
385+
fprintf(stderr,"Too many alleles, skipping GT filtering at this site %s:%"PRId64". "
386+
"(This warning is printed only once.)\n", bcf_seqname(flt->hdr,line),line->pos+1);
387+
warned = 1;
388+
}
389+
flt->cached_GT.nbuf = 0;
390+
return -1;
391+
}
392+
flt->cached_GT.mask[i] |= 1<<allele;
393+
}
394+
}
395+
return 0;
396+
}
353397

354398
static void filters_set_qual(filter_t *flt, bcf1_t *line, token_t *tok)
355399
{
@@ -762,6 +806,27 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
762806
tok->values[i] = ptr[tok->idx];
763807
}
764808
}
809+
else if ( tok->idx==-3 )
810+
{
811+
if ( filters_cache_genotypes(flt,line)!=0 )
812+
{
813+
tok->nvalues = 0;
814+
return;
815+
}
816+
for (i=0; i<tok->nsamples; i++)
817+
{
818+
if ( !tok->usmpl[i] ) continue;
819+
int32_t *src = flt->tmpi + i*nsrc1;
820+
double *dst = tok->values + i*tok->nval1;
821+
int k, j = 0;
822+
for (k=0; k<nsrc1; k++) // source values are AD[0..nsrc1]
823+
{
824+
if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
825+
dst[j++] = src[k];
826+
}
827+
for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
828+
}
829+
}
765830
else
766831
{
767832
int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
@@ -825,6 +890,33 @@ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
825890
tok->values[i] = ptr[tok->idx];
826891
}
827892
}
893+
else if ( tok->idx==-3 )
894+
{
895+
if ( filters_cache_genotypes(flt,line)!=0 )
896+
{
897+
tok->nvalues = 0;
898+
return;
899+
}
900+
for (i=0; i<tok->nsamples; i++)
901+
{
902+
if ( !tok->usmpl[i] ) continue;
903+
float *src = flt->tmpf + i*nsrc1;
904+
double *dst = tok->values + i*tok->nval1;
905+
int k, j = 0;
906+
for (k=0; k<nsrc1; k++) // source values are AF[0..nsrc1]
907+
{
908+
if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
909+
if ( bcf_float_is_missing(src[k]) )
910+
bcf_double_set_missing(dst[j]);
911+
else if ( bcf_float_is_vector_end(src[k]) )
912+
bcf_double_set_vector_end(dst[j]);
913+
else
914+
dst[j] = src[k];
915+
j++;
916+
}
917+
for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
918+
}
919+
}
828920
else
829921
{
830922
int kend = tok->idxs[tok->nidxs-1] < 0 ? tok->nval1 : tok->nidxs;
@@ -989,9 +1081,9 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
9891081
tok->str_value.s[tok->str_value.l] = 0;
9901082
tok->nval1 = nvals1;
9911083
}
992-
static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); }
993-
static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); }
994-
static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); }
1084+
static void filters_set_genotype2(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 2); } // rr, ra, aa, aA etc
1085+
static void filters_set_genotype3(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 3); } // hap, hom, het
1086+
static void filters_set_genotype4(filter_t *flt, bcf1_t *line, token_t *tok) { _filters_set_genotype(flt, line, tok, 4); } // mis, alt, ref
9951087

9961088
static void filters_set_genotype_string(filter_t *flt, bcf1_t *line, token_t *tok)
9971089
{
@@ -2480,6 +2572,14 @@ static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx)
24802572
*idx = -2;
24812573
return 0;
24822574
}
2575+
if ( !strcmp("GT", tag_idx) )
2576+
{
2577+
*idxs = (int*) malloc(sizeof(int));
2578+
(*idxs)[0] = -1;
2579+
*nidxs = 1;
2580+
*idx = -3;
2581+
return 0;
2582+
}
24832583

24842584
// TAG[integer] .. one field; idx positive
24852585
char *end, *beg = tag_idx;
@@ -2595,7 +2695,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx,
25952695
tok->idxs = (int*) malloc(sizeof(int));
25962696
tok->idxs[0] = -1;
25972697
tok->nidxs = 1;
2598-
tok->idx = -2;
2698+
tok->idx = idx1;
25992699
}
26002700
else if ( bcf_hdr_id2number(hdr,BCF_HL_FMT,tok->hdr_id)!=1 )
26012701
error("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n", tag,tag);
@@ -2620,7 +2720,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx,
26202720
if ( idx1 >= bcf_hdr_nsamples(hdr) ) error("The sample index is too large: %s\n", ori);
26212721
tok->usmpl[idx1] = 1;
26222722
}
2623-
else if ( idx1==-2 )
2723+
else if ( idx1==-2 || idx1==-3 )
26242724
{
26252725
for (i=0; i<nidxs1; i++)
26262726
{
@@ -2820,7 +2920,11 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
28202920
if ( is_fmt==-1 ) is_fmt = 0;
28212921
}
28222922
if ( is_array )
2923+
{
28232924
parse_tag_idx(filter->hdr, is_fmt, tmp.s, tmp.s+is_array, tok);
2925+
if ( tok->idx==-3 && bcf_hdr_id2length(filter->hdr,BCF_HL_FMT,tok->hdr_id)!=BCF_VL_R )
2926+
error("Error: GT subscripts can be used only with Number=R tags\n");
2927+
}
28242928
else if ( is_fmt && !tok->nsamples )
28252929
{
28262930
int i;
@@ -3525,6 +3629,8 @@ void filter_destroy(filter_t *filter)
35253629
free(filter->filters[i].regex);
35263630
}
35273631
}
3632+
free(filter->cached_GT.buf);
3633+
free(filter->cached_GT.mask);
35283634
free(filter->filters);
35293635
free(filter->flt_stack);
35303636
free(filter->str);

test/query.92.out

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
3000151 0/0 A 10,0,0
2+
3000151 0/0 B 10,0,0
3+
3000155 0/2 A 20,0,10
4+
3000155 0/2 B 200,0,10
5+
3000157 1/2 A 0,40,10
6+
3000157 1/2 B 0,400,10

test/query.93.out

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
3000153 2/2 B 0,0,210
2+
3000155 0/2 B 200,0,10

test/query.94.out

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
3000154 0/1 A 30,13,0
2+
3000156 1/0 A 30,14,0
3+
3000158 2/0 A 0,5,30

test/setGT.6.2.out

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1,assembly=b37,length=249250621>
4+
##reference=file:ref.fa
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B C D
7+
1 1890 . A G . . . GT ./. 1|. ./. 1|1

test/test.pl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,9 @@
184184
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.57.out',args=>q[-f'[%POS\\t%SAMPLE\\t%GT\\t%AD\\n]' -i'GT="het" & binom(FMT/AD)>0.01']);
185185
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.58.out',args=>q[-f'[%POS\\t%SAMPLE\\t%GT\\t%AD\\n]' -i'GT="het" & binom(FMT/AD[:0],FMT/AD[:1])>0.01']);
186186
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.59.out',args=>q[-f'%POS\\t%AD\\n' -i'binom(INFO/AD[0],INFO/AD[1])>0.01']);
187+
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.92.out',args=>q[-f'[%POS\\t%GT\\t%SAMPLE\\t%AD\\n]' -i'FMT/AD[GT]==10']);
188+
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.93.out',args=>q[-f'[%POS\\t%GT\\t%SAMPLE\\t%AD\\n]' -i'sSUM(FMT/AD[GT])==210']);
189+
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.94.out',args=>q[-f'[%POS\\t%GT\\t%SAMPLE\\t%AD\\n]' -i'FMT/AD[0:GT]==30']);
187190
run_test(\&test_vcf_query,$opts,in=>'query',out=>'query.60.out',args=>q[-f'%CHROM %POS\\n' -i'CHROM="4"']);
188191
run_test(\&test_vcf_query,$opts,in=>'query.negative',out=>'query.61.out',args=>q[-f'%POS\\t%TAG1\\n' -i'(TAG1>=-129 && TAG1<=-120) || (TAG1>=-32769 && TAG1<=-32760)']);
189192
run_test(\&test_vcf_query,$opts,in=>'query.negative',out=>'query.61.out',args=>q[-f'%POS\\t%TAGV1\\n' -i'(TAGV1>=-129 && TAGV1<=-120) || (TAGV1>=-32769 && TAGV1<=-32760)']);

0 commit comments

Comments
 (0)