Skip to content

Commit c7628e8

Browse files
committed
Fix GT indexing
When setting value by determining index from the genotype, we face the problem of how to interpret truncating arrays. Say we have TAG defined as Number=. and GT:TAG 1/1:0,1,2 0/0:0 Then when querying we expect the following expression to evaluate for the second sample as -i 'TAG[1:1]="."' .. true -i 'TAG[1:GT]="."' .. false The problem is that the implementation truncates the number of fields, filling usually fewer than the original number of per-sample values. This is fixed by adding an exception that makes the code aware of this. Fixes #2133
1 parent 9b83399 commit c7628e8

File tree

5 files changed

+54
-9
lines changed

5 files changed

+54
-9
lines changed

filter.c

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,7 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
974974
if ( !(flt->cached_GT.mask[i] & (1<<k)) ) continue;
975975
dst[j++] = src[k];
976976
}
977+
if ( !j ) { bcf_double_set_missing(dst[j]); j++; }
977978
for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
978979
}
979980
}
@@ -1064,6 +1065,7 @@ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
10641065
dst[j] = src[k];
10651066
j++;
10661067
}
1068+
if ( !j ) { bcf_double_set_missing(dst[j]); j++; }
10671069
for (; j<tok->nval1; j++) bcf_double_set_vector_end(dst[j]);
10681070
}
10691071
}
@@ -1113,30 +1115,39 @@ static void filters_set_format_string(filter_t *flt, bcf1_t *line, token_t *tok)
11131115

11141116
if ( nstr<0 ) return;
11151117

1118+
if ( tok->idx==-3 && filters_cache_genotypes(flt,line)!=0 ) return;
1119+
11161120
tok->nvalues = tok->str_value.l = nstr;
11171121
tok->nval1 = nstr / tok->nsamples;
11181122
for (i=0; i<tok->nsamples; i++)
11191123
{
11201124
if ( !tok->usmpl[i] ) continue;
11211125
char *src = tok->str_value.s + i*tok->nval1, *dst = src;
1122-
int ibeg = 0, idx = 0;
1126+
int ibeg = 0;
1127+
int idx = 0; // idx-th field
11231128
while ( ibeg < tok->nval1 )
11241129
{
11251130
int iend = ibeg;
11261131
while ( iend < tok->nval1 && src[iend] && src[iend]!=',' ) iend++;
11271132

11281133
int keep = 0;
1129-
if ( tok->idx >= 0 )
1134+
if ( tok->idx >= 0 ) // single index, given explicitly, e.g. AD[:1]
11301135
{
11311136
if ( tok->idx==idx ) keep = 1;
11321137
}
1133-
else if ( idx < tok->nidxs )
1138+
else if ( tok->idx == -3 ) // given by GT index, e.g. AD[:GT]
11341139
{
1135-
if ( tok->idxs[idx] != 0 ) keep = 1;
1140+
if ( flt->cached_GT.mask[i] & (1<<idx) ) keep = 1;
1141+
}
1142+
else // given as a list, e.g. AD[:0,3]
1143+
{
1144+
if ( idx < tok->nidxs )
1145+
{
1146+
if ( tok->idxs[idx] != 0 ) keep = 1;
1147+
}
1148+
else if ( tok->idxs[tok->nidxs-1] < 0 )
1149+
keep = 1;
11361150
}
1137-
else if ( tok->idxs[tok->nidxs-1] < 0 )
1138-
keep = 1;
1139-
11401151
if ( keep )
11411152
{
11421153
if ( ibeg!=0 ) memmove(dst, src+ibeg, iend-ibeg+1);
@@ -2407,6 +2418,18 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
24072418
return 2;
24082419
}
24092420

2421+
// A note about comparisons:
2422+
// When setting value by determining index from the genotype, we face the problem
2423+
// of how to interpret truncating arrays. Say we have TAG defined as Number=. and
2424+
// GT:TAG 1/1:0,1,2 0/0:0
2425+
// Then when querying we expect the following expression to evaluate for the second
2426+
// sample as
2427+
// -i 'TAG[1:1]="."' .. true
2428+
// -i 'TAG[1:GT]="."' .. false
2429+
// The problem is that the implementation truncates the number of fields, filling
2430+
// usually fewer than the original number of per-sample values. This is fixed by
2431+
// adding an exception that makes the code aware of this: the GT indexing can be
2432+
// recognised by haveing tok->idx==-3
24102433
#define CMP_VECTORS(atok,btok,_rtok,CMP_OP,missing_logic) \
24112434
{ \
24122435
token_t *rtok = _rtok; \
@@ -2494,6 +2517,8 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
24942517
double *bptr = btok->values + i*btok->nval1; \
24952518
for (j=0; j<atok->nval1; j++) \
24962519
{ \
2520+
if ( atok->idx==-3 && bcf_double_is_vector_end(aptr[j]) ) break; /* explained above */ \
2521+
if ( btok->idx==-3 && bcf_double_is_vector_end(bptr[j]) ) break; /* explained above */ \
24972522
int nmiss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
24982523
if ( nmiss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
24992524
nmiss += (bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0); \
@@ -2515,9 +2540,10 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
25152540
{ \
25162541
if ( !rtok->usmpl[i] ) continue; \
25172542
double *aptr = atok->values + i*atok->nval1; \
2518-
double *bptr = btok->values + i*btok->nval1; \
2543+
double *bptr = btok->values; \
25192544
for (j=0; j<atok->nval1; j++) \
25202545
{ \
2546+
if ( atok->idx==-3 && bcf_double_is_vector_end(aptr[j]) ) break; /* explained above */ \
25212547
int miss = bcf_double_is_missing_or_vector_end(aptr[j]) ? 1 : 0; \
25222548
if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
25232549
for (k=0; k<btok->nvalues; k++) \
@@ -2541,10 +2567,11 @@ static int vector_logic_and(filter_t *filter, bcf1_t *line, token_t *rtok, token
25412567
for (i=0; i<btok->nsamples; i++) \
25422568
{ \
25432569
if ( !rtok->usmpl[i] ) continue; \
2544-
double *aptr = atok->values + i*atok->nval1; \
2570+
double *aptr = atok->values; \
25452571
double *bptr = btok->values + i*btok->nval1; \
25462572
for (j=0; j<btok->nval1; j++) \
25472573
{ \
2574+
if ( atok->idx==-3 && bcf_double_is_vector_end(bptr[j]) ) break; /* explained above */ \
25482575
int miss = bcf_double_is_missing_or_vector_end(bptr[j]) ? 1 : 0; \
25492576
if ( miss && !missing_logic[0] ) continue; /* any is missing => result is false */ \
25502577
for (k=0; k<atok->nvalues; k++) \

test/query.gtidx.1.out

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1/2 A,B,C AA,BB,CC 0,1,2 0,1,2

test/query.gtidx.1.vcf

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1,length=248956422>
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##FORMAT=<ID=TC,Number=R,Type=Character,Description=".">
6+
##FORMAT=<ID=TS,Number=R,Type=String,Description=".">
7+
##FORMAT=<ID=TI,Number=R,Type=Integer,Description=".">
8+
##FORMAT=<ID=TF,Number=R,Type=Float,Description=".">
9+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE
10+
1 1 . CAA C,CA . . . GT:TC:TS:TI:TF 1/2:A,B,C:AA,BB,CC:0,1,2:0,1,2
11+
1 2 . CAA C,CA . . . GT:TC:TS:TI:TF 2/2:A,B:AA,BB:0,1:0,1

test/query.gtidx.2.out

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
2/2 0,1

test/test.pl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,11 @@
196196
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.92.out',args=>q[-f'[%POS\\t%GT\\t%SAMPLE\\t%AD\\n]' -i'FMT/AD[GT]==10']);
197197
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.93.out',args=>q[-f'[%POS\\t%GT\\t%SAMPLE\\t%AD\\n]' -i'sSUM(FMT/AD[GT])==210']);
198198
run_test(\&test_vcf_query,$opts,in=>'query.filter.5',out=>'query.94.out',args=>q[-f'[%POS\\t%GT\\t%SAMPLE\\t%AD\\n]' -i'FMT/AD[0:GT]==30']);
199+
run_test(\&test_vcf_query,$opts,in=>'query.gtidx.1',out=>'query.gtidx.1.out',args=>q[-f'[%GT %TC %TS %TI %TF\\n]' -i'FMT/TF[:GT]==1 && FMT/TF[:GT]==2']);
200+
run_test(\&test_vcf_query,$opts,in=>'query.gtidx.1',out=>'query.gtidx.1.out',args=>q[-f'[%GT %TC %TS %TI %TF\\n]' -i'FMT/TI[:GT]==1 && FMT/TF[:GT]==2']);
201+
run_test(\&test_vcf_query,$opts,in=>'query.gtidx.1',out=>'query.gtidx.1.out',args=>q[-f'[%GT %TC %TS %TI %TF\\n]' -i'FMT/TS[:GT]=="BB" && FMT/TS[:GT]=="CC"']);
202+
run_test(\&test_vcf_query,$opts,in=>'query.gtidx.1',out=>'query.gtidx.1.out',args=>q[-f'[%GT %TC %TS %TI %TF\\n]' -i'FMT/TC[:GT]=="B" && FMT/TC[:GT]=="C"']);
203+
run_test(\&test_vcf_query,$opts,in=>'query.gtidx.1',out=>'query.gtidx.2.out',args=>q[-f'[%GT %TI\\n]' -i'FMT/TI[:GT]=="."']);
199204
run_test(\&test_vcf_query,$opts,in=>'query',out=>'query.60.out',args=>q[-f'%CHROM %POS\\n' -i'CHROM="4"']);
200205
run_test(\&test_vcf_query,$opts,in=>'query.negative',out=>'query.61.out',args=>q[-f'%POS\\t%TAG1\\n' -i'(TAG1>=-129 && TAG1<=-120) || (TAG1>=-32769 && TAG1<=-32760)']);
201206
run_test(\&test_vcf_query,$opts,in=>'query.negative',out=>'query.61.out',args=>q[-f'%POS\\t%TAGV1\\n' -i'(TAGV1>=-129 && TAGV1<=-120) || (TAGV1>=-32769 && TAGV1<=-32760)']);

0 commit comments

Comments
 (0)