Skip to content

Commit 381e693

Browse files
committed
Extend per-population filtering
- Support per-population annotation for embedded functions, such as in -t 'MAC:1=int(MAF*AN)' -S groups.txt - Fix custom functions to use rounding instead of truncation when converting to integer, so that -t 'MAC:1=float(MAF*AN)' and -t 'MAC:1=int(MAF*AN)' return consistent values. Resolves #2514
1 parent b64ebe7 commit 381e693

File tree

6 files changed

+180
-26
lines changed

6 files changed

+180
-26
lines changed

NEWS

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ Changes affecting specific commands:
2323
- Make --gvcf2vcf work around malformed gVCF records where INFO/END overlaps
2424
the next record
2525

26+
* bcftools +fill-tags
27+
28+
- Fix custom functions to use rounding instead of truncation when converting to
29+
integer, so that -t 'MAC:1=float(MAF*AN)' and -t 'MAC:1=int(MAF*AN)' return
30+
consistent values.
31+
32+
- Support per-population annotation for embedded functions, such as in
33+
-t 'MAC:1=int(MAF*AN)' -S groups.txt (#2514)
34+
2635
* bcftools merge
2736

2837
- Fixes in memory access with localized alleles, it could result in printing

filter.c

Lines changed: 129 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* filter.c -- filter expressions.
22
3-
Copyright (C) 2013-2025 Genome Research Ltd.
3+
Copyright (C) 2013-2026 Genome Research Ltd.
44
55
Author: Petr Danecek <pd3@sanger.ac.uk>
66
@@ -109,11 +109,22 @@ struct _filter_t
109109
float *tmpf;
110110
kstring_t tmps;
111111
int max_unpack, mtmpi, mtmpf, nsamples;
112+
uint8_t *sample_mask; // bitmask restricting samples for FMT->INFO calculations
113+
int sample_mask_set; // number of samples after masking, 0: sample mask not used
112114
struct {
113115
bcf1_t *line;
114116
int32_t *buf, nbuf, mbuf; // GTs as obtained by bcf_get_genotypes()
115117
uint64_t *mask; // GTs as mask, e.g 0/0 is 1; 0/1 is 3, max 63 unique alleles
116118
} cached_GT;
119+
struct {
120+
bcf1_t *line;
121+
int n_missing;
122+
int n_samples; // to be able to calculate F_MISSING
123+
int AN;
124+
int m_als; // the size of allocated AC and AF
125+
int32_t *AC; // AC,AF are of size n_allele+1, idx=0 is for the REF allele, i.e. Number=R, not VCF's Number=A
126+
double *AF;
127+
} cached_site;
117128
#if ENABLE_PERL_FILTERS
118129
PerlInterpreter *perl;
119130
#endif
@@ -1443,26 +1454,81 @@ static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok)
14431454
tok->nvalues = 1;
14441455
tok->values[0] = line->n_allele - 1;
14451456
}
1446-
static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
1457+
#define CACHED flt->cached_site
1458+
static int filters_cache_AC_stats(filter_t *flt, bcf1_t *line)
14471459
{
1448-
hts_expand(int32_t, line->n_allele, flt->mtmpi, flt->tmpi);
1449-
if ( !bcf_calc_ac(flt->hdr, line, flt->tmpi, BCF_UN_INFO|BCF_UN_FMT) )
1460+
if ( CACHED.line==line ) return CACHED.n_samples > 0 ? 0 : -1;
1461+
CACHED.line = line;
1462+
1463+
if ( CACHED.m_als < line->n_allele )
14501464
{
1451-
tok->nvalues = 0;
1452-
return;
1465+
CACHED.m_als = line->n_allele;
1466+
CACHED.AC = realloc(CACHED.AC,sizeof(*CACHED.AC)*CACHED.m_als);
1467+
CACHED.AF = realloc(CACHED.AF,sizeof(*CACHED.AF)*CACHED.m_als);
1468+
}
1469+
1470+
int i,j;
1471+
if ( !flt->sample_mask_set && bcf_calc_ac(flt->hdr, line, CACHED.AC, BCF_UN_INFO)>0 )
1472+
{
1473+
// all samples, can reuse INFO/AC,AN etc, when available
1474+
CACHED.n_samples = line->n_sample;
1475+
CACHED.n_missing = -1; // cannot determine n_missing from INFO
1476+
CACHED.AN = 0;
1477+
for (i=0; i<line->n_allele; i++) CACHED.AN += CACHED.AC[i];
1478+
for (i=0; i<line->n_allele; i++) CACHED.AF[i] = CACHED.AN ? (double)CACHED.AC[i]/CACHED.AN : 0;
1479+
return 0;
1480+
}
1481+
1482+
CACHED.n_samples = flt->sample_mask_set ? flt->sample_mask_set : line->n_sample;
1483+
CACHED.n_missing = 0;
1484+
CACHED.AN = 0;
1485+
for (i=0; i<line->n_allele; i++) CACHED.AC[i] = 0;
1486+
1487+
int ngt = bcf_get_genotypes(flt->hdr, line, &flt->tmpi, &flt->mtmpi);
1488+
int ngt1 = ngt/line->n_sample;
1489+
for (i=0; i<line->n_sample; i++)
1490+
{
1491+
if ( flt->sample_mask_set && !flt->sample_mask[i] ) continue;
1492+
int32_t *ptr = flt->tmpi + ngt1*i;
1493+
int is_missing = 1;
1494+
for (j=0; j<ngt1; j++)
1495+
{
1496+
if ( bcf_gt_is_missing(ptr[j]) ) continue;
1497+
if ( ptr[j]==bcf_int32_vector_end ) break;
1498+
int ial = bcf_gt_allele(ptr[j]);
1499+
if ( ial >= line->n_allele )
1500+
{
1501+
static int warned = 0;
1502+
if ( !warned )
1503+
{
1504+
fprintf(stderr,"The allele index too large (%d), skipping GT parsing at this site %s:%"PRId64". "
1505+
"(This warning is printed only once.)\n", ial,bcf_seqname(flt->hdr,line),line->pos+1);
1506+
warned = 1;
1507+
}
1508+
CACHED.n_samples = 0;
1509+
return -1;
1510+
}
1511+
CACHED.AC[ial]++;
1512+
CACHED.AN++;
1513+
is_missing = 0;
1514+
}
1515+
if ( is_missing ) CACHED.n_missing++;
14531516
}
1454-
int i, an = flt->tmpi[0];
1455-
for (i=1; i<line->n_allele; i++) an += flt->tmpi[i];
1456-
if ( !an )
1517+
for (i=0; i<line->n_allele; i++) CACHED.AF[i] = CACHED.AN ? (double)CACHED.AC[i]/CACHED.AN : 0;
1518+
return 0;
1519+
}
1520+
static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
1521+
{
1522+
if ( filters_cache_AC_stats(flt,line) || !CACHED.AN )
14571523
{
14581524
tok->nvalues = 0;
14591525
return;
14601526
}
1461-
flt->tmpi[0] = an; // for filters_set_[mac|af|maf]
1527+
// Here tok->idx is relative to ALT, i.e. interpreted as VCF's Number=A
14621528
if ( tok->idx>=0 )
14631529
{
14641530
tok->nvalues = 1;
1465-
tok->values[0] = tok->idx+1<line->n_allele ? flt->tmpi[tok->idx+1] : 0;
1531+
tok->values[0] = tok->idx+1<line->n_allele ? CACHED.AC[tok->idx+1] : 0;
14661532
}
14671533
else if ( line->n_allele==1 ) // no ALT
14681534
{
@@ -1471,45 +1537,68 @@ static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
14711537
}
14721538
else
14731539
{
1540+
int i;
14741541
hts_expand(double,line->n_allele,tok->mvalues,tok->values);
14751542
for (i=1; i<line->n_allele; i++)
1476-
tok->values[i-1] = flt->tmpi[i];
1543+
tok->values[i-1] = CACHED.AC[i];
14771544
tok->nvalues = line->n_allele - 1;
14781545
}
14791546
}
14801547
static void filters_set_an(filter_t *flt, bcf1_t *line, token_t *tok)
14811548
{
1482-
filters_set_ac(flt,line,tok);
1483-
tok->values[0] = tok->nvalues ? flt->tmpi[0] : 0;
14841549
tok->nvalues = 1;
1550+
if ( filters_cache_AC_stats(flt,line)==0 )
1551+
tok->values[0] = CACHED.AN;
1552+
else
1553+
tok->values[0] = 0;
14851554
}
14861555
static void filters_set_mac(filter_t *flt, bcf1_t *line, token_t *tok)
14871556
{
1557+
// Note: this one does not make much sense if nvalues!=1
14881558
filters_set_ac(flt,line,tok);
14891559
if ( !tok->nvalues ) return;
1490-
int i, an = flt->tmpi[0];
1560+
int i;
14911561
for (i=0; i<tok->nvalues; i++)
1492-
if ( tok->values[i] > an*0.5 ) tok->values[i] = an - tok->values[i];
1562+
if ( tok->values[i] > 0.5*CACHED.AN ) tok->values[i] = CACHED.AN - tok->values[i];
14931563
}
14941564
static void filters_set_af(filter_t *flt, bcf1_t *line, token_t *tok)
14951565
{
1496-
filters_set_ac(flt,line,tok);
1497-
if ( !tok->nvalues ) return;
1498-
int i, an = flt->tmpi[0];
1499-
for (i=0; i<tok->nvalues; i++)
1500-
tok->values[i] /= (double)an;
1566+
if ( filters_cache_AC_stats(flt,line) || !CACHED.AN )
1567+
{
1568+
tok->nvalues = 0;
1569+
return;
1570+
}
1571+
// Here tok->idx is relative to ALT, i.e. interpreted as VCF's Number=A
1572+
if ( tok->idx>=0 )
1573+
{
1574+
tok->nvalues = 1;
1575+
tok->values[0] = tok->idx+1<line->n_allele ? CACHED.AF[tok->idx+1] : 0;
1576+
}
1577+
else if ( line->n_allele==1 ) // no ALT
1578+
{
1579+
tok->nvalues = 1;
1580+
tok->values[0] = 0;
1581+
}
1582+
else
1583+
{
1584+
int i;
1585+
hts_expand(double,line->n_allele,tok->mvalues,tok->values);
1586+
for (i=1; i<line->n_allele; i++)
1587+
tok->values[i-1] = CACHED.AF[i];
1588+
tok->nvalues = line->n_allele - 1;
1589+
}
15011590
}
15021591
static void filters_set_maf(filter_t *flt, bcf1_t *line, token_t *tok)
15031592
{
1504-
filters_set_ac(flt,line,tok);
1593+
filters_set_af(flt,line,tok);
15051594
if ( !tok->nvalues ) return;
1506-
int i, an = flt->tmpi[0];
1595+
int i;
15071596
for (i=0; i<tok->nvalues; i++)
15081597
{
1509-
tok->values[i] /= (double)an;
15101598
if ( tok->values[i] > 0.5 ) tok->values[i] = 1 - tok->values[i];
15111599
}
15121600
}
1601+
#undef CACHED
15131602

15141603
static int func_max(filter_t *flt, bcf1_t *line, token_t *rtok, token_t **stack, int nstack)
15151604
{
@@ -4117,6 +4206,11 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
41174206
}
41184207
}
41194208
filter->nsamples = filter->max_unpack&BCF_UN_FMT ? bcf_hdr_nsamples(filter->hdr) : 0;
4209+
if ( filter->nsamples )
4210+
{
4211+
filter->sample_mask = (uint8_t*) malloc(filter->nsamples);
4212+
if ( !filter->sample_mask ) error("Could not allocate %d bytes\n",filter->nsamples);
4213+
}
41204214
for (i=0; i<nout; i++)
41214215
{
41224216
if ( out[i].tok_type==TOK_MAX ) { out[i].func = func_max; out[i].tok_type = TOK_FUNC; }
@@ -4193,12 +4287,15 @@ void filter_destroy(filter_t *filter)
41934287
free(filter->used_tag);
41944288
free(filter->cached_GT.buf);
41954289
free(filter->cached_GT.mask);
4290+
free(filter->cached_site.AC);
4291+
free(filter->cached_site.AF);
41964292
free(filter->filters);
41974293
free(filter->flt_stack);
41984294
free(filter->str);
41994295
free(filter->tmpi);
42004296
free(filter->tmpf);
42014297
free(filter->tmps.s);
4298+
free(filter->sample_mask);
42024299
free(filter);
42034300
}
42044301

@@ -4389,12 +4486,20 @@ const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
43894486

43904487
void filter_set_samples(filter_t *filter, const uint8_t *samples)
43914488
{
4489+
filter->sample_mask_set = 0;
4490+
if ( !filter->nsamples ) return;
4491+
43924492
int i,j;
43934493
for (i=0; i<filter->nfilters; i++)
43944494
{
43954495
if ( !filter->filters[i].nsamples ) continue;
43964496
for (j=0; j<filter->filters[i].nsamples; j++) filter->filters[i].usmpl[j] = samples[j];
43974497
}
4498+
for (i=0; i<filter->nsamples; i++)
4499+
{
4500+
filter->sample_mask[i] = samples[i];
4501+
if ( samples[i] ) filter->sample_mask_set++;
4502+
}
43984503
}
43994504

44004505
int filter_status(filter_t *filter)

plugins/fill-tags.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* The MIT License
22
3-
Copyright (c) 2015-2025 Genome Research Ltd.
3+
Copyright (c) 2015-2026 Genome Research Ltd.
44
55
Author: Petr Danecek <pd3@sanger.ac.uk>
66
@@ -283,7 +283,7 @@ void ftf_destroy(pop_t *pop)
283283
static inline int32_t int32_from_double(double src)
284284
{
285285
if (bcf_double_is_missing_or_vector_end(src)) return bcf_int32_missing;
286-
else return (int)src;
286+
else return (int)round(src);
287287
}
288288

289289
int ftf_filter_expr(args_t *args, bcf1_t *rec, pop_t *pop, ftf_t *ftf)

test/fill-tags.MAC-MAF.1.out

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
11 2343543 . A . . . MAC_AB=0;MAC_BC=0;MAC=0 GT 0/0 0/0 0/0
2+
11 5464562 . C T . . MAC_AB=.;MAC_BC=.;MAC=. GT ./. ./. ./.
3+
20 76962 . T C . . MAC_AB=1;MAC_BC=0;MAC=1 GT 0/1 1/1 1/1
4+
20 126310 . ACC A . . MAC_AB=2;MAC_BC=1;MAC=2 GT 0/1 0/1 1/1
5+
20 138125 . G T . . MAC_AB=2;MAC_BC=1;MAC=2 GT 0/1 0/1 1/1
6+
20 138148 . C T . . MAC_AB=2;MAC_BC=1;MAC=2 GT 0/1 0/1 1/1
7+
20 271225 . T TTTA,TA . . MAC_AB=1;MAC_BC=2;MAC=2 GT 0/2 0/1 1/2
8+
20 304568 . C T . . MAC_AB=2;MAC_BC=1;MAC=2 GT 0|1 0|1 1|1
9+
20 326891 . A AC . . MAC_AB=2;MAC_BC=1;MAC=2 GT 0|1 0|1 ./.
10+
X 2928329 . C T . . MAC_AB=0;MAC_BC=1;MAC=1 GT 0 0 0/1
11+
X 2933066 . G C . . MAC_AB=0;MAC_BC=1;MAC=1 GT 0 0 0/1
12+
X 2942109 . T C . . MAC_AB=1;MAC_BC=0;MAC=1 GT 0 1 1/1
13+
X 3048719 . T C . . MAC_AB=1;MAC_BC=1;MAC=2 GT 0 1 0|1
14+
Y 8657215 . C A . . MAC_AB=1;MAC_BC=0;MAC=1 GT 0 1 .
15+
Y 10011673 . G A . . MAC_AB=0;MAC_BC=0;MAC=0 GT 1 1 .

test/fill-tags.MAC-MAF.vcf

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
##fileformat=VCFv4.2
2+
##reference=file:///ref.fa
3+
##contig=<ID=11,length=135006516>
4+
##contig=<ID=20,length=63025520>
5+
##contig=<ID=X,length=155270560>
6+
##contig=<ID=Y,length=59373566>
7+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
8+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003
9+
11 2343543 . A . . . . GT 0/0 0/0 0/0
10+
11 5464562 . C T . . . GT ./. ./. ./.
11+
20 76962 . T C . . . GT 0/1 1/1 1/1
12+
20 126310 . ACC A . . . GT 0/1 0/1 1/1
13+
20 138125 . G T . . . GT 0/1 0/1 1/1
14+
20 138148 . C T . . . GT 0/1 0/1 1/1
15+
20 271225 . T TTTA,TA . . . GT 0/2 0/1 1/2
16+
20 304568 . C T . . . GT 0|1 0|1 1|1
17+
20 326891 . A AC . . . GT 0|1 0|1 ./.
18+
X 2928329 . C T . . . GT 0 0 0/1
19+
X 2933066 . G C . . . GT 0 0 0/1
20+
X 2942109 . T C . . . GT 0 1 1/1
21+
X 3048719 . T C . . . GT 0 1 0|1
22+
Y 8657215 . C A . . . GT 0 1 .
23+
Y 10011673 . G A . . . GT 1 1 .

test/test.pl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,8 @@
694694
run_test(\&test_vcf_plugin,$opts,in=>'query.variantkey',out=>'query.add-variantkey.vcf',cmd=>'+add-variantkey',args=>'');
695695
run_test(\&test_vcf_plugin,$opts,in=>'query.variantkey',out=>'variantkey-hex.out',cmd=>'+variantkey-hex',args=>'test/');
696696
run_test(\&test_vcf_plugin,$opts,in=>'query.nucleotide',out=>'query.allele-length.tsv',cmd=>'+allele-length',args=>'');
697+
run_test(\&test_vcf_plugin,$opts,in=>'fill-tags.MAC-MAF',out=>'fill-tags.MAC-MAF.1.out',cmd=>'+fill-tags --no-version',args=>q[-- -S {PATH}/fill-tags.3.smpl -t 'MAC:1=MAC' | grep -v ^#]);
698+
run_test(\&test_vcf_plugin,$opts,in=>'fill-tags.MAC-MAF',out=>'fill-tags.MAC-MAF.1.out',cmd=>'+fill-tags --no-version',args=>q[-- -S {PATH}/fill-tags.3.smpl -t 'MAC:1=int(MAF*AN)' | grep -v ^#]);
697699
run_test(\&test_vcf_plugin,$opts,in=>'query.func.1',out=>'fill-tags.func.1.out',cmd=>'+fill-tags --no-version',args=>q[-- -t 'FMT/AB:1=float(FMT/AD[*:0] / ssum(FMT/AD[*]))' | grep -v ^#]);
698700
run_test(\&test_vcf_plugin,$opts,in=>'fmissing',out=>'fmissing.1.out',cmd=>'+fill-tags --no-version',args=>q[-- -S {PATH}/fmissing.txt -t 'F_MISSING' | grep -v ^#]);
699701
run_test(\&test_vcf_plugin,$opts,in=>'fmissing',out=>'fmissing.1.out',cmd=>'+fill-tags --no-version',args=>q[-- -S {PATH}/fmissing.txt -t 'F_MISSING:1=F_PASS(GT="mis")' | grep -v ^#]);

0 commit comments

Comments
 (0)