11/* filter.c -- filter expressions.
22
3- Copyright (C) 2013-2025 Genome Research Ltd.
3+ Copyright (C) 2013-2026 Genome Research Ltd.
44
55 Author: Petr Danecek <pd3@sanger.ac.uk>
66
@@ -109,11 +109,22 @@ struct _filter_t
109109 float * tmpf ;
110110 kstring_t tmps ;
111111 int max_unpack , mtmpi , mtmpf , nsamples ;
112+ uint8_t * sample_mask ; // bitmask restricting samples for FMT->INFO calculations
113+ int sample_mask_set ; // number of samples after masking, 0: sample mask not used
112114 struct {
113115 bcf1_t * line ;
114116 int32_t * buf , nbuf , mbuf ; // GTs as obtained by bcf_get_genotypes()
115117 uint64_t * mask ; // GTs as mask, e.g 0/0 is 1; 0/1 is 3, max 63 unique alleles
116118 } cached_GT ;
119+ struct {
120+ bcf1_t * line ;
121+ int n_missing ;
122+ int n_samples ; // to be able to calculate F_MISSING
123+ int AN ;
124+ int m_als ; // the size of allocated AC and AF
125+ int32_t * AC ; // AC,AF are of size n_allele+1, idx=0 is for the REF allele, i.e. Number=R, not VCF's Number=A
126+ double * AF ;
127+ } cached_site ;
117128#if ENABLE_PERL_FILTERS
118129 PerlInterpreter * perl ;
119130#endif
@@ -1443,26 +1454,81 @@ static void filters_set_nalt(filter_t *flt, bcf1_t *line, token_t *tok)
14431454 tok -> nvalues = 1 ;
14441455 tok -> values [0 ] = line -> n_allele - 1 ;
14451456}
1446- static void filters_set_ac (filter_t * flt , bcf1_t * line , token_t * tok )
1457+ #define CACHED flt->cached_site
1458+ static int filters_cache_AC_stats (filter_t * flt , bcf1_t * line )
14471459{
1448- hts_expand (int32_t , line -> n_allele , flt -> mtmpi , flt -> tmpi );
1449- if ( !bcf_calc_ac (flt -> hdr , line , flt -> tmpi , BCF_UN_INFO |BCF_UN_FMT ) )
1460+ if ( CACHED .line == line ) return CACHED .n_samples > 0 ? 0 : -1 ;
1461+ CACHED .line = line ;
1462+
1463+ if ( CACHED .m_als < line -> n_allele )
14501464 {
1451- tok -> nvalues = 0 ;
1452- return ;
1465+ CACHED .m_als = line -> n_allele ;
1466+ CACHED .AC = realloc (CACHED .AC ,sizeof (* CACHED .AC )* CACHED .m_als );
1467+ CACHED .AF = realloc (CACHED .AF ,sizeof (* CACHED .AF )* CACHED .m_als );
1468+ }
1469+
1470+ int i ,j ;
1471+ if ( !flt -> sample_mask_set && bcf_calc_ac (flt -> hdr , line , CACHED .AC , BCF_UN_INFO )> 0 )
1472+ {
1473+ // all samples, can reuse INFO/AC,AN etc, when available
1474+ CACHED .n_samples = line -> n_sample ;
1475+ CACHED .n_missing = -1 ; // cannot determine n_missing from INFO
1476+ CACHED .AN = 0 ;
1477+ for (i = 0 ; i < line -> n_allele ; i ++ ) CACHED .AN += CACHED .AC [i ];
1478+ for (i = 0 ; i < line -> n_allele ; i ++ ) CACHED .AF [i ] = CACHED .AN ? (double )CACHED .AC [i ]/CACHED .AN : 0 ;
1479+ return 0 ;
1480+ }
1481+
1482+ CACHED .n_samples = flt -> sample_mask_set ? flt -> sample_mask_set : line -> n_sample ;
1483+ CACHED .n_missing = 0 ;
1484+ CACHED .AN = 0 ;
1485+ for (i = 0 ; i < line -> n_allele ; i ++ ) CACHED .AC [i ] = 0 ;
1486+
1487+ int ngt = bcf_get_genotypes (flt -> hdr , line , & flt -> tmpi , & flt -> mtmpi );
1488+ int ngt1 = ngt /line -> n_sample ;
1489+ for (i = 0 ; i < line -> n_sample ; i ++ )
1490+ {
1491+ if ( flt -> sample_mask_set && !flt -> sample_mask [i ] ) continue ;
1492+ int32_t * ptr = flt -> tmpi + ngt1 * i ;
1493+ int is_missing = 1 ;
1494+ for (j = 0 ; j < ngt1 ; j ++ )
1495+ {
1496+ if ( bcf_gt_is_missing (ptr [j ]) ) continue ;
1497+ if ( ptr [j ]== bcf_int32_vector_end ) break ;
1498+ int ial = bcf_gt_allele (ptr [j ]);
1499+ if ( ial >= line -> n_allele )
1500+ {
1501+ static int warned = 0 ;
1502+ if ( !warned )
1503+ {
1504+ fprintf (stderr ,"The allele index too large (%d), skipping GT parsing at this site %s:%" PRId64 ". "
1505+ "(This warning is printed only once.)\n" , ial ,bcf_seqname (flt -> hdr ,line ),line -> pos + 1 );
1506+ warned = 1 ;
1507+ }
1508+ CACHED .n_samples = 0 ;
1509+ return -1 ;
1510+ }
1511+ CACHED .AC [ial ]++ ;
1512+ CACHED .AN ++ ;
1513+ is_missing = 0 ;
1514+ }
1515+ if ( is_missing ) CACHED .n_missing ++ ;
14531516 }
1454- int i , an = flt -> tmpi [0 ];
1455- for (i = 1 ; i < line -> n_allele ; i ++ ) an += flt -> tmpi [i ];
1456- if ( !an )
1517+ for (i = 0 ; i < line -> n_allele ; i ++ ) CACHED .AF [i ] = CACHED .AN ? (double )CACHED .AC [i ]/CACHED .AN : 0 ;
1518+ return 0 ;
1519+ }
1520+ static void filters_set_ac (filter_t * flt , bcf1_t * line , token_t * tok )
1521+ {
1522+ if ( filters_cache_AC_stats (flt ,line ) || !CACHED .AN )
14571523 {
14581524 tok -> nvalues = 0 ;
14591525 return ;
14601526 }
1461- flt -> tmpi [ 0 ] = an ; // for filters_set_[mac|af|maf]
1527+ // Here tok->idx is relative to ALT, i.e. interpreted as VCF's Number=A
14621528 if ( tok -> idx >=0 )
14631529 {
14641530 tok -> nvalues = 1 ;
1465- tok -> values [0 ] = tok -> idx + 1 < line -> n_allele ? flt -> tmpi [tok -> idx + 1 ] : 0 ;
1531+ tok -> values [0 ] = tok -> idx + 1 < line -> n_allele ? CACHED . AC [tok -> idx + 1 ] : 0 ;
14661532 }
14671533 else if ( line -> n_allele == 1 ) // no ALT
14681534 {
@@ -1471,45 +1537,68 @@ static void filters_set_ac(filter_t *flt, bcf1_t *line, token_t *tok)
14711537 }
14721538 else
14731539 {
1540+ int i ;
14741541 hts_expand (double ,line -> n_allele ,tok -> mvalues ,tok -> values );
14751542 for (i = 1 ; i < line -> n_allele ; i ++ )
1476- tok -> values [i - 1 ] = flt -> tmpi [i ];
1543+ tok -> values [i - 1 ] = CACHED . AC [i ];
14771544 tok -> nvalues = line -> n_allele - 1 ;
14781545 }
14791546}
14801547static void filters_set_an (filter_t * flt , bcf1_t * line , token_t * tok )
14811548{
1482- filters_set_ac (flt ,line ,tok );
1483- tok -> values [0 ] = tok -> nvalues ? flt -> tmpi [0 ] : 0 ;
14841549 tok -> nvalues = 1 ;
1550+ if ( filters_cache_AC_stats (flt ,line )== 0 )
1551+ tok -> values [0 ] = CACHED .AN ;
1552+ else
1553+ tok -> values [0 ] = 0 ;
14851554}
14861555static void filters_set_mac (filter_t * flt , bcf1_t * line , token_t * tok )
14871556{
1557+ // Note: this one does not make much sense if nvalues!=1
14881558 filters_set_ac (flt ,line ,tok );
14891559 if ( !tok -> nvalues ) return ;
1490- int i , an = flt -> tmpi [ 0 ] ;
1560+ int i ;
14911561 for (i = 0 ; i < tok -> nvalues ; i ++ )
1492- if ( tok -> values [i ] > an * 0.5 ) tok -> values [i ] = an - tok -> values [i ];
1562+ if ( tok -> values [i ] > 0.5 * CACHED . AN ) tok -> values [i ] = CACHED . AN - tok -> values [i ];
14931563}
14941564static void filters_set_af (filter_t * flt , bcf1_t * line , token_t * tok )
14951565{
1496- filters_set_ac (flt ,line ,tok );
1497- if ( !tok -> nvalues ) return ;
1498- int i , an = flt -> tmpi [0 ];
1499- for (i = 0 ; i < tok -> nvalues ; i ++ )
1500- tok -> values [i ] /= (double )an ;
1566+ if ( filters_cache_AC_stats (flt ,line ) || !CACHED .AN )
1567+ {
1568+ tok -> nvalues = 0 ;
1569+ return ;
1570+ }
1571+ // Here tok->idx is relative to ALT, i.e. interpreted as VCF's Number=A
1572+ if ( tok -> idx >=0 )
1573+ {
1574+ tok -> nvalues = 1 ;
1575+ tok -> values [0 ] = tok -> idx + 1 < line -> n_allele ? CACHED .AF [tok -> idx + 1 ] : 0 ;
1576+ }
1577+ else if ( line -> n_allele == 1 ) // no ALT
1578+ {
1579+ tok -> nvalues = 1 ;
1580+ tok -> values [0 ] = 0 ;
1581+ }
1582+ else
1583+ {
1584+ int i ;
1585+ hts_expand (double ,line -> n_allele ,tok -> mvalues ,tok -> values );
1586+ for (i = 1 ; i < line -> n_allele ; i ++ )
1587+ tok -> values [i - 1 ] = CACHED .AF [i ];
1588+ tok -> nvalues = line -> n_allele - 1 ;
1589+ }
15011590}
15021591static void filters_set_maf (filter_t * flt , bcf1_t * line , token_t * tok )
15031592{
1504- filters_set_ac (flt ,line ,tok );
1593+ filters_set_af (flt ,line ,tok );
15051594 if ( !tok -> nvalues ) return ;
1506- int i , an = flt -> tmpi [ 0 ] ;
1595+ int i ;
15071596 for (i = 0 ; i < tok -> nvalues ; i ++ )
15081597 {
1509- tok -> values [i ] /= (double )an ;
15101598 if ( tok -> values [i ] > 0.5 ) tok -> values [i ] = 1 - tok -> values [i ];
15111599 }
15121600}
1601+ #undef CACHED
15131602
15141603static int func_max (filter_t * flt , bcf1_t * line , token_t * rtok , token_t * * stack , int nstack )
15151604{
@@ -4117,6 +4206,11 @@ static filter_t *filter_init_(bcf_hdr_t *hdr, const char *str, int exit_on_error
41174206 }
41184207 }
41194208 filter -> nsamples = filter -> max_unpack & BCF_UN_FMT ? bcf_hdr_nsamples (filter -> hdr ) : 0 ;
4209+ if ( filter -> nsamples )
4210+ {
4211+ filter -> sample_mask = (uint8_t * ) malloc (filter -> nsamples );
4212+ if ( !filter -> sample_mask ) error ("Could not allocate %d bytes\n" ,filter -> nsamples );
4213+ }
41204214 for (i = 0 ; i < nout ; i ++ )
41214215 {
41224216 if ( out [i ].tok_type == TOK_MAX ) { out [i ].func = func_max ; out [i ].tok_type = TOK_FUNC ; }
@@ -4193,12 +4287,15 @@ void filter_destroy(filter_t *filter)
41934287 free (filter -> used_tag );
41944288 free (filter -> cached_GT .buf );
41954289 free (filter -> cached_GT .mask );
4290+ free (filter -> cached_site .AC );
4291+ free (filter -> cached_site .AF );
41964292 free (filter -> filters );
41974293 free (filter -> flt_stack );
41984294 free (filter -> str );
41994295 free (filter -> tmpi );
42004296 free (filter -> tmpf );
42014297 free (filter -> tmps .s );
4298+ free (filter -> sample_mask );
42024299 free (filter );
42034300}
42044301
@@ -4389,12 +4486,20 @@ const double *filter_get_doubles(filter_t *filter, int *nval, int *nval1)
43894486
43904487void filter_set_samples (filter_t * filter , const uint8_t * samples )
43914488{
4489+ filter -> sample_mask_set = 0 ;
4490+ if ( !filter -> nsamples ) return ;
4491+
43924492 int i ,j ;
43934493 for (i = 0 ; i < filter -> nfilters ; i ++ )
43944494 {
43954495 if ( !filter -> filters [i ].nsamples ) continue ;
43964496 for (j = 0 ; j < filter -> filters [i ].nsamples ; j ++ ) filter -> filters [i ].usmpl [j ] = samples [j ];
43974497 }
4498+ for (i = 0 ; i < filter -> nsamples ; i ++ )
4499+ {
4500+ filter -> sample_mask [i ] = samples [i ];
4501+ if ( samples [i ] ) filter -> sample_mask_set ++ ;
4502+ }
43984503}
43994504
44004505int filter_status (filter_t * filter )
0 commit comments