@@ -69,6 +69,7 @@ typedef struct _token_t
6969 int hdr_id , tag_type ; // BCF header lookup ID and one of BCF_HL_* types
7070 int idx ; // 0-based index to VCF vectors,
7171 // -2: list (e.g. [0,1,2] or [1..3] or [1..] or any field[*], which is equivalent to [0..])
72+ // -3: select indices on the fly based on values in GT
7273 int * idxs ; // set indexes to 0 to exclude, to 1 to include, and last element negative if unlimited; used by VCF retrievers only
7374 int nidxs , nuidxs ; // size of idxs array and the number of elements set to 1
7475 uint8_t * usmpl ; // bitmask of used samples as set by idx, set for FORMAT fields, NULL otherwise
@@ -100,6 +101,11 @@ struct _filter_t
100101 float * tmpf ;
101102 kstring_t tmps ;
102103 int max_unpack , mtmpi , mtmpf , nsamples ;
104+ struct {
105+ bcf1_t * line ;
106+ int32_t * buf , nbuf , mbuf ; // GTs as obtained by bcf_get_genotypes()
107+ uint64_t * mask ; // GTs as mask, e.g 0/0 is 1; 0/1 is 3, max 63 unique alleles
108+ } cached_GT ;
103109#if ENABLE_PERL_FILTERS
104110 PerlInterpreter * perl ;
105111#endif
@@ -350,6 +356,44 @@ char *expand_path(char *path)
350356
351357 return strdup (path );
352358}
359+ static int filters_cache_genotypes (filter_t * flt , bcf1_t * line )
360+ {
361+ if ( flt -> cached_GT .line == line ) return flt -> cached_GT .nbuf > 0 ? 0 : -1 ;
362+ flt -> cached_GT .line = line ;
363+ flt -> cached_GT .nbuf = bcf_get_genotypes (flt -> hdr , line , & flt -> cached_GT .buf , & flt -> cached_GT .mbuf );
364+ if ( flt -> cached_GT .nbuf <=0 ) return -1 ;
365+ if ( !flt -> cached_GT .mask )
366+ {
367+ flt -> cached_GT .mask = (uint64_t * ) malloc (sizeof (* flt -> cached_GT .mask )* flt -> nsamples );
368+ if ( !flt -> cached_GT .mask ) error ("Could not alloc %zu bytes\n" ,sizeof (* flt -> cached_GT .mask )* flt -> nsamples );
369+ }
370+ int i ,j , ngt1 = flt -> cached_GT .nbuf / line -> n_sample ;
371+ for (i = 0 ; i < line -> n_sample ; i ++ )
372+ {
373+ int32_t * ptr = flt -> cached_GT .buf + i * ngt1 ;
374+ flt -> cached_GT .mask [i ] = 0 ;
375+ for (j = 0 ; j < ngt1 ; j ++ )
376+ {
377+ if ( bcf_gt_is_missing (ptr [j ]) ) continue ;
378+ if ( ptr [j ]== bcf_int32_vector_end ) break ;
379+ int allele = bcf_gt_allele (ptr [j ]);
380+ if ( allele > 63 )
381+ {
382+ static int warned = 0 ;
383+ if ( !warned )
384+ {
385+ fprintf (stderr ,"Too many alleles, skipping GT filtering at this site %s:%" PRId64 ". "
386+ "(This warning is printed only once.)\n" , bcf_seqname (flt -> hdr ,line ),line -> pos + 1 );
387+ warned = 1 ;
388+ }
389+ flt -> cached_GT .nbuf = 0 ;
390+ return -1 ;
391+ }
392+ flt -> cached_GT .mask [i ] |= 1 <<allele ;
393+ }
394+ }
395+ return 0 ;
396+ }
353397
354398static void filters_set_qual (filter_t * flt , bcf1_t * line , token_t * tok )
355399{
@@ -762,6 +806,27 @@ static void filters_set_format_int(filter_t *flt, bcf1_t *line, token_t *tok)
762806 tok -> values [i ] = ptr [tok -> idx ];
763807 }
764808 }
809+ else if ( tok -> idx == -3 )
810+ {
811+ if ( filters_cache_genotypes (flt ,line )!= 0 )
812+ {
813+ tok -> nvalues = 0 ;
814+ return ;
815+ }
816+ for (i = 0 ; i < tok -> nsamples ; i ++ )
817+ {
818+ if ( !tok -> usmpl [i ] ) continue ;
819+ int32_t * src = flt -> tmpi + i * nsrc1 ;
820+ double * dst = tok -> values + i * tok -> nval1 ;
821+ int k , j = 0 ;
822+ for (k = 0 ; k < nsrc1 ; k ++ ) // source values are AD[0..nsrc1]
823+ {
824+ if ( !(flt -> cached_GT .mask [i ] & (1 <<k )) ) continue ;
825+ dst [j ++ ] = src [k ];
826+ }
827+ for (; j < tok -> nval1 ; j ++ ) bcf_double_set_vector_end (dst [j ]);
828+ }
829+ }
765830 else
766831 {
767832 int kend = tok -> idxs [tok -> nidxs - 1 ] < 0 ? tok -> nval1 : tok -> nidxs ;
@@ -825,6 +890,33 @@ static void filters_set_format_float(filter_t *flt, bcf1_t *line, token_t *tok)
825890 tok -> values [i ] = ptr [tok -> idx ];
826891 }
827892 }
893+ else if ( tok -> idx == -3 )
894+ {
895+ if ( filters_cache_genotypes (flt ,line )!= 0 )
896+ {
897+ tok -> nvalues = 0 ;
898+ return ;
899+ }
900+ for (i = 0 ; i < tok -> nsamples ; i ++ )
901+ {
902+ if ( !tok -> usmpl [i ] ) continue ;
903+ float * src = flt -> tmpf + i * nsrc1 ;
904+ double * dst = tok -> values + i * tok -> nval1 ;
905+ int k , j = 0 ;
906+ for (k = 0 ; k < nsrc1 ; k ++ ) // source values are AF[0..nsrc1]
907+ {
908+ if ( !(flt -> cached_GT .mask [i ] & (1 <<k )) ) continue ;
909+ if ( bcf_float_is_missing (src [k ]) )
910+ bcf_double_set_missing (dst [j ]);
911+ else if ( bcf_float_is_vector_end (src [k ]) )
912+ bcf_double_set_vector_end (dst [j ]);
913+ else
914+ dst [j ] = src [k ];
915+ j ++ ;
916+ }
917+ for (; j < tok -> nval1 ; j ++ ) bcf_double_set_vector_end (dst [j ]);
918+ }
919+ }
828920 else
829921 {
830922 int kend = tok -> idxs [tok -> nidxs - 1 ] < 0 ? tok -> nval1 : tok -> nidxs ;
@@ -989,9 +1081,9 @@ static void _filters_set_genotype(filter_t *flt, bcf1_t *line, token_t *tok, int
9891081 tok -> str_value .s [tok -> str_value .l ] = 0 ;
9901082 tok -> nval1 = nvals1 ;
9911083}
992- static void filters_set_genotype2 (filter_t * flt , bcf1_t * line , token_t * tok ) { _filters_set_genotype (flt , line , tok , 2 ); }
993- static void filters_set_genotype3 (filter_t * flt , bcf1_t * line , token_t * tok ) { _filters_set_genotype (flt , line , tok , 3 ); }
994- static void filters_set_genotype4 (filter_t * flt , bcf1_t * line , token_t * tok ) { _filters_set_genotype (flt , line , tok , 4 ); }
1084+ static void filters_set_genotype2 (filter_t * flt , bcf1_t * line , token_t * tok ) { _filters_set_genotype (flt , line , tok , 2 ); } // rr, ra, aa, aA etc
1085+ static void filters_set_genotype3 (filter_t * flt , bcf1_t * line , token_t * tok ) { _filters_set_genotype (flt , line , tok , 3 ); } // hap, hom, het
1086+ static void filters_set_genotype4 (filter_t * flt , bcf1_t * line , token_t * tok ) { _filters_set_genotype (flt , line , tok , 4 ); } // mis, alt, ref
9951087
9961088static void filters_set_genotype_string (filter_t * flt , bcf1_t * line , token_t * tok )
9971089{
@@ -2480,6 +2572,14 @@ static int parse_idxs(char *tag_idx, int **idxs, int *nidxs, int *idx)
24802572 * idx = -2 ;
24812573 return 0 ;
24822574 }
2575+ if ( !strcmp ("GT" , tag_idx ) )
2576+ {
2577+ * idxs = (int * ) malloc (sizeof (int ));
2578+ (* idxs )[0 ] = -1 ;
2579+ * nidxs = 1 ;
2580+ * idx = -3 ;
2581+ return 0 ;
2582+ }
24832583
24842584 // TAG[integer] .. one field; idx positive
24852585 char * end , * beg = tag_idx ;
@@ -2595,7 +2695,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx,
25952695 tok -> idxs = (int * ) malloc (sizeof (int ));
25962696 tok -> idxs [0 ] = -1 ;
25972697 tok -> nidxs = 1 ;
2598- tok -> idx = -2 ;
2698+ tok -> idx = idx1 ;
25992699 }
26002700 else if ( bcf_hdr_id2number (hdr ,BCF_HL_FMT ,tok -> hdr_id )!= 1 )
26012701 error ("The FORMAT tag %s can have multiple subfields, run as %s[sample:subfield]\n" , tag ,tag );
@@ -2620,7 +2720,7 @@ static void parse_tag_idx(bcf_hdr_t *hdr, int is_fmt, char *tag, char *tag_idx,
26202720 if ( idx1 >= bcf_hdr_nsamples (hdr ) ) error ("The sample index is too large: %s\n" , ori );
26212721 tok -> usmpl [idx1 ] = 1 ;
26222722 }
2623- else if ( idx1 == -2 )
2723+ else if ( idx1 == -2 || idx1 == -3 )
26242724 {
26252725 for (i = 0 ; i < nidxs1 ; i ++ )
26262726 {
@@ -2820,7 +2920,11 @@ static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
28202920 if ( is_fmt == -1 ) is_fmt = 0 ;
28212921 }
28222922 if ( is_array )
2923+ {
28232924 parse_tag_idx (filter -> hdr , is_fmt , tmp .s , tmp .s + is_array , tok );
2925+ if ( tok -> idx == -3 && bcf_hdr_id2length (filter -> hdr ,BCF_HL_FMT ,tok -> hdr_id )!= BCF_VL_R )
2926+ error ("Error: GT subscripts can be used only with Number=R tags\n" );
2927+ }
28242928 else if ( is_fmt && !tok -> nsamples )
28252929 {
28262930 int i ;
@@ -3525,6 +3629,8 @@ void filter_destroy(filter_t *filter)
35253629 free (filter -> filters [i ].regex );
35263630 }
35273631 }
3632+ free (filter -> cached_GT .buf );
3633+ free (filter -> cached_GT .mask );
35283634 free (filter -> filters );
35293635 free (filter -> flt_stack );
35303636 free (filter -> str );
0 commit comments