@@ -32,11 +32,6 @@ class UnsupportedMissingDataError(UnsupportedFilteringFeatureError):
32
32
feature = "Missing data"
33
33
34
34
35
- class UnsupportedFilterFieldError (UnsupportedFilteringFeatureError ):
36
- issue = "164"
37
- feature = "FILTER field"
38
-
39
-
40
35
class UnsupportedGenotypeValuesError (UnsupportedFilteringFeatureError ):
41
36
issue = "165"
42
37
feature = "Genotype values"
@@ -131,16 +126,18 @@ def __init__(self, mapper, tokens):
131
126
token = tokens [0 ]
132
127
if token == "CHROM" :
133
128
raise UnsupportedChromFieldError ()
134
- elif token == "FILTER" :
135
- raise UnsupportedFilterFieldError ()
136
129
elif token == "GT" :
137
130
raise UnsupportedGenotypeValuesError ()
138
131
self .field_name = mapper (token )
139
132
logger .debug (f"Mapped { token } to { self .field_name } " )
140
133
141
134
def eval (self , data ):
142
135
value = np .asarray (data [self .field_name ])
143
- if not self .field_name .startswith ("call_" ) and len (value .shape ) > 1 :
136
+ if (
137
+ not self .field_name .startswith ("call_" )
138
+ and self .field_name != "variant_filter"
139
+ and len (value .shape ) > 1
140
+ ):
144
141
raise Unsupported2DFieldsError ()
145
142
return value
146
143
@@ -301,6 +298,69 @@ def referenced_fields(self):
301
298
return op1 .referenced_fields () | op2 .referenced_fields ()
302
299
303
300
301
+ # FILTER field expressions have special set-like semantics
302
+ # so they are handled by dedicated operators.
303
+
304
+
305
+ class FilterString (Constant ):
306
+ def __init__ (self , tokens ):
307
+ super ().__init__ (tokens )
308
+
309
+ def eval (self , data ):
310
+ # convert string to a 1D boolean array (one element per filter)
311
+ if self .tokens == "." :
312
+ return np .zeros_like (data ["filter_id" ], dtype = bool )
313
+ filters = self .tokens .split (";" )
314
+ return np .isin (data ["filter_id" ], filters )
315
+
316
+ def referenced_fields (self ):
317
+ return frozenset (["filter_id" ])
318
+
319
+
320
+ # 'a' is a 2D boolean array with shape (variants, filters)
321
+ # 'b' is a 1D boolean array with shape (filters)
322
+
323
+
324
+ def filter_eq (a , b ):
325
+ return np .all (a == b , axis = 1 )
326
+
327
+
328
+ def filter_ne (a , b ):
329
+ return ~ filter_eq (a , b )
330
+
331
+
332
+ def filter_subset_match (a , b ):
333
+ return np .all (a [:, b ], axis = 1 )
334
+
335
+
336
+ def filter_complement_match (a , b ):
337
+ return ~ filter_subset_match (a , b )
338
+
339
+
340
+ class FilterFieldOperator (EvaluationNode ):
341
+ op_map = {
342
+ "=" : filter_eq ,
343
+ "==" : filter_eq ,
344
+ "!=" : filter_ne ,
345
+ "~" : filter_subset_match ,
346
+ "!~" : filter_complement_match ,
347
+ }
348
+
349
+ def __init__ (self , tokens ):
350
+ super ().__init__ (tokens )
351
+ self .op1 , self .op , self .op2 = tokens # not self.tokens
352
+ self .comparison_fn = self .op_map [self .op ]
353
+
354
+ def eval (self , data ):
355
+ return self .comparison_fn (self .op1 .eval (data ), self .op2 .eval (data ))
356
+
357
+ def __repr__ (self ):
358
+ return f"({ repr (self .op1 )} ){ self .op } ({ repr (self .op2 )} )"
359
+
360
+ def referenced_fields (self ):
361
+ return self .op1 .referenced_fields () | self .op2 .referenced_fields ()
362
+
363
+
304
364
def _identity (x ):
305
365
return x
306
366
@@ -321,6 +381,18 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True):
321
381
vcf_prefixes = pp .Literal ("INFO/" ) | pp .Literal ("FORMAT/" ) | pp .Literal ("FMT/" )
322
382
vcf_identifier = pp .Combine (vcf_prefixes + identifier ) | identifier
323
383
384
+ name_mapper = _identity
385
+ if map_vcf_identifiers :
386
+ name_mapper = functools .partial (vcf_name_to_vcz_name , all_fields )
387
+
388
+ filter_field_identifier = pp .Literal ("FILTER" )
389
+ filter_field_identifier = filter_field_identifier .set_parse_action (
390
+ functools .partial (Identifier , name_mapper )
391
+ )
392
+ filter_string = pp .QuotedString ('"' ).set_parse_action (FilterString )
393
+ filter_field_expr = filter_field_identifier + pp .one_of ("= != ~ !~" ) + filter_string
394
+ filter_field_expr = filter_field_expr .set_parse_action (FilterFieldOperator )
395
+
324
396
lbracket , rbracket = map (pp .Suppress , "[]" )
325
397
# TODO we need to define the indexing grammar more carefully, but
326
398
# this at least let's us match correct strings and raise an informative
@@ -334,9 +406,6 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True):
334
406
)
335
407
indexed_identifier = pp .Group (vcf_identifier + (lbracket + index_expr + rbracket ))
336
408
337
- name_mapper = _identity
338
- if map_vcf_identifiers :
339
- name_mapper = functools .partial (vcf_name_to_vcz_name , all_fields )
340
409
identifier = vcf_identifier .set_parse_action (
341
410
functools .partial (Identifier , name_mapper )
342
411
)
@@ -350,7 +419,12 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True):
350
419
351
420
comp_op = pp .oneOf ("< = == > >= <= !=" )
352
421
filter_expression = pp .infix_notation (
353
- function | constant | indexed_identifier | identifier | file_expr ,
422
+ filter_field_expr
423
+ | function
424
+ | constant
425
+ | indexed_identifier
426
+ | identifier
427
+ | file_expr ,
354
428
[
355
429
("-" , 1 , pp .OpAssoc .RIGHT , UnaryMinus ),
356
430
(pp .one_of ("* /" ), 2 , pp .OpAssoc .LEFT , BinaryOperator ),
0 commit comments