Skip to content

Commit 2fb90a5

Browse files
tomwhitejeromekelleher
authored andcommitted
Support filtering on FILTER field
1 parent f25bfe2 commit 2fb90a5

File tree

5 files changed

+130
-16
lines changed

5 files changed

+130
-16
lines changed

lib/vcf_encoder.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,7 @@ vcz_variant_encoder_write_filter(const vcz_variant_encoder_t *self, size_t varia
729729
{
730730
const vcz_field_t filter_id = self->filter_id;
731731
bool all_missing = true;
732+
bool first = true;
732733
const int8_t *restrict data = self->filter_data + (variant * filter_id.num_columns);
733734
const char *filter_id_data = (const char *) self->filter_id.data;
734735
size_t j, k, source_offset;
@@ -747,7 +748,10 @@ vcz_variant_encoder_write_filter(const vcz_variant_encoder_t *self, size_t varia
747748
source_offset = 0;
748749
for (j = 0; j < filter_id.num_columns; j++) {
749750
if (data[j]) {
750-
source_offset = j * filter_id.item_size;
751+
if (!first) {
752+
offset = append_char(buf, ';', offset, buflen);
753+
}
754+
source_offset = j * filter_id.item_size;
751755
for (k = 0; k < filter_id.item_size; k++) {
752756
if (filter_id_data[source_offset] == VCZ_STRING_FILL) {
753757
break;
@@ -758,6 +762,7 @@ vcz_variant_encoder_write_filter(const vcz_variant_encoder_t *self, size_t varia
758762
goto out;
759763
}
760764
source_offset++;
765+
first = false;
761766
}
762767
}
763768
}

tests/test_bcftools_validation.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ def run_vcztools(args: str, expect_error=False) -> tuple[str, str]:
5151
# ("view --no-version -i 'ID == \"rs6054257\"'", "sample.vcf.gz"),
5252
("view --no-version -i 'DB=0'", "sample.vcf.gz"),
5353
("view --no-version -i 'DB=1'", "sample.vcf.gz"),
54+
("view --no-version -i 'FILTER=\"PASS\"'", "sample.vcf.gz"),
5455
("view --no-version -i 'INFO/DP > 10'", "sample.vcf.gz"),
5556
("view --no-version -i 'FMT/DP >= 5'", "sample.vcf.gz"),
5657
("view --no-version -i 'FMT/DP >= 5 && FMT/GQ > 10'", "sample.vcf.gz"),
@@ -91,7 +92,7 @@ def run_vcztools(args: str, expect_error=False) -> tuple[str, str]:
9192
(
9293
"view --no-version -r '20:1230236-' -i 'FMT/DP>3' -s 'NA00002,NA00003'",
9394
"sample.vcf.gz"
94-
)
95+
),
9596
],
9697
# This is necessary when trying to run individual tests, as the arguments above
9798
# make for unworkable command lines
@@ -182,6 +183,7 @@ def test_vcf_output_with_output_option(tmp_path, args, vcf_file):
182183
# (r"query -f '%AC{1}\n' -i 'AC[1]>10' ", "sample.vcf.gz"),
183184
# TODO fill-out more of these when supported for more stuff is available
184185
# in filtering
186+
("query -f '%CHROM %POS %FILTER\n' -i 'FILTER=\"PASS\"'", "sample.vcf.gz"),
185187
# Per-sample query tests
186188
(
187189
r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3'",

tests/test_filter.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ def test_invalid_expressions(self, parser, expression):
3737
("INFO/HAYSTACK ~ 0", filter_mod.UnsupportedRegexError),
3838
('CHROM="1"', filter_mod.UnsupportedChromFieldError),
3939
('DP="."', filter_mod.UnsupportedMissingDataError),
40-
('FILTER="PASS"', filter_mod.UnsupportedFilterFieldError),
4140
("ID!=@~/file", filter_mod.UnsupportedFileReferenceError),
4241
("INFO/TAG=@file", filter_mod.UnsupportedFileReferenceError),
4342
("INFO/X[0] == 1", filter_mod.UnsupportedArraySubscriptError),
@@ -199,6 +198,38 @@ def test_evaluate(self, expression, data, expected):
199198
result = fee.evaluate(numpify_values(data))
200199
nt.assert_array_equal(result, expected)
201200

201+
@pytest.mark.parametrize(
202+
("expression", "expected"),
203+
[
204+
('FILTER="PASS"', [False, True, False, False, False, False]),
205+
('FILTER="."', [True, False, False, False, False, False]),
206+
('FILTER="A"', [False, False, True, False, False, False]),
207+
('FILTER!="A"', [True, True, False, True, True, True]),
208+
('FILTER~"A"', [False, False, True, False, True, True]),
209+
('FILTER="A;B"', [False, False, False, False, True, False]),
210+
('FILTER="B;A"', [False, False, False, False, True, False]),
211+
('FILTER!="A;B"', [True, True, True, True, False, True]),
212+
('FILTER~"A;B"', [False, False, False, False, True, True]),
213+
('FILTER~"B;A"', [False, False, False, False, True, True]),
214+
('FILTER!~"A;B"', [True, True, True, True, False, False]),
215+
],
216+
)
217+
def test_evaluate_filter_comparison(self, expression, expected):
218+
data = {
219+
"variant_filter": [
220+
[False, False, False, False],
221+
[True, False, False, False],
222+
[False, True, False, False],
223+
[False, False, True, False],
224+
[False, True, True, False],
225+
[False, True, True, True],
226+
],
227+
"filter_id": ["PASS", "A", "B", "C"],
228+
}
229+
fee = filter_mod.FilterExpression(include=expression)
230+
result = fee.evaluate(numpify_values(data))
231+
nt.assert_array_equal(result, expected)
232+
202233
@pytest.mark.parametrize(
203234
("expr", "expected"),
204235
[

vcztools/filter.py

Lines changed: 86 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,6 @@ class UnsupportedMissingDataError(UnsupportedFilteringFeatureError):
3232
feature = "Missing data"
3333

3434

35-
class UnsupportedFilterFieldError(UnsupportedFilteringFeatureError):
36-
issue = "164"
37-
feature = "FILTER field"
38-
39-
4035
class UnsupportedGenotypeValuesError(UnsupportedFilteringFeatureError):
4136
issue = "165"
4237
feature = "Genotype values"
@@ -131,16 +126,18 @@ def __init__(self, mapper, tokens):
131126
token = tokens[0]
132127
if token == "CHROM":
133128
raise UnsupportedChromFieldError()
134-
elif token == "FILTER":
135-
raise UnsupportedFilterFieldError()
136129
elif token == "GT":
137130
raise UnsupportedGenotypeValuesError()
138131
self.field_name = mapper(token)
139132
logger.debug(f"Mapped {token} to {self.field_name}")
140133

141134
def eval(self, data):
142135
value = np.asarray(data[self.field_name])
143-
if not self.field_name.startswith("call_") and len(value.shape) > 1:
136+
if (
137+
not self.field_name.startswith("call_")
138+
and self.field_name != "variant_filter"
139+
and len(value.shape) > 1
140+
):
144141
raise Unsupported2DFieldsError()
145142
return value
146143

@@ -301,6 +298,69 @@ def referenced_fields(self):
301298
return op1.referenced_fields() | op2.referenced_fields()
302299

303300

301+
# FILTER field expressions have special set-like semantics
302+
# so they are handled by dedicated operators.
303+
304+
305+
class FilterString(Constant):
306+
def __init__(self, tokens):
307+
super().__init__(tokens)
308+
309+
def eval(self, data):
310+
# convert string to a 1D boolean array (one element per filter)
311+
if self.tokens == ".":
312+
return np.zeros_like(data["filter_id"], dtype=bool)
313+
filters = self.tokens.split(";")
314+
return np.isin(data["filter_id"], filters)
315+
316+
def referenced_fields(self):
317+
return frozenset(["filter_id"])
318+
319+
320+
# 'a' is a 2D boolean array with shape (variants, filters)
321+
# 'b' is a 1D boolean array with shape (filters)
322+
323+
324+
def filter_eq(a, b):
325+
return np.all(a == b, axis=1)
326+
327+
328+
def filter_ne(a, b):
329+
return ~filter_eq(a, b)
330+
331+
332+
def filter_subset_match(a, b):
333+
return np.all(a[:, b], axis=1)
334+
335+
336+
def filter_complement_match(a, b):
337+
return ~filter_subset_match(a, b)
338+
339+
340+
class FilterFieldOperator(EvaluationNode):
341+
op_map = {
342+
"=": filter_eq,
343+
"==": filter_eq,
344+
"!=": filter_ne,
345+
"~": filter_subset_match,
346+
"!~": filter_complement_match,
347+
}
348+
349+
def __init__(self, tokens):
350+
super().__init__(tokens)
351+
self.op1, self.op, self.op2 = tokens # not self.tokens
352+
self.comparison_fn = self.op_map[self.op]
353+
354+
def eval(self, data):
355+
return self.comparison_fn(self.op1.eval(data), self.op2.eval(data))
356+
357+
def __repr__(self):
358+
return f"({repr(self.op1)}){self.op}({repr(self.op2)})"
359+
360+
def referenced_fields(self):
361+
return self.op1.referenced_fields() | self.op2.referenced_fields()
362+
363+
304364
def _identity(x):
305365
return x
306366

@@ -321,6 +381,18 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True):
321381
vcf_prefixes = pp.Literal("INFO/") | pp.Literal("FORMAT/") | pp.Literal("FMT/")
322382
vcf_identifier = pp.Combine(vcf_prefixes + identifier) | identifier
323383

384+
name_mapper = _identity
385+
if map_vcf_identifiers:
386+
name_mapper = functools.partial(vcf_name_to_vcz_name, all_fields)
387+
388+
filter_field_identifier = pp.Literal("FILTER")
389+
filter_field_identifier = filter_field_identifier.set_parse_action(
390+
functools.partial(Identifier, name_mapper)
391+
)
392+
filter_string = pp.QuotedString('"').set_parse_action(FilterString)
393+
filter_field_expr = filter_field_identifier + pp.one_of("= != ~ !~") + filter_string
394+
filter_field_expr = filter_field_expr.set_parse_action(FilterFieldOperator)
395+
324396
lbracket, rbracket = map(pp.Suppress, "[]")
325397
# TODO we need to define the indexing grammar more carefully, but
326398
# this at least let's us match correct strings and raise an informative
@@ -334,9 +406,6 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True):
334406
)
335407
indexed_identifier = pp.Group(vcf_identifier + (lbracket + index_expr + rbracket))
336408

337-
name_mapper = _identity
338-
if map_vcf_identifiers:
339-
name_mapper = functools.partial(vcf_name_to_vcz_name, all_fields)
340409
identifier = vcf_identifier.set_parse_action(
341410
functools.partial(Identifier, name_mapper)
342411
)
@@ -350,7 +419,12 @@ def make_bcftools_filter_parser(all_fields=None, map_vcf_identifiers=True):
350419

351420
comp_op = pp.oneOf("< = == > >= <= !=")
352421
filter_expression = pp.infix_notation(
353-
function | constant | indexed_identifier | identifier | file_expr,
422+
filter_field_expr
423+
| function
424+
| constant
425+
| indexed_identifier
426+
| identifier
427+
| file_expr,
354428
[
355429
("-", 1, pp.OpAssoc.RIGHT, UnaryMinus),
356430
(pp.one_of("* /"), 2, pp.OpAssoc.LEFT, BinaryOperator),

vcztools/query.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def generate(chunk_data):
132132
array = chunk_data[vcz_name]
133133
for row in array:
134134
is_missing = np.any(row == -1)
135+
sep = ","
135136

136137
if tag == "CHROM":
137138
row = self.contig_ids[row]
@@ -144,6 +145,7 @@ def generate(chunk_data):
144145
row = self.filter_ids[row]
145146
else:
146147
row = "."
148+
sep = ";"
147149
if tag == "QUAL":
148150
if math.isnan(row):
149151
row = "."
@@ -154,7 +156,7 @@ def generate(chunk_data):
154156
and not sample_loop
155157
and (isinstance(row, np.ndarray) or isinstance(row, list))
156158
):
157-
row = ",".join(map(str, row))
159+
row = sep.join(map(str, row))
158160

159161
if sample_loop:
160162
if isinstance(row, np.ndarray):

0 commit comments

Comments
 (0)