Per-sample filtering

tomwhite · jeromekelleher · commit fde9abafd7eb · 2025-05-07T16:10:17.000+01:00
diff --git a/tests/test_bcftools_validation.py b/tests/test_bcftools_validation.py
@@ -52,20 +52,23 @@ def run_vcztools(args: str, expect_error=False) -> tuple[str, str]:
         ("view --no-version -i 'INFO/DP > 10'", "sample.vcf.gz"),
         # Filters based on FMT values are currently disabled.
         # https://github.com/sgkit-dev/vcztools/issues/180
-        # ("view --no-version -i 'FMT/DP >= 5 && FMT/GQ > 10'", "sample.vcf.gz"),
-        # ("view --no-version -i 'FMT/DP >= 5 & FMT/GQ>10'", "sample.vcf.gz"),
-        # (
-        #         "view --no-version -i '(QUAL > 10 || FMT/GQ>10) && POS > 100000'",
-        #         "sample.vcf.gz"
-        # ),
-        # (
-        #         "view --no-version -i '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'",
-        #         "sample.vcf.gz"
-        # ),
-        # (
-        #         "view --no-version -e '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'",
-        #         "sample.vcf.gz"
-        # ),
+        ("view --no-version -i 'FMT/DP >= 5'", "sample.vcf.gz"),
+        ("view --no-version -i 'FMT/DP >= 5 && FMT/GQ > 10'", "sample.vcf.gz"),
+        ("view --no-version -i 'FMT/DP >= 5 & FMT/GQ>10'", "sample.vcf.gz"),
+        ("view --no-version -i 'FMT/DP>5 && FMT/GQ<45'", "sample.vcf.gz"),
+        ("view --no-version -i 'FMT/DP>5 & FMT/GQ<45'", "sample.vcf.gz"),
+        (
+                "view --no-version -i '(QUAL > 10 || FMT/GQ>10) && POS > 100000'",
+                "sample.vcf.gz"
+        ),
+        (
+                "view --no-version -i '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'",
+                "sample.vcf.gz"
+        ),
+        (
+                "view --no-version -e '(FMT/DP >= 8 | FMT/GQ>40) && POS > 100000'",
+                "sample.vcf.gz"
+        ),
         ("view --no-version -G", "sample.vcf.gz"),
         (
                 "view --no-update --no-version --samples-file "
@@ -88,7 +91,7 @@ def run_vcztools(args: str, expect_error=False) -> tuple[str, str]:
     ],
     # This is necessary when trying to run individual tests, as the arguments above
     # make for unworkable command lines
-    # ids=range(26),
+    # ids=range(28),
 )
 # fmt: on
 def test_vcf_output(tmp_path, args, vcf_file):
@@ -175,6 +178,23 @@ def test_vcf_output_with_output_option(tmp_path, args, vcf_file):
         # (r"query  -f '%AC{1}\n' -i 'AC[1]>10' ", "sample.vcf.gz"),
         # TODO fill-out more of these when supported for more stuff is available
         # in filtering
+        # Per-sample query tests
+        (
+            r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3'",
+            "sample.vcf.gz"
+        ),
+        (
+            r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/GQ>30'",
+            "sample.vcf.gz"
+        ),
+        (
+            r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3 & FMT/GQ>30'",
+            "sample.vcf.gz"
+        ),
+        (
+            r"query -f '[%CHROM %POS %SAMPLE %GT %DP %GQ\n]' -i 'FMT/DP>3 && FMT/GQ>30'",
+            "sample.vcf.gz"
+        ),
     ],
 )
 def test_output(tmp_path, args, vcf_name):
diff --git a/tests/test_filter.py b/tests/test_filter.py
@@ -71,7 +71,20 @@ class TestFilterExpressionSample:
             ("POS < 1000", [1, 1, 0, 0, 0, 0, 0, 0, 1]),
             ("INFO/DP > 10", [0, 0, 1, 1, 0, 1, 0, 0, 0]),
             # Not supporting format fields for now: #180
-            # ("FMT/GQ > 20", [0, 0, 1, 1, 1, 1, 1, 0, 0]),
+            (
+                "FMT/GQ > 20",
+                [
+                    [0, 0, 0],
+                    [0, 0, 0],
+                    [1, 1, 1],
+                    [1, 0, 1],
+                    [1, 0, 1],
+                    [1, 1, 1],
+                    [0, 0, 1],
+                    [0, 0, 0],
+                    [0, 0, 0],
+                ],
+            ),
             # ("FMT/DP >= 5 && FMT/GQ > 10", [0, 0, 1, 1, 1, 0, 0, 0, 0]),
             # ("GT > 0", [1, 1, 1, 1, 1, 0, 1, 0, 1]),
             # ("GT > 0 & FMT/HQ >= 10", [0, 0, 1, 1, 1, 0, 0, 0, 0]),
@@ -124,18 +137,6 @@ def test_evaluate(self, expression, data, expected):
         result = fee.evaluate(numpify_values(data))
         nt.assert_array_equal(result, expected)
 
-    @pytest.mark.parametrize(
-        "expression",
-        [
-            "FORMAT/AD > 30",
-            "FMT/AD > 30",
-            "GT > 30",
-        ],
-    )
-    def test_sample_evaluation_unsupported(self, expression):
-        with pytest.raises(filter_mod.UnsupportedSampleFilteringError):
-            filter_mod.FilterExpression(include=expression)
-
     @pytest.mark.parametrize(
         ("expr", "expected"),
         [
@@ -300,13 +301,53 @@ def test_boolean_operator_expressions(self, expr, expected):
             ("a == b", {"a": [0, 1], "b": [1, 1]}, [False, True]),
             ("a = b", {"a": [0, 1], "b": [1, 1]}, [False, True]),
             ("a & b", {"a": [0, 1], "b": [1, 1]}, [False, True]),
-            ("a && b", {"a": [0, 1], "b": [1, 1]}, [False, True]),
             ("a | b", {"a": [0, 1], "b": [1, 1]}, [True, True]),
-            ("a || b", {"a": [0, 1], "b": [1, 1]}, [True, True]),
             ("(a < 2) & (b > 1)", {"a": [0, 1], "b": [1, 2]}, [False, True]),
             # AND has precedence over OR
             ("t | f & f", {"t": [1], "f": [0]}, [True or False and False]),
             ("(t | f) & f", {"t": [1], "f": [0]}, [(True or False) and False]),
+            (
+                "call_a && call_b",
+                {
+                    "call_a": [
+                        [0, 0, 0, 0],
+                        [0, 0, 1, 1],
+                        [0, 0, 0, 0],
+                    ],
+                    "call_b": [
+                        [0, 0, 0, 0],
+                        [0, 1, 0, 1],
+                        [1, 1, 1, 1],
+                    ],
+                },
+                [
+                    [False, False, False, False],
+                    [False, True, True, True],
+                    # all False since condition a is not met (all 0)
+                    [False, False, False, False],
+                ],
+            ),
+            (
+                "call_a || call_b",
+                {
+                    "call_a": [
+                        [0, 0, 0, 0],
+                        [0, 0, 1, 1],
+                        [0, 0, 0, 0],
+                    ],
+                    "call_b": [
+                        [0, 0, 0, 0],
+                        [0, 1, 0, 1],
+                        [1, 1, 1, 1],
+                    ],
+                },
+                [
+                    [False, False, False, False],
+                    # all True since variant site is included
+                    [True, True, True, True],
+                    [True, True, True, True],
+                ],
+            ),
         ],
     )
     def test_boolean_operator_expressions_data(self, expr, data, expected):
diff --git a/vcztools/filter.py b/vcztools/filter.py
@@ -47,11 +47,6 @@ class UnsupportedFileReferenceError(UnsupportedFilteringFeatureError):
     feature = "File references"
 
 
-class UnsupportedSampleFilteringError(UnsupportedFilteringFeatureError):
-    issue = "180"
-    feature = "Per-sample filter expressions"
-
-
 class UnsupportedFunctionsError(UnsupportedFilteringFeatureError):
     issue = "190"
     feature = "Function evaluation"
@@ -110,13 +105,11 @@ def __init__(self, tokens):
 class Identifier(EvaluationNode):
     def __init__(self, mapper, tokens):
         self.field_name = mapper(tokens[0])
-        if self.field_name.startswith("call_"):
-            raise UnsupportedSampleFilteringError()
         logger.debug(f"Mapped {tokens[0]} to {self.field_name}")
 
     def eval(self, data):
         value = np.asarray(data[self.field_name])
-        if len(value.shape) > 1:
+        if not self.field_name.startswith("call_") and len(value.shape) > 1:
             raise Unsupported2DFieldsError()
         return value
 
@@ -160,6 +153,57 @@ def referenced_fields(self):
         return operand.referenced_fields()
 
 
+def double_and(a, b):
+    # if both operands are 1D, then they are just variant masks
+    if a.ndim == 1 and b.ndim == 1:
+        return np.logical_and(a, b)
+
+    # if either operand is 1D and the other is 2D, then make both 2D
+    if a.ndim == 1 and b.ndim == 2:
+        a = np.expand_dims(a, axis=1)
+    elif a.ndim == 2 and b.ndim == 1:
+        b = np.expand_dims(b, axis=1)
+
+    if a.ndim == 2 and b.ndim == 2:
+        # a variant site is included only if both conditions are met
+        # but not necessarily in the same sample
+        variant_mask = np.logical_and(np.any(a, axis=1), np.any(b, axis=1))
+        variant_mask = np.expand_dims(variant_mask, axis=1)
+        # a sample is included if either condition is met
+        sample_mask = np.logical_or(a, b)
+        # but if a variant site is not included then none of its samples should be
+        return np.logical_and(variant_mask, sample_mask)
+    else:
+        raise NotImplementedError(
+            f"&& not implemented for dimensions {a.ndim} and {b.ndim}"
+        )
+
+
+def double_or(a, b):
+    # if both operands are 1D, then they are just variant masks
+    if a.ndim == 1 and b.ndim == 1:
+        return np.logical_or(a, b)
+
+    # if either operand is 1D and the other is 2D, then make both 2D
+    if a.ndim == 1 and b.ndim == 2:
+        a = np.expand_dims(a, axis=1)
+    elif a.ndim == 2 and b.ndim == 1:
+        b = np.expand_dims(b, axis=1)
+
+    if a.ndim == 2 and b.ndim == 2:
+        # a variant site is included if either condition is met in any sample
+        variant_mask = np.logical_or(np.any(a, axis=1), np.any(b, axis=1))
+        variant_mask = np.expand_dims(variant_mask, axis=1)
+        # a sample is included if either condition is met
+        sample_mask = np.logical_or(a, b)
+        # but if a variant site is included then all of its samples should be
+        return np.logical_or(variant_mask, sample_mask)
+    else:
+        raise NotImplementedError(
+            f"|| not implemented for dimensions {a.ndim} and {b.ndim}"
+        )
+
+
 class BinaryOperator(EvaluationNode):
     op_map = {
         "*": operator.mul,
@@ -170,9 +214,8 @@ class BinaryOperator(EvaluationNode):
         # circuit optimisations
         "&": np.logical_and,
         "|": np.logical_or,
-        # As we're only supporting 1D values for now, these are the same thing
-        "&&": np.logical_and,
-        "||": np.logical_or,
+        "&&": double_and,
+        "||": double_or,
     }
 
     def eval(self, data):