diff --git a/tests/test_bcftools_validation.py b/tests/test_bcftools_validation.py index e62a089..a27ece1 100644 --- a/tests/test_bcftools_validation.py +++ b/tests/test_bcftools_validation.py @@ -117,7 +117,10 @@ def run_vcztools(args: str, expect_error=False) -> tuple[str, str]: "view --no-version -i 'FILTER~\"VQSRTrancheINDEL99.00to100.00\"'", "1kg_2020_chrM.vcf.gz" ), - ("view --no-version -i 'INFO/AC>2'", "chr22.vcf.gz") + ("view --no-version -i 'INFO/AC>2'", "chr22.vcf.gz"), + ("view --no-version -i 'F_MISSING<0.05'", "sample.vcf.gz"), + # GT is not in annotations file + ("view --no-version -i 'N_MISSING=0'", "1kg_2020_chr20_annotations.bcf"), ], # This is necessary when trying to run individual tests, as the arguments above # make for unworkable command lines diff --git a/tests/test_calculate.py b/tests/test_calculate.py index e7a4485..260a163 100644 --- a/tests/test_calculate.py +++ b/tests/test_calculate.py @@ -1,6 +1,14 @@ +import numpy as np import pytest -from vcztools.calculate import REF, SNP, UNCLASSIFIED, get_variant_type +from vcztools.calculate import ( + REF, + SNP, + UNCLASSIFIED, + calculate_if_absent, + get_variant_type, +) +from vcztools.constants import INT_FILL, INT_MISSING @pytest.mark.parametrize( @@ -20,3 +28,27 @@ ) def test_get_variant_type(ref, alt, expected_type): assert get_variant_type(ref, alt) == expected_type + + +def test_calculate_if_absent(): + gt = np.array( + [ + [[0, 0], [0, 1], [1, 1]], + [[0, 0], [0, 2], [2, 2]], + [[0, 1], [1, 2], [2, 2]], + [ + [INT_MISSING, INT_MISSING], + [INT_MISSING, INT_MISSING], + [INT_FILL, INT_FILL], + ], + [[INT_MISSING, INT_MISSING], [0, 3], [INT_FILL, INT_FILL]], + ] + ) + data = {"call_genotype": gt} + + calculate_if_absent(data, "variant_F_MISSING") + + np.testing.assert_array_equal(data["variant_N_MISSING"], np.array([0, 0, 0, 3, 2])) + np.testing.assert_array_equal( + data["variant_F_MISSING"], np.array([0, 0, 0, 1, 2 / 3]) + ) diff --git a/tests/test_filter.py b/tests/test_filter.py index 83be0b6..83c396f 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -56,6 +56,7 @@ def test_invalid_expressions(self, parser, expression): ("fisher(INFO/DP4)", filter_mod.UnsupportedFunctionsError), ("fisher(FMT/ADF,FMT/ADR)", filter_mod.UnsupportedFunctionsError), ("N_PASS(GQ>90)", filter_mod.UnsupportedFunctionsError), + ("ILEN > 0", filter_mod.UnsupportedCalculatedVariableError), ('TYPE="bnd"', filter_mod.UnsupportedTypeFieldError), ], ) diff --git a/vcztools/calculate.py b/vcztools/calculate.py index 77c5342..a5f1388 100644 --- a/vcztools/calculate.py +++ b/vcztools/calculate.py @@ -38,3 +38,45 @@ def calculate_variant_type(variant_allele: np.ndarray) -> np.ndarray: for j in range(alt.shape[1]): variant_type[i, j] = get_variant_type(ref[i], alt[i, j]) return variant_type + + +SUPPORTED_CALCULATED_VARIABLES = ["N_MISSING", "F_MISSING"] +UNSUPPORTED_CALCULATED_VARIABLES = [ + "N_ALT", + "N_SAMPLES", + "AC", + "MAC", + "AF", + "MAF", + "AN", + "ILEN", +] + + +def _n_missing(data: dict[str, np.ndarray]) -> np.ndarray: + gt = data.get("call_genotype", None) + if gt is None: + return np.zeros_like(data["variant_position"]) + else: + return np.sum(np.all(gt < 0, axis=-1), axis=1) + + +def _f_missing(data: dict[str, np.ndarray]) -> np.ndarray: + calculate_if_absent(data, "variant_N_MISSING") + gt = data["call_genotype"] + n_samples = gt.shape[1] + n_missing = data["variant_N_MISSING"] + return n_missing / n_samples + + +field_to_function = { + "variant_N_MISSING": _n_missing, + "variant_F_MISSING": _f_missing, +} + + +def calculate_if_absent(data: dict[str, np.ndarray], vcz_name: str) -> None: + """Populates the array for 'vcz_name' in 'data' if not already present.""" + if vcz_name not in data: + func = field_to_function[vcz_name] + data[vcz_name] = func(data) diff --git a/vcztools/filter.py b/vcztools/filter.py index b5f0baa..2a29521 100644 --- a/vcztools/filter.py +++ b/vcztools/filter.py @@ -5,7 +5,13 @@ import numpy as np import pyparsing as pp -from vcztools.calculate import SNP, calculate_variant_type +from vcztools.calculate import ( + SNP, + SUPPORTED_CALCULATED_VARIABLES, + UNSUPPORTED_CALCULATED_VARIABLES, + calculate_if_absent, + calculate_variant_type, +) from .utils import vcf_name_to_vcz_names @@ -49,6 +55,11 @@ class UnsupportedArraySubscriptError(UnsupportedFilteringFeatureError): feature = "Array subscripts" +class UnsupportedCalculatedVariableError(UnsupportedFilteringFeatureError): + issue = "171" + feature = "Calculated variables" + + class UnsupportedRegexError(UnsupportedFilteringFeatureError): issue = "174" feature = "Regular expressions" @@ -121,12 +132,19 @@ def __init__(self, mapper, tokens): token = tokens[0] if token == "GT": raise UnsupportedGenotypeValuesError() + elif token in UNSUPPORTED_CALCULATED_VARIABLES: + raise UnsupportedCalculatedVariableError() field_names = mapper(token) if len(field_names) == 0: - raise ValueError(f'the tag "{token}" is not defined') + if token in SUPPORTED_CALCULATED_VARIABLES: + self.field_name = f"variant_{token}" + self.refs = ["variant_position", "call_genotype"] + else: + raise ValueError(f'the tag "{token}" is not defined') elif len(field_names) == 1: self.field_name = field_names[0] logger.debug(f"Mapped {token} to {self.field_name}") + self.refs = [self.field_name] else: raise ValueError( f'ambiguous filtering expression: "{token}", ' @@ -134,6 +152,7 @@ def __init__(self, mapper, tokens): ) def eval(self, data): + calculate_if_absent(data, self.field_name) value = np.asarray(data[self.field_name]) if self.field_name.startswith("call_") and len(value.shape) > 2: raise UnsupportedHigherDimensionalFormatFieldsError() @@ -143,7 +162,7 @@ def __repr__(self): return self.field_name def referenced_fields(self): - return frozenset([self.field_name]) + return frozenset(self.refs) class IndexedIdentifier(EvaluationNode): diff --git a/vcztools/retrieval.py b/vcztools/retrieval.py index 75b06e1..f1375c5 100644 --- a/vcztools/retrieval.py +++ b/vcztools/retrieval.py @@ -156,7 +156,10 @@ def variant_chunk_index_iter_with_filtering( if filter_expr.parse_result is None: filter_expr = None else: - filter_fields = list(filter_expr.referenced_fields) + # only return referenced fields that are actually present + filter_fields = list( + set.intersection(set(filter_expr.referenced_fields), set(root)) + ) filter_fields_reader = VariantChunkReader(root, fields=filter_fields) for v_chunk, v_mask_chunk in variant_chunk_index_iter(