diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index dd0bbc91..0f3f8566 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -41,7 +41,7 @@ def fromdict(d): return VcfFieldSummary(**d) -@dataclasses.dataclass +@dataclasses.dataclass(order=True) class VcfField: category: str name: str @@ -192,6 +192,16 @@ def fromdict(d): d["contigs"] = [Contig(**cd) for cd in d["contigs"]] return IcfMetadata(**d) + def __eq__(self, other): + if not isinstance(other, IcfMetadata): + return NotImplemented + return ( + self.samples == other.samples + and self.contigs == other.contigs + and self.filters == other.filters + and sorted(self.fields) == sorted(other.fields) + ) + def fixed_vcf_field_definitions(): def make_field_def(name, vcf_type, vcf_number): diff --git a/pyproject.toml b/pyproject.toml index 0b72ca1c..ecb3f60c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ # colouredlogs pulls in humanfriendly", "cyvcf2", "bed_reader", + "packaging", ] requires-python = ">=3.9" classifiers = [ diff --git a/tests/data/vcf/out_of_order_fields/input1.bcf b/tests/data/vcf/out_of_order_fields/input1.bcf new file mode 100644 index 00000000..5d0f42de Binary files /dev/null and b/tests/data/vcf/out_of_order_fields/input1.bcf differ diff --git a/tests/data/vcf/out_of_order_fields/input1.bcf.csi b/tests/data/vcf/out_of_order_fields/input1.bcf.csi new file mode 100644 index 00000000..b10a1267 Binary files /dev/null and b/tests/data/vcf/out_of_order_fields/input1.bcf.csi differ diff --git a/tests/data/vcf/out_of_order_fields/input2.bcf b/tests/data/vcf/out_of_order_fields/input2.bcf new file mode 100644 index 00000000..ee8c4a32 Binary files /dev/null and b/tests/data/vcf/out_of_order_fields/input2.bcf differ diff --git a/tests/data/vcf/out_of_order_fields/input2.bcf.csi b/tests/data/vcf/out_of_order_fields/input2.bcf.csi new file mode 100644 index 00000000..51503336 Binary files /dev/null and b/tests/data/vcf/out_of_order_fields/input2.bcf.csi differ diff --git a/tests/test_core.py b/tests/test_core.py index 3607578f..62f76b88 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -237,8 +237,8 @@ def test_examples(self, chunk_size, size, start, stop): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4976329), - ("tests/data/vcf", 4964192), + ("tests/data", 4981734), + ("tests/data/vcf", 4969597), ("tests/data/vcf/sample.vcf.gz", 1089), ], ) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 64e3b26e..f5609153 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -1100,3 +1100,58 @@ def test_missing_filter(tmp_path): zarr_path = tmp_path / "zarr" with pytest.raises(ValueError, match="Filter 'q10' was not defined in the header"): vcf2zarr.convert([path], zarr_path) + + +class TestOutOfOrderFields: + # Mixing on purpose + data_path1 = "tests/data/vcf/out_of_order_fields/input2.bcf" + data_path2 = "tests/data/vcf/out_of_order_fields/input1.bcf" + + @pytest.fixture(scope="class") + def ds(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "ooo_example.vcf.zarr" + vcf2zarr.convert([self.data_path1, self.data_path2], out) + return sg.load_dataset(out) + + def test_filters(self, ds): + nt.assert_array_equal(ds["filter_id"], ["PASS", "FAIL"]) + nt.assert_array_equal( + ds["variant_filter"], + [ + [True, False], + [False, True], + [True, False], + ], + ) + + def test_source(self, ds): + assert ds.attrs["source"] == f"bio2zarr-{provenance.__version__}" + + def test_contigs(self, ds): + nt.assert_array_equal(ds["contig_id"], ["chr20", "chr21"]) + nt.assert_array_equal(ds["contig_length"], [64444167.0, 46709983.0]) + nt.assert_array_equal(ds["variant_contig"], [0, 1, 1]) + + def test_position(self, ds): + nt.assert_array_equal(ds["variant_position"], [63971, 64506, 64507]) + + def test_length(self, ds): + nt.assert_array_equal(ds["variant_length"], [11, 1, 1]) + + def test_info_fields(self, ds): + nt.assert_array_equal( + ds["variant_QNAME"], + ["cluster19_000000F", ".", "cluster19_000000F"], + ) + nt.assert_array_equal(ds["variant_QSTART"], [25698928, 25698928, -1]) + + def test_allele(self, ds): + nt.assert_array_equal( + ds["variant_allele"].values.tolist(), + [["TTCCATTCCAC", "T"], ["C", "CTCCAT"], ["G", "A"]], + ) + assert ds["variant_allele"].dtype == "O" + + def test_call_DPs(self, ds): + nt.assert_array_equal(ds["call_DP"], [[5], [-1], [5]]) + nt.assert_array_equal(ds["call_DP2"], [[1], [1], [-1]])