sgkit-dev
diff --git a/‎bio2zarr/vcf.py‎
Lines changed: 364 additions & 241 deletions b/‎bio2zarr/vcf.py‎
Lines changed: 364 additions & 241 deletions
diff --git a/‎tests/data/vcf/1kg_2020_chr20_annotations.bcf‎
8.45 KB b/‎tests/data/vcf/1kg_2020_chr20_annotations.bcf‎
8.45 KB
diff --git a/‎tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi‎
101 Bytes b/‎tests/data/vcf/1kg_2020_chr20_annotations.bcf.csi‎
101 Bytes
diff --git a/‎tests/data/vcf/1kg_2020_chrM.vcf.gz‎
4.15 KB b/‎tests/data/vcf/1kg_2020_chrM.vcf.gz‎
4.15 KB
diff --git a/‎tests/data/vcf/1kg_2020_chrM.vcf.gz.csi‎
116 Bytes b/‎tests/data/vcf/1kg_2020_chrM.vcf.gz.csi‎
116 Bytes
diff --git a/‎tests/data/vcf/field_type_combos.vcf.gz‎
2.53 KB b/‎tests/data/vcf/field_type_combos.vcf.gz‎
2.53 KB
diff --git a/‎tests/data/vcf/field_type_combos.vcf.gz.csi‎
133 Bytes b/‎tests/data/vcf/field_type_combos.vcf.gz.csi‎
133 Bytes
diff --git a/‎tests/test_cli.py‎
Lines changed: 11 additions & 0 deletions b/‎tests/test_cli.py‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎tests/test_pcvcf.py‎
Lines changed: 139 additions & 0 deletions b/‎tests/test_pcvcf.py‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎tests/test_simulated_data.py‎
Lines changed: 47 additions & 0 deletions b/‎tests/test_simulated_data.py‎
Lines changed: 47 additions & 0 deletions
@@ -0,0 +1,11 @@
+import click.testing as ct
+
+from bio2zarr import cli
+
+# NOTE just putting things together here to see what works.
+# Probably want to mock the module functions here to
+# avoid testing any real functionality.
+def test_vcf_summarise():
+    runner = ct.CliRunner()
+    result = runner.invoke(cli.vcf2zarr, "summarise", "filename")
+    # FIXME not testing anything!
@@ -0,0 +1,139 @@
+import pytest
+import numpy as np
+import numpy.testing as nt
+
+from bio2zarr import vcf
+
+
+class TestSmallExample:
+    data_path = "tests/data/vcf/sample.vcf.gz"
+
+    # fmt: off
+    columns = [
+        'ALT', 'CHROM', 'FILTERS', 'FORMAT/DP', 'FORMAT/GQ',
+        'FORMAT/GT', 'FORMAT/HQ', 'ID', 'INFO/AA', 'INFO/AC',
+        'INFO/AF', 'INFO/AN', 'INFO/DB', 'INFO/DP', 'INFO/H2',
+        'INFO/NS', 'POS', 'QUAL', 'REF'
+    ]
+    # fmt: on
+
+    @pytest.fixture(scope="class")
+    def pcvcf(self, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "example.exploded"
+        return vcf.explode([self.data_path], out)
+
+    def test_summary_table(self, pcvcf):
+        data = pcvcf.summary_table()
+        cols = [d["name"] for d in data]
+        assert sorted(cols) == self.columns
+
+    def test_mapping_methods(self, pcvcf):
+        assert len(pcvcf) == len(self.columns)
+        assert pcvcf["ALT"] is pcvcf.columns["ALT"]
+        assert list(iter(pcvcf)) == list(iter(pcvcf))
+
+    def test_num_partitions(self, pcvcf):
+        assert pcvcf.num_partitions == 1
+
+    def test_num_records(self, pcvcf):
+        assert pcvcf.num_records == 9
+
+    def test_POS(self, pcvcf):
+        nt.assert_array_equal(
+            [v[0] for v in pcvcf["POS"].values],
+            [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
+        )
+
+    def test_REF(self, pcvcf):
+        ref = ["A", "A", "G", "T", "A", "T", "G", "T", "AC"]
+        assert pcvcf["REF"].values == ref
+
+    def test_ALT(self, pcvcf):
+        alt = [
+            ["C"],
+            ["G"],
+            ["A"],
+            ["A"],
+            ["G", "T"],
+            [],
+            ["GA", "GAC"],
+            [],
+            ["A", "ATG", "C"],
+        ]
+        assert [list(v) for v in pcvcf["ALT"].values] == alt
+
+    def test_INFO_NS(self, pcvcf):
+        assert pcvcf["INFO/NS"].values == [None, None, 3, 3, 2, 3, 3, None, None]
+
+
+class TestGeneratedFieldsExample:
+    data_path = "tests/data/vcf/field_type_combos.vcf.gz"
+
+    @pytest.fixture(scope="class")
+    def pcvcf(self, tmp_path_factory):
+        out = tmp_path_factory.mktemp("data") / "example.exploded"
+        # import sgkit
+        # from sgkit.io.vcf import vcf_to_zarr
+        # vcf_to_zarr(self.data_path, "tmp/fields.vcf.sg", fields=
+        #         ["INFO/IS1", "INFO/IC2", "INFO/IS2", "INFO/ISR", "FORMAT/FS2"])
+        # df = sgkit.load_dataset("tmp/fields.vcf.sg")
+        # print(df["variant_IC2"])
+        # print(df["variant_IC2"].values)
+        return vcf.explode([self.data_path], out)
+
+    @pytest.fixture(scope="class")
+    def schema(self, pcvcf):
+        return vcf.ZarrConversionSpec.generate(pcvcf)
+
+    @pytest.mark.parametrize(
+        ("name", "dtype", "shape"),
+        [
+            ("variant_II1", "i1", (208,)),
+            ("variant_II2", "i2", (208, 2)),
+            ("variant_IIA", "i2", (208, 2)),
+            ("variant_IIR", "i2", (208, 3)),
+            ("variant_IID", "i2", (208, 7)),
+            ("variant_IF1", "f4", (208,)),
+            ("variant_IF2", "f4", (208, 2)),
+            ("variant_IFA", "f4", (208, 2)),
+            ("variant_IFR", "f4", (208, 3)),
+            ("variant_IFD", "f4", (208, 9)),
+            ("variant_IC1", "U1", (208,)),
+            ("variant_IC2", "U1", (208, 2)),
+            ("variant_IS1", "O", (208,)),
+            ("variant_IS2", "O", (208, 2)),
+            ("call_FS2", "O", (208, 2, 2)),
+            ("call_FC2", "U1", (208, 2, 2)),
+        ],
+    )
+    def test_info_schemas(self, schema, name, dtype, shape):
+        variables = [v for v in schema.variables if v.name == name]
+        v = variables[0]
+        assert v.dtype == dtype
+        assert tuple(v.shape) == shape
+
+    def test_info_string1(self, pcvcf):
+        non_missing = [v for v in pcvcf["INFO/IS1"].values if v is not None]
+        assert non_missing[0] == "bc"
+        assert non_missing[1] == "."
+
+    def test_info_char1(self, pcvcf):
+        non_missing = [v for v in pcvcf["INFO/IC1"].values if v is not None]
+        assert non_missing[0] == "f"
+        assert non_missing[1] == "."
+
+    def test_info_string2(self, pcvcf):
+        non_missing = [v for v in pcvcf["INFO/IS2"].values if v is not None]
+        nt.assert_array_equal(non_missing[0], ["hij", "d"])
+        nt.assert_array_equal(non_missing[1], [".", "d"])
+        nt.assert_array_equal(non_missing[2], ["hij", "."])
+        nt.assert_array_equal(non_missing[3], [".", "."])
+
+    def test_format_string1(self, pcvcf):
+        non_missing = [v for v in pcvcf["FORMAT/FS1"].values if v is not None]
+        nt.assert_array_equal(non_missing[0], [["bc"], ["."]])
+
+    def test_format_string2(self, pcvcf):
+        non_missing = [v for v in pcvcf["FORMAT/FS2"].values if v is not None]
+        nt.assert_array_equal(non_missing[0], [["bc", "op"], [".", "op"]])
+        nt.assert_array_equal(non_missing[1], [["bc", "."], [".", "."]])
@@ -0,0 +1,47 @@
+import pytest
+import msprime
+import pysam
+import sgkit as sg
+import numpy.testing as nt
+
+from bio2zarr import vcf
+
+
+class TestTskitRoundTripVcf:
+    @pytest.mark.parametrize("ploidy", [1, 2, 3, 4])
+    def test_ploidy(self, ploidy, tmp_path):
+        ts = msprime.sim_ancestry(
+            2,
+            population_size=10**4,
+            ploidy=ploidy,
+            sequence_length=100_000,
+            random_seed=42,
+        )
+        tables = ts.dump_tables()
+        for u in ts.samples():
+            site = tables.sites.add_row(u + 1, "A")
+            tables.mutations.add_row(site, derived_state="T", node=u)
+        ts = tables.tree_sequence()
+        vcf_file = tmp_path / "sim.vcf"
+        with open(vcf_file, "w") as f:
+            ts.write_vcf(f)
+        # This also compresses the input file
+        pysam.tabix_index(str(vcf_file), preset="vcf")
+        out = tmp_path / "example.vcf.zarr"
+        vcf.convert_vcf([tmp_path / "sim.vcf.gz"], out)
+        ds = sg.load_dataset(out)
+        assert ds.sizes["ploidy"] == ploidy
+        assert ds.sizes["variants"] == ts.num_sites
+        assert ds.sizes["samples"] == ts.num_individuals
+        # Msprime guarantees that this will be true.
+        nt.assert_array_equal(
+            ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
+            ds.call_genotype.values,
+        )
+        nt.assert_equal(ds.variant_allele[:, 0].values, "A")
+        nt.assert_equal(ds.variant_allele[:, 1].values, "T")
+        nt.assert_equal(ds.variant_position, ts.sites_position)
+
+
+# TODO add a plink equivalant if we can find a way of programatically
+# generating plink data?