diff --git a/bio2zarr/icf.py b/bio2zarr/icf.py index 887b8533..9b7a2c1d 100644 --- a/bio2zarr/icf.py +++ b/bio2zarr/icf.py @@ -79,6 +79,14 @@ def full_name(self): return self.name return f"{self.category}/{self.name}" + @property + def max_number(self): + if self.vcf_number in ("R", "A", "G", "."): + return self.summary.max_number + else: + # use declared number if larger than max found + return max(self.summary.max_number, int(self.vcf_number)) + def smallest_dtype(self): """ Returns the smallest dtype suitable for this field based diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 94022327..8d262c0c 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -136,10 +136,14 @@ def from_field( dimensions.append("samples") if array_name is None: array_name = prefix + vcf_field.name + # TODO make an option to add in the empty extra dimension - if vcf_field.summary.max_number > 1 or vcf_field.full_name == "FORMAT/LAA": - shape.append(vcf_field.summary.max_number) - chunks.append(vcf_field.summary.max_number) + max_number = vcf_field.max_number + if (max_number > 0 and vcf_field.vcf_number in ("R", "A", "G")) or ( + max_number > 1 or vcf_field.full_name == "FORMAT/LAA" + ): + shape.append(max_number) + chunks.append(max_number) # TODO we should really be checking this to see if the named dimensions # are actually correct. if vcf_field.vcf_number == "R": diff --git a/tests/data/vcf/chr22.vcf.gz b/tests/data/vcf/chr22.vcf.gz new file mode 100644 index 00000000..0bd64cba Binary files /dev/null and b/tests/data/vcf/chr22.vcf.gz differ diff --git a/tests/data/vcf/chr22.vcf.gz.csi b/tests/data/vcf/chr22.vcf.gz.csi new file mode 100644 index 00000000..930ed67a Binary files /dev/null and b/tests/data/vcf/chr22.vcf.gz.csi differ diff --git a/tests/test_core.py b/tests/test_core.py index 2f4aa838..03f380f5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -237,8 +237,8 @@ def test_examples(self, chunk_size, size, start, stop): # It works in CI on Linux, but it'll probably break at some point. # It's also necessary to update these numbers each time a new data # file gets added - ("tests/data", 4983044), - ("tests/data/vcf", 4970907), + ("tests/data", 5030777), + ("tests/data/vcf", 5018640), ("tests/data/vcf/sample.vcf.gz", 1089), ], ) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index fa333f36..193dfa2a 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -563,6 +563,21 @@ def test_gts(self, ds): assert "call_genotype" not in ds +class TestChr22Example: + data_path = "tests/data/vcf/chr22.vcf.gz" + + @pytest.fixture(scope="class") + def ds(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.vcf.zarr" + icf.convert([self.data_path], out, worker_processes=0) + return sg.load_dataset(out) + + def test_call_SB(self, ds): + # fixes https://github.com/sgkit-dev/bio2zarr/issues/355 + assert ds.call_SB.dims == ("variants", "samples", "FORMAT_SB_dim") + assert ds.call_SB.shape == (100, 100, 4) + + class Test1000G2020Example: data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz" @@ -997,6 +1012,11 @@ def test_variant_ANN(self, ds): ] nt.assert_array_equal(ds.variant_ANN.values, variant_ANN) + def test_variant_MLEAF(self, ds): + # fixes https://github.com/sgkit-dev/bio2zarr/issues/353 + assert ds.variant_MLEAF.dims == ("variants", "alt_alleles") + assert ds.variant_MLEAF.shape == (21, 1) + class TestGeneratedFieldsExample: data_path = "tests/data/vcf/field_type_combos.vcf.gz"