From 6dbe3968f1a13dcb789343a13c81721fc4bff642 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 23 Apr 2025 11:38:47 +0100 Subject: [PATCH 1/3] Check dimension sizes for named VCF Number fields --- bio2zarr/icf.py | 6 +++--- bio2zarr/vcz.py | 32 +++++++++++++++++++++----------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/bio2zarr/icf.py b/bio2zarr/icf.py index f828989d..a7ce2bae 100644 --- a/bio2zarr/icf.py +++ b/bio2zarr/icf.py @@ -1056,6 +1056,7 @@ def generate_schema( if local_alleles is None: local_alleles = False + max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2) dimensions = { "variants": vcz.VcfZarrDimension( size=m, chunk_size=variants_chunk_size or vcz.DEFAULT_VARIANT_CHUNK_SIZE @@ -1064,9 +1065,8 @@ def generate_schema( size=n, chunk_size=samples_chunk_size or vcz.DEFAULT_SAMPLE_CHUNK_SIZE ), # ploidy added conditionally below - "alleles": vcz.VcfZarrDimension( - size=max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2) - ), + "alleles": vcz.VcfZarrDimension(size=max_alleles), + "alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1), "filters": vcz.VcfZarrDimension(size=self.metadata.num_filters), } diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 85cf550c..93d75c46 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -170,19 +170,29 @@ def from_field( array_name = prefix + vcf_field.name max_number = vcf_field.max_number - if (max_number > 0 and vcf_field.vcf_number in ("R", "A", "G")) or ( - max_number > 1 or vcf_field.full_name == "FORMAT/LAA" - ): - # TODO we should really be checking this to see if the named dimensions - # are actually correct. - if vcf_field.vcf_number == "R": + if vcf_field.vcf_number == "R": + max_alleles = schema.dimensions["alleles"].size + if max_number > max_alleles: + raise ValueError( + f"Max number of values {max_number} exceeds max alleles " + f"{max_alleles} for {vcf_field.full_name}" + ) + if max_alleles > 0: dimensions.append("alleles") - elif vcf_field.vcf_number == "A": + elif vcf_field.vcf_number == "A": + max_alt_alleles = schema.dimensions["alt_alleles"].size + if max_number > max_alt_alleles: + raise ValueError( + f"Max number of values {max_number} exceeds max alt alleles " + f"{max_alt_alleles} for {vcf_field.full_name}" + ) + if max_alt_alleles > 0: dimensions.append("alt_alleles") - elif vcf_field.vcf_number == "G": - dimensions.append("genotypes") - else: - dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim") + elif max_number > 0 and vcf_field.vcf_number == "G": + # TODO: need max_genotypes + dimensions.append("genotypes") + elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA": + dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim") if dimensions[-1] not in schema.dimensions: schema.dimensions[dimensions[-1]] = VcfZarrDimension( size=vcf_field.max_number From 8100bfabaabc800c618f8bd70da97a94a65f7af7 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 28 Apr 2025 11:08:52 +0100 Subject: [PATCH 2/3] Add genotypes dimension to schema --- bio2zarr/icf.py | 22 +++++++++++++++++----- bio2zarr/vcz.py | 12 +++++++++--- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/bio2zarr/icf.py b/bio2zarr/icf.py index a7ce2bae..0f12de64 100644 --- a/bio2zarr/icf.py +++ b/bio2zarr/icf.py @@ -1064,12 +1064,28 @@ def generate_schema( "samples": vcz.VcfZarrDimension( size=n, chunk_size=samples_chunk_size or vcz.DEFAULT_SAMPLE_CHUNK_SIZE ), - # ploidy added conditionally below + # ploidy and genotypes added conditionally below "alleles": vcz.VcfZarrDimension(size=max_alleles), "alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1), "filters": vcz.VcfZarrDimension(size=self.metadata.num_filters), } + # Add ploidy and genotypes dimensions only when needed + max_genotypes = 0 + for field in self.metadata.format_fields: + if field.vcf_number == "G": + max_genotypes = max(max_genotypes, field.summary.max_number) + if self.gt_field is not None: + ploidy = max(self.gt_field.summary.max_number - 1, 1) + dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy) + max_genotypes = math.comb(max_alleles + ploidy - 1, ploidy) + dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes) + else: + if max_genotypes > 0: + # there is no GT field, but there is at least one Number=G field, + # so need to define genotypes dimension + dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes) + schema_instance = vcz.VcfZarrSchema( format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION, dimensions=dimensions, @@ -1148,10 +1164,6 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)): array_specs.append(spec_from_field(field)) if self.gt_field is not None and n > 0: - ploidy = max(self.gt_field.summary.max_number - 1, 1) - # Add ploidy dimension only when needed - schema_instance.dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy) - array_specs.append( vcz.ZarrArraySpec( name="call_genotype_phased", diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 93d75c46..452ef5e2 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -188,9 +188,15 @@ def from_field( ) if max_alt_alleles > 0: dimensions.append("alt_alleles") - elif max_number > 0 and vcf_field.vcf_number == "G": - # TODO: need max_genotypes - dimensions.append("genotypes") + elif vcf_field.vcf_number == "G": + max_genotypes = schema.dimensions["genotypes"].size + if max_number > max_genotypes: + raise ValueError( + f"Max number of values {max_number} exceeds max genotypes " + f"{max_genotypes} for {vcf_field.full_name}" + ) + if max_genotypes > 0: + dimensions.append("genotypes") elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA": dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim") if dimensions[-1] not in schema.dimensions: From e042e9f39db2337c7e0ec8c055a5de59a375f5ee Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 5 May 2025 14:46:01 +0100 Subject: [PATCH 3/3] Add unit test --- tests/test_vcz.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 63d6194e..8522d0d3 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -857,3 +857,42 @@ def test_json_serialization(self, icf_path): assert isinstance(schema2.dimensions["variants"], vcz.VcfZarrDimension) assert isinstance(schema2.dimensions["samples"], vcz.VcfZarrDimension) + + +class TestDimensionSizes: + data_path = "tests/data/vcf/field_type_combos.vcf.gz" + + @pytest.fixture(scope="class") + def icf(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.exploded" + return icf_mod.explode(out, [self.data_path]) + + @pytest.fixture(scope="class") + def schema(self, icf): + return icf.generate_schema() + + @pytest.mark.parametrize( + ("vcf_number", "dimensions", "field"), + [ + ("A", "alt_alleles", "FORMAT/FIA"), + ("R", "alleles", "FORMAT/FIR"), + ("G", "genotypes", "FORMAT/FIG"), + ], + ) + def test_max_number_exceeds_dimension_size( + self, icf, schema, vcf_number, dimensions, field + ): + vcf_field = icf.fields[field].vcf_field + assert vcf_field.vcf_number == vcf_number + # this should not fail + vcz.ZarrArraySpec.from_field(vcf_field, schema) + + # change max number to be bigger than that allowed by vcf number + max_number = schema.dimensions[dimensions].size + 1 + vcf_field.summary.max_number = max_number + + # creating an array spec should now fail + with pytest.raises( + ValueError, match=f"Max number of values {max_number} exceeds max" + ): + vcz.ZarrArraySpec.from_field(vcf_field, schema)