From b86628570875cec3b1e4f5460fa35b4d1ad553c8 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 23 Oct 2024 13:48:48 +0100 Subject: [PATCH 1/5] Run on zarr-python v3 with zarr-format v2 --- .github/workflows/ci.yml | 22 +++++++++++++++++++ bio2zarr/plink.py | 8 +++---- bio2zarr/vcf2zarr/vcz.py | 35 +++++++++++++++++++------------ bio2zarr/vcf2zarr/verification.py | 2 +- tests/test_vcz.py | 4 ++++ 5 files changed, 53 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index deeebf5e..4c405b9d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -132,3 +132,25 @@ jobs: # We just run the CLI tests here because it doesn't require other upstream # packages like sgkit (which are tangled up with the numpy 2 dependency) python -m pytest tests/test_cli.py + + test-zarr-version: + name: Test Zarr versions + runs-on: ubuntu-latest + strategy: + matrix: + zarr: ["zarr==2.18.3", "git+https://github.com/zarr-developers/zarr-python.git"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install '.[dev]' + - name: Install ${{ matrix.zarr }} + run: | + python -m pip install --pre '${{ matrix.zarr }}' + - name: Run tests + run: | + python -m pytest diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 88843db1..3aa9bcf0 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -86,7 +86,7 @@ def convert( # we're not using the best Blosc settings for genotypes here. default_compressor = numcodecs.Blosc(cname="zstd", clevel=7) - a = root.array( + a = root.create_dataset( "sample_id", data=bed.iid, shape=bed.iid.shape, @@ -99,7 +99,7 @@ def convert( # TODO encode these in slices - but read them in one go to avoid # fetching repeatedly from bim file - a = root.array( + a = root.create_dataset( "variant_position", data=bed.bp_position, shape=bed.bp_position.shape, @@ -111,13 +111,13 @@ def convert( logger.debug("encoded variant_position") alleles = np.stack([bed.allele_1, bed.allele_2], axis=1) - a = root.array( + a = root.create_dataset( "variant_allele", data=alleles, shape=alleles.shape, dtype="str", compressor=default_compressor, - chunks=(variants_chunk_size,), + chunks=(variants_chunk_size, alleles.shape[1]), ) a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"] logger.debug("encoded variant_allele") diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 3094b233..e9bdbd01 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -13,7 +13,7 @@ import numpy as np import zarr -from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS +from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3 from .. import constants, core, provenance from . import icf @@ -572,7 +572,7 @@ def init( def encode_samples(self, root): if self.schema.samples != self.icf.metadata.samples: raise ValueError("Subsetting or reordering samples not supported currently") - array = root.array( + array = root.create_dataset( "sample_id", data=[sample.id for sample in self.schema.samples], shape=len(self.schema.samples), @@ -584,7 +584,7 @@ def encode_samples(self, root): logger.debug("Samples done") def encode_contig_id(self, root): - array = root.array( + array = root.create_dataset( "contig_id", data=[contig.id for contig in self.schema.contigs], shape=len(self.schema.contigs), @@ -593,7 +593,7 @@ def encode_contig_id(self, root): ) array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"] if all(contig.length is not None for contig in self.schema.contigs): - array = root.array( + array = root.create_dataset( "contig_length", data=[contig.length for contig in self.schema.contigs], shape=len(self.schema.contigs), @@ -605,7 +605,7 @@ def encode_contig_id(self, root): def encode_filter_id(self, root): # TODO need a way to store description also # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19 - array = root.array( + array = root.create_dataset( "filter_id", data=[filt.id for filt in self.schema.filters], shape=len(self.schema.filters), @@ -615,9 +615,17 @@ def encode_filter_id(self, root): array.attrs["_ARRAY_DIMENSIONS"] = ["filters"] def init_array(self, root, array_spec, variants_dim_size): - object_codec = None + kwargs = dict(ZARR_FORMAT_KWARGS) + filters = [numcodecs.get_codec(filt) for filt in array_spec.filters] if array_spec.dtype == "O": - object_codec = numcodecs.VLenUTF8() + if zarr_v3(): + filters = [*list(filters), numcodecs.VLenUTF8()] + else: + kwargs["object_codec"] = numcodecs.VLenUTF8() + + if not zarr_v3(): + kwargs["dimension_separator"] = self.metadata.dimension_separator + shape = list(array_spec.shape) # Truncate the variants dimension is max_variant_chunks was specified shape[0] = variants_dim_size @@ -627,10 +635,8 @@ def init_array(self, root, array_spec, variants_dim_size): chunks=array_spec.chunks, dtype=array_spec.dtype, compressor=numcodecs.get_codec(array_spec.compressor), - filters=[numcodecs.get_codec(filt) for filt in array_spec.filters], - object_codec=object_codec, - dimension_separator=self.metadata.dimension_separator, - **ZARR_FORMAT_KWARGS, + filters=filters, + **kwargs, ) a.attrs.update( { @@ -946,13 +952,16 @@ def create_index(self): c_start_idx = c_end_idx + 1 index = np.array(index, dtype=np.int32) - array = root.array( + kwargs = {} + if not zarr_v3(): + kwargs["dimension_separator"] = self.metadata.dimension_separator + array = root.create_dataset( "region_index", data=index, shape=index.shape, dtype=index.dtype, compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0), - dimension_separator=self.metadata.dimension_separator, + **kwargs, ) array.attrs["_ARRAY_DIMENSIONS"] = [ "region_index_values", diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py index 7d1d91c1..c62cf84e 100644 --- a/bio2zarr/vcf2zarr/verification.py +++ b/bio2zarr/vcf2zarr/verification.py @@ -77,7 +77,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type): if vcf_type in ("String", "Character"): split = list(vcf_val.split(",")) k = len(split) - if isinstance(zarr_val, str): + if isinstance(zarr_val, str) or zarr_val.ndim == 0: assert k == 1 # Scalar assert vcf_val == zarr_val diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 6787461f..d7be275b 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -9,6 +9,7 @@ from bio2zarr import core, vcf2zarr from bio2zarr.vcf2zarr import icf as icf_mod from bio2zarr.vcf2zarr import vcz as vcz_mod +from bio2zarr.zarr_utils import zarr_v3 @pytest.fixture(scope="module") @@ -112,6 +113,9 @@ def test_encode_metadata_mismatch(self, tmpdir, icf_path, version): vcz_mod.VcfZarrWriterMetadata.fromdict(d) +@pytest.mark.skipif( + zarr_v3(), reason="Zarr-python v3 does not support dimension_separator" +) class TestEncodeDimensionSeparator: @pytest.mark.parametrize("dimension_separator", [None, "/"]) def test_directories(self, tmp_path, icf_path, dimension_separator): From d8f72c95f2437825a69edc448615f6368f95c40c Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 7 Jan 2025 12:18:41 +0000 Subject: [PATCH 2/5] Add `first_dim_iter` utility to workaround slow array iteration in zarr-python v3 --- bio2zarr/vcf2zarr/verification.py | 12 +++++++++--- bio2zarr/zarr_utils.py | 6 ++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py index c62cf84e..d7a5291e 100644 --- a/bio2zarr/vcf2zarr/verification.py +++ b/bio2zarr/vcf2zarr/verification.py @@ -4,6 +4,8 @@ import tqdm import zarr +from bio2zarr.zarr_utils import first_dim_iter + from .. import constants @@ -152,7 +154,7 @@ def verify(vcf_path, zarr_path, show_progress=False): vid = root["variant_id"][:] call_genotype = None if "call_genotype" in root and root["call_genotype"].size > 0: - call_genotype = iter(root["call_genotype"]) + call_genotype = first_dim_iter(root["call_genotype"]) vcf = cyvcf2.VCF(vcf_path) format_headers = {} @@ -170,12 +172,16 @@ def verify(vcf_path, zarr_path, show_progress=False): vcf_name = colname.split("_", 1)[1] vcf_type = format_headers[vcf_name]["Type"] vcf_number = format_headers[vcf_name]["Number"] - format_fields[vcf_name] = vcf_type, vcf_number, iter(root[colname]) + format_fields[vcf_name] = ( + vcf_type, + vcf_number, + first_dim_iter(root[colname]), + ) if colname.startswith("variant"): name = colname.split("_", 1)[1] if name.isupper(): vcf_type = info_headers[name]["Type"] - info_fields[name] = vcf_type, iter(root[colname]) + info_fields[name] = vcf_type, first_dim_iter(root[colname]) first_pos = next(vcf).POS start_index = np.searchsorted(pos, first_pos) diff --git a/bio2zarr/zarr_utils.py b/bio2zarr/zarr_utils.py index 16d36e02..11c6b374 100644 --- a/bio2zarr/zarr_utils.py +++ b/bio2zarr/zarr_utils.py @@ -11,3 +11,9 @@ def zarr_v3() -> bool: ZARR_FORMAT_KWARGS = dict(zarr_format=2) else: ZARR_FORMAT_KWARGS = dict() + + +# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529 +def first_dim_iter(z): + for chunk in range(z.cdata_shape[0]): + yield from z.blocks[chunk] From 4a6ed6b4534cacc01dfe76c9509bb3f573d4bc83 Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 8 Jan 2025 13:40:39 +0000 Subject: [PATCH 3/5] Go back to `Group.array()` following https://github.com/zarr-developers/zarr-python/issues/2667 --- bio2zarr/plink.py | 6 +++--- bio2zarr/vcf2zarr/vcz.py | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 3aa9bcf0..11e45b2d 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -86,7 +86,7 @@ def convert( # we're not using the best Blosc settings for genotypes here. default_compressor = numcodecs.Blosc(cname="zstd", clevel=7) - a = root.create_dataset( + a = root.array( "sample_id", data=bed.iid, shape=bed.iid.shape, @@ -99,7 +99,7 @@ def convert( # TODO encode these in slices - but read them in one go to avoid # fetching repeatedly from bim file - a = root.create_dataset( + a = root.array( "variant_position", data=bed.bp_position, shape=bed.bp_position.shape, @@ -111,7 +111,7 @@ def convert( logger.debug("encoded variant_position") alleles = np.stack([bed.allele_1, bed.allele_2], axis=1) - a = root.create_dataset( + a = root.array( "variant_allele", data=alleles, shape=alleles.shape, diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index e9bdbd01..53b25924 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -572,7 +572,7 @@ def init( def encode_samples(self, root): if self.schema.samples != self.icf.metadata.samples: raise ValueError("Subsetting or reordering samples not supported currently") - array = root.create_dataset( + array = root.array( "sample_id", data=[sample.id for sample in self.schema.samples], shape=len(self.schema.samples), @@ -584,7 +584,7 @@ def encode_samples(self, root): logger.debug("Samples done") def encode_contig_id(self, root): - array = root.create_dataset( + array = root.array( "contig_id", data=[contig.id for contig in self.schema.contigs], shape=len(self.schema.contigs), @@ -593,7 +593,7 @@ def encode_contig_id(self, root): ) array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"] if all(contig.length is not None for contig in self.schema.contigs): - array = root.create_dataset( + array = root.array( "contig_length", data=[contig.length for contig in self.schema.contigs], shape=len(self.schema.contigs), @@ -605,7 +605,7 @@ def encode_contig_id(self, root): def encode_filter_id(self, root): # TODO need a way to store description also # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19 - array = root.create_dataset( + array = root.array( "filter_id", data=[filt.id for filt in self.schema.filters], shape=len(self.schema.filters), @@ -955,7 +955,7 @@ def create_index(self): kwargs = {} if not zarr_v3(): kwargs["dimension_separator"] = self.metadata.dimension_separator - array = root.create_dataset( + array = root.array( "region_index", data=index, shape=index.shape, From fa4640f804863cf29d141389f0ab7898fe9673cb Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 13 Jan 2025 11:39:05 +0000 Subject: [PATCH 4/5] Use released version of Zarr Python 3 --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c405b9d..4b581f3b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -138,7 +138,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - zarr: ["zarr==2.18.3", "git+https://github.com/zarr-developers/zarr-python.git"] + zarr: ["==2.18.3", ">=3"] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -148,9 +148,9 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install '.[dev]' - - name: Install ${{ matrix.zarr }} + - name: Install zarr${{ matrix.zarr }} run: | - python -m pip install --pre '${{ matrix.zarr }}' + python -m pip install 'zarr${{ matrix.zarr }}' - name: Run tests run: | python -m pytest From b82fdc4d64b50401cae5995ea10c2bb2d84db3b0 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 13 Jan 2025 16:46:33 +0000 Subject: [PATCH 5/5] Temporarily exclude test_double_encode_partition for Zarr v3 testing --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4b581f3b..fcfea19e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,4 +153,4 @@ jobs: python -m pip install 'zarr${{ matrix.zarr }}' - name: Run tests run: | - python -m pytest + python -m pytest -k "not test_double_encode_partition"