diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index deeebf5e..fcfea19e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -132,3 +132,25 @@ jobs: # We just run the CLI tests here because it doesn't require other upstream # packages like sgkit (which are tangled up with the numpy 2 dependency) python -m pytest tests/test_cli.py + + test-zarr-version: + name: Test Zarr versions + runs-on: ubuntu-latest + strategy: + matrix: + zarr: ["==2.18.3", ">=3"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install '.[dev]' + - name: Install zarr${{ matrix.zarr }} + run: | + python -m pip install 'zarr${{ matrix.zarr }}' + - name: Run tests + run: | + python -m pytest -k "not test_double_encode_partition" diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py index 88843db1..11e45b2d 100644 --- a/bio2zarr/plink.py +++ b/bio2zarr/plink.py @@ -117,7 +117,7 @@ def convert( shape=alleles.shape, dtype="str", compressor=default_compressor, - chunks=(variants_chunk_size,), + chunks=(variants_chunk_size, alleles.shape[1]), ) a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"] logger.debug("encoded variant_allele") diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py index 3094b233..53b25924 100644 --- a/bio2zarr/vcf2zarr/vcz.py +++ b/bio2zarr/vcf2zarr/vcz.py @@ -13,7 +13,7 @@ import numpy as np import zarr -from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS +from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3 from .. import constants, core, provenance from . import icf @@ -615,9 +615,17 @@ def encode_filter_id(self, root): array.attrs["_ARRAY_DIMENSIONS"] = ["filters"] def init_array(self, root, array_spec, variants_dim_size): - object_codec = None + kwargs = dict(ZARR_FORMAT_KWARGS) + filters = [numcodecs.get_codec(filt) for filt in array_spec.filters] if array_spec.dtype == "O": - object_codec = numcodecs.VLenUTF8() + if zarr_v3(): + filters = [*list(filters), numcodecs.VLenUTF8()] + else: + kwargs["object_codec"] = numcodecs.VLenUTF8() + + if not zarr_v3(): + kwargs["dimension_separator"] = self.metadata.dimension_separator + shape = list(array_spec.shape) # Truncate the variants dimension is max_variant_chunks was specified shape[0] = variants_dim_size @@ -627,10 +635,8 @@ def init_array(self, root, array_spec, variants_dim_size): chunks=array_spec.chunks, dtype=array_spec.dtype, compressor=numcodecs.get_codec(array_spec.compressor), - filters=[numcodecs.get_codec(filt) for filt in array_spec.filters], - object_codec=object_codec, - dimension_separator=self.metadata.dimension_separator, - **ZARR_FORMAT_KWARGS, + filters=filters, + **kwargs, ) a.attrs.update( { @@ -946,13 +952,16 @@ def create_index(self): c_start_idx = c_end_idx + 1 index = np.array(index, dtype=np.int32) + kwargs = {} + if not zarr_v3(): + kwargs["dimension_separator"] = self.metadata.dimension_separator array = root.array( "region_index", data=index, shape=index.shape, dtype=index.dtype, compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0), - dimension_separator=self.metadata.dimension_separator, + **kwargs, ) array.attrs["_ARRAY_DIMENSIONS"] = [ "region_index_values", diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py index 7d1d91c1..d7a5291e 100644 --- a/bio2zarr/vcf2zarr/verification.py +++ b/bio2zarr/vcf2zarr/verification.py @@ -4,6 +4,8 @@ import tqdm import zarr +from bio2zarr.zarr_utils import first_dim_iter + from .. import constants @@ -77,7 +79,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type): if vcf_type in ("String", "Character"): split = list(vcf_val.split(",")) k = len(split) - if isinstance(zarr_val, str): + if isinstance(zarr_val, str) or zarr_val.ndim == 0: assert k == 1 # Scalar assert vcf_val == zarr_val @@ -152,7 +154,7 @@ def verify(vcf_path, zarr_path, show_progress=False): vid = root["variant_id"][:] call_genotype = None if "call_genotype" in root and root["call_genotype"].size > 0: - call_genotype = iter(root["call_genotype"]) + call_genotype = first_dim_iter(root["call_genotype"]) vcf = cyvcf2.VCF(vcf_path) format_headers = {} @@ -170,12 +172,16 @@ def verify(vcf_path, zarr_path, show_progress=False): vcf_name = colname.split("_", 1)[1] vcf_type = format_headers[vcf_name]["Type"] vcf_number = format_headers[vcf_name]["Number"] - format_fields[vcf_name] = vcf_type, vcf_number, iter(root[colname]) + format_fields[vcf_name] = ( + vcf_type, + vcf_number, + first_dim_iter(root[colname]), + ) if colname.startswith("variant"): name = colname.split("_", 1)[1] if name.isupper(): vcf_type = info_headers[name]["Type"] - info_fields[name] = vcf_type, iter(root[colname]) + info_fields[name] = vcf_type, first_dim_iter(root[colname]) first_pos = next(vcf).POS start_index = np.searchsorted(pos, first_pos) diff --git a/bio2zarr/zarr_utils.py b/bio2zarr/zarr_utils.py index 16d36e02..11c6b374 100644 --- a/bio2zarr/zarr_utils.py +++ b/bio2zarr/zarr_utils.py @@ -11,3 +11,9 @@ def zarr_v3() -> bool: ZARR_FORMAT_KWARGS = dict(zarr_format=2) else: ZARR_FORMAT_KWARGS = dict() + + +# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529 +def first_dim_iter(z): + for chunk in range(z.cdata_shape[0]): + yield from z.blocks[chunk] diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 6787461f..d7be275b 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -9,6 +9,7 @@ from bio2zarr import core, vcf2zarr from bio2zarr.vcf2zarr import icf as icf_mod from bio2zarr.vcf2zarr import vcz as vcz_mod +from bio2zarr.zarr_utils import zarr_v3 @pytest.fixture(scope="module") @@ -112,6 +113,9 @@ def test_encode_metadata_mismatch(self, tmpdir, icf_path, version): vcz_mod.VcfZarrWriterMetadata.fromdict(d) +@pytest.mark.skipif( + zarr_v3(), reason="Zarr-python v3 does not support dimension_separator" +) class TestEncodeDimensionSeparator: @pytest.mark.parametrize("dimension_separator", [None, "/"]) def test_directories(self, tmp_path, icf_path, dimension_separator):