Add defaults to schema

benjeffery · jeromekelleher · commit 17db733bef54 · 2025-04-22T16:01:25.000+01:00
diff --git a/bio2zarr/icf.py b/bio2zarr/icf.py
@@ -843,7 +843,7 @@ def convert_local_allele_field_types(fields):
     chunks = gt.chunks[:-1]
     dimensions = gt.dimensions[:-1]
 
-    la = vcz.ZarrArraySpec.new(
+    la = vcz.ZarrArraySpec(
         name="call_LA",
         dtype="i1",
         shape=gt.shape,
@@ -1064,14 +1064,20 @@ def fixed_field_spec(
             dimensions=("variants",),
             chunks=None,
         ):
-            return vcz.ZarrArraySpec.new(
+            compressor = (
+                vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config()
+                if dtype == "bool"
+                else None
+            )
+            return vcz.ZarrArraySpec(
                 source=source,
                 name=name,
                 dtype=dtype,
                 shape=shape,
                 description="",
                 dimensions=dimensions,
                 chunks=chunks or [schema_instance.variants_chunk_size],
+                compressor=compressor,
             )
 
         alt_field = self.fields["ALT"]
@@ -1135,7 +1141,7 @@ def fixed_field_spec(
             ]
             dimensions = ["variants", "samples"]
             array_specs.append(
-                vcz.ZarrArraySpec.new(
+                vcz.ZarrArraySpec(
                     name="call_genotype_phased",
                     dtype="bool",
                     shape=list(shape),
@@ -1148,23 +1154,25 @@ def fixed_field_spec(
             chunks += [ploidy]
             dimensions += ["ploidy"]
             array_specs.append(
-                vcz.ZarrArraySpec.new(
+                vcz.ZarrArraySpec(
                     name="call_genotype",
                     dtype=gt_field.smallest_dtype(),
                     shape=list(shape),
                     chunks=list(chunks),
                     dimensions=list(dimensions),
                     description="",
+                    compressor=vcz.DEFAULT_ZARR_COMPRESSOR_GENOTYPES.get_config(),
                 )
             )
             array_specs.append(
-                vcz.ZarrArraySpec.new(
+                vcz.ZarrArraySpec(
                     name="call_genotype_mask",
                     dtype="bool",
                     shape=list(shape),
                     chunks=list(chunks),
                     dimensions=list(dimensions),
                     description="",
+                    compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
                 )
             )
 
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -82,7 +82,7 @@ def generate_schema(
         )
 
         array_specs = [
-            vcz.ZarrArraySpec.new(
+            vcz.ZarrArraySpec(
                 source="position",
                 name="variant_position",
                 dtype="i4",
@@ -91,15 +91,15 @@ def generate_schema(
                 chunks=[schema_instance.variants_chunk_size],
                 description=None,
             ),
-            vcz.ZarrArraySpec.new(
+            vcz.ZarrArraySpec(
                 name="variant_allele",
                 dtype="O",
                 shape=[m, 2],
                 dimensions=["variants", "alleles"],
                 chunks=[schema_instance.variants_chunk_size, 2],
                 description=None,
             ),
-            vcz.ZarrArraySpec.new(
+            vcz.ZarrArraySpec(
                 name="call_genotype_phased",
                 dtype="bool",
                 shape=[m, n],
@@ -109,8 +109,9 @@ def generate_schema(
                     schema_instance.samples_chunk_size,
                 ],
                 description=None,
+                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
             ),
-            vcz.ZarrArraySpec.new(
+            vcz.ZarrArraySpec(
                 name="call_genotype",
                 dtype="i1",
                 shape=[m, n, 2],
@@ -121,8 +122,9 @@ def generate_schema(
                     2,
                 ],
                 description=None,
+                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
             ),
-            vcz.ZarrArraySpec.new(
+            vcz.ZarrArraySpec(
                 name="call_genotype_mask",
                 dtype="bool",
                 shape=[m, n, 2],
@@ -133,6 +135,7 @@ def generate_schema(
                     2,
                 ],
                 description=None,
+                compressor=vcz.DEFAULT_ZARR_COMPRESSOR_BOOL.get_config(),
             ),
         ]
         schema_instance.fields = array_specs
diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py
@@ -16,6 +16,12 @@
 
 ZARR_SCHEMA_FORMAT_VERSION = "0.5"
 DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
+DEFAULT_ZARR_COMPRESSOR_GENOTYPES = numcodecs.Blosc(
+    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
+)
+DEFAULT_ZARR_COMPRESSOR_BOOL = numcodecs.Blosc(
+    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.BITSHUFFLE
+)
 
 _fixed_field_descriptions = {
     "variant_contig": "An identifier from the reference genome or an angle-bracketed ID"
@@ -93,8 +99,8 @@ class ZarrArraySpec:
     chunks: tuple
     dimensions: tuple
     description: str
-    compressor: dict
-    filters: list
+    compressor: dict = None
+    filters: list = None
     source: str = None
 
     def __post_init__(self):
@@ -105,15 +111,7 @@ def __post_init__(self):
         self.shape = tuple(self.shape)
         self.chunks = tuple(self.chunks)
         self.dimensions = tuple(self.dimensions)
-        self.filters = tuple(self.filters)
-
-    @staticmethod
-    def new(**kwargs):
-        spec = ZarrArraySpec(
-            **kwargs, compressor=DEFAULT_ZARR_COMPRESSOR.get_config(), filters=[]
-        )
-        spec._choose_compressor_settings()
-        return spec
+        self.filters = tuple(self.filters) if self.filters is not None else None
 
     @staticmethod
     def from_field(
@@ -124,6 +122,8 @@ def from_field(
         variants_chunk_size,
         samples_chunk_size,
         array_name=None,
+        compressor=None,
+        filters=None,
     ):
         shape = [num_variants]
         prefix = "variant_"
@@ -150,39 +150,18 @@ def from_field(
                 dimensions.append("genotypes")
             else:
                 dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
-        return ZarrArraySpec.new(
+        return ZarrArraySpec(
             source=vcf_field.full_name,
             name=array_name,
             dtype=vcf_field.smallest_dtype(),
             shape=shape,
             chunks=chunks,
             dimensions=dimensions,
             description=vcf_field.description,
+            compressor=compressor,
+            filters=filters,
         )
 
-    def _choose_compressor_settings(self):
-        """
-        Choose compressor and filter settings based on the size and
-        type of the array, plus some hueristics from observed properties
-        of VCFs.
-
-        See https://github.com/pystatgen/bio2zarr/discussions/74
-        """
-        # Default is to not shuffle, because autoshuffle isn't recognised
-        # by many Zarr implementations, and shuffling can lead to worse
-        # performance in some cases anyway. Turning on shuffle should be a
-        # deliberate choice.
-        shuffle = numcodecs.Blosc.NOSHUFFLE
-        if self.name == "call_genotype" and self.dtype == "i1":
-            # call_genotype gets BITSHUFFLE by default as it gets
-            # significantly better compression (at a cost of slower
-            # decoding)
-            shuffle = numcodecs.Blosc.BITSHUFFLE
-        elif self.dtype == "bool":
-            shuffle = numcodecs.Blosc.BITSHUFFLE
-
-        self.compressor["shuffle"] = shuffle
-
     @property
     def chunk_nbytes(self):
         """
@@ -240,16 +219,24 @@ class VcfZarrSchema(core.JsonDataclass):
     samples_chunk_size: int
     variants_chunk_size: int
     fields: list
+    defaults: dict
 
     def __init__(
         self,
         format_version: str,
         fields: list,
         variants_chunk_size: int = None,
         samples_chunk_size: int = None,
+        defaults: dict = None,
     ):
         self.format_version = format_version
         self.fields = fields
+        defaults = defaults.copy() if defaults is not None else {}
+        if defaults.get("compressor", None) is None:
+            defaults["compressor"] = DEFAULT_ZARR_COMPRESSOR.get_config()
+        if defaults.get("filters", None) is None:
+            defaults["filters"] = []
+        self.defaults = defaults
         if variants_chunk_size is None:
             variants_chunk_size = 1000
         self.variants_chunk_size = variants_chunk_size
@@ -533,7 +520,7 @@ def init(
 
         total_chunks = 0
         for field in self.schema.fields:
-            a = self.init_array(root, field, partitions[-1].stop)
+            a = self.init_array(root, self.metadata.schema, field, partitions[-1].stop)
             total_chunks += a.nchunks
 
         logger.info("Writing WIP metadata")
@@ -600,9 +587,20 @@ def encode_filters(self, root):
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
 
-    def init_array(self, root, array_spec, variants_dim_size):
+    def init_array(self, root, schema, array_spec, variants_dim_size):
         kwargs = dict(zarr_utils.ZARR_FORMAT_KWARGS)
-        filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
+        filters = (
+            array_spec.filters
+            if array_spec.filters is not None
+            else schema.defaults["filters"]
+        )
+        filters = [numcodecs.get_codec(filt) for filt in filters]
+        compressor = (
+            array_spec.compressor
+            if array_spec.compressor is not None
+            else schema.defaults["compressor"]
+        )
+        compressor = numcodecs.get_codec(compressor)
         if array_spec.dtype == "O":
             if zarr_utils.zarr_v3():
                 filters = [*list(filters), numcodecs.VLenUTF8()]
@@ -620,7 +618,7 @@ def init_array(self, root, array_spec, variants_dim_size):
             shape=shape,
             chunks=array_spec.chunks,
             dtype=array_spec.dtype,
-            compressor=numcodecs.get_codec(array_spec.compressor),
+            compressor=compressor,
             filters=filters,
             **kwargs,
         )
diff --git a/tests/test_vcz.py b/tests/test_vcz.py