Merge pull request #86 from jeromekelleher/improved-schema-defaults

jeromekelleher · web-flow · commit 8874deadaf40 · 2024-03-22T16:02:25.000Z
Improved schema defaults
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -16,12 +16,6 @@
 
 numcodecs.blosc.use_threads = False
 
-# TODO this should probably go in another module where we abstract
-# out the zarr defaults
-default_compressor = numcodecs.Blosc(
-    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
-)
-
 
 def chunk_aligned_slices(z, n, max_chunks=None):
     """
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -4,6 +4,7 @@
 import numpy as np
 import zarr
 import bed_reader
+import numcodecs
 
 from . import core
 
@@ -82,11 +83,16 @@ def convert(
     chunks = [variants_chunk_size, samples_chunk_size]
     dimensions = ["variants", "samples"]
 
+    # TODO we should be reusing some logic from vcfzarr here on laying
+    # out the basic dataset, and using the schema generator. Currently
+    # we're not using the best Blosc settings for genotypes here.
+    default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
+
     a = root.array(
         "sample_id",
         bed.iid,
         dtype="str",
-        compressor=core.default_compressor,
+        compressor=default_compressor,
         chunks=(samples_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -98,7 +104,7 @@ def convert(
         "variant_position",
         bed.bp_position,
         dtype=np.int32,
-        compressor=core.default_compressor,
+        compressor=default_compressor,
         chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
@@ -109,7 +115,7 @@ def convert(
         "variant_allele",
         alleles,
         dtype="str",
-        compressor=core.default_compressor,
+        compressor=default_compressor,
         chunks=(variants_chunk_size,),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
@@ -121,7 +127,7 @@ def convert(
         dtype="bool",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
 
@@ -132,7 +138,7 @@ def convert(
         dtype="i1",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
 
@@ -141,7 +147,7 @@ def convert(
         dtype="bool",
         shape=list(shape),
         chunks=list(chunks),
-        compressor=core.default_compressor,
+        compressor=default_compressor,
     )
     a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
 
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1049,6 +1049,9 @@ def inspect(path):
     return obj.summary_table()
 
 
+DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
+
+
 @dataclasses.dataclass
 class ZarrColumnSpec:
     name: str
@@ -1058,20 +1061,46 @@ class ZarrColumnSpec:
     dimensions: list
     description: str
     vcf_field: str
-    compressor: dict
+    compressor: dict = None
+    filters: list = None
     # TODO add filters
 
     def __post_init__(self):
         self.shape = tuple(self.shape)
         self.chunks = tuple(self.chunks)
         self.dimensions = tuple(self.dimensions)
+        self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
+        self.filters = []
+        self._choose_compressor_settings()
+
+    def _choose_compressor_settings(self):
+        """
+        Choose compressor and filter settings based on the size and
+        type of the array, plus some hueristics from observed properties
+        of VCFs.
+
+        See https://github.com/pystatgen/bio2zarr/discussions/74
+        """
+        dt = np.dtype(self.dtype)
+        # Default is to not shuffle, because autoshuffle isn't recognised
+        # by many Zarr implementations, and shuffling can lead to worse
+        # performance in some cases anyway. Turning on shuffle should be a
+        # deliberate choice.
+        shuffle = numcodecs.Blosc.NOSHUFFLE
+        if dt.itemsize == 1:
+            # Any 1 byte field gets BITSHUFFLE by default
+            shuffle = numcodecs.Blosc.BITSHUFFLE
+        self.compressor["shuffle"] = shuffle
+
+        if dt.name == "bool":
+            self.filters.append(numcodecs.PackBits().get_config())
 
 
 ZARR_SCHEMA_FORMAT_VERSION = "0.2"
 
 
 @dataclasses.dataclass
-class ZarrConversionSpec:
+class VcfZarrSchema:
     format_version: str
     samples_chunk_size: int
     variants_chunk_size: int
@@ -1095,15 +1124,15 @@ def fromdict(d):
                 "Zarr schema format version mismatch: "
                 f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
             )
-        ret = ZarrConversionSpec(**d)
+        ret = VcfZarrSchema(**d)
         ret.columns = {
             key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
         }
         return ret
 
     @staticmethod
     def fromjson(s):
-        return ZarrConversionSpec.fromdict(json.loads(s))
+        return VcfZarrSchema.fromdict(json.loads(s))
 
     @staticmethod
     def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
@@ -1117,7 +1146,6 @@ def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
         logger.info(
             f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
         )
-        compressor = core.default_compressor.get_config()
 
         def fixed_field_spec(
             name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
@@ -1130,7 +1158,6 @@ def fixed_field_spec(
                 description="",
                 dimensions=dimensions,
                 chunks=[variants_chunk_size],
-                compressor=compressor,
             )
 
         alt_col = pcvcf.columns["ALT"]
@@ -1206,7 +1233,6 @@ def fixed_field_spec(
                 chunks=chunks,
                 dimensions=dimensions,
                 description=field.description,
-                compressor=compressor,
             )
             colspecs.append(colspec)
 
@@ -1225,7 +1251,6 @@ def fixed_field_spec(
                     chunks=list(chunks),
                     dimensions=list(dimensions),
                     description="",
-                    compressor=compressor,
                 )
             )
             shape += [ploidy]
@@ -1239,7 +1264,6 @@ def fixed_field_spec(
                     chunks=list(chunks),
                     dimensions=list(dimensions),
                     description="",
-                    compressor=compressor,
                 )
             )
             colspecs.append(
@@ -1251,11 +1275,10 @@ def fixed_field_spec(
                     chunks=list(chunks),
                     dimensions=list(dimensions),
                     description="",
-                    compressor=compressor,
                 )
             )
 
-        return ZarrConversionSpec(
+        return VcfZarrSchema(
             format_version=ZARR_SCHEMA_FORMAT_VERSION,
             samples_chunk_size=samples_chunk_size,
             variants_chunk_size=variants_chunk_size,
@@ -1328,6 +1351,7 @@ def init_array(self, variable):
             chunks=variable.chunks,
             dtype=variable.dtype,
             compressor=numcodecs.get_codec(variable.compressor),
+            filters=[numcodecs.get_codec(filt) for filt in variable.filters],
             object_codec=object_codec,
         )
         a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
@@ -1446,7 +1470,7 @@ def encode_samples(self):
             "sample_id",
             self.schema.sample_id,
             dtype="str",
-            compressor=core.default_compressor,
+            compressor=DEFAULT_ZARR_COMPRESSOR,
             chunks=(self.schema.samples_chunk_size,),
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -1457,7 +1481,7 @@ def encode_contig_id(self):
             "contig_id",
             self.schema.contig_id,
             dtype="str",
-            compressor=core.default_compressor,
+            compressor=DEFAULT_ZARR_COMPRESSOR,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         if self.schema.contig_length is not None:
@@ -1474,7 +1498,7 @@ def encode_filter_id(self):
             "filter_id",
             self.schema.filter_id,
             dtype="str",
-            compressor=core.default_compressor,
+            compressor=DEFAULT_ZARR_COMPRESSOR,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
         return {v: j for j, v in enumerate(self.schema.filter_id)}
@@ -1647,7 +1671,7 @@ def service_completed_futures():
 
 def mkschema(if_path, out):
     pcvcf = PickleChunkedVcf.load(if_path)
-    spec = ZarrConversionSpec.generate(pcvcf)
+    spec = VcfZarrSchema.generate(pcvcf)
     out.write(spec.asjson())
 
 
@@ -1664,7 +1688,7 @@ def encode(
 ):
     pcvcf = PickleChunkedVcf.load(if_path)
     if schema_path is None:
-        schema = ZarrConversionSpec.generate(
+        schema = VcfZarrSchema.generate(
             pcvcf,
             variants_chunk_size=variants_chunk_size,
             samples_chunk_size=samples_chunk_size,
@@ -1674,7 +1698,7 @@ def encode(
         if variants_chunk_size is not None or samples_chunk_size is not None:
             raise ValueError("Cannot specify schema along with chunk sizes")
         with open(schema_path, "r") as f:
-            schema = ZarrConversionSpec.fromjson(f.read())
+            schema = VcfZarrSchema.fromjson(f.read())
     zarr_path = pathlib.Path(zarr_path)
     if zarr_path.exists():
         logger.warning(f"Deleting existing {zarr_path}")
diff --git a/tests/test_pcvcf.py b/tests/test_pcvcf.py
@@ -27,8 +27,8 @@ def test_mkschema(self, tmp_path, pcvcf):
         with open(schema_file, "w") as f:
             vcf.mkschema(pcvcf.path, f)
         with open(schema_file, "r") as f:
-            schema1 = vcf.ZarrConversionSpec.fromjson(f.read())
-        schema2 = vcf.ZarrConversionSpec.generate(pcvcf)
+            schema1 = vcf.VcfZarrSchema.fromjson(f.read())
+        schema2 = vcf.VcfZarrSchema.generate(pcvcf)
         assert schema1 == schema2
 
     def test_summary_table(self, pcvcf):
@@ -95,7 +95,7 @@ def pcvcf(self, tmp_path_factory):
 
     @pytest.fixture(scope="class")
     def schema(self, pcvcf):
-        return vcf.ZarrConversionSpec.generate(pcvcf)
+        return vcf.VcfZarrSchema.generate(pcvcf)
 
     @pytest.mark.parametrize(
         ("name", "dtype", "shape"),
@@ -165,8 +165,8 @@ def test_repr(self, pcvcf):
 
     def test_pos_repr(self, pcvcf):
         assert repr(pcvcf["POS"]).startswith(
-            "PickleChunkedVcfField(name=POS, partition_chunks=[8, 8, 8, 8, 8], path=")
-
+            "PickleChunkedVcfField(name=POS, partition_chunks=[8, 8, 8, 8, 8], path="
+        )
 
     def test_partition_record_index(self, pcvcf):
         nt.assert_array_equal(
diff --git a/tests/test_vcf.py b/tests/test_vcf.py