Implement basic defaults for schema

jeromekelleher · jeromekelleher · commit ac4aaad3fd87 · 2024-03-22T15:49:31.000Z
Closes #6
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -16,12 +16,6 @@
 
 numcodecs.blosc.use_threads = False
 
-# TODO this should probably go in another module where we abstract
-# out the zarr defaults
-default_compressor = numcodecs.Blosc(
-    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.AUTOSHUFFLE
-)
-
 
 def chunk_aligned_slices(z, n, max_chunks=None):
     """
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -1049,6 +1049,9 @@ def inspect(path):
     return obj.summary_table()
 
 
+DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
+
+
 @dataclasses.dataclass
 class ZarrColumnSpec:
     name: str
@@ -1058,17 +1061,45 @@ class ZarrColumnSpec:
     dimensions: list
     description: str
     vcf_field: str
-    compressor: dict
+    compressor: dict = None
+    filters: list = None
     # TODO add filters
 
     def __post_init__(self):
         self.shape = tuple(self.shape)
         self.chunks = tuple(self.chunks)
         self.dimensions = tuple(self.dimensions)
+        self.compressor = DEFAULT_ZARR_COMPRESSOR.get_config()
+        self.filters = []
+        self._choose_compressor_settings()
+
+    def _choose_compressor_settings(self):
+        """
+        Choose compressor and filter settings based on the size and
+        type of the array, plus some hueristics from observed properties
+        of VCFs.
+
+        See https://github.com/pystatgen/bio2zarr/discussions/74
+        """
+        dt = np.dtype(self.dtype)
+        # Default is to not shuffle, because autoshuffle isn't recognised
+        # by many Zarr implementations, and shuffling can lead to worse
+        # performance in some cases anyway. Turning on shuffle should be a
+        # deliberate choice.
+        shuffle = numcodecs.Blosc.NOSHUFFLE
+        if dt.itemsize == 1:
+            # Any 1 byte field gets BITSHUFFLE by default
+            shuffle = numcodecs.Blosc.BITSHUFFLE
+        self.compressor["shuffle"] = shuffle
+
+        if dt.name == "bool":
+            self.filters.append(numcodecs.PackBits().get_config())
 
 
 ZARR_SCHEMA_FORMAT_VERSION = "0.2"
 
+# RENAME to ZarrSchema
+
 
 @dataclasses.dataclass
 class ZarrConversionSpec:
@@ -1117,7 +1148,6 @@ def generate(pcvcf, variants_chunk_size=None, samples_chunk_size=None):
         logger.info(
             f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
         )
-        compressor = core.default_compressor.get_config()
 
         def fixed_field_spec(
             name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
@@ -1130,7 +1160,6 @@ def fixed_field_spec(
                 description="",
                 dimensions=dimensions,
                 chunks=[variants_chunk_size],
-                compressor=compressor,
             )
 
         alt_col = pcvcf.columns["ALT"]
@@ -1206,7 +1235,6 @@ def fixed_field_spec(
                 chunks=chunks,
                 dimensions=dimensions,
                 description=field.description,
-                compressor=compressor,
             )
             colspecs.append(colspec)
 
@@ -1225,7 +1253,6 @@ def fixed_field_spec(
                     chunks=list(chunks),
                     dimensions=list(dimensions),
                     description="",
-                    compressor=compressor,
                 )
             )
             shape += [ploidy]
@@ -1239,7 +1266,6 @@ def fixed_field_spec(
                     chunks=list(chunks),
                     dimensions=list(dimensions),
                     description="",
-                    compressor=compressor,
                 )
             )
             colspecs.append(
@@ -1251,7 +1277,6 @@ def fixed_field_spec(
                     chunks=list(chunks),
                     dimensions=list(dimensions),
                     description="",
-                    compressor=compressor,
                 )
             )
 
@@ -1328,6 +1353,7 @@ def init_array(self, variable):
             chunks=variable.chunks,
             dtype=variable.dtype,
             compressor=numcodecs.get_codec(variable.compressor),
+            filters=[numcodecs.get_codec(filt) for filt in variable.filters],
             object_codec=object_codec,
         )
         a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
@@ -1446,7 +1472,7 @@ def encode_samples(self):
             "sample_id",
             self.schema.sample_id,
             dtype="str",
-            compressor=core.default_compressor,
+            compressor=DEFAULT_ZARR_COMPRESSOR,
             chunks=(self.schema.samples_chunk_size,),
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
@@ -1457,7 +1483,7 @@ def encode_contig_id(self):
             "contig_id",
             self.schema.contig_id,
             dtype="str",
-            compressor=core.default_compressor,
+            compressor=DEFAULT_ZARR_COMPRESSOR,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         if self.schema.contig_length is not None:
@@ -1474,7 +1500,7 @@ def encode_filter_id(self):
             "filter_id",
             self.schema.filter_id,
             dtype="str",
-            compressor=core.default_compressor,
+            compressor=DEFAULT_ZARR_COMPRESSOR,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
         return {v: j for j, v in enumerate(self.schema.filter_id)}
diff --git a/tests/test_vcf.py b/tests/test_vcf.py
@@ -26,6 +26,12 @@ def schema_path(exploded_path, tmp_path_factory):
     return out
 
 
+@pytest.fixture(scope="module")
+def schema(schema_path):
+    with open(schema_path) as f:
+        return json.load(f)
+
+
 @pytest.fixture(scope="module")
 def zarr_path(exploded_path, tmp_path_factory):
     out = tmp_path_factory.mktemp("data") / "example.zarr"
@@ -35,10 +41,8 @@ def zarr_path(exploded_path, tmp_path_factory):
 
 class TestJsonVersions:
     @pytest.mark.parametrize("version", ["0.1", "1.0", "xxxxx", 0.2])
-    def test_zarr_schema_mismatch(self, schema_path, version):
-        with open(schema_path) as f:
-            d = json.load(f)
-
+    def test_zarr_schema_mismatch(self, schema, version):
+        d = dict(schema)
         d["format_version"] = version
         with pytest.raises(ValueError, match="Zarr schema format version mismatch"):
             vcf.ZarrConversionSpec.fromdict(d)
@@ -53,3 +57,109 @@ def test_exploded_metadata_mismatch(self, tmpdir, exploded_path, version):
             ValueError, match="Exploded metadata format version mismatch"
         ):
             vcf.VcfMetadata.fromdict(d)
+
+
+class TestDefaultSchema:
+    def test_format_version(self, schema):
+        assert schema["format_version"] == vcf.ZARR_SCHEMA_FORMAT_VERSION
+
+    def test_chunk_size(self, schema):
+        assert schema["samples_chunk_size"] == 1000
+        assert schema["variants_chunk_size"] == 10000
+
+    def test_dimensions(self, schema):
+        assert schema["dimensions"] == [
+            "variants",
+            "samples",
+            "ploidy",
+            "alleles",
+            "filters",
+        ]
+
+    def test_sample_id(self, schema):
+        assert schema["sample_id"] == ["NA00001", "NA00002", "NA00003"]
+
+    def test_contig_id(self, schema):
+        assert schema["contig_id"] == ["19", "20", "X"]
+
+    def test_contig_length(self, schema):
+        assert schema["contig_length"] is None
+
+    def test_filter_id(self, schema):
+        assert schema["filter_id"] == ["PASS", "s50", "q10"]
+
+    def test_variant_contig(self, schema):
+        assert schema["columns"]["variant_contig"] == {
+            "name": "variant_contig",
+            "dtype": "i2",
+            "shape": [9],
+            "chunks": [10000],
+            "dimensions": ["variants"],
+            "description": "",
+            "vcf_field": None,
+            "compressor": {
+                "id": "blosc",
+                "cname": "zstd",
+                "clevel": 7,
+                "shuffle": 0,
+                "blocksize": 0,
+            },
+            "filters": [],
+        }
+
+    def test_call_genotype(self, schema):
+        assert schema["columns"]["call_genotype"] == {
+            "name": "call_genotype",
+            "dtype": "i1",
+            "shape": [9, 3, 2],
+            "chunks": [10000, 1000],
+            "dimensions": ["variants", "samples", "ploidy"],
+            "description": "",
+            "vcf_field": None,
+            "compressor": {
+                "id": "blosc",
+                "cname": "zstd",
+                "clevel": 7,
+                "shuffle": 2,
+                "blocksize": 0,
+            },
+            "filters": [],
+        }
+
+    def test_call_genotype_mask(self, schema):
+        assert schema["columns"]["call_genotype_mask"] == {
+            "name": "call_genotype_mask",
+            "dtype": "bool",
+            "shape": [9, 3, 2],
+            "chunks": [10000, 1000],
+            "dimensions": ["variants", "samples", "ploidy"],
+            "description": "",
+            "vcf_field": None,
+            "compressor": {
+                "id": "blosc",
+                "cname": "zstd",
+                "clevel": 7,
+                "shuffle": 2,
+                "blocksize": 0,
+            },
+            "filters": [{"id": "packbits"}],
+        }
+
+    def test_call_genotype_phased(self, schema):
+        assert schema["columns"]["call_genotype_mask"] == {
+            "name": "call_genotype_mask",
+            "dtype": "bool",
+            "shape": [9, 3, 2],
+            "chunks": [10000, 1000],
+            "dimensions": ["variants", "samples", "ploidy"],
+            "description": "",
+            "vcf_field": None,
+            "compressor": {
+                "id": "blosc",
+                "cname": "zstd",
+                "clevel": 7,
+                "shuffle": 2,
+                "blocksize": 0,
+            },
+            "filters": [{"id": "packbits"}],
+        }