Merge pull request #110 from jeromekelleher/update-defaults

jeromekelleher · web-flow · commit 9840d49ff217 · 2024-04-08T13:35:04.000+01:00
Update defaults
diff --git a/bio2zarr/cli.py b/bio2zarr/cli.py
@@ -75,7 +75,7 @@ def list_commands(self, ctx):
     "--compressor",
     type=click.Choice(["lz4", "zstd"]),
     default=None,
-    help="Codec to use for compressing column chunks",
+    help="Codec to use for compressing column chunks (Default=zstd)."
 )
 
 # Note: -l and -w were chosen when these were called "width" and "length".
@@ -282,7 +282,7 @@ def encode(
     worker_processes,
 ):
     """
-    Encode intermediate columnar format (see explode) to vcfzarr.
+    Convert intermediate columnar format to vcfzarr.
     """
     setup_logging(verbose)
     check_overwrite_dir(zarr_path, force)
diff --git a/bio2zarr/core.py b/bio2zarr/core.py
@@ -50,7 +50,8 @@ def wait_on_futures(futures):
             cancel_futures(futures)
             if isinstance(exception, cf.process.BrokenProcessPool):
                 raise RuntimeError(
-                    "Worker process died: you may have run out of memory") from exception
+                    "Worker process died: you may have run out of memory"
+                ) from exception
             else:
                 raise exception
 
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -151,7 +151,7 @@ class VcfPartition:
 
 ICF_METADATA_FORMAT_VERSION = "0.2"
 ICF_DEFAULT_COMPRESSOR = numcodecs.Blosc(
-    cname="lz4", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
+    cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
 )
 
 
@@ -890,6 +890,15 @@ def num_columns(self):
         return len(self.columns)
 
 
+
+def mkdir_with_progress(path):
+    logger.debug(f"mkdir f{path}")
+    # NOTE we may have race-conditions here, I'm not sure. Hopefully allowing
+    # parents=True will take care of it.
+    path.mkdir(parents=True)
+    core.update_progress(1)
+
+
 class IntermediateColumnarFormatWriter:
     def __init__(self, path):
         self.path = pathlib.Path(path)
@@ -932,7 +941,7 @@ def init(
         # dependencies as well.
         self.metadata.provenance = {"source": f"bio2zarr-{provenance.__version__}"}
 
-        self.mkdirs(worker_processes)
+        self.mkdirs(worker_processes, show_progress=show_progress)
 
         # Note: this is needed for the current version of the vcfzarr spec, but it's
         # probably going to be dropped.
@@ -947,30 +956,30 @@ def init(
             json.dump(self.metadata.asdict(), f, indent=4)
         return self.num_partitions
 
-    def mkdirs(self, worker_processes=1):
-        logger.info(
-            f"Creating {len(self.metadata.fields) * self.num_partitions} directories"
-        )
+    def mkdirs(self, worker_processes=1, show_progress=False):
+        num_dirs = len(self.metadata.fields) * self.num_partitions
+        logger.info(f"Creating {num_dirs} directories")
         self.path.mkdir()
         self.wip_path.mkdir()
         # Due to high latency batch system filesystems, we create all the directories in
         # parallel
         progress_config = core.ProgressConfig(
-            total=len(self.metadata.fields) * self.num_partitions,
-            units="dir",
-            title="Creating directories",
-            show=True
+            total=num_dirs,
+            units="dirs",
+            title="Mkdirs",
+            show=show_progress,
         )
         with core.ParallelWorkManager(
-                worker_processes=worker_processes,
-                progress_config=progress_config
+            worker_processes=worker_processes, progress_config=progress_config
         ) as manager:
             for field in self.metadata.fields:
                 col_path = get_vcf_field_path(self.path, field)
+                # Don't bother trying to count the intermediate directories towards
+                # progress
                 manager.submit(col_path.mkdir, parents=True)
                 for j in range(self.num_partitions):
                     part_path = col_path / f"p{j}"
-                    manager.submit(part_path.mkdir, parents=True)
+                    manager.submit(mkdir_with_progress, part_path)
 
     def load_partition_summaries(self):
         summaries = []
@@ -1499,15 +1508,17 @@ def parse_max_memory(max_memory):
 
 
 class VcfZarrWriter:
-    def __init__(self, path, icf, schema):
+    def __init__(self, path, icf, schema, dimension_separator=None):
         self.path = pathlib.Path(path)
         self.icf = icf
         self.schema = schema
+        # Default to using nested directories following the Zarr v3 default.
+        # This seems to require version 2.17+ to work properly
+        self.dimension_separator = "/" if dimension_separator is None else dimension_separator
         store = zarr.DirectoryStore(self.path)
         self.root = zarr.group(store=store)
 
     def init_array(self, variable):
-        # print("CREATE", variable)
         object_codec = None
         if variable.dtype == "O":
             object_codec = numcodecs.VLenUTF8()
@@ -1519,7 +1530,9 @@ def init_array(self, variable):
             compressor=numcodecs.get_codec(variable.compressor),
             filters=[numcodecs.get_codec(filt) for filt in variable.filters],
             object_codec=object_codec,
+            dimension_separator=self.dimension_separator,
         )
+        # Dimension names are part of the spec in Zarr v3
         a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
 
     def get_array(self, name):
@@ -1657,6 +1670,7 @@ def encode_contig_id(self):
                 "contig_length",
                 self.schema.contig_length,
                 dtype=np.int64,
+                compressor=DEFAULT_ZARR_COMPRESSOR,
             )
             array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         return {v: j for j, v in enumerate(self.schema.contig_id)}
@@ -1849,6 +1863,7 @@ def encode(
     variants_chunk_size=None,
     samples_chunk_size=None,
     max_v_chunks=None,
+    dimension_separator=None,
     max_memory=None,
     worker_processes=1,
     show_progress=False,
@@ -1872,7 +1887,7 @@ def encode(
     if zarr_path.exists():
         logger.warning(f"Deleting existing {zarr_path}")
         shutil.rmtree(zarr_path)
-    vzw = VcfZarrWriter(zarr_path, icf, schema)
+    vzw = VcfZarrWriter(zarr_path, icf, schema, dimension_separator=dimension_separator)
     vzw.init()
     vzw.encode(
         max_v_chunks=max_v_chunks,
diff --git a/requirements/development.txt b/requirements/development.txt
@@ -8,4 +8,5 @@ pytest-coverage
 pytest-xdist
 sgkit
 tabulate
-tqdm
+tqdm
+zarr>=2.17
diff --git a/setup.cfg b/setup.cfg
@@ -28,7 +28,7 @@ include_package_data = True
 python_requires = >=3.9
 install_requires =
     numpy
-    zarr >= 2.10.0, != 2.11.0, != 2.11.1, != 2.11.2
+    zarr >= 2.17
     click
     tabulate
     tqdm
diff --git a/tests/test_icf.py b/tests/test_icf.py
@@ -275,6 +275,16 @@ def test_compressor_explode(self, tmp_path, compressor):
         icf = self.run_explode(tmp_path, compressor=compressor)
         assert icf.metadata.compressor == compressor.get_config()
 
+    def test_default_compressor_explode(self, tmp_path):
+        icf = self.run_explode(tmp_path)
+        assert icf.metadata.compressor == vcf.ICF_DEFAULT_COMPRESSOR.get_config()
+        assert icf.metadata.compressor["cname"] == "zstd"
+
+    def test_default_compressor_dexplode(self, tmp_path):
+        icf = self.run_dexplode(tmp_path)
+        assert icf.metadata.compressor == vcf.ICF_DEFAULT_COMPRESSOR.get_config()
+        assert icf.metadata.compressor["cname"] == "zstd"
+
     @pytest.mark.parametrize(
         "compressor",
         [
@@ -340,7 +350,8 @@ def test_empty_chunk_file(self, tmp_path):
         with pytest.raises(RuntimeError, match="blosc"):
             icf["POS"].values
 
-    @pytest.mark.parametrize("length", [10, 100, 200, 210])
+    # Chunk file is 195 long
+    @pytest.mark.parametrize("length", [10, 100, 190, 194])
     def test_truncated_chunk_file(self, tmp_path, length):
         icf_path = tmp_path / "icf"
         vcf.explode(icf_path, [self.data_path])
@@ -359,7 +370,7 @@ def test_chunk_incorrect_length(self, tmp_path):
         icf_path = tmp_path / "icf"
         vcf.explode(icf_path, [self.data_path])
         chunk_file = icf_path / "POS" / "p0" / "2"
-        compressor = numcodecs.Blosc(cname="lz4")
+        compressor = numcodecs.Blosc(cname="zstd")
         with open(chunk_file, "rb") as f:
             pkl = compressor.decode(f.read())
         x = pickle.loads(pkl)
diff --git a/tests/test_vcf.py b/tests/test_vcf.py
@@ -99,6 +99,29 @@ def test_exploded_metadata_mismatch(self, tmpdir, icf_path, version):
             vcf.IcfMetadata.fromdict(d)
 
 
+class TestEncodeDimensionSeparator:
+
+    @pytest.mark.parametrize("dimension_separator", [None, "/"])
+    def test_directories(self, tmp_path, icf_path, dimension_separator):
+        zarr_path = tmp_path / "zarr"
+        vcf.encode(icf_path, zarr_path, dimension_separator=dimension_separator)
+        # print(zarr_path)
+        chunk_file = zarr_path / "call_genotype" / "0" / "0" / "0"
+        assert chunk_file.exists()
+
+    def test_files(self, tmp_path, icf_path):
+        zarr_path = tmp_path / "zarr"
+        vcf.encode(icf_path, zarr_path, dimension_separator=".")
+        chunk_file = zarr_path / "call_genotype" / "0.0.0"
+        assert chunk_file.exists()
+
+    @pytest.mark.parametrize("dimension_separator", ["\\", "X", []])
+    def test_bad_value(self, tmp_path, icf_path, dimension_separator):
+        zarr_path = tmp_path / "zarr"
+        with pytest.raises(ValueError):
+            vcf.encode(icf_path, zarr_path, dimension_separator=dimension_separator)
+
+
 class TestDefaultSchema:
     def test_format_version(self, schema):
         assert schema["format_version"] == vcf.ZARR_SCHEMA_FORMAT_VERSION
diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py
@@ -841,7 +841,7 @@ def test_split_explode(tmp_path):
     pcvcf = vcf.IntermediateColumnarFormat(out)
     assert pcvcf.columns["POS"].vcf_field.summary.asdict() == {
         "num_chunks": 3,
-        "compressed_size": 630,
+        "compressed_size": 587,
         "uncompressed_size": 1008,
         "max_number": 1,
         "max_value": 1235237,