Add a CLI for copy_dataset (#1259)

normanrz · philippotto · markbader · web-flow · commit 63712de52ec8 · 2025-03-03T22:22:34.000+01:00
* make Zarr3 default DataFormat * compress=True * remove deprecations * test fixes * down to 16 * changelog * test fixes * fix /test_dataset_add_remote_mag_and_layer.py * stuff * ci * ci * error on fork * ci * ci * less alignment checks * allow_unaligned * ci.yml aktualisieren * ci testing * ci * sequential tests * ci * ci * ci * ci * parameterize python for kubernetes dockerfile * test * change defaults * mirrored test images * mp logging * mp debugging * debug * debug * debug * debugging * pyproject.toml * py3.12 * debugging * wip * all python versions * revert debug changes in cluster_tools * fixes * larger ci runner * default ci runner * rm pytest-timeout * test * Revert "rm pytest-timeout" This reverts commit 6bc2185. * Revert "test" This reverts commit 8d57971. * ci * ci * ci * ci * ci * ci * properly implement SequentialExecutor * ci * changelog * allow_unaligned wip * ci * wip * fix tests * fix test * examples * longer sleep in slurm test * format * longer sleep in slurm test * Apply suggestions from code review Co-authored-by: Philipp Otto <philippotto@users.noreply.github.com> * add methods * more robust patch * comment * derive_nd_bounding_box_from_shape * refactor Dataset.open to not require an additional IOop * cassettes * format * lint * docs * deprecate chunks_per_shard * deprecate dtype_per_layer * type * fixes for add_layer_from_images * Vec3Int.from_vec_or_int * export defaults * write_layer args * MagLike + mag in write_layer * doc * docstring * adds copy-dataset CLI tool * test * change default data_format in cli * docs * changelog * changelog * changelog * better progress descriptor * remove leading slash * fix TensorStoreArray.open * Update webknossos/webknossos/cli/copy_dataset.py Co-authored-by: Mark Bader <markbader64@gmail.com> * Update webknossos/Changelog.md Co-authored-by: Mark Bader <markbader64@gmail.com> * docstring * more kwargs * tests * add exists-ok flag * -vv --------- Co-authored-by: Philipp Otto <philippotto@users.noreply.github.com> Co-authored-by: Mark Bader <markbader64@gmail.com>
diff --git a/docs/src/cli/index.md b/docs/src/cli/index.md
@@ -11,6 +11,7 @@ The WEBKNOSSOS CLI offers many useful commands to work with WEBKNOSSOS datasets:
 - `webknossos convert-knossos`: Converts a KNOSSOS dataset to a WEBKNOSSOS dataset
 - `webknossos convert-raw`: Converts a RAW image file to a WEBKNOSSOS dataset
 - `webknossos convert-zarr`: Converts a Zarr dataset to a WEBKNOSSOS dataset 
+- `webknossos copy-dataset`: Makes a copy of a WEBKNOSSOS dataset
 - `webknossos download`: Download a dataset from a WEBKNOSSOS server
 - `webknossos downsample`: Downsample a WEBKNOSSOS dataset
 - `webknossos merge-fallback`: Merge a volume layer of a WEBKNOSSOS dataset with an annotation
@@ -61,7 +62,12 @@ webknossos convert-knossos --layer-name color --voxel-size 11.24,11.24,25 data/s
 # Convert RAW file to wkw file
 webknossos convert-raw --layer-name color --voxel-size 10,10,30 --dtype uint8 --shape 2048,2048,1024 data/source/raw_file.raw data/target
 
-
+# Copy a local dataset to a remote storage
+AWS_ACCESS_KEY_ID=XXX AWS_SECRET_ACCESS_KEY=XXX \
+webknossos copy-dataset \
+  --data-format zarr3 \
+  --jobs 4 \
+  data/source s3://webknossos-bucket/target
 ```
 
 ### Parallelization
diff --git a/webknossos/Changelog.md b/webknossos/Changelog.md
@@ -154,6 +154,7 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
 
 
 ### Added
+- Added the `webknossos copy-dataset` CLI command. [#1259](https://github.com/scalableminds/webknossos-libs/pull/1259)
 - Added `Dataset.write_layer` method for writing entire layers in one go. [#1242](https://github.com/scalableminds/webknossos-libs/pull/1242)
 
 ### Changed
diff --git a/webknossos/test.sh b/webknossos/test.sh
@@ -10,7 +10,7 @@ export MULTIPROCESSING_DEFAULT_START_METHOD=forkserver
 # this will ensure that the current directory is added to sys.path
 # (which is standard python behavior). This is necessary so that the imports
 # refer to the checked out (and potentially modified) code.
-PYTEST="uv run --all-extras --python ${PYTHON_VERSION:-3.13} -m pytest --suppress-no-test-exit-code"
+PYTEST="uv run --all-extras --python ${PYTHON_VERSION:-3.13} -m pytest --suppress-no-test-exit-code -vv"
 
 # Within the tests folder is a binaryData folder of the local running webknossos instance. This folder is cleaned up before running the tests.
 # This find command gets all directories in binaryData/Organization_X except for the l4_sample and e2006_knossos directories and deletes them.
diff --git a/webknossos/tests/dataset/test_dataset.py b/webknossos/tests/dataset/test_dataset.py
@@ -3060,6 +3060,16 @@ def test_wkw_copy_to_remote_dataset() -> None:
         )
 
 
+def test_copy_dataset_exists_ok() -> None:
+    ds_path = prepare_dataset_path(DataFormat.WKW, REMOTE_TESTOUTPUT_DIR, "copied")
+    wkw_ds = Dataset.open(TESTDATA_DIR / "simple_wkw_dataset")
+
+    wkw_ds.copy_dataset(ds_path, data_format=DataFormat.Zarr3)
+    with pytest.raises(RuntimeError):
+        wkw_ds.copy_dataset(ds_path, data_format=DataFormat.Zarr3)
+    wkw_ds.copy_dataset(ds_path, data_format=DataFormat.Zarr3, exists_ok=True)
+
+
 @pytest.mark.use_proxay
 def test_remote_dataset_access_metadata() -> None:
     ds = Dataset.open_remote("l4_sample", "Organization_X")
diff --git a/webknossos/tests/test_cli.py b/webknossos/tests/test_cli.py
@@ -23,7 +23,7 @@
     TESTDATA_DIR,
     use_minio,
 )
-from webknossos import BoundingBox, DataFormat, Dataset
+from webknossos import BoundingBox, DataFormat, Dataset, Mag
 from webknossos.cli.export_as_tiff import _make_tiff_name
 from webknossos.cli.main import app
 from webknossos.dataset.dataset import PROPERTIES_FILE_NAME
@@ -134,6 +134,43 @@ def test_check_equality() -> None:
     )
 
 
+def test_copy_dataset(tmp_path: Path) -> None:
+    """Tests the functionality of copy_dataset subcommand."""
+
+    result_without_args = runner.invoke(app, ["copy-dataset"])
+    assert result_without_args.exit_code == 2
+
+    result = runner.invoke(
+        app,
+        [
+            "copy-dataset",
+            str(TESTDATA_DIR / "simple_wkw_dataset"),
+            str(tmp_path / "simple_wkw_dataset"),
+            "--data-format",
+            "zarr3",
+        ],
+    )
+    assert result.exit_code == 0
+    # verify that data is
+    target_ds = Dataset.open(tmp_path / "simple_wkw_dataset")
+    target_layer = target_ds.get_layer("color")
+    assert target_layer.data_format == DataFormat.Zarr3
+    assert Mag(1) in target_layer.mags
+
+    result = runner.invoke(
+        app,
+        [
+            "copy-dataset",
+            str(TESTDATA_DIR / "simple_wkw_dataset"),
+            str(tmp_path / "simple_wkw_dataset"),
+            "--data-format",
+            "zarr3",
+            "--exists-ok",
+        ],
+    )
+    assert result.exit_code == 0
+
+
 def test_check_not_equal() -> None:
     """Tests that the check_equality subcommand detects differing datasets."""
 
diff --git a/webknossos/webknossos/cli/copy_dataset.py b/webknossos/webknossos/cli/copy_dataset.py
@@ -0,0 +1,114 @@
+"""This module copies a WEBKNOSSOS datasets."""
+
+import logging
+from argparse import Namespace
+from multiprocessing import cpu_count
+from typing import Any, Optional
+
+import typer
+from typing_extensions import Annotated
+
+from ..dataset import DataFormat, Dataset
+from ..geometry import Vec3Int
+from ..utils import get_executor_for_args
+from ._utils import DistributionStrategy, parse_path, parse_vec3int
+
+logger = logging.getLogger(__name__)
+
+
+def main(
+    *,
+    source: Annotated[
+        Any,
+        typer.Argument(
+            help="Path to the source WEBKNOSSOS dataset.",
+            show_default=False,
+            parser=parse_path,
+        ),
+    ],
+    target: Annotated[
+        Any,
+        typer.Argument(
+            help="Path to the target WEBKNOSSOS dataset.",
+            show_default=False,
+            parser=parse_path,
+        ),
+    ],
+    data_format: Annotated[
+        Optional[DataFormat],
+        typer.Option(
+            help="Data format to store the target dataset in.",
+        ),
+    ] = None,
+    chunk_shape: Annotated[
+        Optional[Vec3Int],
+        typer.Option(
+            help="Number of voxels to be stored as a chunk in the target dataset "
+            "(e.g. `32` or `32,32,32`).",
+            parser=parse_vec3int,
+            metavar="Vec3Int",
+        ),
+    ] = None,
+    shard_shape: Annotated[
+        Optional[Vec3Int],
+        typer.Option(
+            help="Number of voxels to be stored as a shard in the target dataset "
+            "(e.g. `1024` or `1024,1024,1024`).",
+            parser=parse_vec3int,
+            metavar="Vec3Int",
+        ),
+    ] = None,
+    exists_ok: Annotated[
+        bool, typer.Option(help="Whether it should overwrite an existing dataset.")
+    ] = False,
+    jobs: Annotated[
+        int,
+        typer.Option(
+            help="Number of processes to be spawned.",
+            rich_help_panel="Executor options",
+        ),
+    ] = cpu_count(),
+    distribution_strategy: Annotated[
+        DistributionStrategy,
+        typer.Option(
+            help="Strategy to distribute the task across CPUs or nodes.",
+            rich_help_panel="Executor options",
+        ),
+    ] = DistributionStrategy.MULTIPROCESSING,
+    job_resources: Annotated[
+        Optional[str],
+        typer.Option(
+            help="Necessary when using slurm as distribution strategy. Should be a JSON string "
+            '(e.g., --job-resources=\'{"mem": "10M"}\')\'',
+            rich_help_panel="Executor options",
+        ),
+    ] = None,
+) -> None:
+    """Make a copy of the WEBKNOSSOS dataset.
+
+    Remote paths (i.e. https and s3) are also allowed.
+    Use the following environment variables to configure remote paths:
+    - HTTP_BASIC_USER
+    - HTTP_BASIC_PASSWORD
+    - S3_ENDPOINT_URL
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    """
+
+    executor_args = Namespace(
+        jobs=jobs,
+        distribution_strategy=distribution_strategy.value,
+        job_resources=job_resources,
+    )
+
+    source_dataset = Dataset.open(source)
+
+    with get_executor_for_args(args=executor_args) as executor:
+        source_dataset.copy_dataset(
+            target,
+            chunk_shape=chunk_shape,
+            shard_shape=shard_shape,
+            data_format=data_format,
+            exists_ok=exists_ok,
+            executor=executor,
+        )
diff --git a/webknossos/webknossos/cli/main.py b/webknossos/webknossos/cli/main.py
@@ -9,6 +9,7 @@
     convert_knossos,
     convert_raw,
     convert_zarr,
+    copy_dataset,
     download,
     downsample,
     export_as_tiff,
@@ -25,6 +26,7 @@
 app.command("convert-knossos")(convert_knossos.main)
 app.command("convert-raw")(convert_raw.main)
 app.command("convert-zarr")(convert_zarr.main)
+app.command("copy-dataset")(copy_dataset.main)
 app.command("download")(download.main)
 app.command("downsample")(downsample.main)
 app.command("export-wkw-as-tiff")(export_as_tiff.main)
diff --git a/webknossos/webknossos/dataset/_array.py b/webknossos/webknossos/dataset/_array.py
@@ -430,7 +430,7 @@ def _make_kvstore(path: Path) -> Union[str, Dict[str, Union[str, List[str]]]]:
             parsed_url = urlparse(str(path))
             kvstore_spec: dict[str, Any] = {
                 "driver": "s3",
-                "path": parsed_url.path,
+                "path": parsed_url.path.lstrip("/"),
                 "bucket": parsed_url.netloc,
             }
             if endpoint_url := path.storage_options.get("client_kwargs", {}).get(
diff --git a/webknossos/webknossos/dataset/dataset.py b/webknossos/webknossos/dataset/dataset.py
@@ -2171,6 +2171,7 @@ def add_copy_layer(
         chunks_per_shard: Optional[Union[Vec3IntLike, int]] = None,
         data_format: Optional[Union[str, DataFormat]] = None,
         compress: Optional[bool] = None,
+        exists_ok: bool = False,
         executor: Optional[Executor] = None,
     ) -> Layer:
         """Copy layer from another dataset to this one.
@@ -2186,6 +2187,7 @@ def add_copy_layer(
             chunks_per_shard: Deprecated, use shard_shape. Optional number of chunks per shard
             data_format: Optional format to store copied data ('wkw', 'zarr', etc.)
             compress: Optional whether to compress copied data
+            exists_ok: Whether to overwrite existing layers
             executor: Optional executor for parallel copying
 
         Returns:
@@ -2217,30 +2219,46 @@ def add_copy_layer(
         if new_layer_name is None:
             new_layer_name = foreign_layer.name
 
-        if new_layer_name in self.layers.keys():
-            raise IndexError(
-                f"Cannot copy {foreign_layer}. This dataset already has a layer called {new_layer_name}."
+        if exists_ok:
+            layer = self.get_or_add_layer(
+                new_layer_name,
+                category=foreign_layer.category,
+                dtype_per_channel=foreign_layer.dtype_per_channel,
+                num_channels=foreign_layer.num_channels,
+                data_format=data_format or foreign_layer.data_format,
+                largest_segment_id=foreign_layer._get_largest_segment_id_maybe(),
+                bounding_box=foreign_layer.bounding_box,
+            )
+        else:
+            if new_layer_name in self.layers.keys():
+                raise IndexError(
+                    f"Cannot copy {foreign_layer}. This dataset already has a layer called {new_layer_name}."
+                )
+            layer = self.add_layer(
+                new_layer_name,
+                category=foreign_layer.category,
+                dtype_per_channel=foreign_layer.dtype_per_channel,
+                num_channels=foreign_layer.num_channels,
+                data_format=data_format or foreign_layer.data_format,
+                largest_segment_id=foreign_layer._get_largest_segment_id_maybe(),
+                bounding_box=foreign_layer.bounding_box,
             )
-
-        layer = self.add_layer(
-            new_layer_name,
-            category=foreign_layer.category,
-            dtype_per_channel=foreign_layer.dtype_per_channel,
-            num_channels=foreign_layer.num_channels,
-            data_format=data_format or foreign_layer.data_format,
-            largest_segment_id=foreign_layer._get_largest_segment_id_maybe(),
-        )
-        layer.bounding_box = foreign_layer.bounding_box
 
         for mag_view in foreign_layer.mags.values():
+            progress_desc = (
+                f"Copying {mag_view.layer.name}/{mag_view.mag.to_layer_name()}"
+            )
+
             layer.add_copy_mag(
                 mag_view,
                 extend_layer_bounding_box=False,
                 chunk_shape=chunk_shape,
                 shard_shape=shard_shape,
                 chunks_per_shard=chunks_per_shard,
                 compress=compress,
+                exists_ok=exists_ok,
                 executor=executor,
+                progress_desc=progress_desc,
             )
 
         return layer
@@ -2458,6 +2476,7 @@ def copy_dataset(
         chunks_per_shard: Optional[Union[Vec3IntLike, int]] = None,
         data_format: Optional[Union[str, DataFormat]] = None,
         compress: Optional[bool] = None,
+        exists_ok: bool = False,
         executor: Optional[Executor] = None,
         voxel_size_with_unit: Optional[VoxelSize] = None,
     ) -> "Dataset":
@@ -2473,6 +2492,7 @@ def copy_dataset(
             chunks_per_shard: Deprecated, use shard_shape. Optional number of chunks per shard
             data_format: Optional format to store data ('wkw', 'zarr', 'zarr3')
             compress: Optional whether to compress data
+            exists_ok: Whether to overwrite existing datasets and layers
             executor: Optional executor for parallel copying
             voxel_size_with_unit: Optional voxel size specification with units
 
@@ -2507,13 +2527,13 @@ def copy_dataset(
         if data_format == DataFormat.WKW:
             assert is_fs_path(
                 new_dataset_path
-            ), "Cannot create WKW-based remote datasets. Use `data_format='zarr'` instead."
+            ), "Cannot create WKW-based remote datasets. Use `data_format='zarr3'` instead."
         if data_format is None and any(
             layer.data_format == DataFormat.WKW for layer in self.layers.values()
         ):
             assert is_fs_path(
                 new_dataset_path
-            ), "Cannot create WKW layers in remote datasets. Use explicit `data_format='zarr'`."
+            ), "Cannot create WKW layers in remote datasets. Use explicit `data_format='zarr3'`."
 
         if voxel_size_with_unit is None:
             if voxel_size is None:
@@ -2523,7 +2543,7 @@ def copy_dataset(
         new_ds = Dataset(
             new_dataset_path,
             voxel_size_with_unit=voxel_size_with_unit,
-            exist_ok=False,
+            exist_ok=exists_ok,
         )
 
         with get_executor_for_args(None, executor) as executor:
@@ -2535,6 +2555,7 @@ def copy_dataset(
                     chunks_per_shard=chunks_per_shard,
                     data_format=data_format,
                     compress=compress,
+                    exists_ok=exists_ok,
                     executor=executor,
                 )
         new_ds._export_as_json()
diff --git a/webknossos/webknossos/dataset/layer.py b/webknossos/webknossos/dataset/layer.py

Original file line number	Diff line number	Diff line change
`@@ -430,7 +430,7 @@ def _make_kvstore(path: Path) -> Union[str, Dict[str, Union[str, List[str]]]]:`
`430`	`430`	`parsed_url = urlparse(str(path))`
`431`	`431`	`kvstore_spec: dict[str, Any] = {`
`432`	`432`	`"driver": "s3",`
`433`		`- "path": parsed_url.path,`
	`433`	`+ "path": parsed_url.path.lstrip("/"),`
`434`	`434`	`"bucket": parsed_url.netloc,`
`435`	`435`	`}`
`436`	`436`	`if endpoint_url := path.storage_options.get("client_kwargs", {}).get(`