Skip to content

Commit 185cf72

Browse files
authored
Copy files, if possible (#1362)
* use fs-based copy * move to Layer.add_mag_as_copy * deprecations and changelog * fix tests * rm test * pr feedback
1 parent 4af80cb commit 185cf72

File tree

6 files changed

+104
-57
lines changed

6 files changed

+104
-57
lines changed

webknossos/Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
1919
### Added
2020

2121
### Changed
22+
- `Layer.add_mag_as_copy` now automatically uses file-based copy, if possible. Therefore, `Dataset.fs_copy_dataset`, `Dataset.add_fs_copy_layer` and `Layer.add_fs_copy_mag` are deprecated. [#1362](https://github.com/scalableminds/webknossos-libs/pull/1362)
2223
- The API methods now use `v10` API version of Webknossos. Datasets are now referenced by ID instead of dataset name or directory name. [#1363](https://github.com/scalableminds/webknossos-libs/pull/1363)
2324

2425
### Fixed

webknossos/tests/dataset/test_attachments.py

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -190,47 +190,6 @@ def test_copy_layer(tmp_upath: UPath) -> None:
190190
)
191191

192192

193-
def test_fs_copy_layer(tmp_upath: UPath) -> None:
194-
dataset, seg_layer = make_dataset(tmp_upath)
195-
196-
mesh_path = dataset.path / "seg" / "meshes" / "meshfile.hdf5"
197-
mesh_path.parent.mkdir(parents=True, exist_ok=True)
198-
mesh_path.write_text("test")
199-
200-
agglomerate_path = (tmp_upath / "agglomerate_view_15").resolve()
201-
agglomerate_path.mkdir(parents=True, exist_ok=True)
202-
(agglomerate_path / "zarr.json").write_text("test")
203-
204-
seg_layer.attachments.add_mesh(
205-
mesh_path,
206-
name="meshfile",
207-
data_format=AttachmentDataFormat.HDF5,
208-
)
209-
210-
seg_layer.attachments.add_agglomerate(
211-
Path("../agglomerate_view_15"),
212-
name="agglomerate_view_15",
213-
data_format=AttachmentDataFormat.Zarr3,
214-
)
215-
216-
copy_dataset = Dataset(tmp_upath / "test_copy", voxel_size=(10, 10, 10))
217-
copy_layer = copy_dataset.add_fs_copy_layer(seg_layer).as_segmentation_layer()
218-
219-
# has been copied
220-
assert (
221-
copy_layer.attachments.meshes[0].path
222-
== copy_dataset.path / "seg" / "meshes" / "meshfile.hdf5"
223-
)
224-
assert (copy_dataset.path / "seg" / "meshes" / "meshfile.hdf5").exists()
225-
226-
# has not been copied
227-
assert copy_layer.attachments.agglomerates[0].path == agglomerate_path
228-
assert (
229-
copy_layer.attachments.agglomerates[0]._properties.path
230-
== agglomerate_path.as_posix()
231-
)
232-
233-
234193
def test_symlink_layer(tmp_upath: UPath) -> None:
235194
dataset, seg_layer = make_dataset(tmp_upath)
236195

webknossos/tests/dataset/test_dataset.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from collections.abc import Iterator
66
from pathlib import Path
77
from typing import cast
8+
from unittest import mock
89

910
import numpy as np
1011
import pytest
@@ -2113,7 +2114,15 @@ def test_add_fs_copy_mag(data_format: DataFormat, output_path: UPath) -> None:
21132114
copy_layer = copy_ds.add_layer(
21142115
"color", COLOR_CATEGORY, dtype_per_channel="uint8", data_format=data_format
21152116
)
2116-
copy_mag = copy_layer.add_fs_copy_mag(original_mag, extend_layer_bounding_box=True)
2117+
2118+
with mock.patch.object(
2119+
copy_layer, "_add_fs_copy_mag", wraps=copy_layer._add_fs_copy_mag
2120+
) as mocked_method:
2121+
copy_mag = copy_layer.add_mag_as_copy(
2122+
original_mag, extend_layer_bounding_box=True
2123+
)
2124+
mocked_method.assert_called_once()
2125+
21172126
assert not copy_layer.read_only
21182127
assert not copy_mag.read_only
21192128

@@ -3533,9 +3542,7 @@ def test_zarr_copy_to_remote_dataset(data_format: DataFormat) -> None:
35333542

35343543
@pytest.mark.parametrize("input_path", OUTPUT_PATHS)
35353544
@pytest.mark.parametrize("output_path", OUTPUT_PATHS)
3536-
def test_fs_copy_dataset_with_attachments(
3537-
input_path: UPath, output_path: UPath
3538-
) -> None:
3545+
def test_copy_dataset_with_attachments(input_path: UPath, output_path: UPath) -> None:
35393546
ds_path = copy_simple_dataset(DEFAULT_DATA_FORMAT, input_path)
35403547
new_ds_path = prepare_dataset_path(DEFAULT_DATA_FORMAT, output_path, "copied")
35413548

@@ -3562,7 +3569,7 @@ def test_fs_copy_dataset_with_attachments(
35623569
)
35633570

35643571
# Copy
3565-
copy_ds = ds.fs_copy_dataset(new_ds_path)
3572+
copy_ds = ds.copy_dataset(new_ds_path)
35663573

35673574
assert (
35683575
copy_ds.default_view_configuration

webknossos/webknossos/dataset/_array.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,13 @@ def from_array_info(cls, array_info: ArrayInfo) -> "Zarr3ArrayInfo":
133133
axis_order=array_info.axis_order,
134134
)
135135

136+
@property
137+
def zarr3_config(self) -> "Zarr3Config":
138+
return Zarr3Config(
139+
codecs=self.codecs,
140+
chunk_key_encoding=self.chunk_key_encoding,
141+
)
142+
136143

137144
@dataclass(frozen=True)
138145
class Zarr3Config:

webknossos/webknossos/dataset/dataset.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
)
4040
from ..geometry.mag import MagLike
4141
from ..geometry.nd_bounding_box import derive_nd_bounding_box_from_shape
42-
from ._array import ArrayException, ArrayInfo, BaseArray
42+
from ._array import ArrayException, ArrayInfo, BaseArray, Zarr3Config
4343
from ._metadata import DatasetMetadata
4444
from ._utils import pims_images
4545
from .defaults import (
@@ -2248,7 +2248,7 @@ def add_layer_as_copy(
22482248
shard_shape: Vec3IntLike | int | None = None,
22492249
chunks_per_shard: Vec3IntLike | int | None = None,
22502250
data_format: str | DataFormat | None = None,
2251-
compress: bool | None = None,
2251+
compress: bool | Zarr3Config | None = None,
22522252
exists_ok: bool = False,
22532253
executor: Executor | None = None,
22542254
with_attachments: bool = True,
@@ -2544,13 +2544,15 @@ def add_fs_copy_layer(
25442544
foreign_layer: str | Path | Layer,
25452545
new_layer_name: str | None = None,
25462546
) -> Layer:
2547-
"""
2547+
"""Deprecated. File-based copy is automatically used in `Dataset.add_layer_as_copy`.
2548+
25482549
Copies the files at `foreign_layer` which belongs to another dataset
25492550
to the current dataset via the filesystem. Additionally, the relevant
25502551
information from the `datasource-properties.json` of the other dataset
25512552
are copied too. If new_layer_name is None, the name of the foreign
25522553
layer is used.
25532554
"""
2555+
warn_deprecated("add_fs_copy_layer", "add_layer_as_copy")
25542556
self._ensure_writable()
25552557
foreign_layer = Layer._ensure_layer(foreign_layer)
25562558

@@ -2725,7 +2727,8 @@ def fs_copy_dataset(
27252727
exists_ok: bool = False,
27262728
layers_to_ignore: Iterable[str] | None = None,
27272729
) -> "Dataset":
2728-
"""
2730+
"""Deprecated. File-based copy is automatically used by `Dataset.copy_dataset`.
2731+
27292732
Creates an independent copy of the dataset with all layers at a new location.
27302733
27312734
This method copies the files of the dataset as is and, therefore, might be faster than Dataset.copy_dataset, which decodes and encodes all the data.
@@ -2751,12 +2754,13 @@ def fs_copy_dataset(
27512754
Note:
27522755
WKW layers can only be copied to datasets on local file systems.
27532756
"""
2757+
warn_deprecated("fs_copy_dataset", "copy_dataset")
27542758

27552759
new_dataset_path = UPath(new_dataset_path)
27562760

27572761
if any(layer.data_format == DataFormat.WKW for layer in self.layers.values()):
27582762
assert is_fs_path(new_dataset_path), (
2759-
"Cannot create WKW layers in remote datasets. Use explicit `data_format='zarr3'`."
2763+
"Cannot create WKW layers in remote datasets. Use `Dataset.copy_dataset` with `data_format='zarr3'`."
27602764
)
27612765

27622766
new_dataset = Dataset(

webknossos/webknossos/dataset/layer.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import operator
33
import re
44
import warnings
5+
from inspect import getframeinfo, stack
56
from os import PathLike
67
from os.path import relpath
78
from pathlib import Path
@@ -14,7 +15,7 @@
1415

1516
from ..geometry import Mag, NDBoundingBox, Vec3Int, Vec3IntLike
1617
from ..geometry.mag import MagLike
17-
from ._array import ArrayException, TensorStoreArray, Zarr3Config
18+
from ._array import ArrayException, TensorStoreArray, Zarr3ArrayInfo, Zarr3Config
1819
from ._downsampling_utils import (
1920
calculate_default_coarsest_mag,
2021
calculate_mags_to_downsample,
@@ -60,6 +61,8 @@
6061
DEFAULT_SHARD_SHAPE,
6162
)
6263

64+
logger = logging.getLogger(__name__)
65+
6366

6467
def _is_int(s: str) -> bool:
6568
try:
@@ -870,6 +873,43 @@ def add_mag_as_copy(
870873
self._ensure_writable()
871874
foreign_mag_view = MagView._ensure_mag_view(foreign_mag_view_or_path)
872875

876+
has_same_shapes = (
877+
(
878+
chunk_shape is None
879+
or Vec3Int.from_vec_or_int(chunk_shape)
880+
== foreign_mag_view.info.chunk_shape
881+
)
882+
and (
883+
shard_shape is None
884+
or Vec3Int.from_vec_or_int(shard_shape)
885+
== foreign_mag_view.info.shard_shape
886+
)
887+
and (
888+
chunks_per_shard is None
889+
or Vec3Int.from_vec_or_int(chunks_per_shard)
890+
== foreign_mag_view.info.chunks_per_shard
891+
)
892+
)
893+
has_same_format = self.data_format == foreign_mag_view.info.data_format
894+
if compress is None:
895+
has_same_compression = True
896+
elif isinstance(compress, Zarr3Config) and isinstance(
897+
foreign_mag_view.info, Zarr3ArrayInfo
898+
):
899+
has_same_compression = compress == foreign_mag_view.info.zarr3_config
900+
else:
901+
has_same_compression = compress == foreign_mag_view.info.compression_mode
902+
903+
if has_same_shapes and has_same_format and has_same_compression:
904+
logger.debug(
905+
f"Optimization: Copying files from {foreign_mag_view.path} to {self.path}/{foreign_mag_view.mag} directly without re-encoding."
906+
)
907+
return self._add_fs_copy_mag(
908+
foreign_mag_view,
909+
extend_layer_bounding_box=extend_layer_bounding_box,
910+
exists_ok=exists_ok,
911+
)
912+
873913
chunk_shape = Vec3Int.from_vec_or_int(
874914
chunk_shape or foreign_mag_view.info.chunk_shape
875915
)
@@ -1036,21 +1076,26 @@ def add_mag_as_ref(
10361076

10371077
return self.get_mag(foreign_mag_view.mag)
10381078

1039-
def add_fs_copy_mag(
1079+
def _add_fs_copy_mag(
10401080
self,
10411081
foreign_mag_view_or_path: PathLike | str | MagView,
10421082
*,
10431083
extend_layer_bounding_box: bool = True,
1084+
exists_ok: bool = False,
10441085
) -> MagView:
1045-
"""
1046-
Copies the data at `foreign_mag_view_or_path` which belongs to another dataset to the current dataset via the filesystem.
1047-
Additionally, the relevant information from the `datasource-properties.json` of the other dataset are copied, too.
1048-
"""
10491086
self._ensure_writable()
10501087
foreign_mag_view = MagView._ensure_mag_view(foreign_mag_view_or_path)
10511088
self._assert_mag_does_not_exist_yet(foreign_mag_view.mag)
10521089

1090+
assert foreign_mag_view.info.data_format == self.data_format, (
1091+
f"Cannot use file-based copy, because the foreign data format {foreign_mag_view.info.data_format} does not match the layer's data format {self.data_format}."
1092+
)
1093+
10531094
mag_path = self.path / str(foreign_mag_view.mag)
1095+
if not exists_ok and mag_path.exists():
1096+
raise FileExistsError(
1097+
f"Cannot copy {foreign_mag_view.path} to {mag_path} because it already exists."
1098+
)
10541099
copytree(
10551100
foreign_mag_view.path,
10561101
mag_path,
@@ -1067,6 +1112,30 @@ def add_fs_copy_mag(
10671112

10681113
return mag
10691114

1115+
def add_fs_copy_mag(
1116+
self,
1117+
foreign_mag_view_or_path: PathLike | str | MagView,
1118+
*,
1119+
extend_layer_bounding_box: bool = True,
1120+
exists_ok: bool = False,
1121+
) -> MagView:
1122+
"""Deprecated. File-copy is automatically selected when using `Layer.add_mag_as_copy`.
1123+
1124+
Copies the data at `foreign_mag_view_or_path` which belongs to another dataset to the current dataset via the filesystem.
1125+
Additionally, the relevant information from the `datasource-properties.json` of the other dataset are copied, too.
1126+
"""
1127+
caller = getframeinfo(stack()[2][0])
1128+
warnings.warn(
1129+
f"[DEPRECATION] Direct use of `Layer.add_fs_copy_mag` is deprecated, please use `Layer.add_mag_as_copy` instead (see {caller.filename}:{caller.lineno}), which automatically uses file-based copy if available.",
1130+
DeprecationWarning,
1131+
stacklevel=2,
1132+
)
1133+
return self._add_fs_copy_mag(
1134+
foreign_mag_view_or_path,
1135+
extend_layer_bounding_box=extend_layer_bounding_box,
1136+
exists_ok=exists_ok,
1137+
)
1138+
10701139
def add_mag_from_zarrarray(
10711140
self,
10721141
mag: MagLike,

0 commit comments

Comments
 (0)