Fix compressed from images oom (#920)

markbader · web-flow · commit df6f292850d6 · 2023-06-29T10:14:34.000+02:00
* Implement chunked version of _handle_compressed_write.

* Add test to verify if data written in chunks is correct.

* Run formatter and linter.

* Add docstrings and rename methods.
diff --git a/webknossos/Changelog.md b/webknossos/Changelog.md
@@ -19,6 +19,7 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
 ### Changed
 
 ### Fixed
+- Fixed a bug where compression in add_layer_from_images uses too much memory [#900](https://github.com/scalableminds/webknossos-libs/issues/900)
 
 
 ## [0.13.0](https://github.com/scalableminds/webknossos-libs/releases/tag/v0.13.0) - 2023-06-21
diff --git a/webknossos/tests/dataset/test_dataset.py b/webknossos/tests/dataset/test_dataset.py
@@ -541,6 +541,29 @@ def test_update_new_bounding_box_offset(
     assure_exported_properties(ds)
 
 
+def test_chunked_compressed_write() -> None:
+    ds_path = prepare_dataset_path(DataFormat.WKW, TESTOUTPUT_DIR)
+    mag = (
+        Dataset(ds_path, voxel_size=(1, 1, 1))
+        .get_or_add_layer("color", COLOR_CATEGORY, data_format=DataFormat.WKW)
+        .get_or_add_mag("1", compress=True)
+    )
+
+    np.random.seed(1234)
+    data: np.ndarray = (np.random.rand(10, 10, 10) * 255).astype(np.uint8)
+
+    # write data in the bottom-right cornor of a shard so that other shards have to be written too
+    mag.write(data, absolute_offset=mag.info.shard_shape - Vec3Int(5, 5, 5))
+
+    assert (
+        mag.get_view(
+            absolute_offset=mag.info.shard_shape - Vec3Int(5, 5, 5),
+            size=Vec3Int(10, 10, 10),
+        ).read()
+        == data
+    ).all()
+
+
 @pytest.mark.parametrize("data_format,output_path", DATA_FORMATS_AND_OUTPUT_PATHS)
 def test_write_multi_channel_uint8(data_format: DataFormat, output_path: Path) -> None:
     ds_path = prepare_dataset_path(data_format, output_path, "multichannel")
diff --git a/webknossos/tests/test_cli.py b/webknossos/tests/test_cli.py
@@ -202,29 +202,28 @@ def test_convert() -> None:
         assert (wkw_path / PROPERTIES_FILE_NAME).exists()
 
 
-@pytest.mark.filterwarnings("ignore::UserWarning")
 def test_convert_with_all_params() -> None:
     """Tests the functionality of convert subcommand."""
 
     with tmp_cwd():
         origin_path = TESTDATA_DIR / "tiff"
         wkw_path = Path("wkw_from_tiff_extended")
-
-        result = runner.invoke(
-            app,
-            [
-                "convert",
-                "--voxel-size",
-                "11.0,11.0,11.0",
-                "--data-format",
-                "wkw",
-                "--name",
-                "wkw_from_tiff",
-                "--compress",
-                str(origin_path),
-                str(wkw_path),
-            ],
-        )
+        with pytest.warns(UserWarning):
+            result = runner.invoke(
+                app,
+                [
+                    "convert",
+                    "--voxel-size",
+                    "11.0,11.0,11.0",
+                    "--data-format",
+                    "wkw",
+                    "--name",
+                    "wkw_from_tiff",
+                    "--compress",
+                    str(origin_path),
+                    str(wkw_path),
+                ],
+            )
 
         assert result.exit_code == 0
         assert (wkw_path / PROPERTIES_FILE_NAME).exists()
diff --git a/webknossos/webknossos/cli/convert.py b/webknossos/webknossos/cli/convert.py
@@ -92,15 +92,12 @@ def main(
     )
 
     with get_executor_for_args(args=executor_args) as executor:
-        dataset = Dataset.from_images(
+        Dataset.from_images(
             source,
             target,
             voxel_size,
             name=name,
             data_format=data_format,
             executor=executor,
+            compress=compress,
         )
-        # TODO  pylint: disable=fixme
-        # Include this in the from_images() call as soon as issue #900 is resolved
-        if compress:
-            dataset.compress()
diff --git a/webknossos/webknossos/dataset/view.py b/webknossos/webknossos/dataset/view.py
@@ -2,7 +2,17 @@
 from argparse import Namespace
 from pathlib import Path
 from types import TracebackType
-from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, Optional, Tuple, Type
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    Iterator,
+    Optional,
+    Tuple,
+    Type,
+)
 
 import numpy as np
 import wkw
@@ -243,15 +253,46 @@ def write(
         current_mag_bbox = mag1_bbox.in_mag(self._mag)
 
         if self._is_compressed():
-            current_mag_bbox, data = self._handle_compressed_write(
+            for current_mag_bbox, chunked_data in self._prepare_compressed_write(
                 current_mag_bbox, data
-            )
+            ):
+                self._array.write(current_mag_bbox.topleft, chunked_data)
+        else:
+            self._array.write(current_mag_bbox.topleft, data)
 
-        self._array.write(current_mag_bbox.topleft, data)
+    def _prepare_compressed_write(
+        self, current_mag_bbox: BoundingBox, data: np.ndarray
+    ) -> Iterator[Tuple[BoundingBox, np.ndarray]]:
+        """This method takes an arbitrary sized chunk of data with an accompanying bbox,
+        divides these into chunks of shard_shape size and delegates
+        the preparation to _prepare_compressed_write_chunk."""
+
+        chunked_bboxes = current_mag_bbox.chunk(
+            self.info.shard_shape,
+            chunk_border_alignments=self.info.shard_shape,
+        )
+        for chunked_bbox in chunked_bboxes:
+            source_slice: Any
+            if len(data.shape) == 3:
+                source_slice = chunked_bbox.offset(
+                    -current_mag_bbox.topleft
+                ).to_slices()
+            else:
+                source_slice = (slice(None, None),) + chunked_bbox.offset(
+                    -current_mag_bbox.topleft
+                ).to_slices()
 
-    def _handle_compressed_write(
+            yield self._prepare_compressed_write_chunk(chunked_bbox, data[source_slice])
+
+    def _prepare_compressed_write_chunk(
         self, current_mag_bbox: BoundingBox, data: np.ndarray
     ) -> Tuple[BoundingBox, np.ndarray]:
+        """This method takes an arbitrary sized chunk of data with an accompanying bbox
+        (ideally not larger than a shard) and enlarges that chunk to fit the shard it
+        resides in (by reading the entire shard data and writing the passed data ndarray
+        into the specified volume). That way, the returned data can be written as a whole
+        shard which is a requirement for compressed writes."""
+
         aligned_bbox = current_mag_bbox.align_with_mag(self.info.shard_shape, ceil=True)
 
         if current_mag_bbox != aligned_bbox: