make largest_segment_id optional (#925)

markbader · web-flow · commit b2d156ca3f20 · 2023-08-08T08:31:54.000+02:00
* Remove assertions to make largest_segment_id optional.

* Adapt tests to accept None for largest_segment_id.

* Add update_largest_segment_id method to determine and save id.

* Refactor and add test.

* Implement requested changes.

* Implement requested changes.

* Update docstring.
diff --git a/webknossos/Changelog.md b/webknossos/Changelog.md
@@ -15,8 +15,11 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
 ### Breaking Changes
 
 ### Added
+- `View` has a `map_chunk` method now to run a function on each chunk and collect the results in a list.
 
 ### Changed
+- As WEBKNOSSOS does not require the largest segment id. It is also not mandatory in the WEBKNOSSOS libs anymore. [#917](https://github.com/scalableminds/webknossos-libs/issues/917) The method `SegmentationLayer.refresh_largest_segment_id` was added to lookup the highest value in segmentation data and set `largest_segment_id` accordingly.
+- The `convert` command of the cli now has a `--category` flag, to select the LayerCategoryType.
 
 ### Fixed
 
diff --git a/webknossos/tests/dataset/test_dataset.py b/webknossos/tests/dataset/test_dataset.py
@@ -787,24 +787,16 @@ def test_open_dataset_without_num_channels_in_properties() -> None:
     assure_exported_properties(ds)
 
 
-def test_largest_segment_id_requirement() -> None:
+def test_no_largest_segment_id() -> None:
     ds_path = prepare_dataset_path(DataFormat.WKW, TESTOUTPUT_DIR)
     ds = Dataset(ds_path, voxel_size=(10, 10, 10))
 
-    with pytest.raises(AssertionError):
-        ds.add_layer("segmentation", SEGMENTATION_CATEGORY)
-
-    largest_segment_id = 10
-    ds.add_layer(
-        "segmentation",
-        SEGMENTATION_CATEGORY,
-        largest_segment_id=largest_segment_id,
-    ).add_mag(Mag(1))
+    ds.add_layer("segmentation", SEGMENTATION_CATEGORY).add_mag(Mag(1))
 
     ds = Dataset.open(ds_path)
+
     assert (
-        cast(SegmentationLayer, ds.get_layer("segmentation")).largest_segment_id
-        == largest_segment_id
+        cast(SegmentationLayer, ds.get_layer("segmentation")).largest_segment_id is None
     )
 
     assure_exported_properties(ds)
@@ -2211,6 +2203,26 @@ def test_get_largest_segment_id() -> None:
     assure_exported_properties(ds)
 
 
+def test_refresh_largest_segment_id() -> None:
+    ds_path = prepare_dataset_path(DataFormat.WKW, TESTOUTPUT_DIR)
+    ds = Dataset(ds_path, voxel_size=(1, 1, 1))
+
+    segmentation_layer = cast(
+        SegmentationLayer,
+        ds.add_layer("segmentation", SEGMENTATION_CATEGORY),
+    )
+    mag = segmentation_layer.add_mag(Mag(1))
+
+    assert segmentation_layer.largest_segment_id is None
+
+    write_data = (np.random.rand(10, 20, 30) * 255).astype(np.uint8)
+    mag.write(data=write_data)
+
+    segmentation_layer.refresh_largest_segment_id()
+
+    assert segmentation_layer.largest_segment_id == np.max(write_data, initial=0)
+
+
 def test_get_or_add_layer_by_type() -> None:
     ds_path = prepare_dataset_path(DataFormat.WKW, TESTOUTPUT_DIR)
     ds = Dataset(ds_path, voxel_size=(1, 1, 1))
diff --git a/webknossos/tests/dataset/test_dataset_deprecated.py b/webknossos/tests/dataset/test_dataset_deprecated.py
@@ -520,25 +520,16 @@ def test_open_dataset_without_num_channels_in_properties() -> None:
     assure_exported_properties(ds)
 
 
-def test_largest_segment_id_requirement() -> None:
+def test_no_largest_segment_id() -> None:
     path = TESTOUTPUT_DIR / "largest_segment_id"
     rmtree(path)
     ds = Dataset(path, scale=(10, 10, 10))
 
-    with pytest.raises(AssertionError):
-        ds.add_layer("segmentation", SEGMENTATION_CATEGORY)
-
-    largest_segment_id = 10
-    ds.add_layer(
-        "segmentation",
-        SEGMENTATION_CATEGORY,
-        largest_segment_id=largest_segment_id,
-    ).add_mag(Mag(1))
+    ds.add_layer("segmentation", SEGMENTATION_CATEGORY).add_mag(Mag(1))
 
     ds = Dataset.open(path)
     assert (
-        cast(SegmentationLayer, ds.get_layer("segmentation")).largest_segment_id
-        == largest_segment_id
+        cast(SegmentationLayer, ds.get_layer("segmentation")).largest_segment_id is None
     )
 
     assure_exported_properties(ds)
diff --git a/webknossos/webknossos/dataset/dataset.py b/webknossos/webknossos/dataset/dataset.py
@@ -769,8 +769,6 @@ def add_layer(
         Creates a new layer called `layer_name` and adds it to the dataset.
         The dtype can either be specified per layer or per channel.
         If neither of them are specified, `uint8` per channel is used as default.
-        When creating a "Segmentation Layer" (`category="segmentation"`),
-        the parameter `largest_segment_id` also has to be specified.
 
         Creates the folder `layer_name` in the directory of `self.path`.
 
@@ -837,10 +835,6 @@ def add_layer(
             self._properties.data_layers += [layer_properties]
             self._layers[layer_name] = Layer(self, layer_properties)
         elif category == SEGMENTATION_CATEGORY:
-            assert (
-                "largest_segment_id" in kwargs
-            ), f"Failed to create segmentation layer {layer_name}: the parameter 'largest_segment_id' was not specified, which is necessary for segmentation layers."
-
             segmentation_layer_properties: SegmentationLayerProperties = (
                 SegmentationLayerProperties(
                     **(
diff --git a/webknossos/webknossos/dataset/layer.py b/webknossos/webknossos/dataset/layer.py
@@ -14,7 +14,7 @@
 from upath import UPath
 
 from webknossos.dataset.sampling_modes import SamplingModes
-from webknossos.dataset.view import _copy_job
+from webknossos.dataset.view import View, _copy_job
 from webknossos.geometry import BoundingBox, Mag, Vec3Int, Vec3IntLike
 
 from ._array import ArrayException, BaseArray, DataFormat
@@ -1158,3 +1158,23 @@ def category(self) -> LayerCategoryType:
 
     def _get_largest_segment_id_maybe(self) -> Optional[int]:
         return self.largest_segment_id
+
+    def _get_largest_segment_id(self, view: View) -> int:
+        return np.max(view.read(), initial=0)
+
+    def refresh_largest_segment_id(
+        self, chunk_shape: Optional[Vec3Int] = None, executor: Optional[Executor] = None
+    ) -> None:
+        """Sets the largest segment id to the highest value in the data.
+        largest_segment_id is set to `None` if the data is empty."""
+
+        try:
+            chunk_results = self.get_finest_mag().map_chunk(
+                self._get_largest_segment_id,
+                chunk_shape=chunk_shape,
+                executor=executor,
+                progress_desc="Searching largest segment id",
+            )
+            self.largest_segment_id = max(chunk_results)
+        except ValueError:
+            self.largest_segment_id = None
diff --git a/webknossos/webknossos/dataset/view.py b/webknossos/webknossos/dataset/view.py
@@ -9,6 +9,7 @@
     Dict,
     Iterable,
     Iterator,
+    List,
     Optional,
     Tuple,
     Type,
@@ -203,6 +204,26 @@ def write(
         This parameter used to be relative for `View` and absolute for `MagView`,
         and specified in the mag of the respective view.
 
+        Writing data to a segmentation layer manually does not automatically update the largest_segment_id. To set
+        the largest segment id properly run the `refresh_largest_segment_id` method on your layer or set the
+        `largest_segment_id` property manually..
+
+        Example:
+
+        ```python
+        ds = Dataset(DS_PATH, voxel_size=(1, 1, 1))
+
+        segmentation_layer = cast(
+            SegmentationLayer,
+            ds.add_layer("segmentation", SEGMENTATION_CATEGORY),
+        )
+        mag = segmentation_layer.add_mag(Mag(1))
+
+        mag.write(data=MY_NP_ARRAY)
+
+        segmentation_layer.refresh_largest_segment_id()
+        ```
+
         Note that writing compressed data which is not aligned with the blocks on disk may result in
         diminished performance, as full blocks will automatically be read to pad the write actions.
         """
@@ -809,6 +830,64 @@ def some_work(args: Tuple[View, int], some_parameter: int) -> None:
                 executor.map_to_futures(func_per_chunk, job_args), progress_desc
             )
 
+    def map_chunk(
+        self,
+        func_per_chunk: Callable[["View"], Any],
+        chunk_shape: Optional[Vec3IntLike] = None,  # in Mag(1)
+        executor: Optional[Executor] = None,
+        progress_desc: Optional[str] = None,
+    ) -> List[Any]:
+        """
+        The view is chunked into multiple sub-views of size `chunk_shape` (in Mag(1)),
+        by default one chunk per file.
+        Then, `func_per_chunk` is performed on each sub-view and the results are collected
+        in a list.
+        Additional parameters for `func_per_chunk` can be specified using `functools.partial`.
+        The computation of each chunk has to be independent of each other.
+        Therefore, the work can be parallelized with `executor`.
+
+        If the `View` is of type `MagView` only the bounding box from the properties is chunked.
+
+        Example:
+        ```python
+        from webknossos.utils import named_partial
+
+        def some_work(view: View, some_parameter: int) -> None:
+            # perform operations on the view
+            ...
+
+        # ...
+        # let 'mag1' be a `MagView`
+        func = named_partial(some_work, some_parameter=42)
+        results = mag1.map_chunk(
+            func,
+        )
+        ```
+        """
+
+        if chunk_shape is None:
+            chunk_shape = self._get_file_dimensions_mag1()
+        else:
+            chunk_shape = Vec3Int(chunk_shape)
+            self._check_chunk_shape(chunk_shape, read_only=self.read_only)
+
+        job_args = []
+        for chunk in self.bounding_box.chunk(chunk_shape, chunk_shape):
+            chunk_view = self.get_view(
+                absolute_offset=chunk.topleft,
+                size=chunk.size,
+            )
+            job_args.append(chunk_view)
+
+        # execute the work for each chunk
+        with get_executor_for_args(None, executor) as executor:
+            results = wait_and_ensure_success(
+                executor.map_to_futures(func_per_chunk, job_args),
+                progress_desc=progress_desc,
+            )
+
+        return results
+
     def for_zipped_chunks(
         self,
         func_per_chunk: Callable[[Tuple["View", "View", int]], None],