Fixed PimsTiffReader (#1212)

normanrz · web-flow · commit ce9c8b03faf3 · 2024-12-04T16:48:16.000Z
* wip fixed PimsTiffReader

* types, lint

* fixes?

* SequentialExecutor

* fixes issues

* improved out shape

* types for 3.9

* more efficient

* changelog
diff --git a/webknossos/Changelog.md b/webknossos/Changelog.md
@@ -20,6 +20,7 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
 ### Changed
 - Removes vcr-py from developer dependencies for testing and adds proxay for recording and replaying API requests. [#1198](https://github.com/scalableminds/webknossos-libs/pull/1198)
 - Removed the CZI installation extra from `pip install webknossos[all]` by default. Users need to manually install it with `pip install --extra-index-url https://pypi.scm.io/simple/ webknossos[czi]`. [#1219](https://github.com/scalableminds/webknossos-libs/pull/1219)
+- Refactored the PimsTiffReader to read the data directly from the tiff file without creating a memmap-able copy first. This greatly reduces the time and storage requirements for converting large tiff files. [#1212](https://github.com/scalableminds/webknossos-libs/pull/1212)
 
 ### Fixed
 - Fixed unpickling of the SSL_Context to allow for a second or third pickling. [#1223](https://github.com/scalableminds/webknossos-libs/pull/1223)
diff --git a/webknossos/tests/dataset/test_add_layer_from_images.py b/webknossos/tests/dataset/test_add_layer_from_images.py
@@ -9,6 +9,7 @@
 import httpx
 import numpy as np
 import pytest
+from cluster_tools import SequentialExecutor
 from tifffile import TiffFile
 
 import webknossos as wk
@@ -38,20 +39,22 @@ def test_compare_tifffile(tmp_path: Path) -> None:
     for z_index in range(0, data.shape[-1]):
         with TiffFile("testdata/tiff/test.0200.tiff") as tif_file:
             comparison_slice = tif_file.asarray().T
-        assert np.array_equal(data[:, :, z_index], comparison_slice)
+        np.testing.assert_array_equal(data[:, :, z_index], comparison_slice)
 
 
 def test_compare_nd_tifffile(tmp_path: Path) -> None:
     ds = wk.Dataset(tmp_path, (1, 1, 1))
-    layer = ds.add_layer_from_images(
-        "testdata/4D/4D_series/4D-series.ome.tif",
-        layer_name="color",
-        category="color",
-        topleft=(2, 55, 100, 100),
-        data_format="zarr3",
-        chunk_shape=(8, 8, 8),
-        chunks_per_shard=(8, 8, 8),
-    )
+    with SequentialExecutor() as executor:
+        layer = ds.add_layer_from_images(
+            "testdata/4D/4D_series/4D-series.ome.tif",
+            layer_name="color",
+            category="color",
+            topleft=(2, 55, 100, 100),
+            data_format="zarr3",
+            chunk_shape=(8, 8, 8),
+            chunks_per_shard=(8, 8, 8),
+            executor=executor,
+        )
     assert layer.bounding_box.topleft == wk.VecInt(
         2, 55, 100, 100, axes=("t", "z", "y", "x")
     )
@@ -62,7 +65,9 @@ def test_compare_nd_tifffile(tmp_path: Path) -> None:
         "testdata/4D/4D_series/4D-series.ome.tif"
     ).asarray()
     read_first_channel_from_dataset = layer.get_finest_mag().read()[0]
-    assert np.array_equal(read_with_tifffile_reader, read_first_channel_from_dataset)
+    np.testing.assert_array_equal(
+        read_with_tifffile_reader, read_first_channel_from_dataset
+    )
 
 
 REPO_IMAGES_ARGS: List[
@@ -184,19 +189,16 @@ def test_compare_nd_tifffile(tmp_path: Path) -> None:
 ]
 
 
-@pytest.mark.parametrize(
-    "path, kwargs, dtype, num_channels, num_layers, size", REPO_IMAGES_ARGS
-)
-def test_repo_images(
+def _test_repo_images(
     tmp_path: Path,
-    path: str,
+    path: Union[str, list[Path]],
     kwargs: Dict,
     dtype: str,
     num_channels: int,
     num_layers: int,
     size: Tuple[int, ...],
 ) -> wk.Dataset:
-    with wk.utils.get_executor_for_args(None) as executor:
+    with SequentialExecutor() as executor:
         ds = wk.Dataset(tmp_path, (1, 1, 1))
         layer = ds.add_layer_from_images(
             path,
@@ -216,6 +218,21 @@ def test_repo_images(
     return ds
 
 
+@pytest.mark.parametrize(
+    "path, kwargs, dtype, num_channels, num_layers, size", REPO_IMAGES_ARGS
+)
+def test_repo_images(
+    tmp_path: Path,
+    path: str,
+    kwargs: Dict,
+    dtype: str,
+    num_channels: int,
+    num_layers: int,
+    size: Tuple[int, ...],
+) -> None:
+    _test_repo_images(tmp_path, path, kwargs, dtype, num_channels, num_layers, size)
+
+
 def download_and_unpack(
     url: Union[str, List[str]], out_path: Path, filename: Union[str, List[str]]
 ) -> None:
@@ -245,7 +262,7 @@ def download_and_unpack(
                 copy(download_file.name, out_path / filename_i)
 
 
-BIOFORMATS_ARGS = [
+BIOFORMATS_ARGS: list[tuple[str, str, dict, str, int, tuple[int, int, int], int]] = [
     (
         "https://samples.scif.io/wtembryo.zip",
         "wtembryo.mov",
@@ -294,10 +311,7 @@ def download_and_unpack(
 ]
 
 
-@pytest.mark.parametrize(
-    "url, filename, kwargs, dtype, num_channels, size, num_layers", BIOFORMATS_ARGS
-)
-def test_bioformats(
+def _test_bioformats(
     tmp_path: Path,
     url: str,
     filename: str,
@@ -326,9 +340,36 @@ def test_bioformats(
     return ds
 
 
+@pytest.mark.parametrize(
+    "url, filename, kwargs, dtype, num_channels, size, num_layers", BIOFORMATS_ARGS
+)
+def test_bioformats(
+    tmp_path: Path,
+    url: str,
+    filename: str,
+    kwargs: Dict,
+    dtype: str,
+    num_channels: int,
+    size: Tuple[int, int, int],
+    num_layers: int,
+) -> None:
+    _test_bioformats(
+        tmp_path, url, filename, kwargs, dtype, num_channels, size, num_layers
+    )
+
+
 # All scif images used here are published with CC0 license,
 # see https://scif.io/images.
-TEST_IMAGES_ARGS = [
+TEST_IMAGES_ARGS: list[
+    tuple[
+        Union[str, list[str]],
+        Union[str, list[str]],
+        dict,
+        str,
+        int,
+        tuple[int, int, int],
+    ]
+] = [
     (
         "https://static.webknossos.org/data/webknossos-libs/slice_0420.dm4",
         "slice_0420.dm4",
@@ -409,10 +450,7 @@ def test_bioformats(
 ]
 
 
-@pytest.mark.parametrize(
-    "url, filename, kwargs, dtype, num_channels, size", TEST_IMAGES_ARGS
-)
-def test_test_images(
+def _test_test_images(
     tmp_path: Path,
     url: Union[str, List[str]],
     filename: Union[str, List[str]],
@@ -461,34 +499,49 @@ def test_test_images(
         assert l_normal.num_channels == num_channels
         assert l_normal.bounding_box.size.to_tuple() == size
         if l_bio is not None:
-            assert np.array_equal(
+            np.testing.assert_array_equal(
                 l_bio.get_finest_mag().read(), l_normal.get_finest_mag().read()
             )
     return ds
 
 
+@pytest.mark.parametrize(
+    "url, filename, kwargs, dtype, num_channels, size", TEST_IMAGES_ARGS
+)
+def test_test_images(
+    tmp_path: Path,
+    url: Union[str, List[str]],
+    filename: Union[str, List[str]],
+    kwargs: Dict,
+    dtype: str,
+    num_channels: int,
+    size: Tuple[int, int, int],
+) -> None:
+    _test_test_images(tmp_path, url, filename, kwargs, dtype, num_channels, size)
+
+
 if __name__ == "__main__":
     time = lambda: strftime("%Y-%m-%d_%H-%M-%S", gmtime())  # noqa: E731
 
-    for repo_images_args in REPO_IMAGES_ARGS:
+    for repo_image in REPO_IMAGES_ARGS:
         with TemporaryDirectory() as tempdir:
-            image_path = repo_images_args[0]
+            image_path = repo_image[0]
             if isinstance(image_path, list):
                 image_path = str(image_path[0])
             name = "".join(filter(str.isalnum, image_path))
-            print(*repo_images_args)
+            print(repo_image)
             print(
-                test_repo_images(Path(tempdir), *repo_images_args)
+                _test_repo_images(Path(tempdir), *repo_image)
                 .upload(f"test_repo_images_{name}_{time()}")
                 .url
             )
 
-    for bioformats_args in BIOFORMATS_ARGS:
+    for bioformat_image in BIOFORMATS_ARGS:
         with TemporaryDirectory() as tempdir:
-            name = "".join(filter(str.isalnum, bioformats_args[1]))
-            print(*bioformats_args)
+            name = "".join(filter(str.isalnum, bioformat_image[1]))
+            print(bioformat_image)
             print(
-                test_bioformats(Path(tempdir), *bioformats_args)
+                _test_bioformats(Path(tempdir), *bioformat_image)
                 .upload(f"test_bioformats_{name}_{time()}")
                 .url
             )
@@ -498,7 +551,7 @@ def test_test_images(
             name = "".join(filter(str.isalnum, test_images_args[1]))
             print(*test_images_args)
             print(
-                test_test_images(Path(tempdir), *test_images_args)
+                _test_test_images(Path(tempdir), *test_images_args)
                 .upload(f"test_test_images_{name}_{time()}")
                 .url
             )
diff --git a/webknossos/tests/dataset/test_from_images.py b/webknossos/tests/dataset/test_from_images.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 import pytest
+from cluster_tools import SequentialExecutor
 from tifffile import TiffFile
 
 import webknossos as wk
@@ -36,7 +37,7 @@ def test_compare_tifffile(tmp_path: Path) -> None:
     for z_index in range(0, data.shape[-1]):
         with TiffFile(TESTDATA_DIR / "tiff" / "test.0000.tiff") as tif_file:
             comparison_slice = tif_file.asarray().T
-        assert np.array_equal(data[:, :, z_index], comparison_slice)
+        np.testing.assert_array_equal(data[:, :, z_index], comparison_slice)
 
 
 def test_multiple_multitiffs(tmp_path: Path) -> None:
@@ -95,11 +96,13 @@ def test_no_slashes_in_layername(tmp_path: Path) -> None:
     )
 
     for strategy in Dataset.ConversionLayerMapping:
-        dataset = wk.Dataset.from_images(
-            tmp_path / "tiff",
-            tmp_path / str(strategy),
-            voxel_size=(10, 10, 10),
-            map_filepath_to_layer_name=strategy,
-        )
-
-        assert all("/" not in layername for layername in dataset.layers)
+        with SequentialExecutor() as executor:
+            dataset = wk.Dataset.from_images(
+                tmp_path / "tiff",
+                tmp_path / str(strategy),
+                voxel_size=(10, 10, 10),
+                map_filepath_to_layer_name=strategy,
+                executor=executor,
+            )
+
+            assert all("/" not in layername for layername in dataset.layers)
diff --git a/webknossos/webknossos/dataset/_utils/pims_tiff_reader.py b/webknossos/webknossos/dataset/_utils/pims_tiff_reader.py