Add lazy loading for File_Manager for better multiprocessing; Add cv2 IO

Lars-Kraemer · Lars-Kraemer · commit c52b10e40055 · 2025-11-05T14:34:51.000+01:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,6 +37,7 @@ dependencies = [
     "pyyaml",
     "seaborn",
     "scikit-learn",
+    "opencv-python-headless"
 ]
 
 [project.optional-dependencies]
diff --git a/src/vidata/analysis/image_analyzer.py b/src/vidata/analysis/image_analyzer.py
@@ -44,8 +44,8 @@ def __init__(self, data_loader: BaseLoader, file_manager: FileManager, nchannels
         self.stats = None
         self.global_stats = None
 
-    def analyze_case(self, index, verbose=False):
-        file = self.file_manager[index]
+    def analyze_case(self, file, verbose=False):
+        # file = self.file_manager[index]
         data, meta = self.data_loader.load(file)
         data = data[...]  # To resolve memmap dtypes
         stats = {
@@ -79,7 +79,7 @@ def analyze_case(self, index, verbose=False):
     def run(self, n_processes=8, progressbar=True, verbose=False):
         stats = multiprocess_iter(
             self.analyze_case,
-            iterables={"index": np.arange(0, len(self.file_manager))},
+            iterables={"file": self.file_manager},
             const={"verbose": verbose},
             p=n_processes,
             progressbar=progressbar,
diff --git a/src/vidata/analysis/label_analyzer.py b/src/vidata/analysis/label_analyzer.py
@@ -31,8 +31,8 @@ def __init__(
         self.n_classes = n_classes
         self.ignore_bg = ignore_bg
 
-    def analyze_case(self, index, verbose=False):
-        file = self.file_manager[index]
+    def analyze_case(self, file, verbose=False):
+        # file = self.file_manager[index]
         data, meta = self.data_loader.load(file)
         data = data[...]  # To resolve memmap dtypes
         data = data.astype(np.uint8)
@@ -58,7 +58,7 @@ def analyze_case(self, index, verbose=False):
     def run(self, n_processes=8, progressbar=True, verbose=False):
         stats = multiprocess_iter(
             self.analyze_case,
-            iterables={"index": np.arange(0, len(self.file_manager))},
+            iterables={"file": self.file_manager},
             const={"verbose": verbose},
             p=n_processes,
             progressbar=progressbar,
@@ -173,13 +173,15 @@ def plot(self, path, name=""):
         # --- Size - Frequency Plot --- #
         colors = get_colormap("tab10", len(class_cnt), as_uint=True)
         fig = go.Figure()
-        for cnt, size, name, col in zip(class_cnt, class_size, categories, colors, strict=False):
+        for cnt, size, legend_name, col in zip(
+            class_cnt, class_size, categories, colors, strict=False
+        ):
             fig.add_trace(
                 go.Scatter(
                     x=[cnt],
                     y=[size],
                     mode="markers",
-                    name=name,  # ← legend label
+                    name=legend_name,  # ← legend label
                     marker={
                         "size": 15,
                         "color": f"rgb{col}",
diff --git a/src/vidata/file_manager/file_manager.py b/src/vidata/file_manager/file_manager.py
@@ -10,6 +10,7 @@
 class FileManager:
     """
     Flexible file collector with optional patterns and name based filtering.
+    Also supports lazy loading (useful for multiprocessing).
 
     Parameters
     ----------
@@ -25,6 +26,8 @@ class FileManager:
         Drop files whose RELATIVE path contains ANY of these substrings. (Exclude wins.)
     recursive: bool
         Whether to recursively search subdirectories.
+    lazy_init : bool
+        If True, defer file collection until the first access (default: False).
     """
 
     def __init__(
@@ -35,56 +38,154 @@ def __init__(
         include_names: list[str] | None = None,
         exclude_names: list[str] | None = None,
         recursive: bool = False,
+        lazy_init: bool = False,
     ):
         self.path = path
         self.file_type = file_type
         self.pattern = pattern
         self.include_names = include_names
         self.exclude_names = exclude_names
         self.recursive = recursive
-        self.collect_files()
-        self.filter_files()
 
-    def filter_files(self):
-        if self.include_names is not None:
-            _files_re = [str(_file.relative_to(self.path)) for _file in self.files]
-            self.files = [
+        self._files: list[Path] | None
+        if not lazy_init:
+            self.refresh()
+        else:
+            self._files = None
+
+    def refresh(self):
+        """
+        (Re)collect and filter files immediately.
+
+        This method rebuilds the internal file list by scanning the directory and
+        applying inclusion/exclusion filters.
+        """
+        self._files = self.collect_files(self.path, self.file_type, self.pattern, self.recursive)
+        self._files = self.filter_files(
+            self._files, self.path, self.include_names, self.exclude_names
+        )
+
+    @property
+    def files(self) -> list[Path]:
+        """
+        Lazily returns the collected file list.
+
+        If `lazy_init=True` was set and the files have not yet been collected,
+        this property will automatically trigger a collection.
+        """
+        if self._files is None:  # Lazy loading
+            self.refresh()
+            assert self._files is not None
+        return self._files
+
+    @files.setter
+    def files(self, value: list[Path]):
+        """Directly override the internal file list (advanced use only)."""
+        self._files = value
+
+    @staticmethod
+    def filter_files(
+        files: list[Path],
+        path: Path,
+        include_names: list[str] | None = None,
+        exclude_names: list[str] | None = None,
+    ) -> list[Path]:
+        """
+        Filter a list of files based on inclusion or exclusion substrings.
+
+        Parameters
+        ----------
+        files : list[Path]
+            Input file list.
+        path : Path
+            Root path used to compute relative paths for filtering.
+        include_names : list[str] | None
+            Substrings; keep files containing any of these in their relative path.
+        exclude_names : list[str] | None
+            Substrings; remove files containing any of these in their relative path.
+
+        Returns
+        -------
+        list[Path]
+            Filtered file list.
+        """
+        if include_names is not None:
+            _files_re = [str(_file.relative_to(path)) for _file in files]
+            files = [
                 _file
-                for _file, rel in zip(list(self.files), _files_re, strict=False)
-                if any(_token in rel for _token in self.include_names)
+                for _file, rel in zip(list(files), _files_re, strict=False)
+                if any(_token in rel for _token in include_names)
             ]
 
-        if self.exclude_names is not None:
-            _files_re = [str(_file.relative_to(self.path)) for _file in self.files]
-            self.files = [
+        if exclude_names is not None:
+            _files_re = [str(_file.relative_to(path)) for _file in files]
+            files = [
                 _file
-                for _file, rel in zip(list(self.files), _files_re, strict=False)
-                if not any(_token in rel for _token in self.exclude_names)
+                for _file, rel in zip(list(files), _files_re, strict=False)
+                if not any(_token in rel for _token in exclude_names)
             ]
+        return files
 
-    def collect_files(self):
-        if self.file_type == "" or self.path == "":
-            self.files = []
-            return
+    @staticmethod
+    def collect_files(
+        path: Path, file_type: str, pattern: str | None, recursive: bool = False
+    ) -> list[Path]:
+        """
+        Collect files under the given directory according to a pattern and extension.
+
+        Parameters
+        ----------
+        path : Path
+            Root directory to search.
+        file_type : str
+            File extension to match (e.g., ".png").
+        pattern : str | None
+            Glob-like pattern (e.g., "*_image").
+        recursive : bool, optional
+            Whether to recursively search subdirectories.
 
-        if self.pattern is None:
+        Returns
+        -------
+        list[Path]
+            Naturally sorted list of file paths.
+        """
+        if file_type == "" or path == "":
+            return []
+
+        if pattern is None:
             pattern = "*"
-        elif "*" not in self.pattern:
-            pattern = "*" + self.pattern
+        elif "*" not in pattern:
+            pattern = "*" + pattern
         else:
-            pattern = self.pattern
+            pattern = pattern
 
-        if self.recursive:
-            files = list(Path(self.path).rglob(pattern + self.file_type))
+        if recursive:
+            files = list(Path(path).rglob(pattern + file_type))
         else:
-            files = list(Path(self.path).glob(pattern + self.file_type))
-        self.files = natsorted(files, key=lambda p: p.name)
+            files = list(Path(path).glob(pattern + file_type))
+        # self.files = natsorted(files, key=lambda p: p.name)
+        return natsorted(files, key=lambda p: p.name)
 
     def get_name(self, file: str | int, with_file_type=True) -> str:
-        """Just keep this for backwards compatibility"""
+        """Legacy alias for :meth:`name_from_path` (kept for backward compatibility)."""
         return self.name_from_path(file, with_file_type)
 
     def name_from_path(self, file: str | int, include_ext: bool = True) -> str:
+        """
+        Get the relative name of a file (e.g., 'subdir/sample.png').
+
+        Parameters
+        ----------
+        file : str | int
+            File path or index into the internal file list.
+        include_ext : bool
+            Whether to keep the file extension.
+
+        Returns
+        -------
+        str
+            Relative file name.
+        """
         if isinstance(file, int):
             file = str(self.files[file])
         name = str(Path(file).relative_to(self.path))
@@ -93,6 +194,9 @@ def name_from_path(self, file: str | int, include_ext: bool = True) -> str:
         return name
 
     def path_from_name(self, name: str | Path, include_ext=True):
+        """
+        Convert a relative name (as from :meth:`name_from_path`) to an absolute path.
+        """
         rel = Path(name)
         if include_ext and rel.suffix != self.file_type:
             rel = rel.with_suffix(self.file_type)
@@ -107,6 +211,36 @@ def __len__(self):
     def __iter__(self):
         return iter(self.files)
 
+    def __getstate__(self):
+        """
+        Make the object lightweight for pickling.
+
+        The file list is omitted to reduce memory footprint when the object is
+        sent to subprocesses. Workers can rebuild it lazily on first access.
+        """
+        return {
+            "path": str(self.path),
+            "file_type": self.file_type,
+            "pattern": self.pattern,
+            "include_names": self.include_names,
+            "exclude_names": self.exclude_names,
+            "recursive": self.recursive,
+            "_files": None,
+        }
+
+    def __setstate__(self, state):
+        """
+        Restore object state after unpickling (used in multiprocessing).
+        The file list will be lazily rebuilt on first access.
+        """
+        self.path = Path(state["path"])
+        self.file_type = state["file_type"]
+        self.pattern = state["pattern"]
+        self.include_names = state["include_names"]
+        self.exclude_names = state["exclude_names"]
+        self.recursive = state["recursive"]
+        self._files = state.get("_files", None)
+
 
 class FileManagerStacked(FileManager):
     """
diff --git a/src/vidata/io/__init__.py b/src/vidata/io/__init__.py
@@ -1,12 +1,13 @@
 # isort: skip_file # order matters, first ones in list are the defaults
 # ruff: noqa: I001, I002  # disable Ruff's import-sorting checks for this file
-from .image_io import load_image, save_image
+from .image_io import load_image, save_image, load_imageRGB
+from .cv2_io import load_cv2, save_cv2, load_cv2RGB, save_cv2RGB
 from .sitk_io import load_sitk, save_sitk
 from .nib_io import load_nib, save_nib, load_nibRO, save_nibRO
 from .tif_io import load_tif, save_tif
 from .blosc2_io import load_blosc2, load_blosc2pkl, save_blosc2, save_blosc2pkl
 from .numpy_io import load_npy, load_npz, save_npy, save_npz
-from .json_io import load_json, save_json
+from .json_io import load_json, save_json, load_jsongz, save_jsongz
 from .pickle_io import load_pickle, save_pickle
 from .txt_io import load_txt, save_txt
 from .yaml_io import load_yaml, save_yaml
@@ -26,6 +27,11 @@
     "save_tif",
     "load_image",
     "save_image",
+    "load_imageRGB",
+    "load_cv2",
+    "save_cv2",
+    "load_cv2RGB",
+    "save_cv2RGB",
     "load_npy",
     "save_npy",
     "load_npz",
@@ -34,6 +40,8 @@
     "save_yaml",
     "load_json",
     "save_json",
+    "load_jsongz",
+    "save_jsongz",
     "load_pickle",
     "save_pickle",
     "load_txt",
diff --git a/src/vidata/io/cv2_io.py b/src/vidata/io/cv2_io.py
@@ -0,0 +1,36 @@
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from vidata.registry import register_loader, register_writer
+
+cv2.setNumThreads(0)
+
+
+@register_loader("image", ".png", ".jpg", ".jpeg", ".bmp", backend="cv2")
+@register_loader("mask", ".png", ".bmp", backend="cv2")
+def load_cv2(file: str | Path):
+    data = cv2.imread(file, cv2.IMREAD_UNCHANGED)
+    return data, {}
+
+
+@register_writer("image", ".png", ".jpg", ".jpeg", ".bmp", backend="cv2")
+@register_writer("mask", ".png", ".bmp", backend="cv2")
+def save_cv2(data: np.ndarray, file: str | Path) -> list[str]:
+    cv2.imwrite(file, data)
+    return [str(file)]
+
+
+@register_writer("image", ".png", ".jpg", ".jpeg", ".bmp", backend="cv2RGB")
+def save_cv2RGB(data: np.ndarray, file: str | Path) -> list[str]:
+    data = cv2.cvtColor(data, cv2.COLOR_RGB2BGR)
+    cv2.imwrite(file, data)
+    return [str(file)]
+
+
+@register_loader("image", ".png", ".jpg", ".jpeg", ".bmp", backend="cv2RGB")
+def load_cv2RGB(file: str | Path):
+    data = cv2.imread(file, cv2.IMREAD_COLOR)
+    data = cv2.cvtColor(data, cv2.COLOR_BGR2RGB)  # BGR -> RGB
+    return data, {}
diff --git a/src/vidata/io/image_io.py b/src/vidata/io/image_io.py
diff --git a/src/vidata/io/json_io.py b/src/vidata/io/json_io.py

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ dependencies = [`
`37`	`37`	`"pyyaml",`
`38`	`38`	`"seaborn",`
`39`	`39`	`"scikit-learn",`
	`40`	`+ "opencv-python-headless"`
`40`	`41`	`]`
`41`	`42`
`42`	`43`	`[project.optional-dependencies]`