Merge pull request #300 from bioimage-io/impl_scale_mean_var

FynnBe · web-flow · commit 3cf3611c3316 · 2022-10-27T16:52:56.000+02:00
Implement scale_mean_var
diff --git a/bioimageio/core/prediction_pipeline/_combined_processing.py b/bioimageio/core/prediction_pipeline/_combined_processing.py
@@ -1,7 +1,8 @@
-from typing import List, Optional, Sequence, Union
+import dataclasses
+from typing import Any, Dict, List, Optional, Sequence, Union
 
 from bioimageio.core.resource_io import nodes
-from ._processing import EnsureDtype, KNOWN_PROCESSING, Processing
+from ._processing import AssertDtype, EnsureDtype, KNOWN_PROCESSING, Processing, TensorName
 from ._utils import ComputedMeasures, PER_DATASET, PER_SAMPLE, RequiredMeasures, Sample
 
 try:
@@ -10,44 +11,78 @@
     from typing_extensions import Literal  # type: ignore
 
 
+@dataclasses.dataclass
+class ProcessingInfoStep:
+    name: str
+    kwargs: Dict[str, Any]
+
+
+@dataclasses.dataclass
+class ProcessingInfo:
+    steps: List[ProcessingInfoStep]
+    assert_dtype_before: Optional[Union[str, Sequence[str]]] = None  # throw AssertionError if data type doesn't match
+    ensure_dtype_before: Optional[str] = None  # cast data type if needed
+    assert_dtype_after: Optional[Union[str, Sequence[str]]] = None  # throw AssertionError if data type doesn't match
+    ensure_dtype_after: Optional[str] = None  # throw AssertionError if data type doesn't match
+
+
 class CombinedProcessing:
-    def __init__(self, tensor_specs: Union[List[nodes.InputTensor], List[nodes.OutputTensor]]):
-        PRE: Literal["pre"] = "pre"
-        POST: Literal["post"] = "post"
-        proc_prefix: Optional[Literal["pre", "post"]] = None
+    def __init__(self, combine_tensors: Dict[TensorName, ProcessingInfo]):
         self._procs = []
-        for t in tensor_specs:
-            if isinstance(t, nodes.InputTensor):
-                steps = t.preprocessing or []
-                if proc_prefix is not None and proc_prefix != PRE:
-                    raise ValueError(f"Invalid mixed input/output tensor specs: {tensor_specs}")
-
-                proc_prefix = PRE
-            elif isinstance(t, nodes.OutputTensor):
-                steps = t.postprocessing or []
-                if proc_prefix is not None and proc_prefix != POST:
-                    raise ValueError(f"Invalid mixed input/output tensor specs: {tensor_specs}")
-
-                proc_prefix = POST
-            else:
-                raise NotImplementedError(t)
+        known = dict(KNOWN_PROCESSING["pre"])
+        known.update(KNOWN_PROCESSING["post"])
+
+        # ensure all tensors have correct data type before any processing
+        for tensor_name, info in combine_tensors.items():
+            if info.assert_dtype_before is not None:
+                self._procs.append(AssertDtype(tensor_name=tensor_name, dtype=info.assert_dtype_before))
 
-            for step in steps:
-                self._procs.append(KNOWN_PROCESSING[proc_prefix][step.name](tensor_name=t.name, **step.kwargs))
+            if info.ensure_dtype_before is not None:
+                self._procs.append(EnsureDtype(tensor_name=tensor_name, dtype=info.ensure_dtype_before))
 
-        # There is a difference between pre-and-postprocessing:
-        # Pre-processing always returns float32, because its output is consumed by the model.
-        # Post-processing, however, should return the dtype that is specified in the model spec.
-        # todo: cast dtype for inputs before preprocessing? or check dtype?
-        if proc_prefix == POST:
-            for t in tensor_specs:
-                self._procs.append(EnsureDtype(tensor_name=t.name, dtype=t.data_type))
+        for tensor_name, info in combine_tensors.items():
+            for step in info.steps:
+                self._procs.append(known[step.name](tensor_name=tensor_name, **step.kwargs))
+
+            if info.assert_dtype_after is not None:
+                self._procs.append(AssertDtype(tensor_name=tensor_name, dtype=info.assert_dtype_after))
+
+            # ensure tensor has correct data type right after its processing
+            if info.ensure_dtype_after is not None:
+                self._procs.append(EnsureDtype(tensor_name=tensor_name, dtype=info.ensure_dtype_after))
 
         self.required_measures: RequiredMeasures = self._collect_required_measures(self._procs)
-        if proc_prefix == POST and self.required_measures[PER_DATASET]:
-            raise NotImplementedError("computing statistics for output tensors per dataset is not yet implemented")
+        self.tensor_names = list(combine_tensors)
+
+    @classmethod
+    def from_tensor_specs(cls, tensor_specs: List[Union[nodes.InputTensor, nodes.OutputTensor]]):
+        combine_tensors = {}
+        for ts in tensor_specs:
+            # There is a difference between pre-and postprocessing:
+            # After preprocessing we ensure float32, because the output is consumed by the model.
+            # After postprocessing the dtype that is specified in the model spec needs to be ensured.
+            assert ts.name not in combine_tensors
+            if isinstance(ts, nodes.InputTensor):
+                # todo: assert nodes.InputTensor.dtype with assert_dtype_before?
+                # todo: in the long run we do not want to limit model inputs to float32...
+                combine_tensors[ts.name] = ProcessingInfo(
+                    [ProcessingInfoStep(p.name, kwargs=p.kwargs) for p in ts.preprocessing or []],
+                    ensure_dtype_after="float32",
+                )
+            elif isinstance(ts, nodes.OutputTensor):
+                combine_tensors[ts.name] = ProcessingInfo(
+                    [ProcessingInfoStep(p.name, kwargs=p.kwargs) for p in ts.postprocessing or []],
+                    ensure_dtype_after=ts.data_type,
+                )
+            else:
+                raise NotImplementedError(type(ts))
+
+        inst = cls(combine_tensors)
+        for ts in tensor_specs:
+            if isinstance(ts, nodes.OutputTensor) and ts.name in inst.required_measures[PER_DATASET]:
+                raise NotImplementedError("computing statistics for output tensors per dataset is not yet implemented")
 
-        self.tensor_names = [t.name for t in tensor_specs]
+        return inst
 
     def apply(self, sample: Sample, computed_measures: ComputedMeasures) -> None:
         for proc in self._procs:
diff --git a/bioimageio/core/prediction_pipeline/_prediction_pipeline.py b/bioimageio/core/prediction_pipeline/_prediction_pipeline.py
@@ -213,7 +213,7 @@ def create_prediction_pipeline(
         ipts = [resolve_raw_node(s, nodes) for s in bioimageio_model.inputs]
         outs = [resolve_raw_node(s, nodes) for s in bioimageio_model.outputs]
 
-    preprocessing = CombinedProcessing(ipts)
+    preprocessing = CombinedProcessing.from_tensor_specs(ipts)
 
     def sample_dataset():
         for tensors in dataset_for_initial_statistics:
@@ -225,7 +225,7 @@ def sample_dataset():
         update_dataset_stats_after_n_samples=update_dataset_stats_after_n_samples,
         update_dataset_stats_for_n_samples=update_dataset_stats_for_n_samples,
     )
-    postprocessing = CombinedProcessing(outs)
+    postprocessing = CombinedProcessing.from_tensor_specs(outs)
     out_stats = StatsState(
         postprocessing.required_measures,
         dataset=tuple(),
diff --git a/bioimageio/core/prediction_pipeline/_processing.py b/bioimageio/core/prediction_pipeline/_processing.py
@@ -1,6 +1,12 @@
-from dataclasses import dataclass, field, fields
-from typing import Mapping, Optional, Sequence, Type, Union
-
+"""Here pre- and postprocessing operations are implemented according to their definitions in bioimageio.spec:
+see https://github.com/bioimage-io/spec-bioimage-io/blob/gh-pages/preprocessing_spec_latest.md
+and https://github.com/bioimage-io/spec-bioimage-io/blob/gh-pages/postprocessing_spec_latest.md
+"""
+import numbers
+from dataclasses import InitVar, dataclass, field, fields
+from typing import List, Mapping, Optional, Sequence, Tuple, Type, Union
+
+import numpy
 import numpy as np
 import xarray as xr
 
@@ -33,7 +39,7 @@ def _get_fixed(
 
 @dataclass
 class Processing:
-    """base class for all Pre- and Postprocessing transformations"""
+    """base class for all Pre- and Postprocessing transformations."""
 
     tensor_name: str
     # todo: in python>=3.10 we should use dataclasses.KW_ONLY instead of MISSING (see child classes) to make inheritance work properly
@@ -87,48 +93,64 @@ def __post_init__(self):
 
 
 #
-# helpers
+# Pre- and Postprocessing implementations
 #
-def ensure_dtype(tensor: xr.DataArray, *, dtype) -> xr.DataArray:
-    """
-    Convert array to a given datatype
-    """
-    return tensor.astype(dtype)
 
 
-#
-# Pre- and Postprocessing implementations
-#
+@dataclass
+class AssertDtype(Processing):
+    """Helper Processing to assert dtype."""
+
+    dtype: Union[str, Sequence[str]] = MISSING
+    assert_with: Tuple[Type[numpy.dtype], ...] = field(init=False)
+
+    def __post_init__(self):
+        if isinstance(self.dtype, str):
+            dtype = [self.dtype]
+        else:
+            dtype = self.dtype
+
+        self.assert_with = tuple(type(numpy.dtype(dt)) for dt in dtype)
+
+    def apply(self, tensor: xr.DataArray) -> xr.DataArray:
+        assert isinstance(tensor.dtype, self.assert_with)
+        return tensor
 
 
 @dataclass
 class Binarize(Processing):
+    """'output = tensor > threshold'."""
+
     threshold: float = MISSING  # make dataclass inheritance work for py<3.10 by using an explicit MISSING value.
 
     def apply(self, tensor: xr.DataArray) -> xr.DataArray:
-        return ensure_dtype(tensor > self.threshold, dtype="float32")
+        return tensor > self.threshold
 
 
 @dataclass
 class Clip(Processing):
+    """Limit tensor values to [min, max]."""
+
     min: float = MISSING
     max: float = MISSING
 
     def apply(self, tensor: xr.DataArray) -> xr.DataArray:
-        return ensure_dtype(tensor.clip(min=self.min, max=self.max), dtype="float32")
+        return tensor.clip(min=self.min, max=self.max)
 
 
 @dataclass
 class EnsureDtype(Processing):
+    """Helper Processing to cast dtype if needed."""
+
     dtype: str = MISSING
 
     def apply(self, tensor: xr.DataArray) -> xr.DataArray:
-        return ensure_dtype(tensor, dtype=self.dtype)
+        return tensor.astype(self.dtype)
 
 
 @dataclass
 class ScaleLinear(Processing):
-    """scale the tensor with a fixed multiplicative and additive factor"""
+    """Scale the tensor with a fixed multiplicative and additive factor."""
 
     gain: Union[float, Sequence[float]] = MISSING
     offset: Union[float, Sequence[float]] = MISSING
@@ -143,7 +165,7 @@ def apply(self, tensor: xr.DataArray) -> xr.DataArray:
             gain = self.gain
             offset = self.offset
 
-        return ensure_dtype(tensor * gain + offset, dtype="float32")
+        return tensor * gain + offset
 
     def __post_init__(self):
         super().__post_init__()
@@ -154,11 +176,37 @@ def __post_init__(self):
 
 @dataclass
 class ScaleMeanVariance(Processing):
-    ...
+    """Scale the tensor s.t. its mean and variance match a reference tensor."""
+
+    mode: Literal[SampleMode, DatasetMode] = PER_SAMPLE
+    reference_tensor: TensorName = MISSING
+    axes: Optional[Sequence[str]] = None
+    eps: float = 1e-6
+
+    def get_required_measures(self) -> RequiredMeasures:
+        axes = None if self.axes is None else tuple(self.axes)
+        return {
+            self.mode: {
+                self.tensor_name: {Mean(axes=axes), Std(axes=axes)},
+                self.reference_tensor: {Mean(axes=axes), Std(axes=axes)},
+            }
+        }
+
+    def apply(self, tensor: xr.DataArray) -> xr.DataArray:
+        axes = None if self.axes is None else tuple(self.axes)
+        assert self.mode in (PER_SAMPLE, PER_DATASET)
+        mean = self.get_computed_measure(self.tensor_name, Mean(axes), mode=self.mode)
+        std = self.get_computed_measure(self.tensor_name, Std(axes), mode=self.mode)
+        ref_mean = self.get_computed_measure(self.reference_tensor, Mean(axes), mode=self.mode)
+        ref_std = self.get_computed_measure(self.reference_tensor, Std(axes), mode=self.mode)
+
+        return (tensor - mean) / (std + self.eps) * (ref_std + self.eps) + ref_mean
 
 
 @dataclass
 class ScaleRange(Processing):
+    """Scale with percentiles."""
+
     mode: Literal[SampleMode, DatasetMode] = PER_SAMPLE
     axes: Optional[Sequence[str]] = None
     min_percentile: float = 0.0
@@ -177,7 +225,7 @@ def apply(self, tensor: xr.DataArray) -> xr.DataArray:
         v_lower = self.get_computed_measure(ref_name, Percentile(self.min_percentile, axes=axes))
         v_upper = self.get_computed_measure(ref_name, Percentile(self.max_percentile, axes=axes))
 
-        return ensure_dtype((tensor - v_lower) / (v_upper - v_lower + self.eps), dtype="float32")
+        return (tensor - v_lower) / (v_upper - v_lower + self.eps)
 
     def __post_init__(self):
         super().__post_init__()
@@ -186,12 +234,16 @@ def __post_init__(self):
 
 @dataclass
 class Sigmoid(Processing):
+    """1 / (1 + e^(-tensor))."""
+
     def apply(self, tensor: xr.DataArray) -> xr.DataArray:
         return 1.0 / (1.0 + np.exp(-tensor))
 
 
 @dataclass
 class ZeroMeanUnitVariance(Processing):
+    """normalize to zero mean, unit variance."""
+
     mode: Mode = PER_SAMPLE
     mean: Optional[Union[float, Sequence[float]]] = None
     std: Optional[Union[float, Sequence[float]]] = None
@@ -218,8 +270,7 @@ def apply(self, tensor: xr.DataArray) -> xr.DataArray:
         else:
             raise ValueError(self.mode)
 
-        tensor = (tensor - mean) / (std + self.eps)
-        return ensure_dtype(tensor, dtype="float32")
+        return (tensor - mean) / (std + self.eps)
 
 
 _KnownProcessing = TypedDict(
diff --git a/tests/prediction_pipeline/test_combined_processing.py b/tests/prediction_pipeline/test_combined_processing.py
@@ -25,7 +25,7 @@ def test_postprocessing_dtype():
                 postprocessing=[nodes.Postprocessing("binarize", dict(threshold=threshold))],
             )
         ]
-        com_proc = CombinedProcessing(outputs)
+        com_proc = CombinedProcessing.from_tensor_specs(outputs)
 
         sample = {"out1": data}
         com_proc.apply(sample, {})
diff --git a/tests/prediction_pipeline/test_postprocessing.py b/tests/prediction_pipeline/test_postprocessing.py
diff --git a/tests/prediction_pipeline/test_processing.py b/tests/prediction_pipeline/test_processing.py

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ def test_postprocessing_dtype():`
`25`	`25`	`postprocessing=[nodes.Postprocessing("binarize", dict(threshold=threshold))],`
`26`	`26`	`)`
`27`	`27`	`]`
`28`		`- com_proc = CombinedProcessing(outputs)`
	`28`	`+ com_proc = CombinedProcessing.from_tensor_specs(outputs)`
`29`	`29`
`30`	`30`	`sample = {"out1": data}`
`31`	`31`	`com_proc.apply(sample, {})`