reduce required input to CombinedProcessing constructor

FynnBe · FynnBe · commit 256399bf98d3 · 2022-10-17T10:15:51.000+02:00
diff --git a/bioimageio/core/prediction_pipeline/_combined_processing.py b/bioimageio/core/prediction_pipeline/_combined_processing.py
@@ -1,7 +1,8 @@
-from typing import List, Optional, Sequence, Union
+import dataclasses
+from typing import Dict, Iterable, List, NamedTuple, Optional, Sequence, TypedDict, Union
 
 from bioimageio.core.resource_io import nodes
-from ._processing import EnsureDtype, KNOWN_PROCESSING, Processing
+from ._processing import EnsureDtype, KNOWN_PROCESSING, Processing, TensorName
 from ._utils import ComputedMeasures, PER_DATASET, PER_SAMPLE, RequiredMeasures, Sample
 
 try:
@@ -10,44 +11,58 @@
     from typing_extensions import Literal  # type: ignore
 
 
+@dataclasses.dataclass
+class TensorProcessingInfo:
+    processing_steps: Union[List[nodes.Preprocessing], List[nodes.Postprocessing]]
+    data_type_before: Optional[str] = None
+    data_type_after: Optional[str] = None
+
+
 class CombinedProcessing:
-    def __init__(self, tensor_specs: Union[List[nodes.InputTensor], List[nodes.OutputTensor]]):
-        PRE: Literal["pre"] = "pre"
-        POST: Literal["post"] = "post"
-        proc_prefix: Optional[Literal["pre", "post"]] = None
+    def __init__(self, combine_tensors: Dict[TensorName, TensorProcessingInfo]):
         self._procs = []
-        for t in tensor_specs:
-            if isinstance(t, nodes.InputTensor):
-                steps = t.preprocessing or []
-                if proc_prefix is not None and proc_prefix != PRE:
-                    raise ValueError(f"Invalid mixed input/output tensor specs: {tensor_specs}")
-
-                proc_prefix = PRE
-            elif isinstance(t, nodes.OutputTensor):
-                steps = t.postprocessing or []
-                if proc_prefix is not None and proc_prefix != POST:
-                    raise ValueError(f"Invalid mixed input/output tensor specs: {tensor_specs}")
-
-                proc_prefix = POST
-            else:
-                raise NotImplementedError(t)
+        known = dict(KNOWN_PROCESSING["pre"])
+        known.update(KNOWN_PROCESSING["post"])
+
+        # ensure all tensors have correct data type before any processing
+        for tensor_name, info in combine_tensors.items():
+            if info.data_type_before is not None:
+                self._procs.append(EnsureDtype(tensor_name=tensor_name, dtype=info.data_type_before))
 
-            for step in steps:
-                self._procs.append(KNOWN_PROCESSING[proc_prefix][step.name](tensor_name=t.name, **step.kwargs))
+        for tensor_name, info in combine_tensors.items():
+            for step in info.processing_steps:
+                self._procs.append(known[step.name](tensor_name=tensor_name, **step.kwargs))
 
-        # There is a difference between pre-and postprocessing:
-        # Preprocessing always returns float32, because its output is consumed by the model.
-        # Postprocessing, however, should return the dtype that is specified in the model spec.
-        # todo: cast dtype for inputs before preprocessing? or check dtype?
-        if proc_prefix == POST:
-            for t in tensor_specs:
-                self._procs.append(EnsureDtype(tensor_name=t.name, dtype=t.data_type))
+            # ensure tensor has correct data type right after its processing
+            if info.data_type_after is not None:
+                self._procs.append(EnsureDtype(tensor_name=tensor_name, dtype=info.data_type_after))
 
         self.required_measures: RequiredMeasures = self._collect_required_measures(self._procs)
-        if proc_prefix == POST and self.required_measures[PER_DATASET]:
-            raise NotImplementedError("computing statistics for output tensors per dataset is not yet implemented")
+        self.tensor_names = list(combine_tensors)
+
+    @classmethod
+    def from_tensor_specs(cls, tensor_specs: List[Union[nodes.InputTensor, nodes.OutputTensor]]):
+        combine_tensors = {}
+        for ts in tensor_specs:
+            # There is a difference between pre-and postprocessing:
+            # Preprocessing always returns float32, because its output is consumed by the model.
+            # Postprocessing, however, should return the dtype that is specified in the model spec.
+            # todo: cast dtype for inputs before preprocessing? or check dtype?
+            assert ts.name not in combine_tensors
+            if isinstance(ts, nodes.InputTensor):
+                # todo: move preprocessing ensure_dtype here as data_type_after
+                combine_tensors[ts.name] = TensorProcessingInfo(ts.preprocessing)
+            elif isinstance(ts, nodes.OutputTensor):
+                combine_tensors[ts.name] = TensorProcessingInfo(ts.postprocessing, None, ts.data_type)
+            else:
+                raise NotImplementedError(type(ts))
+
+        inst = cls(combine_tensors)
+        for ts in tensor_specs:
+            if isinstance(ts, nodes.OutputTensor) and ts.name in inst.required_measures[PER_DATASET]:
+                raise NotImplementedError("computing statistics for output tensors per dataset is not yet implemented")
 
-        self.tensor_names = [t.name for t in tensor_specs]
+        return inst
 
     def apply(self, sample: Sample, computed_measures: ComputedMeasures) -> None:
         for proc in self._procs:
diff --git a/bioimageio/core/prediction_pipeline/_prediction_pipeline.py b/bioimageio/core/prediction_pipeline/_prediction_pipeline.py
@@ -213,7 +213,7 @@ def create_prediction_pipeline(
         ipts = [resolve_raw_node(s, nodes) for s in bioimageio_model.inputs]
         outs = [resolve_raw_node(s, nodes) for s in bioimageio_model.outputs]
 
-    preprocessing = CombinedProcessing(ipts)
+    preprocessing = CombinedProcessing.from_tensor_specs(ipts)
 
     def sample_dataset():
         for tensors in dataset_for_initial_statistics:
@@ -225,7 +225,7 @@ def sample_dataset():
         update_dataset_stats_after_n_samples=update_dataset_stats_after_n_samples,
         update_dataset_stats_for_n_samples=update_dataset_stats_for_n_samples,
     )
-    postprocessing = CombinedProcessing(outs)
+    postprocessing = CombinedProcessing.from_tensor_specs(outs)
     out_stats = StatsState(
         postprocessing.required_measures,
         dataset=tuple(),
diff --git a/tests/prediction_pipeline/test_combined_processing.py b/tests/prediction_pipeline/test_combined_processing.py
@@ -25,7 +25,7 @@ def test_postprocessing_dtype():
                 postprocessing=[nodes.Postprocessing("binarize", dict(threshold=threshold))],
             )
         ]
-        com_proc = CombinedProcessing(outputs)
+        com_proc = CombinedProcessing.from_tensor_specs(outputs)
 
         sample = {"out1": data}
         com_proc.apply(sample, {})

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ def test_postprocessing_dtype():`
`25`	`25`	`postprocessing=[nodes.Postprocessing("binarize", dict(threshold=threshold))],`
`26`	`26`	`)`
`27`	`27`	`]`
`28`		`- com_proc = CombinedProcessing(outputs)`
	`28`	`+ com_proc = CombinedProcessing.from_tensor_specs(outputs)`
`29`	`29`
`30`	`30`	`sample = {"out1": data}`
`31`	`31`	`com_proc.apply(sample, {})`