feat: add new MultiTaskDecoder class to wrap the decoder into one module inside models

okunator · okunator · commit 5fc95bb9882a · 2025-02-15T21:37:41.000+02:00
diff --git a/cellseg_models_pytorch/decoders/multitask_decoder.py b/cellseg_models_pytorch/decoders/multitask_decoder.py
@@ -0,0 +1,278 @@
+from itertools import chain
+from typing import Any, Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from cellseg_models_pytorch.decoders.long_skips import StemSkip
+from cellseg_models_pytorch.decoders.unet_decoder import UnetDecoder
+from cellseg_models_pytorch.models.base._initialization import (
+    initialize_decoder,
+    initialize_head,
+)
+from cellseg_models_pytorch.models.base._seg_head import SegHead
+from cellseg_models_pytorch.modules.misc_modules import StyleReshape
+
+ALLOWED_HEADS = [
+    "inst",
+    "type",
+    "sem",
+    "cellpose",
+    "omnipose",
+    "stardist",
+    "hovernet",
+    "dist",
+    "dcan",
+    "dran",
+]
+
+__all__ = ["MultiTaskDecoder"]
+
+
+class MultiTaskDecoder(nn.ModuleDict):
+    def __init__(
+        self,
+        decoders: Tuple[str, ...],
+        heads: Dict[str, Dict[str, int]],
+        out_channels: Tuple[int, ...],
+        enc_channels: Tuple[int, ...],
+        enc_reductions: Tuple[int, ...],
+        n_layers: Tuple[int, ...],
+        n_blocks: Tuple[int, ...],
+        stage_kws: Tuple[Dict[str, Any], ...],
+        stem_skip_kws: Dict[str, Any] = None,
+        long_skip: str = "unet",
+        out_size: int = None,
+        style_channels: int = None,
+        head_excitation_channels: int = None,
+    ) -> None:
+        """Create a multi-task decoder.
+
+        Parameters:
+            decoders (Tuple[str, ...]):
+                Tuple of decoder names. E.g. ("decoder1", "decoder2").
+            heads (Dict[str, Dict[str, int]]):
+                Dict containing the heads for each decoder. The inner dict contains the
+                head name and the number of output channels. For example:
+                {"decoder1": {"inst": 2, "sem": 5}, "decoder2": {"cellpose": 2}}.
+            out_channels (Tuple[int, ...]):
+                Tuple of output channels for each decoder stage. The length of the tuple
+                should be equal to the number of enc_channels.
+            enc_channels (Tuple[int, ...]):
+                Tuple of encoder channels.
+            enc_reductions (Tuple[int, ...]):
+                Tuple of encoder reduction factors.
+            n_layers (Tuple[int, ...]):
+                Tuple of number of conv layers in each decoder stage.
+            n_blocks (Tuple[int, ...]):
+                Tuple of number of conv blocks in each decoder stage.
+            stage_kws (Tuple[Dict[str, Any], ...]):
+                Tuple of kwargs for each decoder stage. See UnetDecoderStage for info.
+            stem_skip_kws (Dict[str, Any], default=None):
+                Optional kwargs for the stem skip connection.
+            long_skip (str, default="unet"):
+                The long skip connection method to be used in the decoder
+            out_size (int, default=None):
+                The output size of the model. If given, the output will be interpolated to this size.
+            style_channels (int, default=None):
+                The number of style channels for domain adaptation.
+            head_excitation_channels (int, default=None):
+                The number of excitation channels for the head. If None, no excitation is
+                used. Excitation is a conv block before the head that widens the output
+                channels before the head to avoid 'fight over features' (stardist).
+        """
+        super().__init__()
+        self.out_size = out_size
+        self._check_head_args(heads, decoders)
+        self._check_decoder_args(decoders)
+        self._check_depth(
+            len(enc_channels),
+            {
+                "n_blocks": n_blocks,
+                "n_layers": n_layers,
+                "out_channels": out_channels,
+                "enc_reductions": enc_reductions,
+            },
+        )
+
+        # style
+        self.make_style = None
+        if style_channels is not None:
+            self.make_style = StyleReshape(enc_channels[0], style_channels)
+
+        # set decoders
+        for decoder_name in decoders:
+            decoder = UnetDecoder(
+                enc_channels=enc_channels,
+                enc_reductions=enc_reductions,
+                out_channels=out_channels,
+                style_channels=style_channels,
+                long_skip=long_skip,
+                n_conv_layers=n_layers,
+                n_conv_blocks=n_blocks,
+                stage_params=stage_kws,
+            )
+            self.add_module(f"{decoder_name}_decoder", decoder)
+
+        # optional stem skip
+        self.has_stem_skip = stem_skip_kws is not None
+        if self.has_stem_skip:
+            for decoder_name in decoders:
+                stem_skip = StemSkip(out_channels=out_channels[-1], **stem_skip_kws)
+                self.add_module(f"{decoder_name}_stem_skip", stem_skip)
+
+        # set heads
+        for decoder_name in heads.keys():
+            for output_name, n_classes in heads[decoder_name].items():
+                seg_head = SegHead(
+                    in_channels=decoder.out_channels,
+                    out_channels=n_classes,
+                    kernel_size=1,
+                    excitation_channels=head_excitation_channels,
+                )
+                self.add_module(f"{decoder_name}-{output_name}_head", seg_head)
+
+    def forward_features(
+        self, feats: List[torch.Tensor], style: torch.Tensor = None
+    ) -> Dict[str, List[torch.Tensor]]:
+        """Forward all the decoders and return multi-res feature-lists per branch."""
+        res = {}
+        decoders = [k for k in self.keys() if "decoder" in k]
+
+        for dec in decoders:
+            featlist = self[dec](*feats, style=style)
+            branch = "_".join(dec.split("_")[:-1])
+            res[branch] = featlist
+
+        return res
+
+    def forward_heads(
+        self, dec_feats: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """Forward pass all the seg heads."""
+        res = {}
+        heads = [k for k in self.keys() if "head" in k]
+        for head in heads:
+            branch_head = head.split("-")
+            branch = branch_head[0]  # branch name
+            head_name = "_".join(branch_head[1].split("_")[:-1])  # head name
+            x = self[head](dec_feats[branch][-1])  # the last decoder stage feat map
+
+            if self.out_size is not None:
+                x = F.interpolate(
+                    x, size=self.out_size, mode="bilinear", align_corners=False
+                )
+
+            res[f"{branch}-{head_name}"] = x
+
+        return res
+
+    def forward_style(self, feat: torch.Tensor) -> torch.Tensor:
+        """Forward the style domain adaptation layer."""
+        style = None
+        if self.make_style is not None:
+            style = self.make_style(feat)
+
+        return style
+
+    def forward_stem_skip(
+        self, x: torch.Tensor, dec_feats: Dict[str, torch.Tensor]
+    ) -> Dict[str, torch.Tensor]:
+        """Forward the stem skip connection."""
+        stems = [k for k in self.keys() if "stem_skip" in k]
+        for stem in stems:
+            branch = stem.split("_")[0]
+            dec_feats[branch][-1] = self[stem](x, dec_feats[branch][-1])
+
+        return dec_feats
+
+    def forward(
+        self, enc_feats: Tuple[torch.Tensor, ...], x_in: torch.Tensor = None
+    ) -> Tuple[Dict[str, List[torch.Tensor]], Dict[str, torch.Tensor]]:
+        """Forward pass style, decoders and optional stem skip.
+
+        Parameters:
+            enc_feats (Tuple[torch.Tensor, ...]):
+                Tuple containing encoder feature tensors.
+            x_in (torch.Tensor, default=None):
+                Optional (the input image) tensor for stem skip connection.
+
+        Returns:
+            Tuple[Dict[str, List[torch.Tensor]], Dict[str, torch.Tensor]]:
+                The output of the seg heads.
+        """
+        style = self.forward_style(enc_feats[0])
+        dec_feats = self.forward_features(enc_feats, style)
+
+        # final input resolution skip connection
+        if self.has_stem_skip and x_in is not None:
+            dec_feats = self.forward_stem_skip(x_in, dec_feats)
+
+        out = self.forward_heads(dec_feats)
+
+        return dec_feats, out
+
+    def initialize(self) -> None:
+        """Initialize the decoders and segmentation heads."""
+        for name, module in self.items():
+            if "decoder" in name:
+                initialize_decoder(module)
+            if "head" in name:
+                initialize_head(module)
+
+    def _get_inner_keys(self, d: Dict[str, Dict[str, Any]]) -> List[str]:
+        """Get the inner dict keys from a nested dict."""
+        return list(chain.from_iterable(list(d[k].keys()) for k in d.keys()))
+
+    def _flatten_inner_dicts(self, d: Dict[str, Dict[str, Any]]) -> List[str]:
+        """Get the inner dicts as one dict from a nested dict."""
+        return dict(chain.from_iterable(list(d[k].items()) for k in d.keys()))
+
+    def _check_string_arg(self, arg: str) -> None:
+        """Check the str arg does not contain any chars other than '_' for splitting."""
+        if "-" in arg:
+            raise ValueError(
+                f"The dict key '{arg}' contains '-', which is not allowed. Use '_' instead."
+            )
+
+    def _check_decoder_args(self, decoders: Tuple[str, ...]) -> str:
+        """Check `decoders` arg."""
+        if len(decoders) != len(set(decoders)):
+            raise ValueError("The decoder names need to be unique.")
+
+        for dec in decoders:
+            self._check_string_arg(dec)
+
+    def _check_head_args(
+        self, heads: Dict[str, int], decoders: Tuple[str, ...]
+    ) -> None:
+        """Check `heads` arg."""
+        for head in heads.keys():
+            self._check_string_arg(head)
+
+        for head in self._get_inner_keys(heads):
+            if head not in ALLOWED_HEADS:
+                raise ValueError(
+                    f"Unknown head type: '{head}'. Allowed: {ALLOWED_HEADS}."
+                )
+
+        if not set(decoders) == set(heads.keys()):
+            raise ValueError(
+                "The decoder names need match exactly to the keys of `heads`. "
+                f"Got decoders: {decoders} and heads: {list(heads.keys())}."
+            )
+
+    def _check_depth(self, depth: int, arrs: Dict[str, Tuple[Any, ...]]) -> None:
+        """Check that the depth matches to tuple args."""
+        if not 3 <= depth <= 5:
+            raise ValueError(
+                f"max value for `depth` is 5, min value is 3. Got: {depth}"
+            )
+
+        for name, arr in arrs.items():
+            if depth != len(arr):
+                raise ValueError(
+                    f"The length of `{name}` should be equal to arg `depth`: {depth}. "
+                    f"For `{name}`, got: {arr}."
+                )
diff --git a/cellseg_models_pytorch/decoders/unet_decoder.py b/cellseg_models_pytorch/decoders/unet_decoder.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -13,13 +13,13 @@ def __init__(
         self,
         enc_channels: Tuple[int, ...],
         enc_reductions: Tuple[int, ...],
-        out_channels: Tuple[int, ...] = (256, 128, 64, 32, 16),
+        out_channels: Tuple[int, ...],
         long_skip: Union[None, str, Tuple[str, ...]] = "unet",
         n_conv_layers: Union[None, int, Tuple[int, ...]] = 1,
-        n_transformers: Union[None, int, Tuple[int, ...]] = None,
         n_conv_blocks: Union[int, Tuple[Tuple[int, ...], ...]] = 2,
+        n_transformers: Union[None, int, Tuple[int, ...]] = None,
         n_transformer_blocks: Union[int, Tuple[Tuple[int], ...]] = 1,
-        stage_params: Optional[Tuple[Dict, ...]] = None,
+        stage_params: Tuple[Dict, ...] = None,
         style_channels: int = None,
         **kwargs,
     ) -> None:
@@ -41,7 +41,7 @@ def __init__(
                 Number of channels at each encoder layer.
             enc_reductions : Tuple[int, ...]
                 The reduction factor from the input image size at each encoder layer.
-            out_channels : Tuple[int, ...], default=(256, 128, 64, 32, 16)
+            out_channels : Tuple[int, ...]
                 Number of channels at each decoder layer output.
             long_skip : Union[None, str, Tuple[str, ...]], default="unet"
                 long skip method to be used. The argument can be given as a tuple, where
@@ -71,7 +71,7 @@ def __init__(
                 value indicates the number of `SelfAttention`s inside a single
                 `TranformerLayer` allowing different sized transformer blocks inside
                 each transformer-layer in the decoder.
-            stage_params : Optional[Tuple[Dict, ...]], default=None
+            stage_params : Tuple[Dict, ...], default=None
                 The keyword args for each of the distinct decoder stages. Incudes the
                 parameters for the long skip connections, convolutional layers of the
                 decoder and transformer layers itself. See the `DecoderStage`
diff --git a/cellseg_models_pytorch/decoders/unet_decoder_stage.py b/cellseg_models_pytorch/decoders/unet_decoder_stage.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -18,33 +18,33 @@ def __init__(
         skip_channels: Tuple[int, ...],
         long_skip: str = "unet",
         merge_policy: str = "sum",
-        skip_params: Optional[Dict[str, Any]] = None,
+        skip_params: Dict[str, Any] = None,
         upsampling: str = "fixed-unpool",
-        n_conv_layers: Optional[int] = 1,
-        style_channels: Optional[int] = None,
-        layer_residual: Optional[bool] = False,
-        n_conv_blocks: Optional[Tuple[int, ...]] = (2,),
-        short_skips: Optional[Tuple[str, ...]] = ("residual",),
-        expand_ratios: Optional[Tuple[float, float]] = ((1.0, 1.0),),
-        block_types: Optional[Tuple[Tuple[str, ...], ...]] = (("basic", "basic"),),
-        normalizations: Optional[Tuple[Tuple[str, ...], ...]] = (("bn", "bn"),),
-        activations: Optional[Tuple[Tuple[str, ...], ...]] = (("relu", "relu"),),
-        convolutions: Optional[Tuple[Tuple[str, ...], ...]] = (("conv", "conv"),),
-        attentions: Optional[Tuple[Tuple[str, ...], ...]] = ((None, "se"),),
-        preactivates: Optional[Tuple[Tuple[bool, ...], ...]] = ((False, False),),
-        preattends: Optional[Tuple[Tuple[bool, ...], ...]] = ((False, False),),
-        use_styles: Optional[Tuple[Tuple[bool, ...], ...]] = ((False, False),),
-        kernel_sizes: Optional[Tuple[Tuple[int, ...]]] = ((3, 3),),
-        groups: Optional[Tuple[Tuple[int, ...]]] = ((1, 1),),
-        biases: Optional[Tuple[Tuple[bool, ...]]] = ((False, False),),
-        n_transformers: Optional[int] = None,
-        n_transformer_blocks: Optional[Tuple[int, ...]] = (1,),
-        transformer_blocks: Optional[Tuple[Tuple[str, ...], ...]] = (("exact",),),
-        transformer_computations: Optional[Tuple[Tuple[str, ...], ...]] = (("basic",),),
-        transformer_biases: Optional[Tuple[Tuple[bool, ...], ...]] = ((False,),),
-        transformer_dropouts: Optional[Tuple[Tuple[float, ...], ...]] = ((0.0,),),
-        transformer_layer_scales: Optional[Tuple[Tuple[bool, ...], ...]] = ((False,),),
-        transformer_params: Optional[List[Dict[str, Any]]] = None,
+        n_conv_layers: int = 1,
+        style_channels: int = None,
+        layer_residual: bool = False,
+        n_conv_blocks: Tuple[int, ...] = (2,),
+        short_skips: Tuple[str, ...] = ("residual",),
+        expand_ratios: Tuple[float, float] = ((1.0, 1.0),),
+        block_types: Tuple[Tuple[str, ...], ...] = (("basic", "basic"),),
+        normalizations: Tuple[Tuple[str, ...], ...] = (("bn", "bn"),),
+        activations: Tuple[Tuple[str, ...], ...] = (("relu", "relu"),),
+        convolutions: Tuple[Tuple[str, ...], ...] = (("conv", "conv"),),
+        attentions: Tuple[Tuple[str, ...], ...] = ((None, "se"),),
+        preactivates: Tuple[Tuple[bool, ...], ...] = ((False, False),),
+        preattends: Tuple[Tuple[bool, ...], ...] = ((False, False),),
+        use_styles: Tuple[Tuple[bool, ...], ...] = ((False, False),),
+        kernel_sizes: Tuple[Tuple[int, ...]] = ((3, 3),),
+        groups: Tuple[Tuple[int, ...]] = ((1, 1),),
+        biases: Tuple[Tuple[bool, ...]] = ((False, False),),
+        n_transformers: int = None,
+        n_transformer_blocks: Tuple[int, ...] = (1,),
+        transformer_blocks: Tuple[Tuple[str, ...], ...] = (("exact",),),
+        transformer_computations: Tuple[Tuple[str, ...], ...] = (("basic",),),
+        transformer_biases: Tuple[Tuple[bool, ...], ...] = ((False,),),
+        transformer_dropouts: Tuple[Tuple[float, ...], ...] = ((0.0,),),
+        transformer_layer_scales: Tuple[Tuple[bool, ...], ...] = ((False,),),
+        transformer_params: List[Dict[str, Any]] = None,
         **kwargs,
     ) -> None:
         """Build a decoder stage.
@@ -73,7 +73,7 @@ def __init__(
                 Allowed: "cross-attn", "unet", "unetpp", "unet3p", "unet3p-lite", None
             merge_policy : str, default="sum"
                 The long skip merge policy. One of: "sum", "cat"
-            skip_params : Optional[Dict]
+            skip_params : Dict[str, Any], default=None
                 Extra keyword arguments for the skip-connection module. These depend
                 on the skip module. Refer to specific skip modules for more info.
             upsampling : str, default="fixed-unpool"