Update mix_transformer.py

brianhou0208 · brianhou0208 · commit 4392454f0e7e · 2024-11-02T03:12:02.000+08:00
diff --git a/segmentation_models_pytorch/encoders/mix_transformer.py b/segmentation_models_pytorch/encoders/mix_transformer.py
@@ -1,7 +1,12 @@
 # ---------------------------------------------------------------
 # Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
 #
-# This work is licensed under the NVIDIA Source Code License
+# Licensed under the NVIDIA Source Code License. For full license
+# terms, please refer to the LICENSE file provided with this code
+# or visit NVIDIA's official repository at 
+# https://github.com/NVlabs/SegFormer/tree/master.
+#
+# This code has been modified.
 # ---------------------------------------------------------------
 import math
 import torch
@@ -11,6 +16,21 @@
 from timm.layers import DropPath, to_2tuple, trunc_normal_
 
 
+class LayerNorm(nn.LayerNorm):
+    def forward(self, x):
+        if x.ndim == 4:
+            B, C, H, W = x.shape
+            x = x.view(B, C, -1).transpose(1, 2)
+            x = nn.functional.layer_norm(
+                x, self.normalized_shape, self.weight, self.bias, self.eps
+            )
+            x = x.transpose(1, 2).view(B, -1, H, W).contiguous()
+        else:
+            x = nn.functional.layer_norm(
+                x, self.normalized_shape, self.weight, self.bias, self.eps
+            )
+        return x
+
 class Mlp(nn.Module):
     def __init__(
         self,
@@ -36,9 +56,6 @@ def _init_weights(self, m):
             trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.bias, 0)
-            nn.init.constant_(m.weight, 1.0)
         elif isinstance(m, nn.Conv2d):
             fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
             fan_out //= m.groups
@@ -86,7 +103,7 @@ def __init__(
         self.sr_ratio = sr_ratio
         if sr_ratio > 1:
             self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
-            self.norm = nn.LayerNorm(dim)
+            self.norm = LayerNorm(dim)
 
         self.apply(self._init_weights)
 
@@ -95,7 +112,7 @@ def _init_weights(self, m):
             trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
+        elif isinstance(m, LayerNorm):
             nn.init.constant_(m.bias, 0)
             nn.init.constant_(m.weight, 1.0)
         elif isinstance(m, nn.Conv2d):
@@ -153,7 +170,7 @@ def __init__(
         attn_drop=0.0,
         drop_path=0.0,
         act_layer=nn.GELU,
-        norm_layer=nn.LayerNorm,
+        norm_layer=LayerNorm,
         sr_ratio=1,
     ):
         super().__init__()
@@ -185,7 +202,7 @@ def _init_weights(self, m):
             trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
+        elif isinstance(m, LayerNorm):
             nn.init.constant_(m.bias, 0)
             nn.init.constant_(m.weight, 1.0)
         elif isinstance(m, nn.Conv2d):
@@ -195,10 +212,12 @@ def _init_weights(self, m):
             if m.bias is not None:
                 m.bias.data.zero_()
 
-    def forward(self, x, H, W):
+    def forward(self, x: torch.Tensor):
+        B, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
         x = x + self.drop_path(self.attn(self.norm1(x), H, W))
         x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
-
+        x = x.transpose(1, 2).view(B, -1, H, W)
         return x
 
 
@@ -221,7 +240,7 @@ def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=7
             stride=stride,
             padding=(patch_size[0] // 2, patch_size[1] // 2),
         )
-        self.norm = nn.LayerNorm(embed_dim)
+        self.norm = LayerNorm(embed_dim)
 
         self.apply(self._init_weights)
 
@@ -230,7 +249,7 @@ def _init_weights(self, m):
             trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
+        elif isinstance(m, LayerNorm):
             nn.init.constant_(m.bias, 0)
             nn.init.constant_(m.weight, 1.0)
         elif isinstance(m, nn.Conv2d):
@@ -242,11 +261,8 @@ def _init_weights(self, m):
 
     def forward(self, x):
         x = self.proj(x)
-        _, _, H, W = x.shape
-        x = x.flatten(2).transpose(1, 2)
         x = self.norm(x)
-
-        return x, H, W
+        return x
 
 
 class MixVisionTransformer(nn.Module):
@@ -307,8 +323,8 @@ def __init__(
             x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
         ]  # stochastic depth decay rule
         cur = 0
-        self.block1 = nn.ModuleList(
-            [
+        self.block1 = nn.Sequential(
+            *[
                 Block(
                     dim=embed_dims[0],
                     num_heads=num_heads[0],
@@ -327,8 +343,8 @@ def __init__(
         self.norm1 = norm_layer(embed_dims[0])
 
         cur += depths[0]
-        self.block2 = nn.ModuleList(
-            [
+        self.block2 = nn.Sequential(
+            *[
                 Block(
                     dim=embed_dims[1],
                     num_heads=num_heads[1],
@@ -347,8 +363,8 @@ def __init__(
         self.norm2 = norm_layer(embed_dims[1])
 
         cur += depths[1]
-        self.block3 = nn.ModuleList(
-            [
+        self.block3 = nn.Sequential(
+            *[
                 Block(
                     dim=embed_dims[2],
                     num_heads=num_heads[2],
@@ -367,8 +383,8 @@ def __init__(
         self.norm3 = norm_layer(embed_dims[2])
 
         cur += depths[2]
-        self.block4 = nn.ModuleList(
-            [
+        self.block4 = nn.Sequential(
+            *[
                 Block(
                     dim=embed_dims[3],
                     num_heads=num_heads[3],
@@ -396,7 +412,7 @@ def _init_weights(self, m):
             trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
+        elif isinstance(m, LayerNorm):
             nn.init.constant_(m.bias, 0)
             nn.init.constant_(m.weight, 1.0)
         elif isinstance(m, nn.Conv2d):
@@ -454,35 +470,27 @@ def forward_features(self, x):
         outs = []
 
         # stage 1
-        x, H, W = self.patch_embed1(x)
-        for i, blk in enumerate(self.block1):
-            x = blk(x, H, W)
+        x = self.patch_embed1(x)
+        x = self.block1(x)
         x = self.norm1(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
         outs.append(x)
 
         # stage 2
-        x, H, W = self.patch_embed2(x)
-        for i, blk in enumerate(self.block2):
-            x = blk(x, H, W)
+        x = self.patch_embed2(x)
+        x = self.block2(x)
         x = self.norm2(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
         outs.append(x)
 
         # stage 3
-        x, H, W = self.patch_embed3(x)
-        for i, blk in enumerate(self.block3):
-            x = blk(x, H, W)
+        x = self.patch_embed3(x)
+        x = self.block3(x)
         x = self.norm3(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
         outs.append(x)
 
         # stage 4
-        x, H, W = self.patch_embed4(x)
-        for i, blk in enumerate(self.block4):
-            x = blk(x, H, W)
+        x = self.patch_embed4(x)
+        x = self.block4(x)
         x = self.norm4(x)
-        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
         outs.append(x)
 
         return outs
@@ -500,7 +508,7 @@ def __init__(self, dim=768):
         self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
 
     def forward(self, x, H, W):
-        B, N, C = x.shape
+        B, _, C = x.shape
         x = x.transpose(1, 2).view(B, C, H, W)
         x = self.dwconv(x)
         x = x.flatten(2).transpose(1, 2)
@@ -522,8 +530,15 @@ def __init__(self, out_channels, depth=5, **kwargs):
         self._depth = depth
         self._in_channels = 3
 
-    def make_dilated(self, *args, **kwargs):
-        raise ValueError("MixVisionTransformer encoder does not support dilated mode")
+    def get_stages(self):
+        return [
+            nn.Identity(),
+            nn.Identity(),
+            nn.Sequential(self.patch_embed1, self.block1, self.norm1),
+            nn.Sequential(self.patch_embed2, self.block2, self.norm2),
+            nn.Sequential(self.patch_embed3, self.block3, self.norm3),
+            nn.Sequential(self.patch_embed4, self.block4, self.norm4),
+        ]
 
     def set_in_channels(self, in_channels, *args, **kwargs):
         if in_channels != 3:
@@ -532,11 +547,20 @@ def set_in_channels(self, in_channels, *args, **kwargs):
             )
 
     def forward(self, x):
+        stages = self.get_stages()
+
         # create dummy output for the first block
         B, C, H, W = x.shape
         dummy = torch.empty([B, 0, H // 2, W // 2], dtype=x.dtype, device=x.device)
 
-        return [x, dummy] + self.forward_features(x)[: self._depth - 1]
+        features = []
+        for i in range(self._depth + 1):
+            if i == 1:
+                features.append(dummy)
+            else:
+                x = stages[i](x)
+                features.append(x)
+        return features
 
     def load_state_dict(self, state_dict):
         state_dict.pop("head.weight", None)