Segformer backbone Mix Visual Transformer (#632)

qubvel · web-flow · commit 792c2735aa0e · 2022-08-08T09:57:37.000+03:00
* Segformer backbone

* Add limitations for FPN, Unet++, Linknet

* fix tests
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ The main features of this library are:
 
  - High level API (just two lines to create a neural network)
  - 9 models architectures for binary and multi class segmentation (including legendary Unet)
- - 113 available encoders (and 400+ encoders from [timm](https://github.com/rwightman/pytorch-image-models))
+ - 119 available encoders (and 400+ encoders from [timm](https://github.com/rwightman/pytorch-image-models))
  - All encoders have pre-trained weights for faster and better convergence
  - Popular metrics and losses for training routines
  
@@ -352,6 +352,29 @@ The following is a list of supported encoders in the SMP. Select the appropriate
 </div>
 </details>
 
+<details>
+<summary style="margin-left: 25px;">Mix Vision Transformer</summary>
+<div style="margin-left: 25px;">
+
+Backbone from SegFormer pretrained on Imagenet! Can be used with other decoders from package, you can combine Mix Visual Transformer with Unet, FPN and others!
+
+Limitations:  
+
+   - encoder is not supported by Linknet, Unet++
+   - encoder is not supported by FPN if encoder depth != 5
+
+|Encoder                         |Weights                         |Params, M                       |
+|--------------------------------|:------------------------------:|:------------------------------:|
+|mit_b0                          |imagenet                        |3M                              |
+|mit_b1                          |imagenet                        |13M                             |
+|mit_b2                          |imagenet                        |24M                             |
+|mit_b3                          |imagenet                        |44M                             |
+|mit_b4                          |imagenet                        |60M                             |
+|mit_b5                          |imagenet                        |81M                             |
+
+</div>
+</details>
+
 
 \* `ssl`, `swsl` - semi-supervised and weakly-supervised learning on ImageNet ([repo](https://github.com/facebookresearch/semi-supervised-ImageNet1K-models)).
 
diff --git a/docs/encoders.rst b/docs/encoders.rst
@@ -324,3 +324,18 @@ VGG
 +-------------+------------+-------------+
 | vgg19\_bn   | imagenet   | 20M         |
 +-------------+------------+-------------+
+
+
+Mix Visual Transformer
+~~~~~~~~~~~~~~~~~~~~~
+
++-----------+----------+------------+
+| Encoder   | Weights  | Params, M  |
++===========+==========+============+
+| mit\_b0   | imagenet | 3M         |
+| mit\_b1   | imagenet | 13M        |
+| mit\_b2   | imagenet | 24M        |
+| mit\_b3   | imagenet | 44M        |
+| mit\_b4   | imagenet | 60M        |
+| mit\_b5   | imagenet | 81M        |
++-----------+----------+------------+
diff --git a/segmentation_models_pytorch/decoders/fpn/model.py b/segmentation_models_pytorch/decoders/fpn/model.py
@@ -66,6 +66,10 @@ def __init__(
     ):
         super().__init__()
 
+        # validate input params
+        if encoder_name.startswith("mit_b") and encoder_depth != 5:
+            raise ValueError("Encoder {} support only encoder_depth=5".format(encoder_name))
+
         self.encoder = get_encoder(
             encoder_name,
             in_channels=in_channels,
diff --git a/segmentation_models_pytorch/decoders/linknet/model.py b/segmentation_models_pytorch/decoders/linknet/model.py
@@ -64,6 +64,9 @@ def __init__(
     ):
         super().__init__()
 
+        if encoder_name.startswith("mit_b"):
+            raise ValueError("Encoder `{}` is not supported for Linknet".format(encoder_name))
+
         self.encoder = get_encoder(
             encoder_name,
             in_channels=in_channels,
diff --git a/segmentation_models_pytorch/decoders/unetplusplus/model.py b/segmentation_models_pytorch/decoders/unetplusplus/model.py
@@ -68,6 +68,9 @@ def __init__(
     ):
         super().__init__()
 
+        if encoder_name.startswith("mit_b"):
+            raise ValueError("UnetPlusPlus is not support encoder_name={}".format(encoder_name))
+
         self.encoder = get_encoder(
             encoder_name,
             in_channels=in_channels,
diff --git a/segmentation_models_pytorch/encoders/__init__.py b/segmentation_models_pytorch/encoders/__init__.py
@@ -19,6 +19,7 @@
 from .timm_sknet import timm_sknet_encoders
 from .timm_mobilenetv3 import timm_mobilenetv3_encoders
 from .timm_gernet import timm_gernet_encoders
+from .mix_transformer import mix_transformer_encoders
 
 from .timm_universal import TimmUniversalEncoder
 
@@ -42,6 +43,7 @@
 encoders.update(timm_sknet_encoders)
 encoders.update(timm_mobilenetv3_encoders)
 encoders.update(timm_gernet_encoders)
+encoders.update(mix_transformer_encoders)
 
 
 def get_encoder(name, in_channels=3, depth=5, weights=None, output_stride=32, **kwargs):
diff --git a/segmentation_models_pytorch/encoders/mix_transformer.py b/segmentation_models_pytorch/encoders/mix_transformer.py
diff --git a/tests/test_models.py b/tests/test_models.py