Timm universal encoder (qubvel#433)

qubvel · web-flow · commit ad3e5c127346 · 2021-07-05T21:33:43.000+03:00
diff --git a/README.md b/README.md
@@ -25,6 +25,7 @@ Visit [Read The Docs Project Page](https://smp.readthedocs.io/) or read followin
  3. [Models](#models)
     1. [Architectures](#architectures)
     2. [Encoders](#encoders)
+    3. [Timm Encoders](#timm)
  4. [Models API](#api)
     1. [Input channels](#input-channels)
     2. [Auxiliary classification output](#auxiliary-classification-output)
@@ -344,6 +345,17 @@ The following is a list of supported encoders in the SMP. Select the appropriate
 
 \* `ssl`, `swsl` - semi-supervised and weakly-supervised learning on ImageNet ([repo](https://github.com/facebookresearch/semi-supervised-ImageNet1K-models)).
 
+#### Timm Encoders <a name="timm"></a>
+
+[docs](https://smp.readthedocs.io/en/latest/encoders_timm.html)
+
+Pytorch Image Models (a.k.a. timm) has a lot of pretrained models and interface which allows using these models as encoders in smp, however, not all models are supported
+
+ - transformer models do not have ``features_only`` functionality implemented
+ - some models do not have appropriate strides
+
+Total number of supported encoders: 467
+ - [table with available encoders](https://smp.readthedocs.io/en/latest/encoders_timm.html)
 
 ### 🔁 Models API <a name="api"></a>
 
diff --git a/docs/encoders_timm.rst b/docs/encoders_timm.rst
diff --git a/docs/index.rst b/docs/index.rst
@@ -14,6 +14,7 @@ Welcome to Segmentation Models's documentation!
    quickstart
    models
    encoders
+   encoders_timm
    losses
    insights
 
diff --git a/misc/generate_table_timm.py b/misc/generate_table_timm.py
@@ -0,0 +1,52 @@
+import timm
+from tqdm import tqdm
+
+
+def check_features_and_reduction(name):
+    encoder = timm.create_model(name, features_only=True, pretrained=False)
+    if not encoder.feature_info.reduction() == [2, 4, 8, 16, 32]:
+        raise ValueError
+
+def has_dilation_support(name):
+    try:
+        timm.create_model(name, features_only=True, output_stride=8, pretrained=False)
+        timm.create_model(name, features_only=True, output_stride=16, pretrained=False)
+        return True
+    except Exception as e:
+        return False
+
+def make_table(data):
+    names = supported.keys()
+    max_len1 = max([len(x) for x in names]) + 2
+    max_len2 = len("support dilation") + 2
+    
+    l1 = "+" + "-" * max_len1 + "+" + "-" * max_len2 + "+\n"
+    l2 = "+" + "=" * max_len1 + "+" + "=" * max_len2 + "+\n"
+    top = "| " + "Encoder name".ljust(max_len1 - 2) + " | " + "Support dilation".center(max_len2 - 2) + " |\n"
+    
+    table = l1 + top + l2
+    
+    for k in sorted(data.keys()):
+        support = "✅".center(max_len2 - 3) if data[k]["has_dilation"] else " ".center(max_len2 - 2)
+        table += "| " + k.ljust(max_len1 - 2) + " | " + support + " |\n"
+        table += l1
+    
+    return table
+    
+
+if __name__ == "__main__":
+
+    supported_models = {}
+
+    with tqdm(timm.list_models()) as names:
+        for name in names:
+            try:
+                check_features_and_reduction(name)
+                has_dilation = has_dilation_support(name)
+                supported_models[name] = dict(has_dilation=has_dilation)
+            except Exception:
+                continue
+
+    table = make_table(supported_models)
+    print(table)
+    print(f"Total encoders: {len(supported_models.keys())}")
diff --git a/segmentation_models_pytorch/deeplabv3/model.py b/segmentation_models_pytorch/deeplabv3/model.py
@@ -58,10 +58,7 @@ def __init__(
             in_channels=in_channels,
             depth=encoder_depth,
             weights=encoder_weights,
-        )
-        self.encoder.make_dilated(
-            stage_list=[4, 5],
-            dilation_list=[2, 4]
+            output_stride=8,
         )
 
         self.decoder = DeepLabV3Decoder(
@@ -136,29 +133,19 @@ def __init__(
     ):
         super().__init__()
 
+        if encoder_output_stride not in [8, 16]:
+            raise ValueError(
+                "Encoder output stride should be 8 or 16, got {}".format(encoder_output_stride)
+            )
+
         self.encoder = get_encoder(
             encoder_name,
             in_channels=in_channels,
             depth=encoder_depth,
             weights=encoder_weights,
+            output_stride=encoder_output_stride,
         )
 
-        if encoder_output_stride == 8:
-            self.encoder.make_dilated(
-                stage_list=[4, 5],
-                dilation_list=[2, 4]
-            )
-
-        elif encoder_output_stride == 16:
-            self.encoder.make_dilated(
-                stage_list=[5],
-                dilation_list=[2]
-            )
-        else:
-            raise ValueError(
-                "Encoder output stride should be 8 or 16, got {}".format(encoder_output_stride)
-            )
-
         self.decoder = DeepLabV3PlusDecoder(
             encoder_channels=self.encoder.out_channels,
             out_channels=decoder_channels,
diff --git a/segmentation_models_pytorch/encoders/__init__.py b/segmentation_models_pytorch/encoders/__init__.py
@@ -19,6 +19,8 @@
 from .timm_mobilenetv3 import timm_mobilenetv3_encoders
 from .timm_gernet import timm_gernet_encoders
 
+from .timm_universal import TimmUniversalEncoder
+
 from ._preprocessing import preprocess_input
 
 encoders = {}
@@ -41,7 +43,19 @@
 encoders.update(timm_gernet_encoders)
 
 
-def get_encoder(name, in_channels=3, depth=5, weights=None):
+def get_encoder(name, in_channels=3, depth=5, weights=None, output_stride=32, **kwargs):
+
+    if name.startswith("tu-"):
+        name = name.lstrip("tu-")
+        encoder = TimmUniversalEncoder(
+            name=name,
+            in_channels=in_channels,
+            depth=depth,
+            output_stride=output_stride,
+            pretrained=weights is not None,
+            **kwargs
+        )
+        return encoder
 
     try:
         Encoder = encoders[name]["encoder"]
@@ -62,7 +76,9 @@ def get_encoder(name, in_channels=3, depth=5, weights=None):
         encoder.load_state_dict(model_zoo.load_url(settings["url"]))
 
     encoder.set_in_channels(in_channels, pretrained=weights is not None)
-
+    if output_stride != 32:
+        encoder.make_dilated(output_stride)
+    
     return encoder
 
 
diff --git a/segmentation_models_pytorch/encoders/_base.py b/segmentation_models_pytorch/encoders/_base.py
@@ -32,7 +32,19 @@ def get_stages(self):
         """Method should be overridden in encoder"""
         raise NotImplementedError
 
-    def make_dilated(self, stage_list, dilation_list):
+    def make_dilated(self, output_stride):
+
+        if output_stride == 16:
+            stage_list=[5,]
+            dilation_list=[2,]
+            
+        elif output_stride == 8:
+            stage_list=[4, 5]
+            dilation_list=[2, 4] 
+
+        else:
+            raise ValueError("Output stride should be 16 or 8, got {}.".format(output_stride))
+        
         stages = self.get_stages()
         for stage_indx, dilation_rate in zip(stage_list, dilation_list):
             utils.replace_strides_with_dilation(
diff --git a/segmentation_models_pytorch/encoders/timm_universal.py b/segmentation_models_pytorch/encoders/timm_universal.py
@@ -0,0 +1,34 @@
+import timm
+import torch.nn as nn
+
+
+class TimmUniversalEncoder(nn.Module):
+
+    def __init__(self, name, pretrained=True, in_channels=3, depth=5, output_stride=32): 
+        super().__init__()
+        kwargs = dict(
+            in_chans=in_channels,
+            features_only=True,
+            output_stride=output_stride,
+            pretrained=pretrained,
+            out_indices=tuple(range(depth)),
+        )
+
+        # not all models support output stride argument, drop it by default
+        if output_stride == 32:
+            kwargs.pop("output_stride")
+
+        self.model = timm.create_model(name, **kwargs)
+
+        self._in_channels = in_channels
+        self._out_channels = [3, ] + self.model.feature_info.channels()
+        self._depth = depth
+
+    def forward(self, x):
+        features = self.model(x)
+        features = [x,] + features
+        return features
+
+    @property
+    def out_channels(self):
+        return self._out_channels
diff --git a/segmentation_models_pytorch/pan/model.py b/segmentation_models_pytorch/pan/model.py
@@ -17,8 +17,8 @@ class PAN(SegmentationModel):
             to extract features of different spatial resolution
         encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and 
             other pretrained weights (see table with available weights for each encoder_name)
-        encoder_dilation: Flag to use dilation in encoder last layer. Doesn't work with ***ception***, **vgg***, 
-            **densenet*`** backbones, default is **True**
+        encoder_output_stride: 16 or 32, if 16 use dilation in encoder last layer. 
+            Doesn't work with ***ception***, **vgg***, **densenet*`** backbones.Default is 16.
         decoder_channels: A number of convolution layer filters in decoder blocks
         in_channels: A number of input channels for the model, default is 3 (RGB images)
         classes: A number of classes for output mask (or you can think as a number of channels of output mask)
@@ -45,7 +45,7 @@ def __init__(
             self,
             encoder_name: str = "resnet34",
             encoder_weights: Optional[str] = "imagenet",
-            encoder_dilation: bool = True,
+            encoder_output_stride: int = 16,
             decoder_channels: int = 32,
             in_channels: int = 3,
             classes: int = 1,
@@ -55,19 +55,17 @@ def __init__(
     ):
         super().__init__()
 
+        if encoder_output_stride not in [16, 32]:
+            raise ValueError("PAN support output stride 16 or 32, got {}".format(encoder_output_stride))
+
         self.encoder = get_encoder(
             encoder_name,
             in_channels=in_channels,
             depth=5,
             weights=encoder_weights,
+            output_stride=encoder_output_stride,
         )
 
-        if encoder_dilation:
-            self.encoder.make_dilated(
-                stage_list=[5],
-                dilation_list=[2]
-            )
-
         self.decoder = PANDecoder(
             encoder_channels=self.encoder.out_channels,
             decoder_channels=decoder_channels,
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -8,19 +8,17 @@
 sys.modules["torchvision._C"] = mock.Mock()
 import segmentation_models_pytorch as smp
 
-IS_TRAVIS = os.environ.get("TRAVIS", False)
-
 
 def get_encoders():
-    travis_exclude_encoders = [
+    exclude_encoders = [
         "senet154",
         "resnext101_32x16d",
         "resnext101_32x32d",
         "resnext101_32x48d",
     ]
     encoders = smp.encoders.get_encoder_names()
-    if IS_TRAVIS:
-        encoders = [e for e in encoders if e not in travis_exclude_encoders]
+    encoders = [e for e in encoders if e not in exclude_encoders]
+    encoders.append("tu-resnet34") # for timm universal encoder
     return encoders
 
 
@@ -127,11 +125,7 @@ def test_dilation(encoder_name):
     ):
         return
 
-    encoder = smp.encoders.get_encoder(encoder_name)
-    encoder.make_dilated(
-        stage_list=[5],
-        dilation_list=[2],
-    )
+    encoder = smp.encoders.get_encoder(encoder_name, output_stride=16)
 
     encoder.eval()
     with torch.no_grad():