Add mmcls.VisionTransformer backbone support (#1908)

harimkang · web-flow · commit f7e4799c4383 · 2023-03-21T13:25:50.000+09:00
* Add mmcls transformer backbones

* Fix VisionTransformeroutput check

* Add changes

* Disable recording forward hooks in inferrer

* Remove unused import
diff --git a/otx/algorithms/__init__.py b/otx/algorithms/__init__.py
@@ -2,3 +2,5 @@
 
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
+
+TRANSFORMER_BACKBONES = ["VisionTransformer", "T2T_ViT", "Conformer"]
diff --git a/otx/algorithms/classification/configs/configuration.yaml b/otx/algorithms/classification/configs/configuration.yaml
@@ -10,7 +10,7 @@ learning_parameters:
       stable. A larger batch size has higher memory requirements.
     editable: true
     header: Batch size
-    max_value: 512
+    max_value: 2048
     min_value: 1
     type: INTEGER
     ui_rules:
diff --git a/otx/algorithms/common/configs/training_base.py b/otx/algorithms/common/configs/training_base.py
@@ -65,7 +65,7 @@ class BaseLearningParameters(ParameterGroup):
         batch_size = configurable_integer(
             default_value=5,
             min_value=1,
-            max_value=512,
+            max_value=2048,
             header="Batch size",
             description="The number of training samples seen in each iteration of training. Increasing thisvalue "
             "improves training time and may make the training more stable. A larger batch size has higher "
diff --git a/otx/cli/builder/builder.py b/otx/cli/builder/builder.py
@@ -28,6 +28,7 @@
 from mmcv.utils import Registry, build_from_cfg
 from torch import nn
 
+from otx.algorithms import TRANSFORMER_BACKBONES
 from otx.api.entities.model_template import TaskType
 from otx.cli.utils.importing import (
     get_backbone_list,
@@ -101,8 +102,8 @@ def update_backbone_args(backbone_config: dict, registry: Registry, backend: str
 
 def update_channels(model_config: MPAConfig, out_channels: Any):
     """Update in_channel of head or neck."""
-    if hasattr(model_config.model, "neck"):
-        if model_config.model.neck.type == "GlobalAveragePooling":
+    if hasattr(model_config.model, "neck") and model_config.model.neck:
+        if model_config.model.neck.get("type", None) == "GlobalAveragePooling":
             model_config.model.neck.pop("in_channels", None)
         else:
             print(f"\tUpdate model.neck.in_channels: {out_channels}")
@@ -212,6 +213,12 @@ def merge_backbone(
             out_channels = -1
             if hasattr(model_config.model, "head"):
                 model_config.model.head.in_channels = -1
+            # TODO: This is a hard coded part of the Transformer backbone and needs to be refactored.
+            if backend == "mmcls" and backbone_class in TRANSFORMER_BACKBONES:
+                if hasattr(model_config.model, "neck"):
+                    model_config.model.neck = None
+                if hasattr(model_config.model, "head"):
+                    model_config.model.head["type"] = "VisionTransformerClsHead"
         else:
             # Need to update in/out channel configuration here
             out_channels = get_backbone_out_channels(backbone)
diff --git a/otx/cli/builder/supported_backbone/mmcls.json b/otx/cli/builder/supported_backbone/mmcls.json
@@ -11,7 +11,7 @@
       "options": {
         "arch": ["tiny", "small", "base"]
       },
-      "available": []
+      "available": ["CLASSIFICATION"]
     },
     "mmcls.ConvMixer": {
       "required": ["arch"],
@@ -287,7 +287,7 @@
     "mmcls.T2T_ViT": {
       "required": [],
       "options": {},
-      "available": []
+      "available": ["CLASSIFICATION"]
     },
     "mmcls.TIMMBackbone": {
       "required": ["model_name"],
@@ -341,7 +341,7 @@
           "deit-base"
         ]
       },
-      "available": []
+      "available": ["CLASSIFICATION"]
     }
   }
 }
diff --git a/otx/mpa/cls/inferrer.py b/otx/mpa/cls/inferrer.py
@@ -11,6 +11,7 @@
 from mmcls.datasets import build_dataset as mmcls_build_dataset
 from mmcv import Config, ConfigDict
 
+from otx.algorithms import TRANSFORMER_BACKBONES
 from otx.algorithms.common.adapters.mmcv.utils import (
     build_data_parallel,
     build_dataloader,
@@ -53,6 +54,10 @@ def run(self, model_cfg, model_ckpt, data_cfg, **kwargs):
         model_builder = kwargs.get("model_builder", None)
         dump_features = kwargs.get("dump_features", False)
         dump_saliency_map = kwargs.get("dump_saliency_map", False)
+        # TODO: It looks like we need to modify that code in an appropriate way.
+        if model_cfg.model.head.get("type", None) == "VisionTransformerClsHead":
+            dump_features = False
+            dump_saliency_map = False
         eval = kwargs.get("eval", False)
         outputs = self.infer(
             cfg,
diff --git a/otx/mpa/cls/stage.py b/otx/mpa/cls/stage.py
@@ -8,6 +8,7 @@
 import torch
 from mmcv import ConfigDict, build_from_cfg
 
+from otx.algorithms import TRANSFORMER_BACKBONES
 from otx.algorithms.classification.adapters.mmcls.utils.builder import build_classifier
 from otx.mpa.stage import Stage
 from otx.mpa.utils.config_utils import recursively_update_cfg, update_or_add_custom_hook
@@ -89,6 +90,13 @@ def configure_in_channel(cfg):
         output = layer(torch.rand([1] + list(input_shape)))
         if isinstance(output, (tuple, list)):
             output = output[-1]
+
+        if layer.__class__.__name__ in TRANSFORMER_BACKBONES and isinstance(output, (tuple, list)):
+            # mmcls.VisionTransformer outputs Tuple[List[...]] and the last index of List is the final logit.
+            _, output = output
+            if cfg.model.head.type != "VisionTransformerClsHead":
+                raise ValueError(f"{layer.__class__.__name__ } needs VisionTransformerClsHead as head")
+
         in_channels = output.shape[1]
         if cfg.model.get("neck") is not None:
             if cfg.model.neck.get("in_channels") is not None: