Introducing adaptive LR schedule to Classification task (#2268)

sungmanc · web-flow · commit 00bc6a8c37f6 · 2023-07-07T10:15:38.000+09:00
* Change the params

* Fix lr scheduler

* Make faster training

* Final values

* Fix tests

* Fix self-sl error

* Change the interpolation type: bicubic --&gt; default
diff --git a/src/otx/algorithms/classification/adapters/mmcls/task.py b/src/otx/algorithms/classification/adapters/mmcls/task.py
@@ -47,14 +47,14 @@
     build_data_parallel,
     get_configs_by_pairs,
     patch_data_pipeline,
+    patch_from_hyperparams,
 )
 from otx.algorithms.common.adapters.mmcv.utils import (
     build_dataloader as otx_build_dataloader,
 )
 from otx.algorithms.common.adapters.mmcv.utils import build_dataset as otx_build_dataset
 from otx.algorithms.common.adapters.mmcv.utils.config_utils import (
     MPAConfig,
-    get_adaptive_num_workers,
     update_or_add_custom_hook,
 )
 from otx.algorithms.common.configs.configuration_enums import BatchSizeAdaptType
@@ -117,7 +117,7 @@ def _init_task(self, export: bool = False):  # noqa
         patch_data_pipeline(self._recipe_cfg, self.data_pipeline_path)
 
         if not export:
-            self._recipe_cfg.merge_from_dict(self._init_hparam())
+            patch_from_hyperparams(self._recipe_cfg, self._hyperparams)
 
         if "custom_hooks" in self.override_configs:
             override_custom_hooks = self.override_configs.pop("custom_hooks")
@@ -656,59 +656,6 @@ def patch_input_shape(deploy_cfg):
 
         return deploy_cfg
 
-    def _init_hparam(self) -> dict:
-        params = self._hyperparams.learning_parameters
-        warmup_iters = int(params.learning_rate_warmup_iters)
-        if self._multilabel:
-            # hack to use 1cycle policy
-            lr_config = ConfigDict(max_lr=params.learning_rate, warmup=None)
-        else:
-            lr_config = (
-                ConfigDict(warmup_iters=warmup_iters) if warmup_iters > 0 else ConfigDict(warmup_iters=0, warmup=None)
-            )
-
-        early_stop = False
-        if self._recipe_cfg is not None:
-            if params.enable_early_stopping and self._recipe_cfg.get("evaluation", None):
-                early_stop = ConfigDict(
-                    start=int(params.early_stop_start),
-                    patience=int(params.early_stop_patience),
-                    iteration_patience=int(params.early_stop_iteration_patience),
-                )
-
-        if self._recipe_cfg.runner.get("type").startswith("IterBasedRunner"):  # type: ignore
-            runner = ConfigDict(max_iters=int(params.num_iters))
-        else:
-            runner = ConfigDict(max_epochs=int(params.num_iters))
-
-        config = ConfigDict(
-            optimizer=ConfigDict(lr=params.learning_rate),
-            lr_config=lr_config,
-            early_stop=early_stop,
-            data=ConfigDict(
-                samples_per_gpu=int(params.batch_size),
-                workers_per_gpu=int(params.num_workers),
-            ),
-            runner=runner,
-        )
-
-        if self._hyperparams.learning_parameters.auto_num_workers:
-            adapted_num_worker = get_adaptive_num_workers()
-            if adapted_num_worker is not None:
-                config.data.workers_per_gpu = adapted_num_worker
-
-        if self._train_type.value == "Semisupervised":
-            unlabeled_config = ConfigDict(
-                data=ConfigDict(
-                    unlabeled_dataloader=ConfigDict(
-                        samples_per_gpu=int(params.unlabeled_batch_size),
-                        workers_per_gpu=int(params.num_workers),
-                    )
-                )
-            )
-            config.update(unlabeled_config)
-        return config
-
     # This should be removed
     def update_override_configurations(self, config):
         """Update override_configs."""
diff --git a/src/otx/algorithms/classification/configs/base/data/data_pipeline.py b/src/otx/algorithms/classification/configs/base/data/data_pipeline.py
@@ -20,19 +20,17 @@
 __resize_target_size = 224
 
 __train_pipeline = [
-    dict(type="Resize", size=__resize_target_size),
+    dict(type="RandomResizedCrop", size=224, efficientnet_style=True),
     dict(type="RandomFlip", flip_prob=0.5, direction="horizontal"),
-    dict(type="AugMixAugment", config_str="augmix-m5-w3-d1"),
-    dict(type="RandomRotate", p=0.35, angle=(-10, 10)),
-    dict(type="PILImageToNDArray", keys=["img"]),
     dict(type="Normalize", **__img_norm_cfg),
     dict(type="ImageToTensor", keys=["img"]),
     dict(type="ToTensor", keys=["gt_label"]),
     dict(type="Collect", keys=["img", "gt_label"]),
 ]
 
 __test_pipeline = [
-    dict(type="Resize", size=__resize_target_size),
+    dict(type="Resize", size=(256, -1)),
+    dict(type="CenterCrop", crop_size=224),
     dict(type="Normalize", **__img_norm_cfg),
     dict(type="ImageToTensor", keys=["img"]),
     dict(type="Collect", keys=["img"]),
diff --git a/src/otx/algorithms/classification/configs/configuration.yaml b/src/otx/algorithms/classification/configs/configuration.yaml
@@ -171,7 +171,7 @@ learning_parameters:
     visible_in_ui: false
   early_stop_patience:
     affects_outcome_of: TRAINING
-    default_value: 8
+    default_value: 3
     description: Training will stop if the model does not improve within the number of epochs of patience.
     editable: true
     header: Patience for early stopping
@@ -207,17 +207,17 @@ learning_parameters:
     warning: This is applied exclusively when early stopping is enabled.
   use_adaptive_interval:
     affects_outcome_of: TRAINING
-    default_value: false
+    default_value: true
     description: Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
-    editable: false
+    editable: true
     header: Use adaptive validation interval
     type: BOOLEAN
     ui_rules:
       action: DISABLE_EDITING
       operator: AND
       rules: []
       type: UI_RULES
-    visible_in_ui: false
+    visible_in_ui: true
     warning: This will automatically control the patience and interval when early stopping is enabled.
   enable_supcon:
     affects_outcome_of: TRAINING
diff --git a/src/otx/algorithms/classification/configs/efficientnet_b0_cls_incr/selfsl/hparam.yaml b/src/otx/algorithms/classification/configs/efficientnet_b0_cls_incr/selfsl/hparam.yaml
@@ -16,6 +16,8 @@ hyper_parameters:
         default_value: 5000
       enable_early_stopping:
         default_value: false
+      use_adaptive_interval:
+        default_value: false
     algo_backend:
       train_type:
         default_value: Selfsupervised
diff --git a/src/otx/algorithms/classification/configs/efficientnet_v2_s_cls_incr/selfsl/hparam.yaml b/src/otx/algorithms/classification/configs/efficientnet_v2_s_cls_incr/selfsl/hparam.yaml
@@ -16,6 +16,8 @@ hyper_parameters:
         default_value: 5000
       enable_early_stopping:
         default_value: false
+      use_adaptive_interval:
+        default_value: false
     algo_backend:
       train_type:
         default_value: Selfsupervised
diff --git a/src/otx/algorithms/classification/configs/mobilenet_v3_large_075_cls_incr/selfsl/hparam.yaml b/src/otx/algorithms/classification/configs/mobilenet_v3_large_075_cls_incr/selfsl/hparam.yaml
@@ -16,6 +16,8 @@ hyper_parameters:
         default_value: 5000
       enable_early_stopping:
         default_value: false
+      use_adaptive_interval:
+        default_value: false
     algo_backend:
       train_type:
         default_value: Selfsupervised
diff --git a/src/otx/algorithms/classification/configs/mobilenet_v3_large_1_cls_incr/selfsl/hparam.yaml b/src/otx/algorithms/classification/configs/mobilenet_v3_large_1_cls_incr/selfsl/hparam.yaml
@@ -16,6 +16,8 @@ hyper_parameters:
         default_value: 5000
       enable_early_stopping:
         default_value: false
+      use_adaptive_interval:
+        default_value: false
     algo_backend:
       train_type:
         default_value: Selfsupervised
diff --git a/src/otx/algorithms/classification/configs/mobilenet_v3_small_cls_incr/selfsl/hparam.yaml b/src/otx/algorithms/classification/configs/mobilenet_v3_small_cls_incr/selfsl/hparam.yaml
@@ -16,6 +16,8 @@ hyper_parameters:
         default_value: 5000
       enable_early_stopping:
         default_value: false
+      use_adaptive_interval:
+        default_value: false
     algo_backend:
       train_type:
         default_value: Selfsupervised
diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/adaptive_training_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/adaptive_training_hook.py
@@ -23,26 +23,22 @@
 class AdaptiveTrainSchedulingHook(Hook):
     """Adaptive Training Scheduling Hook.
 
-    Depending on the size of iteration per epoch, adaptively update the validation interval and related values.
+    Depending on the size of iteration per epoch, adaptively update the validation interval.
 
     Args:
         max_interval (int): Maximum value of validation interval.
             Defaults to 5.
-        base_lr_patience (int): The value of LR drop patience are expected in total epoch.
-            Patience used when interval is 1, Defaults to 5.
-        min_lr_patience (int): Minumum value of LR drop patience.
-            Defaults to 2.
-        base_es_patience (int): The value of Early-Stopping patience are expected in total epoch.
-            Patience used when interval is 1, Defaults to 10.
+        decay (float): Parameter to control the interval. This value is set by manual manner.
+            Defaults to -0.025.
+        enable_adaptive_interval_hook (bool): If True, adaptive interval will be enabled.
+            Defaults to False.
+        enable_eval_before_run (bool): If True, initial evaluation before training will be enabled.
+            Defaults to False.
     """
 
     def __init__(
         self,
         max_interval=5,
-        base_lr_patience=5,
-        min_lr_patience=2,
-        base_es_patience=10,
-        min_es_patience=3,
         decay=-0.025,
         enable_adaptive_interval_hook=False,
         enable_eval_before_run=False,
@@ -51,10 +47,6 @@ def __init__(
         super().__init__(**kwargs)
 
         self.max_interval = max_interval
-        self.base_lr_patience = base_lr_patience
-        self.min_lr_patience = min_lr_patience
-        self.base_es_patience = base_es_patience
-        self.min_es_patience = min_es_patience
         self.decay = decay
         self.enable_adaptive_interval_hook = enable_adaptive_interval_hook
         self.enable_eval_before_run = enable_eval_before_run
@@ -92,23 +84,13 @@ def before_train_iter(self, runner):
                     logger.info(f"Update EvalHook interval: {hook.interval} -> {adaptive_interval}")
                     hook.interval = adaptive_interval
                 elif isinstance(hook, LrUpdaterHook):
-                    patience = max(
-                        math.ceil((self.base_lr_patience / adaptive_interval)),
-                        self.min_lr_patience,
-                    )
                     if hasattr(hook, "interval") and hasattr(hook, "patience"):
                         hook.interval = adaptive_interval
-                        hook.patience = patience
-                        logger.info(f"Update LrUpdaterHook patience: {hook.patience} -> {patience}")
+                        logger.info(f"Update LrUpdaterHook interval: {hook.interval} -> {adaptive_interval}")
                 elif isinstance(hook, EarlyStoppingHook):
-                    patience = max(
-                        math.ceil((self.base_es_patience / adaptive_interval)),
-                        self.min_es_patience,
-                    )
-                    logger.info(f"Update EarlyStoppingHook patience: {hook.patience} -> {patience}")
+                    logger.info(f"Update EarlyStoppingHook interval: {hook.interval} -> {adaptive_interval}")
                     hook.start = adaptive_interval
                     hook.interval = adaptive_interval
-                    hook.patience = patience
                 elif isinstance(hook, CheckpointHook):
                     # make sure checkpoint is saved at last
                     limit = runner.max_epochs if hook.by_epoch else runner.max_iters
diff --git a/src/otx/algorithms/common/adapters/mmcv/hooks/early_stopping_hook.py b/src/otx/algorithms/common/adapters/mmcv/hooks/early_stopping_hook.py
@@ -40,8 +40,8 @@ class EarlyStoppingHook(Hook):
                                continues if the number of iteration is lower than iteration_patience
                                This variable makes sure a model is trained enough for some
                                iterations after the last improvement before stopping.
-    :param min_delta: Minimal decay applied to lr. If the difference between new and old lr is
-                      smaller than eps, the update is ignored
+    :param min_delta_ratio: Minimal ratio value to check the best score. If the difference between current and
+                      best score is smaller than (current_score * (1-min_delta_ratio)), best score will not be changed.
     """
 
     rule_map = {"greater": lambda x, y: x > y, "less": lambda x, y: x < y}
@@ -56,16 +56,16 @@ def __init__(
         rule: Optional[str] = None,
         patience: int = 5,
         iteration_patience: int = 500,
-        min_delta: float = 0.0,
+        min_delta_ratio: float = 0.0,
     ):
         super().__init__()
         self.patience = patience
         self.iteration_patience = iteration_patience
         self.interval = interval
-        self.min_delta = min_delta
+        self.min_delta_ratio = min_delta_ratio
         self._init_rule(rule, metric)
 
-        self.min_delta *= 1 if self.rule == "greater" else -1
+        self.min_delta_ratio *= 1 if self.rule == "greater" else -1
         self.last_iter = 0
         self.wait_count = 0
         self.best_score = self.init_value_map[self.rule]
@@ -141,7 +141,7 @@ def _do_check_stopping(self, runner):
                 )
 
             key_score = runner.log_buffer.output[self.key_indicator]
-            if self.compare_func(key_score - self.min_delta, self.best_score):
+            if self.compare_func(key_score - (key_score * self.min_delta_ratio), self.best_score):
                 self.best_score = key_score
                 self.wait_count = 0
                 self.last_iter = runner.iter
@@ -184,11 +184,11 @@ def __init__(
         rule: str = None,
         patience: int = 5,
         iteration_patience: int = 500,
-        min_delta: float = 0.0,
+        min_delta_ratio: float = 0.0,
         start: int = None,
     ):
         self.start = start
-        super().__init__(interval, metric, rule, patience, iteration_patience, min_delta)
+        super().__init__(interval, metric, rule, patience, iteration_patience, min_delta_ratio)
 
     def _should_check_stopping(self, runner):
         if self.by_epoch:
@@ -352,6 +352,7 @@ def get_lr(self, runner: BaseRunner, base_lr: float):
                     logger=runner.logger,
                 )
                 return self.current_lr
+
             self.last_iter = runner.iter
             self.bad_count = 0
             print_log(
diff --git a/src/otx/algorithms/common/adapters/mmcv/utils/config_utils.py b/src/otx/algorithms/common/adapters/mmcv/utils/config_utils.py
@@ -585,11 +585,16 @@ def patch_from_hyperparams(config: Config, hyperparams):
     """Patch config parameters from hyperparams."""
     params = hyperparams.learning_parameters
     warmup_iters = int(params.learning_rate_warmup_iters)
-    lr_config = (
-        ConfigDict(warmup_iters=warmup_iters)
-        if warmup_iters > 0
-        else ConfigDict(warmup_iters=warmup_iters, warmup=None)
-    )
+
+    model_label_type = config.filename.split("/")[-1]
+    if "multilabel" in model_label_type:
+        lr_config = ConfigDict(max_lr=params.learning_rate, warmup=None)
+    else:
+        lr_config = (
+            ConfigDict(warmup_iters=warmup_iters)
+            if warmup_iters > 0
+            else ConfigDict(warmup_iters=warmup_iters, warmup=None)
+        )
 
     if params.enable_early_stopping and config.get("evaluation", None):
         early_stop = ConfigDict(
@@ -620,6 +625,17 @@ def patch_from_hyperparams(config: Config, hyperparams):
         if adapted_num_worker is not None:
             hparams.data.workers_per_gpu = adapted_num_worker
 
+    if hyperparams.algo_backend.train_type.name == "Semisupervised":
+        unlabeled_config = ConfigDict(
+            data=ConfigDict(
+                unlabeled_dataloader=ConfigDict(
+                    samples_per_gpu=int(params.unlabeled_batch_size),
+                    workers_per_gpu=int(params.num_workers),
+                )
+            )
+        )
+        config.update(unlabeled_config)
+
     hparams["use_adaptive_interval"] = hyperparams.learning_parameters.use_adaptive_interval
     config.merge_from_dict(hparams)
 
diff --git a/src/otx/recipes/stages/_base_/schedules/cos_anneal.py b/src/otx/recipes/stages/_base_/schedules/cos_anneal.py
@@ -1,3 +1,3 @@
 _base_ = "./schedule.py"
 
-lr_config = dict(policy="CosineAnnealing", min_lr_ratio=0.0001)
+lr_config = dict(policy="CosineAnnealing", warmup=None, warmup_iters=0, min_lr_ratio=0.00001)
diff --git a/src/otx/recipes/stages/classification/incremental.yaml b/src/otx/recipes/stages/classification/incremental.yaml
diff --git a/src/otx/recipes/stages/classification/train.yaml b/src/otx/recipes/stages/classification/train.yaml
diff --git a/tests/unit/algorithms/classification/adapters/mmcls/test_configurer.py b/tests/unit/algorithms/classification/adapters/mmcls/test_configurer.py
diff --git a/tests/unit/algorithms/common/adapters/mmcv/hooks/test_adaptive_training_hooks.py b/tests/unit/algorithms/common/adapters/mmcv/hooks/test_adaptive_training_hooks.py