Merge pull request #38 from kozistr/feature/adamd-optimizer

kozistr · web-flow · commit 09a8b583673c · 2021-11-22T10:48:55.000+09:00
[Feature] Support AdamD optimizer
diff --git a/README.rst b/README.rst
@@ -58,6 +58,8 @@ Supported Optimizers
 +--------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+
 | AdaHessian   | *An Adaptive Second Order Optimizer for Machine Learning*                              | `github <https://github.com/amirgholami/adahessian>`__                            | `https://arxiv.org/abs/2006.00719 <https://arxiv.org/abs/2006.00719>`__                       |
 +--------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+
+| AdamD        | *Improved bias-correction in Adam*                                                     |                                                                                   | `https://arxiv.org/abs/2110.10828 <https://arxiv.org/abs/2110.10828>`__                       |
++--------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+
 | AdamP        | *Slowing Down the Slowdown for Momentum Optimizers on Scale-invariant Weights*         | `github <https://github.com/clovaai/AdamP>`__                                     | `https://arxiv.org/abs/2006.08217 <https://arxiv.org/abs/2006.08217>`__                       |
 +--------------+----------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+
 | diffGrad     | *An Optimization Method for Convolutional Neural Networks*                             | `github <https://github.com/shivram1987/diffGrad>`__                              | `https://arxiv.org/abs/1909.11015v3 <https://arxiv.org/abs/1909.11015v3>`__                   |
@@ -452,6 +454,17 @@ Gradient Surgery for Multi-Task Learning
       year={2020}
     }
 
+AdamD: Improved bias-correction in Adam
+
+::
+
+    @article{john2021adamd,
+      title={AdamD: Improved bias-correction in Adam},
+      author={John, John St},
+      journal={arXiv preprint arXiv:2110.10828},
+      year={2021}
+    }
+
 Author
 ------
 
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -20,4 +20,4 @@
 from pytorch_optimizer.sgdp import SGDP
 from pytorch_optimizer.utils import clip_grad_norm, get_optimizer_parameters, normalize_gradient, unit_norm
 
-__VERSION__ = '0.2.0'
+__VERSION__ = '0.2.1'
diff --git a/pytorch_optimizer/adabelief.py b/pytorch_optimizer/adabelief.py
@@ -35,6 +35,7 @@ def __init__(
         rectify: bool = True,
         degenerated_to_sgd: bool = True,
         amsgrad: bool = False,
+        adamd_debias_term: bool = False,
         eps: float = 1e-16,
     ):
         """
@@ -48,6 +49,7 @@ def __init__(
         :param rectify: bool. perform the rectified update similar to RAdam
         :param degenerated_to_sgd: bool. perform SGD update when variance of gradient is high
         :param amsgrad: bool. whether to use the AMSBound variant
+        :param adamd_debias_term: bool. Only correct the denominator to avoid inflating step sizes early in training
         :param eps: float. term added to the denominator to improve numerical stability
         """
         self.lr = lr
@@ -58,6 +60,7 @@ def __init__(
         self.fixed_decay = fixed_decay
         self.rectify = rectify
         self.degenerated_to_sgd = degenerated_to_sgd
+        self.adamd_debias_term = adamd_debias_term
         self.eps = eps
 
         buffer: BUFFER = [[None, None, None] for _ in range(10)]
@@ -73,6 +76,7 @@ def __init__(
             eps=eps,
             weight_decay=weight_decay,
             amsgrad=amsgrad,
+            adamd_debias_term=adamd_debias_term,
             buffer=buffer,
         )
         super().__init__(params, defaults)
@@ -81,17 +85,17 @@ def __setstate__(self, state: STATE):
         super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault('amsgrad', False)
+            group.setdefault('adamd_debias_term', False)
 
     def reset(self):
         for group in self.param_groups:
             for p in group['params']:
                 state = self.state[p]
-                amsgrad = group['amsgrad']
 
                 state['step'] = 0
                 state['exp_avg'] = torch.zeros_like(p.data)
                 state['exp_avg_var'] = torch.zeros_like(p.data)
-                if amsgrad:
+                if group['amsgrad']:
                     state['max_exp_avg_var'] = torch.zeros_like(p.data)
 
     def step(self, closure: CLOSURE = None) -> LOSS:
@@ -114,14 +118,12 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 if grad.is_sparse:
                     raise RuntimeError('AdaBelief does not support sparse gradients')
 
-                amsgrad = group['amsgrad']
-
                 state = self.state[p]
                 if len(state) == 0:
                     state['step'] = 0
                     state['exp_avg'] = torch.zeros_like(p.data)
                     state['exp_avg_var'] = torch.zeros_like(p.data)
-                    if amsgrad:
+                    if group['amsgrad']:
                         state['max_exp_avg_var'] = torch.zeros_like(p.data)
 
                 if self.weight_decouple:
@@ -145,7 +147,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 grad_residual = grad - exp_avg
                 exp_avg_var.mul_(beta2).addcmul_(grad_residual, grad_residual, value=1.0 - beta2)
 
-                if amsgrad:
+                if group['amsgrad']:
                     max_exp_avg_var = state['max_exp_avg_var']
 
                     torch.max(
@@ -159,7 +161,11 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     denom = (exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
 
                 if not self.rectify:
-                    step_size = group['lr'] / bias_correction1
+                    if group['adamd_debias_term']:
+                        step_size = group['lr']
+                    else:
+                        step_size = group['lr'] / bias_correction1
+
                     p.data.addcdiv_(exp_avg, denom, value=-step_size)
                 else:
                     buffered = group['buffer'][int(state['step'] % 10)]
@@ -173,17 +179,22 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                         buffered[1] = n_sma
 
                         if n_sma >= self.n_sma_threshold:
-                            step_size = math.sqrt(
+                            rt = math.sqrt(
                                 (1 - beta2_t)
                                 * (n_sma - 4)
                                 / (n_sma_max - 4)
                                 * (n_sma - 2)
                                 / n_sma
                                 * n_sma_max
                                 / (n_sma_max - 2)
-                            ) / (1 - beta1 ** state['step'])
+                            )
+
+                            if group['adamd_debias_term']:
+                                step_size = rt
+                            else:
+                                step_size = rt / bias_correction1
                         elif self.degenerated_to_sgd:
-                            step_size = 1.0 / (1.0 - beta1 ** state['step'])
+                            step_size = 1.0 / bias_correction1
                         else:
                             step_size = -1
 
diff --git a/pytorch_optimizer/adabound.py b/pytorch_optimizer/adabound.py
@@ -34,6 +34,7 @@ def __init__(
         weight_decouple: bool = True,
         fixed_decay: bool = False,
         amsbound: bool = False,
+        adamd_debias_term: bool = False,
         eps: float = 1e-8,
     ):
         """
@@ -46,6 +47,7 @@ def __init__(
         :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW
         :param fixed_decay: bool.
         :param amsbound: bool. whether to use the AMSBound variant
+        :param adamd_debias_term: bool. Only correct the denominator to avoid inflating step sizes early in training
         :param eps: float. term added to the denominator to improve numerical stability
         """
         self.lr = lr
@@ -62,6 +64,7 @@ def __init__(
             gamma=gamma,
             weight_decay=weight_decay,
             amsbound=amsbound,
+            adamd_debias_term=adamd_debias_term,
             eps=eps,
         )
         super().__init__(params, defaults)
@@ -84,6 +87,7 @@ def __setstate__(self, state: STATE):
         super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault('amsbound', False)
+            group.setdefault('adamd_debias_term', False)
 
     def step(self, closure: CLOSURE = None) -> LOSS:
         loss: LOSS = None
@@ -99,19 +103,17 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 if grad.is_sparse:
                     raise RuntimeError('AdaBound does not support sparse gradients')
 
-                amsbound = group['amsbound']
-
                 state = self.state[p]
 
                 if len(state) == 0:
                     state['step'] = 0
                     state['exp_avg'] = torch.zeros_like(p)
                     state['exp_avg_sq'] = torch.zeros_like(p)
-                    if amsbound:
+                    if group['amsbound']:
                         state['max_exp_avg_sq'] = torch.zeros_like(p)
 
                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                if amsbound:
+                if group['amsbound']:
                     max_exp_avg_sq = state['max_exp_avg_sq']
 
                 state['step'] += 1
@@ -129,15 +131,19 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                if amsbound:
+                if group['amsbound']:
                     max_exp_avg_sq = torch.max(max_exp_avg_sq, exp_avg_sq)
                     denom = max_exp_avg_sq.sqrt().add_(group['eps'])
                 else:
                     denom = exp_avg_sq.sqrt().add_(group['eps'])
 
                 bias_correction1 = 1 - beta1 ** state['step']
                 bias_correction2 = 1 - beta2 ** state['step']
-                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+
+                if group['adamd_debias_term']:
+                    step_size = group['lr'] * math.sqrt(bias_correction2)
+                else:
+                    step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
 
                 final_lr = group['final_lr'] * group['lr'] / base_lr
                 lower_bound = final_lr * (1 - 1 / (group['gamma'] * state['step'] + 1))
diff --git a/pytorch_optimizer/adahessian.py b/pytorch_optimizer/adahessian.py
@@ -32,6 +32,7 @@ def __init__(
         update_each: int = 1,
         num_samples: int = 1,
         average_conv_kernel: bool = False,
+        adamd_debias_term: bool = False,
         eps: float = 1e-8,
         seed: int = 2147483647,
     ):
@@ -44,6 +45,7 @@ def __init__(
         :param update_each: int. compute the hessian trace approximation only after *this* number of steps
         :param num_samples: int. how many times to sample `z` for the approximation of the hessian trace
         :param average_conv_kernel: bool. average out the hessian traces of convolutional kernels as in the paper.
+        :param adamd_debias_term: bool. Only correct the denominator to avoid inflating step sizes early in training
         :param eps: float. term added to the denominator to improve numerical stability
         :param seed: int.
         """
@@ -69,6 +71,7 @@ def __init__(
             eps=eps,
             weight_decay=weight_decay,
             hessian_power=hessian_power,
+            adamd_debias_term=adamd_debias_term,
         )
         super().__init__(params, defaults)
 
@@ -179,7 +182,11 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 hessian_power = group['hessian_power']
                 denom = (exp_hessian_diag_sq / bias_correction2).pow_(hessian_power / 2).add_(group['eps'])
 
-                step_size = group['lr'] / bias_correction1
+                if group['adamd_debias_term']:
+                    step_size = group['lr']
+                else:
+                    step_size = group['lr'] / bias_correction1
+
                 p.addcdiv_(exp_avg, denom, value=-step_size)
 
         return loss
diff --git a/pytorch_optimizer/adamp.py b/pytorch_optimizer/adamp.py
@@ -35,6 +35,7 @@ def __init__(
         wd_ratio: float = 0.1,
         use_gc: bool = False,
         nesterov: bool = False,
+        adamd_debias_term: bool = False,
         eps: float = 1e-8,
     ):
         """
@@ -47,6 +48,7 @@ def __init__(
             on scale-variant parameters
         :param use_gc: bool. use gradient centralization
         :param nesterov: bool. enables Nesterov momentum
+        :param adamd_debias_term: bool. Only correct the denominator to avoid inflating step sizes early in training
         :param eps: float. term added to the denominator to improve numerical stability
         """
         self.lr = lr
@@ -65,6 +67,7 @@ def __init__(
             delta=delta,
             wd_ratio=wd_ratio,
             nesterov=nesterov,
+            adamd_debias_term=adamd_debias_term,
             eps=eps,
         )
         super().__init__(params, defaults)
@@ -157,10 +160,12 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
 
                 denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-                step_size = group['lr'] / bias_correction1
+                if group['adamd_debias_term']:
+                    step_size = group['lr']
+                else:
+                    step_size = group['lr'] / bias_correction1
 
-                nesterov = group['nesterov']
-                if nesterov:
+                if group['nesterov']:
                     perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
                 else:
                     perturb = exp_avg / denom
diff --git a/pytorch_optimizer/diffgrad.py b/pytorch_optimizer/diffgrad.py
@@ -29,13 +29,15 @@ def __init__(
         betas: BETAS = (0.9, 0.999),
         eps: float = 1e-8,
         weight_decay: float = 0.0,
+        adamd_debias_term: bool = False,
     ):
         """
         :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups
         :param lr: float. learning rate.
         :param betas: BETAS. coefficients used for computing running averages of gradient and the squared hessian trace
         :param eps: float. term added to the denominator to improve numerical stability
         :param weight_decay: float. weight decay (L2 penalty)
+        :param adamd_debias_term: bool. Only correct the denominator to avoid inflating step sizes early in training
         """
         self.lr = lr
         self.eps = eps
@@ -44,7 +46,9 @@ def __init__(
 
         self.check_valid_parameters()
 
-        defaults: DEFAULTS = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        defaults: DEFAULTS = dict(
+            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, adamd_debias_term=adamd_debias_term
+        )
         super().__init__(params, defaults)
 
     def check_valid_parameters(self):
@@ -107,7 +111,10 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 # update momentum with dfc
                 exp_avg1 = exp_avg * dfc
 
-                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+                if group['adamd_debias_term']:
+                    step_size = group['lr'] * math.sqrt(bias_correction2)
+                else:
+                    step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
 
                 p.data.addcdiv_(-step_size, exp_avg1, denom)
 
diff --git a/pytorch_optimizer/diffrgrad.py b/pytorch_optimizer/diffrgrad.py
diff --git a/pytorch_optimizer/radam.py b/pytorch_optimizer/radam.py
diff --git a/pytorch_optimizer/ranger.py b/pytorch_optimizer/ranger.py