refactor: optimizers

kozistr · kozistr · commit 26fca7b3cacf · 2023-01-28T18:57:43.000+09:00
diff --git a/pytorch_optimizer/optimizer/adabelief.py b/pytorch_optimizer/optimizer/adabelief.py
@@ -95,6 +95,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
         for group in self.param_groups:
             beta1, beta2 = group['betas']
+            n_sma_max: float = 2 / (1 - beta2) - 1
             for p in group['params']:
                 if p.grad is None:
                     continue
@@ -154,12 +155,11 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     else:
                         buffered[0] = state['step']
                         beta2_t = beta2 ** state['step']
-                        n_sma_max = 2 / (1 - beta2) - 1
                         n_sma = n_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                         buffered[1] = n_sma
 
                         if n_sma >= self.n_sma_threshold:
-                            rt = math.sqrt(
+                            step_size = math.sqrt(
                                 (1 - beta2_t)
                                 * (n_sma - 4)
                                 / (n_sma_max - 4)
@@ -168,8 +168,6 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                                 * n_sma_max
                                 / (n_sma_max - 2)
                             )
-
-                            step_size = rt
                             if not group['adamd_debias_term']:
                                 step_size /= bias_correction1
                         elif self.degenerated_to_sgd:
diff --git a/pytorch_optimizer/optimizer/adabound.py b/pytorch_optimizer/optimizer/adabound.py
@@ -113,7 +113,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 state['step'] += 1
                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 
-                if group['weight_decay'] != 0:
+                if group['weight_decay'] > 0.0:
                     if self.weight_decouple:
                         p.mul_(
                             1.0 - (group['weight_decay'] if self.fixed_decay else group['lr'] * group['weight_decay'])
@@ -124,7 +124,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1)
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
                 if group['amsbound']:
-                    exp_avg_sq = torch.max(state['max_exp_avg_sq'], exp_avg_sq)
+                    torch.max(state['max_exp_avg_sq'], exp_avg_sq, out=exp_avg_sq)
 
                 de_nom = exp_avg_sq.sqrt().add_(group['eps'])
 
diff --git a/pytorch_optimizer/optimizer/adai.py b/pytorch_optimizer/optimizer/adai.py
@@ -113,7 +113,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 bias_correction2 = 1.0 - beta2 ** state['step']
 
-                if group['weight_decay'] != 0:
+                if group['weight_decay'] > 0.0:
                     if self.weight_decouple:
                         p.mul_(1.0 - group['lr'] * group['weight_decay'])
                     else:
diff --git a/pytorch_optimizer/optimizer/adamp.py b/pytorch_optimizer/optimizer/adamp.py
@@ -136,7 +136,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                         group['eps'],
                     )
 
-                if group['weight_decay'] > 0:
+                if group['weight_decay'] > 0.0:
                     p.mul_(1.0 - group['lr'] * group['weight_decay'] * wd_ratio)
 
                 step_size = group['lr']
diff --git a/pytorch_optimizer/optimizer/adapnm.py b/pytorch_optimizer/optimizer/adapnm.py
@@ -122,7 +122,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 exp_avg.mul_(beta1 ** 2).add_(grad, alpha=1 - beta1 ** 2)  # fmt: skip
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
                 if group['amsgrad']:
-                    exp_avg_sq = torch.max(state['max_exp_avg_sq'], exp_avg_sq)
+                    torch.max(state['max_exp_avg_sq'], exp_avg_sq, out=exp_avg_sq)
 
                 de_nom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
 
diff --git a/pytorch_optimizer/optimizer/diffrgrad.py b/pytorch_optimizer/optimizer/diffrgrad.py
@@ -81,6 +81,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
         for group in self.param_groups:
             beta1, beta2 = group['betas']
+            n_sma_max: float = 2.0 / (1.0 - beta2) - 1.0
             for p in group['params']:
                 if p.grad is None:
                     continue
@@ -107,9 +108,8 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_fp32)
                     state['previous_grad'] = state['previous_grad'].type_as(p_fp32)
 
-                exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad']
-
                 state['step'] += 1
+                exp_avg, exp_avg_sq, previous_grad = state['exp_avg'], state['exp_avg_sq'], state['previous_grad']
 
                 bias_correction1 = 1.0 - beta1 ** state['step']
 
@@ -127,12 +127,11 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 else:
                     buffered[0] = state['step']
                     beta2_t = beta2 ** state['step']
-                    n_sma_max = 2.0 / (1.0 - beta2) - 1.0
                     n_sma = n_sma_max - 2.0 * state['step'] * beta2_t / (1.0 - beta2_t)
                     buffered[1] = n_sma
 
                     if n_sma >= self.n_sma_threshold:
-                        rt = math.sqrt(
+                        step_size = math.sqrt(
                             (1 - beta2_t)
                             * (n_sma - 4)
                             / (n_sma_max - 4)
@@ -141,8 +140,6 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                             * n_sma_max
                             / (n_sma_max - 2)
                         )
-
-                        step_size = rt
                         if not group['adamd_debias_term']:
                             step_size /= bias_correction1
                     elif self.degenerated_to_sgd:
diff --git a/pytorch_optimizer/optimizer/lamb.py b/pytorch_optimizer/optimizer/lamb.py
@@ -142,7 +142,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
 
                 adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
-                if group['weight_decay'] != 0:
+                if group['weight_decay'] > 0.0:
                     adam_step.add_(p, alpha=group['weight_decay'])
 
                 weight_norm = p.norm(2).clamp(0, self.clamp)
diff --git a/pytorch_optimizer/optimizer/madgrad.py b/pytorch_optimizer/optimizer/madgrad.py
@@ -86,7 +86,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
         for group in self.param_groups:
             eps = group['eps']
             lr = group['lr'] + eps
-            decay = group['weight_decay']
+            weight_decay = group['weight_decay']
             momentum = group['momentum']
 
             ck: float = 1.0 - momentum
@@ -111,15 +111,15 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 grad_sum_sq = state['grad_sum_sq']
                 s = state['s']
 
-                if decay != 0 and not self.decouple_decay:
+                if weight_decay > 0.0 and not self.decouple_decay:
                     if grad.is_sparse:
                         raise NoSparseGradientError(self.__name__, note='weight_decay')
 
                     # original implementation
-                    grad.add_(p, alpha=decay)
+                    grad.add_(p, alpha=weight_decay)
 
                     # Apply weight decay - L2 / AdamW style
-                    # p.mul_(1.0 - lr * decay)
+                    # p.mul_(1.0 - lr * weight_decay)
 
                 if grad.is_sparse:
                     grad = grad.coalesce()
@@ -167,7 +167,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                     s.add_(grad, alpha=_lambda)
 
-                    if decay != 0 and self.decouple_decay:
+                    if weight_decay > 0.0 and self.decouple_decay:
                         p_old = p.clone()
 
                     if momentum == 0.0:
@@ -176,8 +176,8 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                         z = x0.addcdiv(s, rms, value=-1)
                         p.mul_(1.0 - ck).add_(z, alpha=ck)
 
-                    if decay != 0 and self.decouple_decay:
-                        p.add_(p_old, alpha=-lr * decay)
+                    if weight_decay > 0.0 and self.decouple_decay:
+                        p.add_(p_old, alpha=-lr * weight_decay)
 
         self.state['k'] += 1
 
diff --git a/pytorch_optimizer/optimizer/radam.py b/pytorch_optimizer/optimizer/radam.py
@@ -81,6 +81,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
         for group in self.param_groups:
             beta1, beta2 = group['betas']
+            n_sma_max: float = 2.0 / (1.0 - beta2) - 1.0
             for p in group['params']:
                 if p.grad is None:
                     continue
@@ -120,12 +121,11 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 else:
                     buffered[0] = state['step']
                     beta2_t = beta2 ** state['step']
-                    n_sma_max = 2.0 / (1.0 - beta2) - 1.0
                     n_sma = n_sma_max - 2.0 * state['step'] * beta2_t / (1.0 - beta2_t)
                     buffered[1] = n_sma
 
                     if n_sma >= self.n_sma_threshold:
-                        rt = math.sqrt(
+                        step_size = math.sqrt(
                             (1 - beta2_t)
                             * (n_sma - 4)
                             / (n_sma_max - 4)
@@ -134,8 +134,6 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                             * n_sma_max
                             / (n_sma_max - 2)
                         )
-
-                        step_size = rt
                         if not group['adamd_debias_term']:
                             step_size /= bias_correction1
                     elif self.degenerated_to_sgd:
@@ -144,7 +142,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                         step_size = -1
                     buffered[2] = step_size
 
-                if group['weight_decay'] != 0 and (n_sma >= self.n_sma_threshold or step_size > 0):
+                if group['weight_decay'] > 0.0 and (n_sma >= self.n_sma_threshold or step_size > 0):
                     p_fp32.add_(p_fp32, alpha=-group['weight_decay'] * group['lr'])
 
                 if n_sma >= self.n_sma_threshold:
diff --git a/pytorch_optimizer/optimizer/ralamb.py b/pytorch_optimizer/optimizer/ralamb.py
@@ -104,6 +104,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
         for group in self.param_groups:
             beta1, beta2 = group['betas']
+            n_sma_max: float = 2.0 / (1.0 - beta2) - 1.0
             for p in group['params']:
                 if p.grad is None:
                     continue
@@ -147,13 +148,12 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 else:
                     buffered[0] = state['step']
                     beta2_t = beta2 ** state['step']
-                    n_sma_max = 2 / (1 - beta2) - 1
                     n_sma = n_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                     buffered[1] = n_sma
 
                     # more conservative since it's an approximated value
                     if n_sma >= self.n_sma_threshold:
-                        rt = math.sqrt(
+                        step_size = math.sqrt(
                             (1 - beta2_t)
                             * (n_sma - 4)
                             / (n_sma_max - 4)
@@ -162,8 +162,6 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                             * n_sma_max
                             / (n_sma_max - 2)
                         )
-
-                        step_size = rt
                         if not group['adamd_debias_term']:
                             step_size /= bias_correction1
                     elif self.degenerated_to_sgd:
diff --git a/pytorch_optimizer/optimizer/ranger.py b/pytorch_optimizer/optimizer/ranger.py
@@ -96,6 +96,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
         for group in self.param_groups:
             beta1, beta2 = group['betas']
+            n_sma_max: float = 2.0 / (1.0 - beta2) - 1.0
             for p in group['params']:
                 if p.grad is None:
                     continue
@@ -140,11 +141,10 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 else:
                     buffered[0] = state['step']
                     beta2_t = beta2 ** state['step']
-                    n_sma_max = 2 / (1 - beta2) - 1
                     n_sma = n_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                     buffered[1] = n_sma
                     if n_sma > self.n_sma_threshold:
-                        rt = math.sqrt(
+                        step_size = math.sqrt(
                             (1 - beta2_t)
                             * (n_sma - 4)
                             / (n_sma_max - 4)
@@ -153,8 +153,6 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                             * n_sma_max
                             / (n_sma_max - 2)
                         )
-
-                        step_size = rt
                         if not group['adamd_debias_term']:
                             step_size /= bias_correction1
                     else:

Original file line number	Diff line number	Diff line change
`@@ -136,7 +136,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:`
`136`	`136`	`group['eps'],`
`137`	`137`	`)`
`138`	`138`
`139`		`- if group['weight_decay'] > 0:`
	`139`	`+ if group['weight_decay'] > 0.0:`
`140`	`140`	`p.mul_(1.0 - group['lr'] * group['weight_decay'] * wd_ratio)`
`141`	`141`
`142`	`142`	`step_size = group['lr']`