kozistr
diff --git a/‎poetry.lock‎
Lines changed: 312 additions & 114 deletions b/‎poetry.lock‎
Lines changed: 312 additions & 114 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 12 additions & 6 deletions b/‎pyproject.toml‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎pytorch_optimizer/base/exception.py‎
Lines changed: 3 additions & 3 deletions b/‎pytorch_optimizer/base/exception.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytorch_optimizer/lr_scheduler/cosine_anealing.py‎
Lines changed: 13 additions & 16 deletions b/‎pytorch_optimizer/lr_scheduler/cosine_anealing.py‎
Lines changed: 13 additions & 16 deletions
diff --git a/‎pytorch_optimizer/optimizer/dadapt.py‎
Lines changed: 2 additions & 10 deletions b/‎pytorch_optimizer/optimizer/dadapt.py‎
Lines changed: 2 additions & 10 deletions
diff --git a/‎pytorch_optimizer/optimizer/lookahead.py‎
Lines changed: 63 additions & 58 deletions b/‎pytorch_optimizer/optimizer/lookahead.py‎
Lines changed: 63 additions & 58 deletions
diff --git a/‎pytorch_optimizer/optimizer/madgrad.py‎
Lines changed: 1 addition & 8 deletions b/‎pytorch_optimizer/optimizer/madgrad.py‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎pytorch_optimizer/optimizer/pcgrad.py‎
Lines changed: 6 additions & 6 deletions b/‎pytorch_optimizer/optimizer/pcgrad.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎pytorch_optimizer/optimizer/ranger21.py‎
Lines changed: 1 addition & 1 deletion b/‎pytorch_optimizer/optimizer/ranger21.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pytorch_optimizer"
-version = "2.5.1"
+version = "2.5.2"
 description = "optimizer & lr scheduler implementations in PyTorch with clean-code, strict types. Also, including useful optimization ideas."
 license = "Apache-2.0"
 authors = ["kozistr <[email protected]>"]
@@ -37,13 +37,19 @@ numpy = [
     { version = "=1.21.1", python = ">=3.7,<3.8" },
     { version = "*", python = ">=3.8" },
 ]
-torch = { version = ">=1.10", source = "torch" }
+torch = [
+    { version = ">=1.10,>=2.0", python = ">=3.8", source = "torch" },
+    { version = "^1.10", python = ">=3.7,<3.8", source = "torch" },
+]
 
 [tool.poetry.dev-dependencies]
-isort = "^5.11.5"
-black = "^23.1.0"
-ruff = "^0.0.244"
-pytest = "^7.2.1"
+isort = [
+    { version = "==5.11.5", python = ">=3.7,<3.8"},
+    { version = "^5.12.0", python = ">=3.8"}
+]
+black = "^23.3.0"
+ruff = "^0.0.260"
+pytest = "^7.2.2"
 pytest-cov = "^4.0.0"
 
 [[tool.poetry.source]]
 
@@ -6,7 +6,7 @@ class NoSparseGradientError(Exception):
     """
 
     def __init__(self, optimizer_name: str, note: str = ''):
-        self.note: str = ' ' if note == '' else f' w/ {note} '
+        self.note: str = ' ' if not note else f' w/ {note} '
         self.message: str = f'[-] {optimizer_name}{self.note}does not support sparse gradient.'
         super().__init__(self.message)
 
@@ -31,7 +31,7 @@ class NegativeLRError(Exception):
     """Raised when learning rate is negative."""
 
     def __init__(self, lr: float, lr_type: str = ''):
-        self.note: str = 'learning rate' if lr_type == '' else lr_type
+        self.note: str = lr_type if lr_type else 'learning rate'
         self.message: str = f'[-] {self.note} must be positive. ({lr} > 0)'
         super().__init__(self.message)
 
@@ -40,6 +40,6 @@ class NegativeStepError(Exception):
     """Raised when step is negative."""
 
     def __init__(self, num_steps: int, step_type: str = ''):
-        self.note: str = 'step' if step_type == '' else step_type
+        self.note: str = step_type if step_type else 'step'
         self.message: str = f'[-] {self.note} must be positive. ({num_steps} > 0)'
         super().__init__(self.message)
@@ -91,23 +91,20 @@ def step(self, epoch: Optional[int] = None):
                 self.cur_cycle_steps = (
                     int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
                 )
-        else:
-            if epoch >= self.first_cycle_steps:
-                if self.cycle_mult == 1.0:
-                    self.step_in_cycle = epoch % self.first_cycle_steps
-                    self.cycle = epoch // self.first_cycle_steps
-                else:
-                    n: int = int(
-                        math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult)
-                    )
-                    self.cycle = n
-                    self.step_in_cycle = epoch - int(
-                        self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1)
-                    )  # fmt: skip
-                    self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** n  # fmt: skip
+        elif epoch >= self.first_cycle_steps:
+            if self.cycle_mult == 1.0:
+                self.step_in_cycle = epoch % self.first_cycle_steps
+                self.cycle = epoch // self.first_cycle_steps
             else:
-                self.cur_cycle_steps = self.first_cycle_steps
-                self.step_in_cycle = epoch
+                n: int = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
+                self.cycle = n
+                self.step_in_cycle = epoch - int(
+                    self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1)
+                )  # fmt: skip
+                self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** n  # fmt: skip
+        else:
+            self.cur_cycle_steps = self.first_cycle_steps
+            self.step_in_cycle = epoch
 
         self.max_lr = self.base_max_lr * (self.gamma ** self.cycle)  # fmt: skip
         self.last_epoch = math.floor(epoch)
 
@@ -77,11 +77,7 @@ def reset(self):
 
                 state = self.state[p]
 
-                try:
-                    state['alpha_k'] = torch.full_like(p, fill_value=1e-6)
-                except NotImplementedError:  # there's no fill_() op for SpareTensorCPU
-                    state['alpha_k'] = torch.zeros_like(p)
-
+                state['alpha_k'] = torch.full_like(p, fill_value=1e-6)
                 state['sk'] = torch.zeros_like(p)
                 state['x0'] = torch.clone(p)
                 if p.grad.is_sparse:
@@ -119,11 +115,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 state = self.state[p]
                 if 'alpha_k' not in state:
-                    try:
-                        state['alpha_k'] = torch.full_like(p, fill_value=1e-6)
-                    except NotImplementedError:  # there's no fill_() op for SpareTensorCPU
-                        state['alpha_k'] = torch.zeros_like(p)
-
+                    state['alpha_k'] = torch.full_like(p, fill_value=1e-6)
                     state['sk'] = torch.zeros_like(p)
                     state['x0'] = torch.clone(p)
                     if grad.is_sparse:
 
@@ -2,13 +2,12 @@
 from typing import Dict
 
 import torch
-from torch.optim import Optimizer
 
 from pytorch_optimizer.base.optimizer import BaseOptimizer
-from pytorch_optimizer.base.types import CLOSURE, DEFAULTS, LOSS, OPTIMIZER, STATE
+from pytorch_optimizer.base.types import CLOSURE, LOSS, OPTIMIZER, STATE
 
 
-class Lookahead(Optimizer, BaseOptimizer):
+class Lookahead(BaseOptimizer):
     r"""k steps forward, 1 step back.
 
     :param optimizer: OPTIMIZER. base optimizer.
@@ -17,7 +16,7 @@ class Lookahead(Optimizer, BaseOptimizer):
     :param pullback_momentum: str. change to inner optimizer momentum on interpolation update.
     """
 
-    def __init__(  # pylint: disable=super-init-not-called
+    def __init__(
         self,
         optimizer: OPTIMIZER,
         k: int = 5,
@@ -32,62 +31,90 @@ def __init__(  # pylint: disable=super-init-not-called
         self.validate_parameters()
 
         self.param_groups = self.optimizer.param_groups
-        self.fast_state: STATE = self.optimizer.state
         self.state: STATE = defaultdict(dict)
-        self.reset()
 
-        self.defaults: DEFAULTS = optimizer.defaults
-        self.defaults.update(
-            {
-                'k': k,
-                'alpha': alpha,
-                'pullback_momentum': pullback_momentum,
-            }
-        )
+        for group in self.param_groups:
+            if 'counter' not in group:
+                group['counter'] = 0
+
+            for p in group['params']:
+                state = self.state[p]
+                state['slow_params'] = torch.empty_like(p)
+                state['slow_params'].copy_(p)
+                if self.pullback_momentum == 'pullback':
+                    state['slow_momentum'] = torch.zeros_like(p)
 
     def validate_parameters(self):
         self.validate_lookahead_k(self.k)
         self.validate_alpha(self.alpha)
         self.validate_pullback_momentum(self.pullback_momentum)
 
+    def __getstate__(self):
+        return {
+            'state': self.state,
+            'optimizer': self.optimizer,
+            'alpha': self.alpha,
+            'k': self.k,
+            'pullback_momentum': self.pullback_momentum,
+        }
+
     @torch.no_grad()
     def reset(self):
         for group in self.param_groups:
             group['counter'] = 0
 
+    def backup_and_load_cache(self):
+        r"""Backup cache parameters."""
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['backup_params'] = torch.empty_like(p)
+                state['backup_params'].copy_(p)
+                p.data.copy_(state['slow_params'])
+
+    def clear_and_load_backup(self):
+        r"""Load backup parameters."""
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                p.data.copy_(state['backup_params'])
+                del state['backup_params']
+
+    def state_dict(self) -> STATE:
+        return self.optimizer.state_dict()
+
+    def load_state_dict(self, state: STATE):
+        r"""Load state."""
+        self.optimizer.load_state_dict(state)
+
+    @torch.no_grad()
+    def zero_grad(self):
+        self.optimizer.zero_grad(set_to_none=True)
+
     @torch.no_grad()
     def update(self, group: Dict):
-        for fast in group['params']:
-            if fast.grad is None:
+        for p in group['params']:
+            if p.grad is None:
                 continue
 
-            param_state = self.state[fast]
-            if 'slow_param' not in param_state:
-                param_state['slow_param'] = torch.empty_like(fast)
-                param_state['slow_param'].copy_(fast)
-                if self.pullback_momentum == 'pullback':
-                    param_state['slow_mom'] = torch.zeros_like(fast)
+            state = self.state[p]
 
-            slow = param_state['slow_param']
-            slow.add_(fast - slow, alpha=self.alpha)
+            slow = state['slow_params']
 
-            fast.copy_(slow)
+            p.mul_(self.alpha).add_(slow, alpha=1.0 - self.alpha)
+            slow.copy_(p)
 
-            if 'momentum_buffer' not in self.optimizer.state[fast]:
-                self.optimizer.state[fast]['momentum_buffer'] = torch.zeros_like(fast)
+            if 'momentum_buffer' not in self.optimizer.state[p]:
+                self.optimizer.state[p]['momentum_buffer'] = torch.zeros_like(p)
 
             if self.pullback_momentum == 'pullback':
-                internal_momentum = self.optimizer.state[fast]['momentum_buffer']
-                self.optimizer.state[fast]['momentum_buffer'] = internal_momentum.mul_(self.alpha).add_(
-                    param_state['slow_mom'], alpha=1.0 - self.alpha
+                internal_momentum = self.optimizer.state[p]['momentum_buffer']
+                self.optimizer.state[p]['momentum_buffer'] = internal_momentum.mul_(self.alpha).add_(
+                    state['slow_momentum'], alpha=1.0 - self.alpha
                 )
-                param_state['slow_mom'] = self.optimizer.state[fast]['momentum_buffer']
+                state['slow_momentum'] = self.optimizer.state[p]['momentum_buffer']
             elif self.pullback_momentum == 'reset':
-                self.optimizer.state[fast]['momentum_buffer'] = torch.zeros_like(fast)
-
-    def update_lookahead(self):
-        for group in self.param_groups:
-            self.update(group)
+                self.optimizer.state[p]['momentum_buffer'] = torch.zeros_like(p)
 
     def step(self, closure: CLOSURE = None) -> LOSS:
         loss: LOSS = self.optimizer.step(closure)
@@ -97,25 +124,3 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 group['counter'] = 0
                 self.update(group)
         return loss
-
-    def state_dict(self) -> STATE:
-        fast_state: STATE = self.optimizer.state_dict()
-        slow_state: STATE = {(id(k) if isinstance(k, torch.Tensor) else k): v for k, v in self.state.items()}
-
-        return {
-            'fast_state': fast_state['state'],
-            'slow_state': slow_state,
-            'param_groups': fast_state['param_groups'],
-        }
-
-    def load_state_dict(self, state: STATE):
-        slow_state: STATE = {'state': state['slow_state'], 'param_groups': state['param_groups']}
-        fast_state: STATE = {'state': state['fast_state'], 'param_groups': state['param_groups']}
-        super().load_state_dict(slow_state)
-
-        self.optimizer.load_state_dict(fast_state)
-        self.fast_state = self.optimizer.state
-
-    def add_param_group(self, param_group):
-        param_group['counter'] = 0
-        self.optimizer.add_param_group(param_group)
 
@@ -69,8 +69,6 @@ def reset(self):
 
     @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
-        # pylint: disable=W0212
-
         loss: LOSS = None
         if closure is not None:
             with torch.enable_grad():
@@ -80,13 +78,11 @@ def step(self, closure: CLOSURE = None) -> LOSS:
         if 'k' not in self.state:
             self.state['k'] = torch.tensor([0], dtype=torch.long, requires_grad=False)
 
-        k = self.state['k']
-
         for group in self.param_groups:
             weight_decay, momentum, eps = group['weight_decay'], group['momentum'], group['eps']
             lr = group['lr'] + eps
 
-            _lambda = lr * math.pow(k + 1, 0.5)
+            _lambda = lr * math.pow(self.state['k'] + 1, 0.5)
 
             for p in group['params']:
                 if p.grad is None:
@@ -105,7 +101,6 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     raise NoSparseGradientError(str(self), note='momentum > 0.0')
 
                 grad_sum_sq, s = state['grad_sum_sq'], state['s']
-
                 if weight_decay > 0.0 and not self.decouple_decay:
                     if grad.is_sparse:
                         raise NoSparseGradientError(str(self), note='weight_decay')
@@ -120,11 +115,9 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad)
                     s_masked = s.sparse_mask(grad)
 
-                    # Compute x_0 from other known quantities
                     rms_masked_values = grad_sum_sq_masked._values().pow(1 / 3).add_(eps)
                     x0_masked_values = p_masked._values().addcdiv(s_masked._values(), rms_masked_values, value=1)
 
-                    # Dense + sparse op
                     grad_sq = grad * grad
                     grad_sum_sq.add_(grad_sq, alpha=_lambda)
                     grad_sum_sq_masked.add_(grad_sq, alpha=_lambda)
 
@@ -44,7 +44,7 @@ def set_grad(self, grads: List[torch.Tensor]):
                 idx += 1
 
     def retrieve_grad(self) -> Tuple[List[torch.Tensor], List[int], List[torch.Tensor]]:
-        r"""get the gradient of the parameters of the network with specific objective."""
+        r"""Get the gradient of the parameters of the network with specific objective."""
         grad, shape, has_grad = [], [], []
         for group in self.optimizer.param_groups:
             for p in group['params']:
@@ -61,7 +61,7 @@ def retrieve_grad(self) -> Tuple[List[torch.Tensor], List[int], List[torch.Tenso
         return grad, shape, has_grad
 
     def pack_grad(self, objectives: Iterable) -> Tuple[List[torch.Tensor], List[List[int]], List[torch.Tensor]]:
-        r"""pack the gradient of the parameters of the network for each objective.
+        r"""Pack the gradient of the parameters of the network for each objective.
 
         :param objectives: Iterable[nn.Module]. a list of objectives.
         :return: torch.Tensor. packed gradients.
@@ -80,7 +80,7 @@ def pack_grad(self, objectives: Iterable) -> Tuple[List[torch.Tensor], List[List
         return grads, shapes, has_grads
 
     def project_conflicting(self, grads: List[torch.Tensor], has_grads: List[torch.Tensor]) -> torch.Tensor:
-        r"""project conflicting.
+        r"""Project conflicting.
 
         :param grads: a list of the gradient of the parameters.
         :param has_grads: a list of mask represent whether the parameter has gradient.
@@ -89,12 +89,12 @@ def project_conflicting(self, grads: List[torch.Tensor], has_grads: List[torch.T
         shared: torch.Tensor = torch.stack(has_grads).prod(0).bool()
 
         pc_grad: List[torch.Tensor] = deepcopy(grads)
-        for g_i in pc_grad:
+        for i, g_i in enumerate(pc_grad):
             random.shuffle(grads)
             for g_j in grads:
                 g_i_g_j: torch.Tensor = torch.dot(g_i, g_j)
                 if g_i_g_j < 0:
-                    g_i -= g_i_g_j * g_j / (g_j.norm() ** 2)
+                    pc_grad[i] -= g_i_g_j * g_j / (g_j.norm() ** 2)
 
         merged_grad: torch.Tensor = torch.zeros_like(grads[0], device=grads[0].device)
 
@@ -109,7 +109,7 @@ def project_conflicting(self, grads: List[torch.Tensor], has_grads: List[torch.T
         return merged_grad
 
     def pc_backward(self, objectives: Iterable[nn.Module]):
-        r"""calculate the gradient of the parameters.
+        r"""Calculate the gradient of the parameters.
 
         :param objectives: Iterable[nn.Module]. a list of objectives.
         """
 
@@ -217,7 +217,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 param_size += p.numel()
 
                 # Apply Adaptive Gradient Clipping (AGC)
-                p = agc(p, agc_eps=self.agc_eps, agc_clip_val=self.agc_clipping_value)
+                p = agc(p, agc_eps=self.agc_eps, agc_clip_val=self.agc_clipping_value)  # noqa: PLW2901
 
                 state = self.state[p]
                 if len(state) == 0: