Merge pull request #359 from kozistr/feature/optimizers

kozistr · web-flow · commit c950609aff45 · 2025-02-26T20:13:10.000+09:00
[Feature] Implement StableSPAM optimizer
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 
 ## The reasons why you use `pytorch-optimizer`.
 
-* Wide range of supported optimizers. Currently, **99 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
+* Wide range of supported optimizers. Currently, **100 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
 * Including many variants such as `ADOPT`, `Cautious`, `AdamD`, `StableAdamW`, and `Gradient Centrailiaztion`
 * Easy to use, clean, and tested codes
 * Active maintenance
@@ -207,6 +207,8 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | GCSAM         | *Gradient Centralized Sharpness Aware Minimization*                                               | [github](https://github.com/mhassann22/GCSAM)                                                                  | <https://arxiv.org/abs/2501.11584>                                                          | [cite](https://github.com/mhassann22/GCSAM?tab=readme-ov-file#citation)                                                             |
 | LookSAM       | *Towards Efficient and Scalable Sharpness-Aware Minimization*                                     | [github](https://github.com/rollovd/LookSAM)                                                                   | <https://arxiv.org/abs/2203.02714>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220302714L/exportcitation)                                                        |
 | SCION         | *Training Deep Learning Models with Norm-Constrained LMOs*                                        |                                                                                                                | <https://arxiv.org/abs/2502.07529>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2025arXiv250207529P/exportcitation)                                                        |
+| COSMOS        | *SOAP with Muon*                                                                                  | [github](https://github.com/lliu606/COSMOS)                                                                    |                                                                                             |                                                                                                                                     |
+| StableSPAM    | *How to Train in 4-Bit More Stably than 16-Bit Adam                                               | [github](https://github.com/TianjinYellow/StableSPAM)                                                          | <https://arxiv.org/abs/2502.17055>                                                          |                                                                                                                                     |
 
 ## Supported LR Scheduler
 
diff --git a/docs/changelogs/v3.4.3.md b/docs/changelogs/v3.4.3.md
@@ -1,5 +1,10 @@
 ### Change Log
 
+### Feature
+
+* Support `StableSPAM` optimizer. (#358, #359)
+    * [How to Train in 4-Bit More Stably than 16-Bit Adam](https://arxiv.org/abs/2502.17055?)
+
 ### Update
 
 * Update Muon optimizer. (#355, #356)
diff --git a/docs/index.md b/docs/index.md
@@ -10,7 +10,7 @@
 
 ## The reasons why you use `pytorch-optimizer`.
 
-* Wide range of supported optimizers. Currently, **99 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
+* Wide range of supported optimizers. Currently, **100 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
 * Including many variants such as `ADOPT`, `Cautious`, `AdamD`, `StableAdamW`, and `Gradient Centrailiaztion`
 * Easy to use, clean, and tested codes
 * Active maintenance
@@ -207,6 +207,8 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | GCSAM         | *Gradient Centralized Sharpness Aware Minimization*                                               | [github](https://github.com/mhassann22/GCSAM)                                                                  | <https://arxiv.org/abs/2501.11584>                                                          | [cite](https://github.com/mhassann22/GCSAM?tab=readme-ov-file#citation)                                                             |
 | LookSAM       | *Towards Efficient and Scalable Sharpness-Aware Minimization*                                     | [github](https://github.com/rollovd/LookSAM)                                                                   | <https://arxiv.org/abs/2203.02714>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2022arXiv220302714L/exportcitation)                                                        |
 | SCION         | *Training Deep Learning Models with Norm-Constrained LMOs*                                        |                                                                                                                | <https://arxiv.org/abs/2502.07529>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2025arXiv250207529P/exportcitation)                                                        |
+| COSMOS        | *SOAP with Muon*                                                                                  | [github](https://github.com/lliu606/COSMOS)                                                                    |                                                                                             |                                                                                                                                     |
+| StableSPAM    | *How to Train in 4-Bit More Stably than 16-Bit Adam                                               | [github](https://github.com/TianjinYellow/StableSPAM)                                                          | <https://arxiv.org/abs/2502.17055>                                                          |                                                                                                                                     |
 
 ## Supported LR Scheduler
 
diff --git a/docs/optimizer.md b/docs/optimizer.md
@@ -392,6 +392,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.StableSPAM
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.SRMM
     :docstring:
     :members:
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,9 +19,9 @@ keywords = [
     "Muno", "Nero", "NovoGrad", "OrthoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "PSGD", "QHAdam", "QHM",
     "RAdam", "Ranger", "Ranger21", "RotoGrad", "SAM", "GCSAM", "LookSAM", "ScheduleFreeSGD", "ScheduleFreeAdamW",
     "ScheduleFreeRAdam", "SCION", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SOAP", "SopihaH",
-    "SPAM", "SRMM", "StableAdamW", "SWATS", "TAM", "Tiger", "TRAC", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal",
-    "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge",
-    "bitsandbytes", "WSD", "QGaLore",
+    "SPAM", "StableSPAM", "SRMM", "StableAdamW", "SWATS", "TAM", "Tiger", "TRAC", "WSAM", "Yogi", "BCE", "BCEFocal",
+    "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky",
+    "LovaszHinge", "bitsandbytes", "WSD", "QGaLore",
 ]
 classifiers = [
     "License :: OSI Approved :: Apache Software License",
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -144,6 +144,7 @@
     SignSGD,
     SophiaH,
     StableAdamW,
+    StableSPAM,
     Tiger,
     Yogi,
     agc,
diff --git a/pytorch_optimizer/optimizer/__init__.py b/pytorch_optimizer/optimizer/__init__.py
@@ -88,7 +88,7 @@
 from pytorch_optimizer.optimizer.sm3 import SM3
 from pytorch_optimizer.optimizer.soap import SOAP
 from pytorch_optimizer.optimizer.sophia import SophiaH
-from pytorch_optimizer.optimizer.spam import SPAM
+from pytorch_optimizer.optimizer.spam import SPAM, StableSPAM
 from pytorch_optimizer.optimizer.srmm import SRMM
 from pytorch_optimizer.optimizer.swats import SWATS
 from pytorch_optimizer.optimizer.tam import TAM, AdaTAM
@@ -302,6 +302,7 @@ def load_optimizer(optimizer: str) -> OPTIMIZER:
     Kron,
     EXAdam,
     SCION,
+    StableSPAM,
     Ranger25,
 ]
 OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}
diff --git a/pytorch_optimizer/optimizer/spam.py b/pytorch_optimizer/optimizer/spam.py
@@ -1,4 +1,5 @@
 import math
+from typing import Optional
 
 import torch
 from torch.nn import Parameter, ParameterList
@@ -22,7 +23,7 @@ class CosineDecay:
     def __init__(self, death_rate: float, t_max: int, eta_min: float = 0.0, last_epoch: int = -1):
         self.sgd: Optimizer = SGD(ParameterList([Parameter(torch.zeros(1))]), lr=death_rate)
         self.cosine_stepper: LRScheduler = CosineAnnealingLR(self.sgd, t_max + 1, eta_min, last_epoch)
-        self.T_max = t_max
+        self.t_max = t_max
         self.eta_min = eta_min
 
     def step(self, current_step: int) -> None:
@@ -37,7 +38,7 @@ def get_death_rate(self, current_step: int) -> float:
 
         :param current_step: int. Current step index.
         """
-        if current_step >= self.T_max:
+        if current_step >= self.t_max:
             return self.eta_min
 
         self.step(current_step)
@@ -267,3 +268,161 @@ def step(self, closure: CLOSURE = None) -> LOSS:
             self.warmup = CosineDecay(0.99, self.warmup_epoch)
 
         return loss
+
+
+class StableSPAM(BaseOptimizer):
+    r"""How to Train in 4-Bit More Stably than 16-Bit Adam.
+
+    :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
+    :param lr: float. learning rate.
+    :param betas: BETAS. coefficients used for computing running averages of gradient and the squared hessian trace.
+    :param gamma1: float.
+    :param gamma2: float.
+    :param theta: float.
+    :param t_max: Optional[int]. total number of steps.
+    :param eta_min: float. eta_min of CosineDecay.
+    :param weight_decay: float. weight decay (L2 penalty).
+    :param update_proj_gap: int. update projection gap.
+    :param eps: float. term added to the denominator to improve numerical stability.
+    """
+
+    def __init__(
+        self,
+        params: PARAMETERS,
+        lr: float = 1e-3,
+        betas: BETAS = (0.9, 0.999),
+        gamma1: float = 0.7,
+        gamma2: float = 0.9,
+        theta: float = 0.999,
+        t_max: Optional[int] = None,
+        eta_min: float = 0.5,
+        weight_decay: float = 0.0,
+        update_proj_gap: int = 1000,
+        eps: float = 1e-8,
+        **kwargs,
+    ):
+        self.validate_learning_rate(lr)
+        self.validate_betas(betas)
+        self.validate_non_negative(weight_decay, 'weight_decay')
+        self.validate_positive(update_proj_gap, 'update_proj_gap')
+        self.validate_non_negative(eps, 'eps')
+
+        self.gamma1: float = betas[0] if gamma1 == -1.0 else gamma1
+        self.gamma2: float = gamma2
+        self.theta: float = theta
+        self.t_max = t_max
+        self.update_proj_gap = update_proj_gap
+        self.warmup = CosineDecay(1.0, t_max, eta_min=eta_min) if t_max is not None else None
+
+        self.total_step: int = 0
+
+        defaults: DEFAULTS = {'lr': lr, 'betas': betas, 'weight_decay': weight_decay, 'eps': eps, **kwargs}
+        super().__init__(params, defaults)
+
+    def __str__(self) -> str:
+        return 'StableSPAM'
+
+    @torch.no_grad()
+    def reset(self):
+        for group in self.param_groups:
+            group['step'] = 0
+            for p in group['params']:
+                state = self.state[p]
+
+                state['exp_avg'] = torch.zeros_like(p)
+                state['exp_avg_sq'] = torch.zeros_like(p)
+                state['m_norm_t'] = torch.zeros(1, device=p.device, dtype=p.dtype)
+                state['v_norm_t'] = torch.zeros(1, device=p.device, dtype=p.dtype)
+                state['m_max_t'] = torch.zeros(1, device=p.device, dtype=p.dtype)
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        self.total_step += 1
+        scale: float = self.warmup.get_death_rate(self.total_step) if self.warmup is not None else 1.0
+
+        for group in self.param_groups:
+            if 'step' not in group:
+                group['step'] = 1
+            else:
+                group['step'] += 1
+
+            beta1, beta2 = group['betas']
+            beta1 *= scale
+
+            bias_correction1: float = self.debias(beta1, group['step'])
+            bias_correction2: float = self.debias(beta2, group['step'])
+            bias_correction2_sq: float = math.sqrt(bias_correction2)
+
+            step_size: float = group['lr'] / bias_correction1
+
+            theta_t: float = 1.0 - self.theta ** group['step']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad
+                if grad.is_sparse:
+                    raise NoSparseGradientError(str(self))
+
+                state = self.state[p]
+
+                if 'exp_avg' not in state:
+                    state['exp_avg'] = torch.zeros_like(grad)
+                    state['exp_avg_sq'] = torch.zeros_like(grad)
+                    state['m_norm_t'] = torch.zeros(1, device=grad.device, dtype=grad.dtype)
+                    state['v_norm_t'] = torch.zeros(1, device=grad.device, dtype=grad.dtype)
+                    state['m_max_t'] = torch.zeros(1, device=grad.device, dtype=grad.dtype)
+
+                self.apply_weight_decay(
+                    p,
+                    grad=grad,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    weight_decouple=True,
+                    fixed_decay=False,
+                )
+
+                max_grad = torch.max(grad.abs())
+
+                exp_avg, exp_avg_sq, m_max_t = state['exp_avg'], state['exp_avg_sq'], state['m_max_t']
+
+                m_max_t.lerp_(max_grad, weight=1.0 - self.theta)
+
+                m_max_hat = m_max_t / theta_t
+
+                mask = grad.abs() > m_max_hat
+                if mask.sum() > 0:
+                    grad[mask].div_(max_grad).mul_(m_max_hat)
+
+                grad_norm = torch.norm(grad)
+
+                m_norm_t, v_norm_t = state['m_norm_t'], state['v_norm_t']
+                m_norm_t.lerp_(grad_norm, weight=1.0 - self.gamma1 * scale)
+                v_norm_t.lerp_(grad_norm.pow(2), weight=1.0 - self.gamma2)
+
+                m_norm_hat = m_norm_t / (1.0 - (self.gamma1 * scale) ** group['step'])
+                v_norm_hat = v_norm_t / (1.0 - self.gamma2 ** group['step'])
+
+                c_norm_t = m_norm_hat.div_(v_norm_hat.sqrt_().add_(group['eps']))
+
+                grad.div_(grad_norm).mul_(c_norm_t)
+
+                if self.update_proj_gap > 0 and self.total_step % self.update_proj_gap == 0:
+                    state['exp_avg'] = torch.zeros_like(grad)
+                    state['exp_avg_sq'] = torch.zeros_like(grad)
+                    group['step'] = 1
+
+                exp_avg.mul_(beta1).add_(grad, alpha=1.0 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1.0 - beta2)
+
+                de_nom = exp_avg_sq.sqrt().div_(bias_correction2_sq).add_(group['eps'])
+
+                p.addcdiv_(exp_avg, de_nom, value=-step_size)
+
+        return loss
diff --git a/tests/constants.py b/tests/constants.py
@@ -88,6 +88,7 @@
     SignSGD,
     SophiaH,
     StableAdamW,
+    StableSPAM,
     Tiger,
     Yogi,
 )
@@ -557,6 +558,7 @@
     (SGDSaI, {'lr': 1e0, 'momentum': 0.0}, 15),
     (Grams, {'lr': 1e-1, 'weight_decay': 1e-3}, 5),
     (SPAM, {'lr': 1e0, 'weight_decay': 1e-3, 'warmup_epoch': 1, 'grad_accu_steps': 1, 'update_proj_gap': 1}, 5),
+    (StableSPAM, {'lr': 1e0, 'weight_decay': 1e-3, 'update_proj_gap': 1, 't_max': 5}, 5),
     (TAM, {'lr': 1e0, 'weight_decay': 1e-3}, 5),
     (AdaTAM, {'lr': 1e-1, 'weight_decay': 1e-3}, 5),
     (FOCUS, {'lr': 1e-1, 'weight_decay': 1e-3}, 5),
diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py
@@ -34,7 +34,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names):
 
 
 def test_get_supported_optimizers():
-    assert len(get_supported_optimizers()) == 96
+    assert len(get_supported_optimizers()) == 97
     assert len(get_supported_optimizers('adam*')) == 8
     assert len(get_supported_optimizers(['adam*', 'ranger*'])) == 11