kozistr
diff --git a/‎README.md‎
Lines changed: 3 additions & 2 deletions b/‎README.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/changelogs/v3.3.4.md‎
Lines changed: 12 additions & 0 deletions b/‎docs/changelogs/v3.3.4.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/index.md‎
Lines changed: 3 additions & 2 deletions b/‎docs/index.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎docs/optimizer.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/optimizer.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytorch_optimizer/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pytorch_optimizer/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pytorch_optimizer/base/scheduler.py‎
Lines changed: 4 additions & 3 deletions b/‎pytorch_optimizer/base/scheduler.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎pytorch_optimizer/base/types.py‎
Lines changed: 1 addition & 0 deletions b/‎pytorch_optimizer/base/types.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pytorch_optimizer/lr_scheduler/cosine_anealing.py‎
Lines changed: 3 additions & 4 deletions b/‎pytorch_optimizer/lr_scheduler/cosine_anealing.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎pytorch_optimizer/lr_scheduler/rex.py‎
Lines changed: 3 additions & 4 deletions b/‎pytorch_optimizer/lr_scheduler/rex.py‎
Lines changed: 3 additions & 4 deletions
@@ -10,8 +10,8 @@
 
 ## The reasons why you use `pytorch-optimizer`.
 
-* Wide range of supported optimizers. Currently, **89 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
-* Including many variants such as `Cautious`, `AdamD`, `Gradient Centrailiaztion`
+* Wide range of supported optimizers. Currently, **90 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
+* Including many variants such as `ADOPT`, `Cautious`, `AdamD`, `StableAdamW`, and `Gradient Centrailiaztion`
 * Easy to use, clean, and tested codes
 * Active maintenance
 * Somewhat a bit more optimized compared to the original implementation
@@ -198,6 +198,7 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | Grams         | *Gradient Descent with Adaptive Momentum Scaling*                                                 |                                                                                                                | <https://arxiv.org/abs/2412.17107>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv241217107C/exportcitation)                                                        |
 | OrthoGrad     | *Grokking at the Edge of Numerical Stability*                                                     | [github](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability)                         | <https://arxiv.org/abs/2501.04697>                                                          | [cite](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability?tab=readme-ov-file#citation)                    |
 | Adam-ATAN2    | *Scaling Exponents Across Parameterizations and Optimizers*                                       |                                                                                                                | <https://arxiv.org/abs/2407.05872>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240705872E/exportcitation)                                                        |
+| SPAM          | *Spike-Aware Adam with Momentum Reset for Stable LLM Training*                                    | [github](https://github.com/TianjinYellow/SPAM-Optimizer)                                                      | <https://arxiv.org/abs/2501.06842>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2025arXiv250106842H/exportcitation)                                                        |
 
 ## Supported LR Scheduler
 
 
@@ -0,0 +1,12 @@
+### Change Log
+
+### Feature
+
+* Support `OrthoGrad` feature for `create_optimizer()`. (#324)
+* Enhanced flexibility for the `optimizer` parameter in `Lookahead`, `TRAC`, and `OrthoGrad` optimizers. (#324)
+    * Now supports both torch.optim.Optimizer instances and classes
+    * You can now use `Lookahead` optimizer in two ways.
+        * `Lookahead(AdamW(model.parameters(), lr=1e-3), k=5, alpha=0.5)`
+        * `Lookahead(AdamW, k=5, alpha=0.5, params=model.parameters())`
+* Implement `SPAM` optimizer. (#324)
+    * [Spike-Aware Adam with Momentum Reset for Stable LLM Training](https://arxiv.org/abs/2501.06842)
@@ -10,8 +10,8 @@
 
 ## The reasons why you use `pytorch-optimizer`.
 
-* Wide range of supported optimizers. Currently, **89 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
-* Including many variants such as `Cautious`, `AdamD`, `Gradient Centrailiaztion`
+* Wide range of supported optimizers. Currently, **90 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
+* Including many variants such as `ADOPT`, `Cautious`, `AdamD`, `StableAdamW`, and `Gradient Centrailiaztion`
 * Easy to use, clean, and tested codes
 * Active maintenance
 * Somewhat a bit more optimized compared to the original implementation
@@ -198,6 +198,7 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | Grams         | *Gradient Descent with Adaptive Momentum Scaling*                                                 |                                                                                                                | <https://arxiv.org/abs/2412.17107>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv241217107C/exportcitation)                                                        |
 | OrthoGrad     | *Grokking at the Edge of Numerical Stability*                                                     | [github](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability)                         | <https://arxiv.org/abs/2501.04697>                                                          | [cite](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability?tab=readme-ov-file#citation)                    |
 | Adam-ATAN2    | *Scaling Exponents Across Parameterizations and Optimizers*                                       |                                                                                                                | <https://arxiv.org/abs/2407.05872>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240705872E/exportcitation)                                                        |
+| SPAM          | *Spike-Aware Adam with Momentum Reset for Stable LLM Training*                                    | [github](https://github.com/TianjinYellow/SPAM-Optimizer)                                                      | <https://arxiv.org/abs/2501.06842>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2025arXiv250106842H/exportcitation)                                                        |
 
 ## Supported LR Scheduler
 
 
@@ -368,6 +368,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.SPAM
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.SRMM
     :docstring:
     :members:
 
@@ -18,9 +18,9 @@ keywords = [
     "Kate", "Lamb", "LaProp", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MARS", "MSVAG", "Muno", "Nero",
     "NovoGrad", "OrthoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger",
     "Ranger21", "RotoGrad", "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "ScheduleFreeRAdam", "SGDP", "Shampoo",
-    "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SOAP", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "TRAC",
-    "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered",
-    "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD", "QGaLore",
+    "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SOAP", "SopihaH", "SPAM", "SRMM", "StableAdamW", "SWATS", "Tiger",
+    "TRAC", "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard",
+    "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD", "QGaLore",
 ]
 classifiers = [
     "License :: OSI Approved :: Apache Software License",
 
@@ -61,6 +61,7 @@
     SGDW,
     SM3,
     SOAP,
+    SPAM,
     SRMM,
     SWATS,
     TRAC,
 
@@ -1,16 +1,17 @@
 from abc import ABC, abstractmethod
 from typing import List
 
+from torch.optim import Optimizer
+
 from pytorch_optimizer.base.exception import NegativeLRError, NegativeStepError
-from pytorch_optimizer.base.types import OPTIMIZER
 
 
 class BaseLinearWarmupScheduler(ABC):
     r"""BaseLinearWarmupScheduler class.
 
         The LR Scheduler class based on this class has linear warmup strategy.
 
-    :param optimizer: Optimizer. OPTIMIZER. It will set learning rate to all trainable parameters in optimizer.
+    :param optimizer: Optimizer. It will set learning rate to all trainable parameters in optimizer.
     :param t_max: int. total steps to train.
     :param max_lr: float. maximum lr.
     :param min_lr: float. minimum lr.
@@ -20,7 +21,7 @@ class BaseLinearWarmupScheduler(ABC):
 
     def __init__(
         self,
-        optimizer: OPTIMIZER,
+        optimizer: Optimizer,
         t_max: int,
         max_lr: float,
         min_lr: float = 0.0,
 
@@ -11,6 +11,7 @@
 PARAMETERS = Optional[Union[Iterable[Dict], Iterable[torch.Tensor]]]
 STATE = Dict
 OPTIMIZER = Type[Optimizer]
+OPTIMIZER_INSTANCE_OR_CLASS = Union[OPTIMIZER, Optimizer]
 SCHEDULER = Type[LRScheduler]
 
 HUTCHINSON_G = Literal['gaussian', 'rademacher']
 
@@ -1,10 +1,9 @@
 import math
 from typing import List, Optional
 
+from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 
-from pytorch_optimizer.base.types import OPTIMIZER
-
 
 class CosineAnnealingWarmupRestarts(LRScheduler):
     r"""CosineAnnealingWarmupRestarts.
@@ -21,7 +20,7 @@ class CosineAnnealingWarmupRestarts(LRScheduler):
 
     def __init__(
         self,
-        optimizer: OPTIMIZER,
+        optimizer: Optimizer,
         first_cycle_steps: int,
         cycle_mult: float = 1.0,
         max_lr: float = 1e-4,
@@ -53,7 +52,7 @@ def __init__(
 
         self.init_lr()
 
-    def init_lr(self):
+    def init_lr(self) -> None:
         self.base_lrs = []
         for param_group in self.optimizer.param_groups:
             param_group['lr'] = self.min_lr
 
@@ -1,9 +1,8 @@
 from typing import List, Optional
 
+from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 
-from pytorch_optimizer.base.types import OPTIMIZER
-
 
 class REXScheduler(LRScheduler):
     r"""Revisiting Budgeted Training with an Improved Schedule.
@@ -16,7 +15,7 @@ class REXScheduler(LRScheduler):
 
     def __init__(
         self,
-        optimizer: OPTIMIZER,
+        optimizer: Optimizer,
         total_steps: int,
         max_lr: float = 1.0,
         min_lr: float = 0.0,
@@ -35,7 +34,7 @@ def __init__(
 
         self.init_lr()
 
-    def init_lr(self):
+    def init_lr(self) -> None:
         self.base_lrs = []
         for param_group in self.optimizer.param_groups:
             param_group['lr'] = self.min_lr
-Original file line number
+Diff line change
     SGDW,
     SM3,
     SOAP,
 +    SPAM,
     SRMM,
     SWATS,
     TRAC,