kozistr
diff --git a/‎docs/changelogs/v3.5.2.md‎
Lines changed: 0 additions & 14 deletions b/‎docs/changelogs/v3.5.2.md‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎docs/changelogs/v3.6.0.md‎
Lines changed: 26 additions & 0 deletions b/‎docs/changelogs/v3.6.0.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎docs/optimizer.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/optimizer.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/qa.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/qa.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/pytorch_lightning_example.py‎
Lines changed: 53 additions & 0 deletions b/‎examples/pytorch_lightning_example.py‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎examples/pytorch_lightning_manual_backward_example.py‎
Lines changed: 60 additions & 0 deletions b/‎examples/pytorch_lightning_manual_backward_example.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pytorch_optimizer/base/exception.py‎
Lines changed: 13 additions & 0 deletions b/‎pytorch_optimizer/base/exception.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎pytorch_optimizer/base/optimizer.py‎
Lines changed: 33 additions & 7 deletions b/‎pytorch_optimizer/base/optimizer.py‎
Lines changed: 33 additions & 7 deletions
diff --git a/‎pytorch_optimizer/base/type.py‎
Lines changed: 2 additions & 1 deletion b/‎pytorch_optimizer/base/type.py‎
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,26 @@
+## Change Log
+
+### Feature
+
+* Implement `Fira` optimizer. (#376)
+    * [Can We Achieve Full-rank Training of LLMs Under Low-rank Constraint?](https://arxiv.org/abs/2410.01623) 
+* Implement `RACS` and `Alice` optimizers. (#376)
+    * [Towards Efficient Optimizer Design for LLM via Structured Fisher Approximation with a Low-Rank Extension](https://arxiv.org/abs/2502.07752)
+* Implement `VSGD` optimizer. (#377, #378)
+    * [Variational Stochastic Gradient Descent for Deep Neural Networks](https://openreview.net/forum?id=xu4ATNjcdy) 
+* Support complex parameters. (#370, #380)
+* Support `maximize` parameter. (#370, #380)
+
+### Update
+
+* Support 2D< Tensor for `RACS` and `Alice` optimizers. (#380)
+* Remove the auxiliary variants from the default parameters of the optimizers and change the name of the state and parameter. (#380)
+    * `use_gc`, `adanorm`, `cautious`, `stable_adamw`, and `adam_debias` will be affected.
+    * You can still use these variants by passing the parameters to `**kwargs`.
+    * Notably, in case of `adanorm` variant, you need to pass `adanorm` (and `adanorm_r` for `r` option) parameter(s) to use this variant, and the name of the state will be changed from `exp_avg_norm` to `exp_avg_adanorm`.
+* Refactor `reset()` to `init_group()` method in the `BaseOptimizer` class. (#380)
+* Refactor `SAM` optimizer faimily. (#380)
+
+### Fix
+
+* Fix shape mismatch issues in the Galore projection for `reverse_std`, `right` and `full` projection types. (#376)
@@ -116,6 +116,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.Alice
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.AliG
     :docstring:
     :members:
@@ -316,6 +320,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.RACS
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.RAdam
     :docstring:
     :members:
 
@@ -10,4 +10,4 @@
 
 ## Q3) How to run visualizations?
 
-Run `python3 -m examples.visualize_optimizers` on the project root.
+Run `make visualize` or `python3 -m examples.visualize_optimizers` on the project root.
@@ -0,0 +1,53 @@
+import os
+
+import pytorch_lightning as pl
+import torch
+from torch import nn
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+from torchvision.datasets import MNIST
+from torchvision.transforms import ToTensor
+
+from pytorch_optimizer import Lookahead
+
+
+class LitAutoEncoder(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+
+        self.encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
+        self.decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))
+
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        x = x.view(x.size(0), -1)
+
+        z = self.encoder(x)
+        x_hat = self.decoder(z)
+
+        loss = nn.functional.mse_loss(x_hat, x)
+
+        self.log('train_loss', loss)
+
+        return loss
+
+    def configure_optimizers(self):
+        return Lookahead(AdamW(self.parameters(), lr=1e-3), k=5, alpha=0.5)
+
+
+def main():
+    train_dataset = MNIST(os.getcwd(), train=True, download=True, transform=ToTensor())
+    train_loader = DataLoader(train_dataset)
+
+    autoencoder = LitAutoEncoder()
+    autoencoder.train()
+
+    if torch.cuda.is_available():
+        autoencoder.cuda()
+
+    trainer = pl.Trainer(limit_train_batches=100, max_epochs=1)
+    trainer.fit(model=autoencoder, train_dataloaders=train_loader)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,60 @@
+import os
+
+import pytorch_lightning as pl
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+from torchvision.datasets import MNIST
+from torchvision.transforms import ToTensor
+
+from pytorch_optimizer import SophiaH
+
+
+class LitAutoEncoder(pl.LightningModule):
+    def __init__(self):
+        super().__init__()
+
+        self.encoder = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))
+        self.decoder = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))
+
+        self.automatic_optimization = False
+
+    def training_step(self, batch, batch_idx):
+        opt = self.optimizers()
+        opt.zero_grad()
+
+        x, y = batch
+        x = x.view(x.size(0), -1)
+
+        z = self.encoder(x)
+        x_hat = self.decoder(z)
+
+        loss = nn.functional.mse_loss(x_hat, x)
+
+        self.manual_backward(loss, create_graph=True)
+        opt.step()
+
+        self.log('train_loss', loss)
+
+        return loss
+
+    def configure_optimizers(self):
+        return SophiaH(self.parameters())
+
+
+def main():
+    train_dataset = MNIST(os.getcwd(), train=True, download=True, transform=ToTensor())
+    train_loader = DataLoader(train_dataset)
+
+    autoencoder = LitAutoEncoder()
+    autoencoder.train()
+
+    if torch.cuda.is_available():
+        autoencoder.cuda()
+
+    trainer = pl.Trainer(limit_train_batches=100, max_epochs=1)
+    trainer.fit(model=autoencoder, train_dataloaders=train_loader)
+
+
+if __name__ == '__main__':
+    main()
@@ -100,7 +100,7 @@ select = [
 ignore = [
     "B905",
     "D100", "D102", "D104", "D105", "D107", "D203", "D213", "D413",
-    "PLR0912", "PLR0913", "PLR0915", "PLR2004",
+    "PLR0912", "PLR0913", "PLR0915", "PLR2004", "PLW2901",
     "Q003", "ARG002",
 ]
 fixable = ["ALL"]
 
@@ -43,3 +43,16 @@ def __init__(self, num_steps: int, step_type: str = ''):
         self.note: str = step_type if step_type else 'step'
         self.message: str = f'{self.note} must be positive. ({num_steps} > 0)'
         super().__init__(self.message)
+
+
+class NoComplexParameterError(Exception):
+    """Raised when the dtype of the parameter is complex.
+
+    :param optimizer_name: str. optimizer name.
+    :param note: str. special conditions to note (default '').
+    """
+
+    def __init__(self, optimizer_name: str, note: str = ''):
+        self.note: str = ' ' if not note else f' w/ {note} '
+        self.message: str = f'{optimizer_name}{self.note}does not support complex parameter.'
+        super().__init__(self.message)
@@ -10,6 +10,7 @@
     BETAS,
     CLOSURE,
     DEFAULTS,
+    GROUP,
     HUTCHINSON_G,
     LOSS,
     OPTIMIZER_INSTANCE_OR_CLASS,
@@ -163,7 +164,10 @@ def apply_ams_bound(
         :param eps: float. epsilon.
         """
         if ams_bound:
-            torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
+            if torch.is_complex(max_exp_avg_sq):
+                max_exp_avg_sq = torch.view_as_real(max_exp_avg_sq)
+
+            torch.maximum(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
             de_nom = max_exp_avg_sq.add(eps)
         else:
             de_nom = exp_avg_sq.add(eps)
@@ -195,7 +199,7 @@ def debias_beta(beta: float, step: int) -> float:
     def apply_adam_debias(adam_debias: bool, step_size: float, bias_correction1: float) -> float:
         r"""Apply AdamD variant.
 
-        :param adam_debias: bool. whether to apply AdamD.
+        :param adam_debias: bool. Only correct the denominator to avoid inflating step sizes early in training.
         :param step_size: float. step size.
         :param bias_correction1: float. bias_correction.
         """
@@ -247,16 +251,19 @@ def get_adanorm_gradient(
         r"""Get AdaNorm gradient.
 
         :param grad: torch.Tensor. gradient.
-        :param adanorm: bool. whether to apply AdaNorm.
+        :param adanorm: bool. whether to use the AdaNorm variant.
         :param exp_grad_norm: Optional[torch.Tensor]. exp_grad_norm.
-        :param r: float. Optional[float]. momentum (ratio).
+        :param r: Optional[float]. EMA factor. between 0.9 ~ 0.99 is preferred.
         """
         if not adanorm or exp_grad_norm is None:
             return grad
 
+        if r is None:
+            r = 0.95
+
         grad_norm = torch.linalg.norm(grad)
 
-        exp_grad_norm.mul_(r).add_(grad_norm, alpha=1.0 - r)
+        exp_grad_norm.mul(r).add_(grad_norm, alpha=1.0 - r)
 
         return grad.mul(exp_grad_norm).div_(grad_norm) if exp_grad_norm > grad_norm else grad
 
@@ -371,8 +378,27 @@ def validate_nus(self, nus: Union[float, Tuple[float, float]]) -> None:
             self.validate_range(nus[1], 'nu2', 0.0, 1.0, range_type='[]')
 
     @abstractmethod
-    def reset(self) -> None:  # pragma: no cover
-        raise NotImplementedError
+    def init_group(self, group: GROUP, **kwargs) -> None:  # pragma: no cover
+        r"""Initialize the group of the optimizer and return is_complex."""
+        return
+
+    @staticmethod
+    def view_as_real(param, *state_and_grads) -> tuple:
+        r"""View imaginary tensors as real tensors."""
+        if torch.is_complex(param):
+            param = torch.view_as_real(param)
+            state_and_grads = tuple(
+                torch.view_as_real(s) if (s is not None and torch.is_complex(s)) else s if s is not None else None
+                for s in state_and_grads
+            )
+
+        return param, *state_and_grads
+
+    @staticmethod
+    def maximize_gradient(grad: torch.Tensor, maximize: bool = False) -> None:
+        r"""Maximize the objective with respect to the params, instead of minimizing."""
+        if maximize:
+            grad.neg_()
 
     def step(self, closure: CLOSURE = None) -> LOSS:  # pragma: no cover
         raise NotImplementedError
@@ -8,7 +8,8 @@
 LOSS = Optional[float]
 BETAS = Union[Tuple[float, float], Tuple[float, float, float], Tuple[None, float]]
 DEFAULTS = Dict
-PARAMETERS = Optional[Union[Iterable[Dict], Iterable[torch.Tensor]]]
+GROUP = Dict
+PARAMETERS = Optional[Union[Iterable[GROUP], Iterable[torch.Tensor]]]
 STATE = Dict
 OPTIMIZER = Type[Optimizer]
 OPTIMIZER_INSTANCE_OR_CLASS = Union[OPTIMIZER, Optimizer]
Original file line number	Diff line number	Diff line change
`@@ -10,4 +10,4 @@`
`10`	`10`
`11`	`11`	`## Q3) How to run visualizations?`
`12`	`12`
`13`		-Run `python3 -m examples.visualize_optimizers` on the project root.
	`13`	+Run `make visualize` or `python3 -m examples.visualize_optimizers` on the project root.
Original file line number	Diff line number	Diff line change
`@@ -100,7 +100,7 @@ select = [`
`100`	`100`	`ignore = [`
`101`	`101`	`"B905",`
`102`	`102`	`"D100", "D102", "D104", "D105", "D107", "D203", "D213", "D413",`
`103`		`- "PLR0912", "PLR0913", "PLR0915", "PLR2004",`
	`103`	`+ "PLR0912", "PLR0913", "PLR0915", "PLR2004", "PLW2901",`
`104`	`104`	`"Q003", "ARG002",`
`105`	`105`	`]`
`106`	`106`	`fixable = ["ALL"]`