kozistr
diff --git a/‎README.rst‎
Lines changed: 8 additions & 2 deletions b/‎README.rst‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎docs/changelogs/v.2.10.0.md‎
Lines changed: 19 additions & 0 deletions b/‎docs/changelogs/v.2.10.0.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/optimizer_api.rst‎
Lines changed: 24 additions & 0 deletions b/‎docs/optimizer_api.rst‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pytorch_optimizer/__init__.py‎
Lines changed: 3 additions & 3 deletions b/‎pytorch_optimizer/__init__.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pytorch_optimizer/base/optimizer.py‎
Lines changed: 40 additions & 38 deletions b/‎pytorch_optimizer/base/optimizer.py‎
Lines changed: 40 additions & 38 deletions
diff --git a/‎pytorch_optimizer/base/types.py‎
Lines changed: 2 additions & 2 deletions b/‎pytorch_optimizer/base/types.py‎
Lines changed: 2 additions & 2 deletions
@@ -16,7 +16,7 @@ pytorch-optimizer
 
 | **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch.
 | I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.
-| Currently, 51 optimizers, 6 lr schedulers are supported!
+| Currently, 54 optimizers, 6 lr schedulers are supported!
 |
 | Highly inspired by `pytorch-optimizer <https://github.com/jettify/pytorch-optimizer>`__.
 
@@ -213,7 +213,13 @@ You can check the supported optimizers & lr schedulers.
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
 | AdaDelta     | *An Adaptive Learning Rate Method*                                                                |                                                                                   | `https://arxiv.org/abs/1212.5701v1 <https://arxiv.org/abs/1212.5701v1>`__                     | `cite <https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation>`__                                      |
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
-| Amos         | * An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                | `github <https://github.com/google-research/jestimator>`__                        | `https://arxiv.org/abs/2210.11693 <https://arxiv.org/abs/2210.11693>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation>`__                                      |
+| Amos         | *An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale*                 | `github <https://github.com/google-research/jestimator>`__                        | `https://arxiv.org/abs/2210.11693 <https://arxiv.org/abs/2210.11693>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2022arXiv221011693T/exportcitation>`__                                      |
++--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| SignSGD      | *Compressed Optimisation for Non-Convex Problems*                                                 | `github <https://github.com/jxbz/signSGD>`__                                      | `https://arxiv.org/abs/1802.04434 <https://arxiv.org/abs/1802.04434>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2018arXiv180204434B/exportcitation>`__                                      |
++--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| AdaHessian   | *An Adaptive Second Order Optimizer for Machine Learning*                                         | `github <https://github.com/amirgholami/adahessian>`__                            | `https://arxiv.org/abs/2006.00719 <https://arxiv.org/abs/2006.00719>`__                       | `cite <https://github.com/amirgholami/adahessian#citation>`__                                                        |
++--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| Sophia       | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | `github <https://github.com/Liuhong99/Sophia>`__                                  | `https://arxiv.org/abs/2305.14342 <https://arxiv.org/abs/2305.14342>`__                       | `cite <https://github.com/Liuhong99/Sophia>`__                                                                       |
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
 
 Useful Resources
 
@@ -0,0 +1,19 @@
+## Change Log
+
+### Feature
+
+* Implement Amos optimizer (#174)
+  * [An Adam-style Optimizer with Adaptive Weight Decay towards Model-Oriented Scale](https://arxiv.org/abs/2210.11693)
+* Implement SignSGD optimizer (#176) (thanks to @i404788)
+  * [Compressed Optimisation for Non-Convex Problems](https://arxiv.org/abs/1802.04434)
+* Implement AdaHessian optimizer (#176) (thanks to @i404788)
+  * [An Adaptive Second Order Optimizer for Machine Learning](https://arxiv.org/abs/2006.00719)
+* Implement SophiaH optimizer (#173, #176) (thanks to @i404788)
+  * [A Scalable Stochastic Second-order Optimizer for Language Model Pre-training](https://arxiv.org/abs/2305.14342) 
+* Implement re-usable functions to compute hessian in `BaseOptimizer` (#176, #177) (thanks to @i404788)
+  * two types of distribution are supported (`gaussian`, `rademacher`). 
+* Support `AdamD` variant for AdaHessian optimizer (#177)
+
+### Diff
+
+[2.9.1...2.10.0](https://github.com/kozistr/pytorch_optimizer/compare/v2.9.1...v2.10.0)
@@ -464,3 +464,27 @@ Amos
 
 .. autoclass:: pytorch_optimizer.Amos
     :members:
+
+.. _SignSGD:
+
+SignSGD
+-------
+
+.. autoclass:: pytorch_optimizer.SignSGD
+    :members:
+
+.. _AdaHessian:
+
+AdaHessian
+----------
+
+.. autoclass:: pytorch_optimizer.AdaHessian
+    :members:
+
+.. _SophiaH:
+
+SophiaH
+-------
+
+.. autoclass:: pytorch_optimizer.SophiaH
+    :members:
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pytorch_optimizer"
-version = "2.9.1"
+version = "2.10.0"
 description = "optimizer & lr scheduler collections in PyTorch"
 license = "Apache-2.0"
 authors = ["kozistr <[email protected]>"]
@@ -9,7 +9,7 @@ readme = "README.rst"
 homepage = "https://github.com/kozistr/pytorch_optimizer"
 repository = "https://github.com/kozistr/pytorch_optimizer"
 documentation = "https://pytorch-optimizers.readthedocs.io/en/latest"
-keywords = ["pytorch", "deep-learning", "optimizer", "lr scheduler", "A2Grad", "ASGD", "AccSGD", "AdaBelief", "AdaBound", "AdaDelta", "AdaFactor", "AdaMax", "AdaMod", "AdaNorm", "AdaPNM", "AdaSmooth", "Adai", "AdamP", "AdamS", "Adan", "AggMo", "AliG", "Amos", "Apollo", "AvaGrad", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DiffGrad", "Fromage", "Gravity", "LARS", "Lamb", "Lion", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PID", "PNM", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "SGDP", "SGDW", "SM3", "SRMM", "SWATS", "ScalableShampoo", "Shampoo", "Yogi", "SAM", "GSAM", "PCGrad", "RotoGrad"]
+keywords = ["pytorch", "deep-learning", "optimizer", "lr scheduler", "A2Grad", "ASGD", "AccSGD", "AdaBelief", "AdaBound", "AdaDelta", "AdaFactor", "AdaMax", "AdaMod", "AdaNorm", "AdaPNM", "AdaSmooth", "AdaHessian", "Adai", "AdamP", "AdamS", "Adan", "AggMo", "AliG", "Amos", "Apollo", "AvaGrad", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD", "DiffGrad", "Fromage", "Gravity", "LARS", "Lamb", "Lion", "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PID", "PNM", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "SGDP", "SGDW", "SignSGD", "SM3", "SopihaH", "SRMM", "SWATS", "ScalableShampoo", "Shampoo", "Yogi", "SAM", "GSAM", "PCGrad", "RotoGrad"]
 classifiers = [
     "License :: OSI Approved :: Apache Software License",
     "Development Status :: 5 - Production/Stable",
 
@@ -21,6 +21,7 @@
 from pytorch_optimizer.optimizer.adabound import AdaBound
 from pytorch_optimizer.optimizer.adadelta import AdaDelta
 from pytorch_optimizer.optimizer.adafactor import AdaFactor
+from pytorch_optimizer.optimizer.adahessian import AdaHessian
 from pytorch_optimizer.optimizer.adai import Adai
 from pytorch_optimizer.optimizer.adamax import AdaMax
 from pytorch_optimizer.optimizer.adamod import AdaMod
@@ -81,10 +82,9 @@
     power_iteration,
 )
 from pytorch_optimizer.optimizer.sm3 import SM3
+from pytorch_optimizer.optimizer.sophia import SophiaH
 from pytorch_optimizer.optimizer.srmm import SRMM
 from pytorch_optimizer.optimizer.swats import SWATS
-from pytorch_optimizer.optimizer.adahessian import AdaHessian
-from pytorch_optimizer.optimizer.sophiah import SophiaH
 from pytorch_optimizer.optimizer.utils import (
     clip_grad_norm,
     disable_running_stats,
@@ -151,7 +151,7 @@
     Amos,
     AdaHessian,
     SophiaH,
-    SignSGD
+    SignSGD,
 ]
 OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}
 
 
@@ -4,78 +4,80 @@
 
 import torch
 
-from pytorch_optimizer.base.exception import NegativeLRError, NegativeStepError, NoSparseGradientError
-from pytorch_optimizer.base.types import BETAS, HUTCHINSON_G
+from pytorch_optimizer.base.exception import NegativeLRError, NegativeStepError
+from pytorch_optimizer.base.types import BETAS, HUTCHINSON_G, PARAMETERS, STATE
 
 
 class BaseOptimizer(ABC):
     r"""Base optimizer class."""
 
+    @staticmethod
     @torch.no_grad()
-    def set_hessian(self, hessian):
-        """
-        Helper function to set hessian state from external source
-        Generally useful when using functorch as a base
+    def set_hessian(param_groups: PARAMETERS, state: STATE, hessian: List[torch.Tensor]):
+        r"""Set hessian to state from external source. Generally useful when using functorch as a base.
 
         Example usage:
         ```
         # Hutchinsons Estimator using HVP
         noise = tree_map(lambda v: torch.randn_like(v), params)
         loss_, hvp_est = jvp(grad(run_model_fn), (params,), (noise,))
-        hessian_diag_est  = tree_map(lambda a, b: a*b, hvp_est, noise)
+        hessian_diag_est  = tree_map(lambda a, b: a * b, hvp_est, noise)
 
         optimizer.set_hessian(hessian_diag_est)
         # OR
         optimizer.step(hessian=hessian_diag_est)
         ````
-
         """
-        i = 0
-        for group in self.param_groups:
+        i: int = 0
+        for group in param_groups:
             for p in group['params']:
-                assert p.shape == hessian[i].shape
-                self.state[p]['hessian'] = hessian[i]
+                if p.size() != hessian[i].size():
+                    raise ValueError(
+                        f'[-] the shape of parameter and hessian does not match. {p.size()} vs {hessian[i].size()}'
+                    )
+
+                state[p]['hessian'] = hessian[i]
                 i += 1
 
+    @staticmethod
     @torch.no_grad()
-    def compute_hutchinson_hessian(self, nsamples: int = 1, pre_zero=True, alpha=1.0, distribution: HUTCHINSON_G = 'gaussian'):
-        """
-        Hutchinsons approximate hessian, added to the state under key 'hessian'
-        """
-        if distribution not in ['gaussian', 'rademacher']:
-            raise NotImplementedError(f"Hessian with distribution {distribution} is not implemented")
+    def compute_hutchinson_hessian(
+        param_groups: PARAMETERS,
+        state: STATE,
+        num_samples: int = 1,
+        pre_zero: bool = True,
+        alpha: float = 1.0,
+        distribution: HUTCHINSON_G = 'gaussian',
+    ):
+        r"""Hutchinson's approximate hessian, added to the state under key `hessian`."""
+        if distribution not in ('gaussian', 'rademacher'):
+            raise NotImplementedError(f'[-] Hessian with distribution {distribution} is not implemented.')
 
         params = []
-        for group in self.param_groups:
+        for group in param_groups:
             for p in group['params']:
-                if p.requires_grad and p.grad is not None:
-                    if p.grad.is_sparse:
-                        raise NoSparseGradientError(str(self))
-                    # Initialize Hessian state
-                    if 'hessian' in self.state[p]:
-                        if pre_zero:
-                            self.state[p]['hessian'].zero_()
-                    else:
-                        self.state[p]['hessian'] = torch.zeros_like(p.data)
+                if p.requires_grad and p.grad is not None and not p.grad.is_sparse:
+                    if 'hessian' not in state[p]:
+                        state[p]['hessian'] = torch.zeros_like(p)
+                    elif pre_zero:
+                        state[p]['hessian'].zero_()
+
                     params.append(p)
 
         if len(params) == 0:
             return
 
         grads = [p.grad for p in params]
 
-        for i in range(nsamples):
-            if distribution == 'gaussian':
-                # Gaussian N(0,Id)
-                zs = [torch.randn(p.size(), device=p.device) for p in params]
-            elif distribution == 'rademacher':
-                # Rademacher distribution {-1.0, 1.0}
-                zs = [torch.randint(0, 2, p.size(), dtype=p.dtype, device=p.device) * 2.0 - 1.0 for p in params]
+        for i in range(num_samples):
+            if distribution == 'rademacher':
+                zs = [torch.randint_like(p, 0, 1) * 2.0 - 1.0 for p in params]
+            else:
+                zs = [torch.randn_like(p) for p in params]
 
-            h_zs = torch.autograd.grad(grads, params, grad_outputs=zs, retain_graph=i < nsamples - 1)
+            h_zs = torch.autograd.grad(grads, params, grad_outputs=zs, retain_graph=i < num_samples - 1)
             for h_z, z, p in zip(h_zs, zs, params):
-                # approximate the expected values of z*(H@z)
-                self.state[p]['hessian'].add_(h_z * z, alpha=(1/nsamples) * alpha)
+                state[p]['hessian'].add_(h_z * z, alpha=(1 / num_samples) * alpha)
 
     @staticmethod
     def apply_weight_decay(
 
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Type, Union, Literal
+from typing import Any, Callable, Dict, Iterable, Literal, Optional, Tuple, Type, Union
 
 import torch
 from torch.optim import Optimizer
@@ -13,4 +13,4 @@
 OPTIMIZER = Type[Optimizer]
 SCHEDULER = Type[_LRScheduler]
 
-HUTCHINSON_G = Literal['gaussian', 'rademacher']
+HUTCHINSON_G = Literal['gaussian', 'rademacher']