kozistr
diff --git a/‎pytorch_optimizer/__init__.py‎
Lines changed: 6 additions & 1 deletion b/‎pytorch_optimizer/__init__.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎pytorch_optimizer/base/optimizer.py‎
Lines changed: 68 additions & 2 deletions b/‎pytorch_optimizer/base/optimizer.py‎
Lines changed: 68 additions & 2 deletions
diff --git a/‎pytorch_optimizer/base/types.py‎
Lines changed: 3 additions & 1 deletion b/‎pytorch_optimizer/base/types.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎pytorch_optimizer/optimizer/adahessian.py‎
Lines changed: 128 additions & 0 deletions b/‎pytorch_optimizer/optimizer/adahessian.py‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎pytorch_optimizer/optimizer/sgd.py‎
Lines changed: 72 additions & 0 deletions b/‎pytorch_optimizer/optimizer/sgd.py‎
Lines changed: 72 additions & 0 deletions
@@ -62,7 +62,7 @@
 from pytorch_optimizer.optimizer.ranger21 import Ranger21
 from pytorch_optimizer.optimizer.rotograd import RotoGrad
 from pytorch_optimizer.optimizer.sam import SAM
-from pytorch_optimizer.optimizer.sgd import ASGD, SGDW, AccSGD
+from pytorch_optimizer.optimizer.sgd import ASGD, SGDW, AccSGD, SignSGD
 from pytorch_optimizer.optimizer.sgdp import SGDP
 from pytorch_optimizer.optimizer.shampoo import ScalableShampoo, Shampoo
 from pytorch_optimizer.optimizer.shampoo_utils import (
@@ -83,6 +83,8 @@
 from pytorch_optimizer.optimizer.sm3 import SM3
 from pytorch_optimizer.optimizer.srmm import SRMM
 from pytorch_optimizer.optimizer.swats import SWATS
+from pytorch_optimizer.optimizer.adahessian import AdaHessian
+from pytorch_optimizer.optimizer.sophiah import SophiaH
 from pytorch_optimizer.optimizer.utils import (
     clip_grad_norm,
     disable_running_stats,
@@ -147,6 +149,9 @@
     AdaShift,
     AdaDelta,
     Amos,
+    AdaHessian,
+    SophiaH,
+    SignSGD
 ]
 OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}
 
 
@@ -4,13 +4,79 @@
 
 import torch
 
-from pytorch_optimizer.base.exception import NegativeLRError, NegativeStepError
-from pytorch_optimizer.base.types import BETAS
+from pytorch_optimizer.base.exception import NegativeLRError, NegativeStepError, NoSparseGradientError
+from pytorch_optimizer.base.types import BETAS, HUTCHINSON_G
 
 
 class BaseOptimizer(ABC):
     r"""Base optimizer class."""
 
+    @torch.no_grad()
+    def set_hessian(self, hessian):
+        """
+        Helper function to set hessian state from external source
+        Generally useful when using functorch as a base
+
+        Example usage:
+        ```
+        # Hutchinsons Estimator using HVP
+        noise = tree_map(lambda v: torch.randn_like(v), params)
+        loss_, hvp_est = jvp(grad(run_model_fn), (params,), (noise,))
+        hessian_diag_est  = tree_map(lambda a, b: a*b, hvp_est, noise)
+
+        optimizer.set_hessian(hessian_diag_est)
+        # OR
+        optimizer.step(hessian=hessian_diag_est)
+        ````
+
+        """
+        i = 0
+        for group in self.param_groups:
+            for p in group['params']:
+                assert p.shape == hessian[i].shape
+                self.state[p]['hessian'] = hessian[i]
+                i += 1
+
+    @torch.no_grad()
+    def compute_hutchinson_hessian(self, nsamples: int = 1, pre_zero=True, alpha=1.0, distribution: HUTCHINSON_G = 'gaussian'):
+        """
+        Hutchinsons approximate hessian, added to the state under key 'hessian'
+        """
+        if distribution not in ['gaussian', 'rademacher']:
+            raise NotImplementedError(f"Hessian with distribution {distribution} is not implemented")
+
+        params = []
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.requires_grad and p.grad is not None:
+                    if p.grad.is_sparse:
+                        raise NoSparseGradientError(str(self))
+                    # Initialize Hessian state
+                    if 'hessian' in self.state[p]:
+                        if pre_zero:
+                            self.state[p]['hessian'].zero_()
+                    else:
+                        self.state[p]['hessian'] = torch.zeros_like(p.data)
+                    params.append(p)
+
+        if len(params) == 0:
+            return
+
+        grads = [p.grad for p in params]
+
+        for i in range(nsamples):
+            if distribution == 'gaussian':
+                # Gaussian N(0,Id)
+                zs = [torch.randn(p.size(), device=p.device) for p in params]
+            elif distribution == 'rademacher':
+                # Rademacher distribution {-1.0, 1.0}
+                zs = [torch.randint(0, 2, p.size(), dtype=p.dtype, device=p.device) * 2.0 - 1.0 for p in params]
+
+            h_zs = torch.autograd.grad(grads, params, grad_outputs=zs, retain_graph=i < nsamples - 1)
+            for h_z, z, p in zip(h_zs, zs, params):
+                # approximate the expected values of z*(H@z)
+                self.state[p]['hessian'].add_(h_z * z, alpha=(1/nsamples) * alpha)
+
     @staticmethod
     def apply_weight_decay(
         p: torch.Tensor,
 
@@ -1,4 +1,4 @@
-from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Type, Union, Literal
 
 import torch
 from torch.optim import Optimizer
@@ -12,3 +12,5 @@
 STATE = Dict[str, Any]
 OPTIMIZER = Type[Optimizer]
 SCHEDULER = Type[_LRScheduler]
+
+HUTCHINSON_G = Literal['gaussian', 'rademacher']
@@ -0,0 +1,128 @@
+import torch
+from torch.optim.optimizer import Optimizer
+
+from pytorch_optimizer.base.exception import NoSparseGradientError
+from pytorch_optimizer.base.optimizer import BaseOptimizer
+from pytorch_optimizer.base.types import BETAS, CLOSURE, DEFAULTS, LOSS, PARAMETERS, HUTCHINSON_G
+
+# Modified from https://github.com/davda54/ada-hessian/blob/master/ada_hessian.py (MIT David Samuel)
+
+
+class AdaHessian(Optimizer, BaseOptimizer):
+    r"""An Adaptive Second Order Optimizer for Machine Learning
+
+    Requires `loss.backward(create_graph=True)` in order to calculate hessians
+
+    :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
+    :param lr: float. learning rate.
+    :param betas: BETAS. coefficients used for computing running averages of gradient and the squared hessian trace.
+    :param weight_decay: float. weight decay (L2 penalty).
+    :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW.
+    :param fixed_decay: bool. fix weight decay.
+    :param hessian_power: float. exponent of the hessian trace
+    :param update_period: int. number of steps after which to apply hessian approximation
+    :param n_samples: int. times to sample `z` for the approximation of the hessian trace
+    :param eps: float. term added to the denominator to improve numerical stability.
+    """
+
+    def __init__(self,
+                 params: PARAMETERS,
+                 lr: float = 1e-1,
+                 betas: BETAS = (0.9, 0.999),
+                 weight_decay: float = 0.0,
+                 weight_decouple: bool = True,
+                 fixed_decay: bool = False,
+                 hessian_power: float = 1.0,
+                 update_period: int = 1,
+                 n_samples: int = 1,
+                 hessian_distribution: HUTCHINSON_G = 'rademacher',
+                 eps: float = 1e-16):
+
+        self.validate_learning_rate(lr)
+        self.validate_betas(betas)
+        self.validate_non_negative(weight_decay, 'weight_decay')
+        self.validate_non_negative(eps, 'eps')
+        self.validate_range(hessian_power, "Hessian Power", 0, 1, range_type='(]')
+
+        self.distribution = hessian_distribution
+        self.update_period = update_period
+        self.n_samples = n_samples
+        defaults: DEFAULTS = {
+            'lr': lr,
+            'betas': betas,
+            'weight_decay': weight_decay,
+            'weight_decouple': weight_decouple,
+            'fixed_decay': fixed_decay,
+            'hessian_power': hessian_power,
+            'eps': eps,
+        }
+        self._step = 0
+        super().__init__(params, defaults)
+
+    @torch.no_grad()
+    def reset(self):
+        self._step = 0
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+                state['exp_avg'] = torch.zeros_like(p)
+                state['exp_hessian_diag_sq'] = torch.zero_like(p)
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None, hessian: tuple[torch.Tensor] = None) -> LOSS:
+        loss: LOSS = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        if hessian is not None:
+            self.set_hessian(hessian)
+        elif self._step % self.update_period == 0:
+            self.compute_hutchinson_hessian(self.n_samples, distribution=self.distribution)
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad
+                if grad.is_sparse:
+                    raise NoSparseGradientError(str(self))
+
+                # State initialization
+                state = self.state[p]
+                if 'exp_avg' not in state:
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    state['exp_hessian_diag_sq'] = torch.zeros_like(p.data)
+
+                self.apply_weight_decay(
+                    p=p,
+                    grad=grad,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    weight_decouple=group['weight_decouple'],
+                    fixed_decay=group['fixed_decay'],
+                )
+
+                exp_avg, exp_hessian_diag_sq = state['exp_avg'], state['exp_hessian_diag_sq']
+                beta1, beta2 = group['betas']
+
+                # Decay the first and second moment running average coefficient
+                exp_avg.mul_(beta1).add_(p.grad, alpha=1 - beta1)
+                if (self._step % self.update_period == 0 or hessian is not None) and 'hessian' in state:
+                    # if self.average_conv_kernel and p.dim() == 4:
+                    #     state['hessian'] = torch.abs(state['hessian']).mean(dim=[2, 3], keepdim=True).expand_as(state['hessian']).clone()
+                    exp_hessian_diag_sq.mul_(beta2).addcmul_(state['hessian'], state['hessian'], value=1 - beta2)
+
+                bias_correction1 = 1 - beta1 ** (self._step+1)
+                bias_correction2 = 1 - beta2 ** (self._step+1)
+
+                k = group['hessian_power']
+                denom = (exp_hessian_diag_sq / bias_correction2).pow_(k / 2).add_(group['eps'])
+
+                # make update
+                step_size = group['lr'] / bias_correction1
+                p.addcdiv_(exp_avg, denom, value=-step_size)
+
+        self._step += 1
+        return loss
@@ -311,3 +311,75 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 p.add_(grad, alpha=-new_lr)
 
         return loss
+
+
+class SignSGD(Optimizer, BaseOptimizer):
+    r"""SignSGD: Compressed Optimisation for Non-Convex Problems
+
+    :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
+    :param lr: float. learning rate.
+    :param momentum: float. momentum factor (0.0=SignSGD, >0=Signum).
+    :param weight_decay: float. weight decay (L2 penalty).
+    :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW.
+    """
+
+    def __init__(
+        self,
+        params: PARAMETERS,
+        lr: float = 1e-3,
+        beta: float = 0.9,
+        weight_decay: float = 0.0,
+        weight_decouple: bool = True,
+    ):
+        self.validate_learning_rate(lr)
+        self.validate_range(beta, 'beta', 0.0, 1.0)
+        self.validate_non_negative(weight_decay, 'weight_decay')
+
+        defaults: DEFAULTS = {
+            'lr': lr,
+            'beta': beta,
+            'weight_decay': weight_decay,
+            'weight_decouple': weight_decouple,
+        }
+
+        super().__init__(params, defaults)
+
+    @torch.no_grad()
+    def reset(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+
+                if group['beta'] > 0.0:
+                    state['momentum_buffer'] = p.grad.clone()
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            beta = group['beta']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                if p.grad.is_sparse:
+                    raise NoSparseGradientError(str(self))
+
+                state = self.state[p]
+
+                if beta > 0.0:
+                    if len(state) == 0:
+                        state['momentum_buffer'] = p.grad.clone()
+
+                    buf = state['momentum_buffer']
+                    buf.mul_(beta).add_(p.grad, alpha=1.0 - beta)
+                else:
+                    buf = p.grad
+
+                p.add_(torch.sign(buf), alpha=-group['lr'])
+
+        return loss