Allow external hessian source & fix remaining adahessian bugs

ferris · ferris · commit 2ed564da184e · 2023-06-05T23:39:05.000+02:00
diff --git a/pytorch_optimizer/base/optimizer.py b/pytorch_optimizer/base/optimizer.py
@@ -11,6 +11,32 @@
 class BaseOptimizer(ABC):
     r"""Base optimizer class."""
 
+    @torch.no_grad()
+    def set_hessian(self, hessian):
+        """
+        Helper function to set hessian state from external source
+        Generally useful when using functorch as a base
+
+        Example usage:
+        ```
+        # Hutchinsons Estimator using HVP
+        noise = tree_map(lambda v: torch.randn_like(v), params)
+        loss_, hvp_est = jvp(grad(run_model_fn), (params,), (noise,))
+        hessian_diag_est  = tree_map(lambda a, b: a*b, hvp_est, noise)
+
+        optimizer.set_hessian(hessian_diag_est)
+        # OR
+        optimizer.step(hessian=hessian_diag_est)
+        ````
+
+        """
+        i = 0
+        for group in self.param_groups:
+            for p in group['params']:
+                assert p.shape == hessian[i].shape
+                self.state[p]['hessian'] = hessian[i]
+                i += 1
+
     @torch.no_grad()
     def compute_hutchinson_hessian(self, nsamples: int = 1, pre_zero=True, alpha=1.0, distribution: HUTCHINSON_G = 'gaussian'):
         """
diff --git a/pytorch_optimizer/optimizer/adahessian.py b/pytorch_optimizer/optimizer/adahessian.py
@@ -44,6 +44,7 @@ def __init__(self,
         self.validate_non_negative(eps, 'eps')
         self.validate_range(hessian_power, "Hessian Power", 0, 1, range_type='(]')
 
+        self.distribution = hessian_distribution
         self.update_period = update_period
         self.n_samples = n_samples
         defaults: DEFAULTS = {
@@ -65,7 +66,7 @@ def reset(self):
             for p in group['params']:
                 state = self.state[p]
                 state['exp_avg'] = torch.zeros_like(p)
-                state['exp_hessian_diag_sq'] = state['hessian'].clone()
+                state['exp_hessian_diag_sq'] = torch.zero_like(p)
 
     @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
@@ -75,7 +76,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 loss = closure()
 
         if self._step % self.update_period == 0:
-            self.compute_hutchinson_hessian(self.n_samples)
+            self.compute_hutchinson_hessian(self.n_samples, distribution=self.distribution)
 
         for group in self.param_groups:
             for p in group['params']:
@@ -90,8 +91,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 state = self.state[p]
                 if 'exp_avg' not in state:
                     state['exp_avg'] = torch.zeros_like(p.data)
-                    # NOTE: zeroing-out the hessian causes instability
-                    state['exp_hessian_diag_sq'] = state['hessian'].clone()
+                    state['exp_hessian_diag_sq'] = torch.zeros_like(p.data)
 
                 self.apply_weight_decay(
                     p=p,
diff --git a/pytorch_optimizer/optimizer/sophiah.py b/pytorch_optimizer/optimizer/sophiah.py
@@ -25,12 +25,12 @@ class SophiaH(Optimizer, BaseOptimizer):
 
     def __init__(self,
                  params: PARAMETERS,
-                 lr: float = 1e-1,
-                 betas: BETAS = (0.965, 0.99),
+                 lr: float = 6e-2,
+                 betas: BETAS = (0.96, 0.99),
                  weight_decay: float = 0.0,
                  weight_decouple: bool = True,
                  fixed_decay: bool = False,
-                 p: float = 25.,
+                 p: float = 1e-2,
                  update_period: int = 10,
                  n_samples: int = 1,
                  hessian_distribution: HUTCHINSON_G = 'gaussian',
@@ -40,8 +40,9 @@ def __init__(self,
         self.validate_betas(betas)
         self.validate_non_negative(weight_decay, 'weight_decay')
         self.validate_non_negative(eps, 'eps')
-        self.validate_positive(p, "p (gradient clip)")
+        self.validate_non_negative(p, "p (gradient clip)")
 
+        self.distribution = hessian_distribution
         defaults: DEFAULTS = {
             'lr': lr,
             'betas': betas,
@@ -66,14 +67,16 @@ def reset(self):
                 state['hessian_moment'] = torch.zeros_like(p)
 
     @torch.no_grad()
-    def step(self, closure: CLOSURE = None) -> LOSS:
+    def step(self, closure: CLOSURE = None, hessian: tuple[torch.Tensor] = None) -> LOSS:
         loss: LOSS = None
         if closure is not None:
             with torch.enable_grad():
                 loss = closure()
 
-        if self._step % self.update_period == 0:
-            self.compute_hutchinson_hessian(self.n_samples)
+        if hessian is not None:
+            self.set_hessian(hessian)
+        elif self._step % self.update_period == 0:
+            self.compute_hutchinson_hessian(self.n_samples, distribution=self.distribution)
 
         for group in self.param_groups:
             for p in group['params']:
@@ -103,7 +106,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                 momentum, hessian_moment = state['momentum'], state['hessian_moment']
 
                 momentum.mul_(beta1).add_(p.grad, alpha=1.0-beta1)
-                if self._step % self.update_period == 0:
+                if self._step % self.update_period == 0 or hessian is not None:
                     hessian_moment.mul_(beta2).add_(state['hessian'], alpha=1.0-beta2)
 
                 # See https://shreyansh26.github.io/post/2023-05-28_sophia_scalable_second_order_optimizer_llms/#per-coordinate-clipping