Merge pull request #44 from kozistr/fix/adahessian

kozistr · web-flow · commit 5f1ef596d09c · 2022-01-29T13:54:05.000+09:00
[Test] Add FP16 &amp; SAM test cases
diff --git a/pytorch_optimizer/adahessian.py b/pytorch_optimizer/adahessian.py
@@ -34,17 +34,17 @@ def __init__(
         average_conv_kernel: bool = False,
         adamd_debias_term: bool = False,
         eps: float = 1e-8,
-        seed: int = 2147483647,
+        seed: int = 1337,
     ):
-        """
+        """AdaHessian
         :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups
-        :param lr: float. learning rate.
+        :param lr: float. learning rate
         :param betas: BETAS. coefficients used for computing running averages of gradient and the squared hessian trace
         :param weight_decay: float. weight decay (L2 penalty)
         :param hessian_power: float. exponent of the hessian trace
         :param update_each: int. compute the hessian trace approximation only after *this* number of steps
         :param num_samples: int. how many times to sample `z` for the approximation of the hessian trace
-        :param average_conv_kernel: bool. average out the hessian traces of convolutional kernels as in the paper.
+        :param average_conv_kernel: bool. average out the hessian traces of convolutional kernels as in the paper
         :param adamd_debias_term: bool. Only correct the denominator to avoid inflating step sizes early in training
         :param eps: float. term added to the denominator to improve numerical stability
         :param seed: int.
@@ -103,16 +103,17 @@ def zero_hessian(self):
             if not isinstance(p.hess, float) and self.state[p]['hessian_step'] % self.update_each == 0:
                 p.hess.zero_()
 
-    @torch.no_grad()
     def set_hessian(self):
-        """Computes the Hutchinson approximation of the hessian trace
-        and accumulates it for each trainable parameter
-        """
+        """Computes the Hutchinson approximation of the hessian trace and accumulates it for each trainable parameter"""
         params = []
-        for p in filter(lambda param: param.grad is not None, self.get_params()):
+        for p in self.get_params():
+            if p.grad is None:
+                continue
+
             # compute the trace only each `update_each` step
             if self.state[p]['hessian_step'] % self.update_each == 0:
                 params.append(p)
+
             self.state[p]['hessian_step'] += 1
 
         if len(params) == 0:
@@ -126,7 +127,7 @@ def set_hessian(self):
 
         for i in range(self.num_samples):
             # Rademacher distribution {-1.0, 1.0}
-            zs = [torch.randint(0, 2, p.size(), generator=self.generator, device=p.device) * 2.0 - 1.0 for p in params]
+            zs = [2.0 * torch.randint(0, 2, p.size()).float().requires_grad_(True) - 1.0 for p in params]
 
             # note that, possible memory leak due to retrain_graph=True
             h_zs = torch.autograd.grad(
@@ -141,7 +142,6 @@ def set_hessian(self):
                 # approximate the expected values of z * (H@z)
                 p.hess += h_z * z / self.num_samples
 
-    @torch.no_grad()
     def step(self, closure: CLOSURE = None) -> LOSS:
         loss: LOSS = None
         if closure is not None:
@@ -156,7 +156,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     continue
 
                 if self.average_conv_kernel and p.dim() == 4:
-                    p.hess = torch.abs(p.hess).mean(dim=[2, 3], keepdim=True).expand_as(p.hess).clone()
+                    p.hess = torch.abs(p.hess).mean(dim=(2, 3), keepdim=True).expand_as(p.hess).clone()
 
                 # Perform correct step-weight decay as in AdamW
                 p.mul_(1.0 - group['lr'] * group['weight_decay'])
diff --git a/pytorch_optimizer/agc.py b/pytorch_optimizer/agc.py
@@ -4,10 +4,10 @@
 
 
 def agc(p: torch.Tensor, agc_eps: float, agc_clip_val: float, eps: float = 1e-6):
-    """Clip gradient values in excess of the unit-wise norm.
-    :param p: parameter.
-    :param agc_eps: float.
-    :param agc_clip_val: float.
+    """Clip gradient values in excess of the unit-wise norm
+    :param p: parameter. parameter
+    :param agc_eps: float. epsilon
+    :param agc_clip_val: float. norm clip
     :param eps: float. simple stop from div by zero and no relation to standard optimizer eps
     """
     p_norm = unit_norm(p).clamp_(agc_eps)
diff --git a/pytorch_optimizer/gc.py b/pytorch_optimizer/gc.py
@@ -3,8 +3,8 @@
 
 def centralize_gradient(x: torch.Tensor, gc_conv_only: bool = False) -> torch.Tensor:
     """Gradient Centralization (GC)
-    :param x: torch.Tensor. gradient.
-    :param gc_conv_only: bool. 'False' for both conv & fc layers.
+    :param x: torch.Tensor. gradient
+    :param gc_conv_only: bool. 'False' for both conv & fc layers
     :return: torch.Tensor. GC-ed gradient
     """
     size: int = x.dim()
diff --git a/pytorch_optimizer/madgrad.py b/pytorch_optimizer/madgrad.py
@@ -38,7 +38,7 @@ def __init__(
     ):
         """A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic (slightly modified)
         :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups
-        :param lr: float. learning rate.
+        :param lr: float. learning rate
         :param eps: float. term added to the denominator to improve numerical stability
         :param weight_decay: float. weight decay (L2 penalty)
             MADGRAD optimizer requires less weight decay than other methods, often as little as zero
diff --git a/pytorch_optimizer/ranger21.py b/pytorch_optimizer/ranger21.py
@@ -1,12 +1,3 @@
-__AUTHORS__ = [
-    '@lessw2020',
-    '@NestorDemeure',
-    # with contributions from :
-    '@BrianPugh',
-    '@Kayuksel',
-    '@TheZothen',
-]
-
 import math
 from typing import Optional
 
@@ -19,6 +10,15 @@
 from pytorch_optimizer.types import BETAS, CLOSURE, DEFAULTS, LOSS, PARAMETERS, STATE
 from pytorch_optimizer.utils import normalize_gradient, unit_norm
 
+__AUTHORS__ = [
+    '@lessw2020',
+    '@NestorDemeure',
+    # with contributions from :
+    '@BrianPugh',
+    '@Kayuksel',
+    '@TheZothen',
+]
+
 
 class Ranger21(Optimizer):
     """
@@ -185,7 +185,7 @@ def step(self, closure: CLOSURE = None) -> LOSS:
         param_size: int = 0
         variance_ma_sum: float = 1.0
 
-        # Phase 1 - Accumulate all of the variance_ma_sum to use in stable weight decay
+        # Phase 1 - Accumulate all the variance_ma_sum to use in stable weight decay
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is None:
diff --git a/pytorch_optimizer/sam.py b/pytorch_optimizer/sam.py
@@ -57,9 +57,9 @@ def __init__(
         adaptive: bool = False,
         **kwargs,
     ):
-        """
+        """SAM
         :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups
-        :param base_optimizer: Optimizer.
+        :param base_optimizer: Optimizer. base optimizer
         :param rho: float. size of the neighborhood for computing the max loss
         :param adaptive: bool. element-wise Adaptive SAM
         :param kwargs: Dict. parameters for optimizer.
diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py
@@ -8,6 +8,7 @@
 
 from pytorch_optimizer import (
     MADGRAD,
+    SAM,
     SGDP,
     AdaBelief,
     AdaBound,
@@ -19,6 +20,7 @@
     RAdam,
     Ranger,
     Ranger21,
+    SafeFP16Optimizer,
 )
 
 __REFERENCE__ = 'https://github.com/jettify/pytorch-optimizer/blob/master/tests/test_optimizer_with_nn.py'
@@ -66,7 +68,7 @@ def build_lookahead(*parameters, **kwargs):
     return Lookahead(AdamP(*parameters, **kwargs))
 
 
-OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [
+FP32_OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [
     (build_lookahead, {'lr': 1e-2, 'weight_decay': 1e-3}, 200),
     (AdaBelief, {'lr': 1e-2, 'weight_decay': 1e-3}, 200),
     (AdaBound, {'lr': 1e-2, 'gamma': 0.1, 'weight_decay': 1e-3}, 200),
@@ -78,21 +80,34 @@ def build_lookahead(*parameters, **kwargs):
     (RAdam, {'lr': 1e-1, 'weight_decay': 1e-3}, 200),
     (SGDP, {'lr': 1e-1, 'weight_decay': 1e-3}, 200),
     (Ranger, {'lr': 1e-1, 'weight_decay': 1e-3}, 200),
-    (Ranger21, {'lr': 5e-1, 'weight_decay': 1e-3, 'num_iterations': 1000}, 500),
-    # (AdaHessian, {'lr': 1e-2, 'weight_decay': 1e-3}, 200),
+    (Ranger21, {'lr': 5e-1, 'weight_decay': 1e-3, 'num_iterations': 500}, 500),
 ]
 
+FP16_OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [
+    (build_lookahead, {'lr': 5e-1, 'weight_decay': 1e-3}, 500),
+    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3}, 200),
+    (AdaBound, {'lr': 5e-1, 'gamma': 0.1, 'weight_decay': 1e-3}, 200),
+    (AdamP, {'lr': 5e-1, 'weight_decay': 1e-3}, 500),
+    (DiffGrad, {'lr': 15 - 1, 'weight_decay': 1e-3}, 500),
+    (DiffRGrad, {'lr': 1e-1, 'weight_decay': 1e-3}, 200),
+    (Lamb, {'lr': 1e-1, 'weight_decay': 1e-3}, 200),
+    (RAdam, {'lr': 1e-1, 'weight_decay': 1e-3}, 200),
+    (SGDP, {'lr': 5e-1, 'weight_decay': 1e-3}, 500),
+    (Ranger, {'lr': 5e-1, 'weight_decay': 1e-3}, 200),
+    (Ranger21, {'lr': 5e-1, 'weight_decay': 1e-3, 'num_iterations': 500}, 500),
+]
 
-@pytest.mark.parametrize('optimizer_config', OPTIMIZERS, ids=ids)
-def test_optimizers(optimizer_config):
+
+@pytest.mark.parametrize('optimizer_fp32_config', FP32_OPTIMIZERS, ids=ids)
+def test_f32_optimizers(optimizer_fp32_config):
     torch.manual_seed(42)
 
     x_data, y_data = make_dataset()
 
     model: nn.Module = LogisticRegression()
     loss_fn: nn.Module = nn.BCEWithLogitsLoss()
 
-    optimizer_class, config, iterations = optimizer_config
+    optimizer_class, config, iterations = optimizer_fp32_config
     optimizer = optimizer_class(model.parameters(), **config)
 
     loss: float = np.inf
@@ -111,3 +126,58 @@ def test_optimizers(optimizer_config):
         optimizer.step()
 
     assert init_loss > 2.0 * loss
+
+
+@pytest.mark.parametrize('optimizer_fp16_config', FP16_OPTIMIZERS, ids=ids)
+def test_f16_optimizers(optimizer_fp16_config):
+    torch.manual_seed(42)
+
+    x_data, y_data = make_dataset()
+
+    model: nn.Module = LogisticRegression()
+    loss_fn: nn.Module = nn.BCEWithLogitsLoss()
+
+    optimizer_class, config, iterations = optimizer_fp16_config
+    optimizer = SafeFP16Optimizer(optimizer_class(model.parameters(), **config))
+
+    loss: float = np.inf
+    init_loss: float = np.inf
+    for _ in range(1000):
+        optimizer.zero_grad()
+
+        y_pred = model(x_data)
+        loss = loss_fn(y_pred, y_data)
+
+        if init_loss == np.inf:
+            init_loss = loss
+
+        loss.backward()
+
+        optimizer.step()
+
+    assert init_loss - 0.01 > loss
+
+
+@pytest.mark.parametrize('optimizer_config', FP32_OPTIMIZERS, ids=ids)
+def test_sam_optimizers(optimizer_config):
+    torch.manual_seed(42)
+
+    x_data, y_data = make_dataset()
+
+    model: nn.Module = LogisticRegression()
+    loss_fn: nn.Module = nn.BCEWithLogitsLoss()
+
+    optimizer_class, config, iterations = optimizer_config
+    optimizer = SAM(model.parameters(), optimizer_class, **config)
+
+    loss: float = np.inf
+    init_loss: float = np.inf
+    for _ in range(iterations):
+        loss = loss_fn(y_data, model(x_data))
+        loss.backward()
+        optimizer.first_step(zero_grad=True)
+
+        loss_fn(y_data, model(x_data)).backward()
+        optimizer.second_step(zero_grad=True)
+
+    assert init_loss > 2.0 * loss