Merge pull request #203 from kozistr/feature/dadaptlion-optimizer

kozistr · web-flow · commit 25618d73ba46 · 2023-09-02T14:56:18.000+09:00
[Feature] Implement DAdaptLion optimizer
diff --git a/README.rst b/README.rst
@@ -16,7 +16,7 @@ pytorch-optimizer
 
 | **pytorch-optimizer** is optimizer & lr scheduler collections in PyTorch.
 | I just re-implemented (speed & memory tweaks, plug-ins) the algorithm while based on the original paper. Also, It includes useful and practical optimization ideas.
-| Currently, 59 optimizers, 10 lr schedulers, and 13 loss functions are supported!
+| Currently, **60 optimizers**, **10 lr schedulers**, and **13 loss functions** are supported!
 |
 | Highly inspired by `pytorch-optimizer <https://github.com/jettify/pytorch-optimizer>`__.
 
@@ -31,20 +31,20 @@ So, please double-check the license before using it at your work.
 Installation
 ~~~~~~~~~~~~
 
-::
+.. code-block:: bash
 
     $ pip3 install -U pytorch-optimizer
 
 If there's a version issue when installing the package, try with `--no-deps` option.
 
-::
+.. code-block:: bash
 
     $ pip3 install -U --no-deps pytorch-optimizer
 
 Simple Usage
 ~~~~~~~~~~~~
 
-::
+.. code-block:: python
 
     from pytorch_optimizer import AdamP
 
@@ -61,7 +61,7 @@ Simple Usage
 
 Also, you can load the optimizer via `torch.hub`
 
-::
+.. code-block:: python
 
     import torch
 
@@ -71,7 +71,7 @@ Also, you can load the optimizer via `torch.hub`
 
 If you want to build the optimizer with parameters & configs, there's `create_optimizer()` API.
 
-::
+.. code-block:: python
 
     from pytorch_optimizer import create_optimizer
 
@@ -89,7 +89,7 @@ Supported Optimizers
 
 You can check the supported optimizers with below code.
 
-::
+.. code-block:: python
 
     from pytorch_optimizer import get_supported_optimizers
 
@@ -230,7 +230,7 @@ Supported LR Scheduler
 
 You can check the supported learning rate schedulers with below code.
 
-::
+.. code-block:: python
 
     from pytorch_optimizer import get_supported_lr_schedulers
 
@@ -249,7 +249,7 @@ Supported Loss Function
 
 You can check the supported loss functions with below code.
 
-::
+.. code-block:: python
 
     from pytorch_optimizer import get_supported_loss_functions
 
diff --git a/docs/changelogs/v2.11.2.md b/docs/changelogs/v2.11.2.md
@@ -1,9 +1,24 @@
 ## Change Log
 
+### Feature
+
+* Implement DAdaptLion optimizer (#203)
+  * [Lion with D-Adaptation](https://github.com/facebookresearch/dadaptation/blob/main/dadaptation/dadapt_lion.py)
+
 ### Fix
 
 * Fix Lookahead optimizer (#200, #201, #202)
   * When using PyTorch Lightning which expects your optimiser to be a subclass of `Optimizer`.
+* Fix default `rectify` to `False` in `AdaBelief` optimizer (#203)
+
+### Test
+
+* Add `DynamicLossScaler` test case
+
+### Docs
+
+* Highlight the code blocks
+* Fix pepy badges
 
 ### Contributions
 
diff --git a/docs/optimizer_api.rst b/docs/optimizer_api.rst
@@ -225,6 +225,14 @@ DAdaptAdan
 .. autoclass:: pytorch_optimizer.DAdaptAdan
     :members:
 
+.. _DAdaptLion:
+
+DAdaptLion
+----------
+
+.. autoclass:: pytorch_optimizer.DAdaptLion
+    :members:
+
 .. _AdamS:
 
 AdamS
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pytorch_optimizer"
-version = "2.11.1"
+version = "2.11.2"
 description = "optimizer & lr scheduler & objective function collections in PyTorch"
 license = "Apache-2.0"
 authors = ["kozistr <kozistr@gmail.com>"]
@@ -13,10 +13,10 @@ keywords = [
     "pytorch", "deep-learning", "optimizer", "lr scheduler", "A2Grad", "ASGD", "AccSGD", "AdaBelief", "AdaBound",
     "AdaDelta", "AdaFactor", "AdaMax", "AdaMod", "AdaNorm", "AdaPNM", "AdaSmooth", "AdaHessian", "Adai", "AdamP",
     "AdamS", "Adan", "AggMo", "AliG", "Amos", "Apollo", "AvaGrad", "CAME", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan",
-    "DAdaptSGD", "DiffGrad", "Fromage", "Gravity", "GSAM", "LARS", "Lamb", "Lion", "LOMO", "Lookahead", "MADGRAD",
-    "MSVAG", "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger",
-    "Ranger21", "RotoGrad", "SAM", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SopihaH", "SRMM",
-    "SWATS", "Tiger", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard",
+    "DAdaptSGD", "DAdaptLion", "DiffGrad", "Fromage", "Gravity", "GSAM", "LARS", "Lamb", "Lion", "LOMO", "Lookahead",
+    "MADGRAD", "MSVAG", "Nero", "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam",
+    "Ranger", "Ranger21", "RotoGrad", "SAM", "SGDP", "Shampoo", "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SopihaH",
+    "SRMM", "SWATS", "Tiger", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard",
     "Bi-Tempered", "Tversky", "FocalTversky", "LovaszHinge",
 ]
 classifiers = [
@@ -122,7 +122,6 @@ testpaths = "tests"
 [tool.coverage.run]
 omit = [
     "./pytorch_optimizer/optimizer/gsam.py",
-    "./pytorch_optimizer/optimizer/fp16.py",
     "./pytorch_optimizer/optimizer/rotograd.py",
 ]
 
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -48,7 +48,7 @@
 from pytorch_optimizer.optimizer.apollo import Apollo
 from pytorch_optimizer.optimizer.avagrad import AvaGrad
 from pytorch_optimizer.optimizer.came import CAME
-from pytorch_optimizer.optimizer.dadapt import DAdaptAdaGrad, DAdaptAdam, DAdaptAdan, DAdaptSGD
+from pytorch_optimizer.optimizer.dadapt import DAdaptAdaGrad, DAdaptAdam, DAdaptAdan, DAdaptLion, DAdaptSGD
 from pytorch_optimizer.optimizer.diffgrad import DiffGrad
 from pytorch_optimizer.optimizer.fp16 import DynamicLossScaler, SafeFP16Optimizer
 from pytorch_optimizer.optimizer.fromage import Fromage
@@ -171,6 +171,7 @@
     LOMO,
     Tiger,
     CAME,
+    DAdaptLion,
 ]
 OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}
 
diff --git a/pytorch_optimizer/optimizer/adabelief.py b/pytorch_optimizer/optimizer/adabelief.py
@@ -35,7 +35,7 @@ def __init__(
         weight_decay: float = 0.0,
         weight_decouple: bool = True,
         fixed_decay: bool = False,
-        rectify: bool = True,
+        rectify: bool = False,
         n_sma_threshold: int = 5,
         degenerated_to_sgd: bool = True,
         ams_bound: bool = False,
diff --git a/pytorch_optimizer/optimizer/dadapt.py b/pytorch_optimizer/optimizer/dadapt.py
@@ -699,3 +699,132 @@ def step(self, closure: CLOSURE = None) -> LOSS:
             group['k'] += 1
 
         return loss
+
+
+class DAdaptLion(Optimizer, BaseOptimizer):
+    r"""Lion with D-Adaptation. Leave LR set to 1 unless you encounter instability. This implementation is based on V3.
+
+    :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
+    :param lr: float. learning rate.
+    :param betas: BETAS. coefficients used for computing running averages of gradient and the squared hessian trace.
+    :param d0: float. initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
+    :param weight_decay: float. weight decay (L2 penalty).
+    :param weight_decouple: bool. the optimizer uses decoupled weight decay as in AdamW.
+    :param fixed_decay: bool. fix weight decay.
+    """
+
+    def __init__(
+        self,
+        params: PARAMETERS,
+        lr: float = 1.0,
+        betas: BETAS = (0.9, 0.999),
+        d0: float = 1e-6,
+        weight_decay: float = 0.0,
+        weight_decouple: bool = False,
+        fixed_decay: bool = False,
+    ):
+        self.validate_learning_rate(lr)
+        self.validate_betas(betas)
+        self.validate_non_negative(weight_decay, 'weight_decay')
+
+        defaults: DEFAULTS = {
+            'lr': lr,
+            'betas': betas,
+            'd': d0,
+            'weight_decay': weight_decay,
+            'weight_decouple': weight_decouple,
+            'fixed_decay': fixed_decay,
+            'step': 0,
+        }
+        super().__init__(params, defaults)
+
+    def __str__(self) -> str:
+        return 'DAdaptLion'
+
+    @torch.no_grad()
+    def reset(self):
+        for group in self.param_groups:
+            group['step'] = 0
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                state = self.state[p]
+
+                state['exp_avg'] = torch.zeros_like(p)
+                state['s'] = torch.zeros_like(p)
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        group = self.param_groups[0]
+        device = group['params'][0].device
+
+        if 'numerator_weighted' not in group:
+            group['numerator_weighted'] = torch.tensor([0.0], device=device)
+        numerator_weighted = group['numerator_weighted']
+
+        sk_l1 = torch.tensor([0.0], device=device)
+        numerator_accumulator = torch.tensor([0.0], device=device)
+
+        beta1, beta2 = group['betas']
+        beta2_sq = math.sqrt(beta2)
+
+        d, lr = group['d'], group['lr']
+        d_lr: float = d * lr
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad
+                if grad.is_sparse:
+                    raise NoSparseGradientError(str(self))
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state['exp_avg'] = torch.zeros_like(p)
+                    state['s'] = torch.zeros_like(p)
+
+                self.apply_weight_decay(
+                    p=p,
+                    grad=grad,
+                    lr=d_lr,
+                    weight_decay=group['weight_decay'],
+                    weight_decouple=group['weight_decouple'],
+                    fixed_decay=group['fixed_decay'],
+                )
+
+                exp_avg, s = state['exp_avg'], state['s']
+
+                update = exp_avg.clone().mul_(beta1).add_(grad, alpha=1.0 - beta1).sign_()
+                p.add_(update, alpha=-d_lr)
+
+                exp_avg.mul_(beta2).add_(grad, alpha=(1.0 - beta2) * d_lr)
+
+                numerator_accumulator.add_(torch.dot(update.flatten(), s.flatten()), alpha=d_lr)
+                s.mul_(beta2_sq).add_(update, alpha=(1.0 - beta2_sq) * d_lr)
+
+                sk_l1.add_(s.abs().sum())
+
+        numerator_weighted.mul_(beta2_sq).add_(numerator_accumulator, alpha=1.0 - beta2_sq)
+
+        if sk_l1 == 0:
+            return loss
+
+        if lr > 0.0:
+            d_hat: float = (numerator_weighted / ((1.0 - beta2_sq) * sk_l1)).item()
+            d = max(d, d_hat)
+
+        for group in self.param_groups:
+            group['step'] += 1
+
+            group['numerator_weighted'] = numerator_weighted
+            group['d'] = d
+
+        return loss
diff --git a/pytorch_optimizer/optimizer/fp16.py b/pytorch_optimizer/optimizer/fp16.py
@@ -90,7 +90,7 @@ def decrease_loss_scale(self):
             self.loss_scale = max(self.loss_scale, self.threshold)
 
 
-class SafeFP16Optimizer(Optimizer):
+class SafeFP16Optimizer(Optimizer):  # pragma: no cover
     r"""Safe FP16 Optimizer.
 
     :param optimizer: OPTIMIZER.
diff --git a/tests/constants.py b/tests/constants.py
@@ -40,6 +40,7 @@
     DAdaptAdaGrad,
     DAdaptAdam,
     DAdaptAdan,
+    DAdaptLion,
     DAdaptSGD,
     DiffGrad,
     Fromage,
@@ -96,6 +97,7 @@
     'scalableshampoo',
     'dadaptadam',
     'dadaptadan',
+    'dadaptlion',
     'adams',
     'adafactor',
     'novograd',
@@ -127,11 +129,11 @@
 
 OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [
     (build_lookahead, {'lr': 5e-1, 'weight_decay': 1e-3}, 5),
-    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3}, 10),
-    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'ams_bound': True}, 10),
-    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'weight_decouple': False}, 10),
-    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'fixed_decay': True}, 10),
-    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'rectify': False}, 10),
+    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3}, 5),
+    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'ams_bound': True}, 5),
+    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'weight_decouple': False}, 5),
+    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'fixed_decay': True}, 5),
+    (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'rectify': True}, 10),
     (AdaBound, {'lr': 1e0, 'gamma': 0.1, 'weight_decay': 1e-3}, 20),
     (AdaBound, {'lr': 1e0, 'gamma': 0.1, 'weight_decay': 1e-3, 'fixed_decay': True}, 20),
     (AdaBound, {'lr': 1e0, 'gamma': 0.1, 'weight_decay': 1e-3, 'weight_decouple': False}, 20),
@@ -329,6 +331,7 @@
     (DAdaptSGD, {'lr': 2e0, 'weight_decay': 1e-3}, 25),
     (DAdaptAdan, {'lr': 2e0, 'weight_decay': 1e-3}, 20),
     (DAdaptAdan, {'lr': 2e0, 'weight_decay': 1e-3, 'weight_decouple': True}, 20),
+    (DAdaptLion, {'lr': 3e0, 'weight_decay': 1e-3}, 20),
     (AdamS, {'lr': 1e0, 'weight_decay': 1e-3}, 10),
     (AdamS, {'lr': 1e0, 'weight_decay': 1e-3, 'ams_bound': True}, 20),
     (AdaFactor, {'lr': 7.5e-1, 'weight_decay': 1e-3, 'scale_parameter': False}, 100),
diff --git a/tests/test_general_optimizer_parameters.py b/tests/test_general_optimizer_parameters.py
@@ -27,6 +27,7 @@ def test_epsilon(optimizer_name):
         'shampoo',
         'scalableshampoo',
         'dadaptsgd',
+        'dadaptlion',
         'adafactor',
         'lion',
         'a2grad',
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
@@ -150,7 +150,9 @@ def test_no_progression(optimizer_name):
     optimizer.step()
 
 
-@pytest.mark.parametrize('optimizer_name', ['DAdaptAdaGrad', 'DAdaptAdam', 'DAdaptSGD', 'DAdaptAdan', 'Prodigy'])
+@pytest.mark.parametrize(
+    'optimizer_name', ['DAdaptAdaGrad', 'DAdaptAdam', 'DAdaptSGD', 'DAdaptAdan', 'DAdaptLion', 'Prodigy']
+)
 def test_2nd_stage_gradient(optimizer_name):
     p1 = simple_parameter(require_grad=False)
     p2 = simple_parameter(require_grad=True)
diff --git a/tests/test_load_modules.py b/tests/test_load_modules.py
@@ -38,7 +38,7 @@ def test_load_lr_scheduler_invalid(invalid_lr_scheduler_names):
 
 
 def test_get_supported_optimizers():
-    assert len(get_supported_optimizers()) == 59
+    assert len(get_supported_optimizers()) == 60
 
 
 def test_get_supported_lr_schedulers():
diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py