Merge pull request #130 from kozistr/feature/sm3-optimizer

kozistr · web-flow · commit 19dcf2bdd03d · 2023-04-22T16:56:10.000+09:00
[Feature] Implement SM3 Optimizer
diff --git a/README.rst b/README.rst
@@ -138,6 +138,8 @@ You can check the supported optimizers & lr schedulers.
 +--------------+-------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+
 | Ali-G        | *Adaptive Learning Rates for Interpolation with Gradients*                                      | `github <https://github.com/oval-group/ali-g>`__                                  | `https://arxiv.org/abs/1906.05661 <https://arxiv.org/abs/1906.05661>`__                       |
 +--------------+-------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+
+| SM3          | *Memory-Efficient Adaptive Optimization*                                                        | `github <https://github.com/google-research/google-research/tree/master/sm3>`__   | `https://arxiv.org/abs/1901.11150 <https://arxiv.org/abs/1901.11150>`__                       |
++--------------+-------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+
 
 Useful Resources
 ----------------
@@ -343,6 +345,8 @@ Citations
 
 `Ali-G <https://github.com/oval-group/ali-g#adaptive-learning-rates-for-interpolation-with-gradients>`__
 
+`SM3 <https://ui.adsabs.harvard.edu/abs/2019arXiv190111150A/exportcitation>`__
+
 Citation
 --------
 
diff --git a/docs/optimizer_api.rst b/docs/optimizer_api.rst
@@ -280,3 +280,11 @@ AliG
 
 .. autoclass:: pytorch_optimizer.AliG
     :members:
+
+.. _SM3:
+
+SM3
+---
+
+.. autoclass:: pytorch_optimizer.SM3
+    :members:
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pytorch_optimizer"
-version = "2.5.2"
+version = "2.6.0"
 description = "optimizer & lr scheduler implementations in PyTorch with clean-code, strict types. Also, including useful optimization ideas."
 license = "Apache-2.0"
 authors = ["kozistr <kozistr@gmail.com>"]
@@ -38,7 +38,7 @@ numpy = [
     { version = "*", python = ">=3.8" },
 ]
 torch = [
-    { version = ">=1.10,>=2.0", python = ">=3.8", source = "torch" },
+    { version = ">=1.10", python = ">=3.8", source = "torch" },
     { version = "^1.10", python = ">=3.7,<3.8", source = "torch" },
 ]
 
@@ -48,8 +48,8 @@ isort = [
     { version = "^5.12.0", python = ">=3.8"}
 ]
 black = "^23.3.0"
-ruff = "^0.0.260"
-pytest = "^7.2.2"
+ruff = "^0.0.262"
+pytest = "^7.3.1"
 pytest-cov = "^4.0.0"
 
 [[tool.poetry.source]]
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -64,6 +64,7 @@
     merge_small_dims,
     power_iteration,
 )
+from pytorch_optimizer.optimizer.sm3 import SM3
 from pytorch_optimizer.optimizer.utils import (
     clip_grad_norm,
     disable_running_stats,
@@ -103,6 +104,7 @@
     NovoGrad,
     Lion,
     AliG,
+    SM3,
 ]
 OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}
 
diff --git a/pytorch_optimizer/optimizer/sm3.py b/pytorch_optimizer/optimizer/sm3.py
@@ -0,0 +1,158 @@
+import torch
+from torch.optim.optimizer import Optimizer
+
+from pytorch_optimizer.base.optimizer import BaseOptimizer
+from pytorch_optimizer.base.types import CLOSURE, DEFAULTS, LOSS, PARAMETERS
+
+
+class SM3(Optimizer, BaseOptimizer):
+    r"""Memory-Efficient Adaptive Optimization.
+
+        Reference : https://github.com/Enealor/PyTorch-SM3/blob/master/src/SM3/SM3.py
+
+    :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
+    :param lr: float. learning rate.
+    :param momentum: float. coefficient used to scale prior updates before adding. This drastically increases
+        memory usage if `momentum > 0.0`. This is ignored if the parameter's gradient is sparse.
+    :param beta: float. coefficient used for exponential moving averages.
+    """
+
+    def __init__(
+        self,
+        params: PARAMETERS,
+        lr: float = 1e-1,
+        momentum: float = 0.0,
+        beta: float = 0.0,
+        eps: float = 1e-30,
+    ):
+        self.lr = lr
+        self.momentum = momentum
+        self.beta = beta
+        self.eps = eps
+
+        self.validate_parameters()
+
+        defaults: DEFAULTS = {'lr': lr, 'momentum': momentum, 'beta': beta}
+        super().__init__(params, defaults)
+
+    def validate_parameters(self):
+        self.validate_learning_rate(self.lr)
+        self.validate_momentum(self.momentum)
+        self.validate_beta(self.beta)
+        self.validate_epsilon(self.eps)
+
+    def __str__(self) -> str:
+        return 'SM3'
+
+    @torch.no_grad()
+    def reset(self):
+        for group in self.param_groups:
+            for p in group['params']:
+                state = self.state[p]
+
+                state['step'] = 0
+                state['momentum_buffer'] = torch.zeros_like(p)
+
+    @staticmethod
+    def max_reduce_except_dim(x: torch.Tensor, dim: int) -> torch.Tensor:
+        r"""Perform reduce-max along all dimensions except the given dim."""
+        rank: int = len(x.shape)
+        if rank == 0:
+            return x
+
+        if dim >= rank:
+            raise ValueError(f'[-] given dim is bigger than rank. {dim} >= {rank}')
+
+        for d in range(rank):
+            if d != dim:
+                x = x.max(dim=d, keepdim=True).values
+        return x
+
+    @staticmethod
+    def make_sparse(grad: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
+        if grad._indices().dim() == 0 or values.dim() == 0:
+            return grad.new().resize_as_(grad)
+        return grad.new(grad._indices(), values, grad.size())
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        loss: LOSS = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            momentum, beta = group['momentum'], group['beta']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+
+                grad = p.grad
+
+                shape = grad.shape
+                rank: int = len(shape)
+
+                state = self.state[p]
+                if len(state) == 0:
+                    state['step'] = 0
+                    state['momentum_buffer'] = torch.zeros_like(p)
+
+                    if grad.is_sparse:
+                        state['accumulator_0'] = torch.zeros(shape[0])
+                    elif rank == 0:
+                        state['accumulator_0'] = torch.zeros(shape)
+                    else:
+                        for i in range(rank):
+                            state[f'accumulator_{i}'] = torch.zeros([1] * i + [shape[i]] + [1] * (rank - 1 - i))
+
+                state['step'] += 1
+
+                if grad.is_sparse:
+                    grad = grad.coalesce()
+
+                    acc = state['accumulator_0']
+                    update_values = torch.gather(acc, 0, grad._indices()[0])
+                    if beta > 0.0:
+                        update_values.mul_(beta)
+                    update_values.addcmul_(grad._values(), grad._values(), value=1.0 - beta)
+
+                    nu_max = self.max_reduce_except_dim(
+                        x=self.make_sparse(grad, update_values).to_dense(),
+                        dim=0,
+                    ).squeeze_()
+
+                    if beta > 0.0:
+                        torch.max(acc, nu_max, out=acc)
+                    else:
+                        acc.copy_(nu_max)
+
+                    update_values.add_(self.eps).rsqrt_().mul_(grad._values())
+
+                    update = self.make_sparse(grad, update_values)
+                else:
+                    update = state['accumulator_0'].clone()
+                    for i in range(1, rank):
+                        update = torch.min(update, state[f'accumulator_{i}'])
+
+                    if beta > 0.0:
+                        update.mul_(beta)
+                    update.addcmul_(grad, grad, value=1.0 - beta)
+
+                    for i in range(rank):
+                        acc = state[f'accumulator_{i}']
+                        nu_max = self.max_reduce_except_dim(update, i)
+                        if beta > 0.0:
+                            torch.max(acc, nu_max, out=acc)
+                        else:
+                            acc.copy_(nu_max)
+
+                    update.add_(self.eps).rsqrt_().mul_(grad)
+
+                    if momentum > 0.0:
+                        m = state['momentum_buffer']
+                        m.mul_(momentum).add_(update, alpha=1.0 - momentum)
+                        update = m
+
+                p.add_(update, alpha=-group['lr'])
+
+        return loss
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,30 +1,29 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 
-attrs==22.2.0 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 black==23.3.0 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 click==8.1.3 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 colorama==0.4.6 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0" and sys_platform == "win32" or python_full_version >= "3.7.2" and python_full_version < "4.0.0" and platform_system == "Windows"
-coverage[toml]==7.2.2 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
+coverage[toml]==7.2.3 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 exceptiongroup==1.1.1 ; python_full_version >= "3.7.2" and python_version < "3.11"
-filelock==3.10.7 ; python_version >= "3.8" and python_full_version < "4.0.0"
-importlib-metadata==6.1.0 ; python_full_version >= "3.7.2" and python_version < "3.8"
+filelock==3.12.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
+importlib-metadata==6.5.1 ; python_full_version >= "3.7.2" and python_version < "3.8"
 iniconfig==2.0.0 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 isort==5.11.5 ; python_full_version >= "3.7.2" and python_version < "3.8"
 isort==5.12.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 jinja2==3.1.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
 markupsafe==2.1.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
 mpmath==1.3.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 mypy-extensions==1.0.0 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
-networkx==3.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
+networkx==3.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
 numpy==1.21.1 ; python_full_version >= "3.7.2" and python_version < "3.8"
 numpy==1.24.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
-packaging==23.0 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
+packaging==23.1 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 pathspec==0.11.1 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 platformdirs==3.2.0 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 pluggy==1.0.0 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 pytest-cov==4.0.0 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
-pytest==7.2.2 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
-ruff==0.0.260 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
+pytest==7.3.1 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
+ruff==0.0.262 ; python_full_version >= "3.7.2" and python_full_version < "4.0.0"
 sympy==1.11.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
 tomli==2.0.1 ; python_full_version >= "3.7.2" and python_full_version <= "3.11.0a6"
 torch==1.13.1+cpu ; python_full_version >= "3.7.2" and python_version < "3.8"
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 
-filelock==3.10.7 ; python_version >= "3.8" and python_full_version < "4.0.0"
+filelock==3.12.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
 jinja2==3.1.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
 markupsafe==2.1.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
 mpmath==1.3.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
-networkx==3.0 ; python_version >= "3.8" and python_full_version < "4.0.0"
+networkx==3.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
 numpy==1.21.1 ; python_full_version >= "3.7.2" and python_version < "3.8"
 numpy==1.24.2 ; python_version >= "3.8" and python_full_version < "4.0.0"
 sympy==1.11.1 ; python_version >= "3.8" and python_full_version < "4.0.0"
diff --git a/tests/constants.py b/tests/constants.py
@@ -6,6 +6,7 @@
     OPTIMIZERS,
     PNM,
     SGDP,
+    SM3,
     AdaBelief,
     AdaBound,
     AdaFactor,
@@ -47,7 +48,7 @@
     'lookahead',
 ]
 
-SPARSE_OPTIMIZERS: List[str] = ['madgrad', 'dadaptadagrad']
+SPARSE_OPTIMIZERS: List[str] = ['madgrad', 'dadaptadagrad', 'sm3']
 NO_SPARSE_OPTIMIZERS: List[str] = [
     optimizer for optimizer in VALID_OPTIMIZER_NAMES if optimizer not in SPARSE_OPTIMIZERS
 ]
@@ -300,6 +301,7 @@
     (Lion, {'lr': 5e-1, 'weight_decay': 1e-3, 'weight_decouple': False}, 10),
     (AliG, {'max_lr': 5e-1, 'momentum': 0.9}, 10),
     (AliG, {'max_lr': 5e-1, 'momentum': 0.9, 'adjusted_momentum': True}, 10),
+    (SM3, {'lr': 5e-1, 'momentum': 0.9, 'beta': 0.9}, 10),
 ]
 ADAMD_SUPPORTED_OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [
     (build_lookahead, {'lr': 5e-1, 'weight_decay': 1e-3, 'adamd_debias_term': True}, 10),
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
@@ -49,8 +49,12 @@ def test_sparse(sparse_optimizer):
 
     weight, weight_sparse = simple_sparse_parameter()
 
-    opt_dense = opt([weight], lr=1e-3, momentum=0.0)
-    opt_sparse = opt([weight_sparse], lr=1e-3, momentum=0.0)
+    params = {'lr': 1e-3, 'momentum': 0.0}
+    if sparse_optimizer == 'sm3':
+        params.update({'beta': 0.9})
+
+    opt_dense = opt([weight], **params)
+    opt_sparse = opt([weight_sparse], **params)
 
     opt_dense.step()
     opt_sparse.step()
@@ -89,13 +93,14 @@ def test_sparse_supported(sparse_optimizer):
         with pytest.raises(NoSparseGradientError):
             optimizer.step()
 
-    optimizer = opt([simple_sparse_parameter()[1]], momentum=0.9, weight_decay=1e-3)
-    optimizer.reset()
-    if sparse_optimizer == 'madgrad':
-        with pytest.raises(NoSparseGradientError):
+    if sparse_optimizer in ('madgrad', 'dadapt'):
+        optimizer = opt([simple_sparse_parameter()[1]], momentum=0.9, weight_decay=1e-3)
+        optimizer.reset()
+        if sparse_optimizer == 'madgrad':
+            with pytest.raises(NoSparseGradientError):
+                optimizer.step()
+        else:
             optimizer.step()
-    else:
-        optimizer.step()
 
 
 @pytest.mark.parametrize('optimizer_name', VALID_OPTIMIZER_NAMES)
diff --git a/tests/test_load_optimizers.py b/tests/test_load_optimizers.py
@@ -16,4 +16,4 @@ def test_load_optimizers_invalid(invalid_optimizer_names):
 
 
 def test_get_supported_optimizers():
-    assert len(get_supported_optimizers()) == 29
+    assert len(get_supported_optimizers()) == 30
diff --git a/tests/test_optimizer_parameters.py b/tests/test_optimizer_parameters.py
@@ -66,7 +66,7 @@ def test_adafactor_epsilon():
 
 @pytest.mark.parametrize('optimizer_name', VALID_OPTIMIZER_NAMES)
 def test_weight_decay(optimizer_name):
-    if optimizer_name in ('nero', 'alig'):
+    if optimizer_name in ('nero', 'alig', 'sm3'):
         pytest.skip(f'skip {optimizer_name} optimizer')
 
     optimizer = load_optimizer(optimizer_name)
@@ -111,7 +111,7 @@ def test_trust_coefficient(optimizer_name):
         optimizer(None, trust_coefficient=-1e-3)
 
 
-@pytest.mark.parametrize('optimizer_name', ['madgrad', 'lars'])
+@pytest.mark.parametrize('optimizer_name', ['madgrad', 'lars', 'sm3'])
 def test_momentum(optimizer_name):
     optimizer = load_optimizer(optimizer_name)
     with pytest.raises(ValueError):
@@ -132,7 +132,7 @@ def test_beta0(optimizer_name):
         optimizer(None, num_iterations=200, beta0=-0.1)
 
 
-@pytest.mark.parametrize('optimizer_name', ['nero', 'apollo'])
+@pytest.mark.parametrize('optimizer_name', ['nero', 'apollo', 'sm3'])
 def test_beta(optimizer_name):
     optimizer = load_optimizer(optimizer_name)
     with pytest.raises(ValueError):
diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py
diff --git a/tests/utils.py b/tests/utils.py

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@`
`64`	`64`	`merge_small_dims,`
`65`	`65`	`power_iteration,`
`66`	`66`	`)`
	`67`	`+from pytorch_optimizer.optimizer.sm3 import SM3`
`67`	`68`	`from pytorch_optimizer.optimizer.utils import (`
`68`	`69`	`clip_grad_norm,`
`69`	`70`	`disable_running_stats,`
`@@ -103,6 +104,7 @@`
`103`	`104`	`NovoGrad,`
`104`	`105`	`Lion,`
`105`	`106`	`AliG,`
	`107`	`+ SM3,`
`106`	`108`	`]`
`107`	`109`	`OPTIMIZERS: Dict[str, OPTIMIZER] = {str(optimizer.__name__).lower(): optimizer for optimizer in OPTIMIZER_LIST}`
`108`	`110`