Merge pull request #181 from kozistr/fix/chebyshev-scheduler

kozistr · web-flow · commit 944a3530ad61 · 2023-06-12T23:47:15.000+09:00
[Fix] Chebyshev LR Scheduler
diff --git a/README.rst b/README.rst
@@ -87,14 +87,13 @@ If you want to build the optimizer with parameters & configs, there's `create_op
 Supported Optimizers
 --------------------
 
-You can check the supported optimizers & lr schedulers.
+You can check the supported optimizers with below code.
 
 ::
 
-    from pytorch_optimizer import get_supported_optimizers, get_supported_lr_schedulers
+    from pytorch_optimizer import get_supported_optimizers
 
     supported_optimizers = get_supported_optimizers()
-    supported_lr_schedulers = get_supported_lr_schedulers()
 
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
 | Optimizer    | Description                                                                                       | Official Code                                                                     | Paper                                                                                         |                                                              Citation                                                |
@@ -201,14 +200,10 @@ You can check the supported optimizers & lr schedulers.
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
 | Softplus T   | *Calibrating the Adaptive Learning Rate to Improve Convergence of ADAM*                           |                                                                                   | `https://arxiv.org/abs/1908.00700 <https://arxiv.org/abs/1908.00700>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2019arXiv190800700T/exportcitation>`__                                      |
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
-| EE LRS       | *Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule*                   |                                                                                   | `https://arxiv.org/abs/2003.03977 <https://arxiv.org/abs/2003.03977>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2020arXiv200303977I/exportcitation>`__                                      |
+| Un-tuned w/u | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                   | `https://arxiv.org/abs/1910.04209 <https://arxiv.org/abs/1910.04209>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation>`__                                      |
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
 | Norm Loss    | *An efficient yet effective regularization method for deep neural networks*                       |                                                                                   | `https://arxiv.org/abs/2103.06583 <https://arxiv.org/abs/2103.06583>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2021arXiv210306583G/exportcitation>`__                                      |
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
-| Chebyshev LR | *Acceleration via Fractal Learning Rate Schedules*                                                |                                                                                   | `https://arxiv.org/abs/2103.01338 <https://arxiv.org/abs/2103.01338>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2021arXiv210301338A/exportcitation>`__                                      |
-+--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
-| Un-tuned WU  | *On the adequacy of untuned warmup for adaptive optimization*                                     |                                                                                   | `https://arxiv.org/abs/1910.04209 <https://arxiv.org/abs/1910.04209>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2019arXiv191004209M/exportcitation>`__                                      |
-+--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
 | AdaShift     | *Decorrelation and Convergence of Adaptive Learning Rate Methods*                                 | `github <https://github.com/MichaelKonobeev/adashift>`__                          | `https://arxiv.org/abs/1810.00143v4 <https://arxiv.org/abs/1810.00143v4>`__                   | `cite <https://ui.adsabs.harvard.edu/abs/2018arXiv181000143Z/exportcitation>`__                                      |
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
 | AdaDelta     | *An Adaptive Learning Rate Method*                                                                |                                                                                   | `https://arxiv.org/abs/1212.5701v1 <https://arxiv.org/abs/1212.5701v1>`__                     | `cite <https://ui.adsabs.harvard.edu/abs/2012arXiv1212.5701Z/exportcitation>`__                                      |
@@ -222,6 +217,25 @@ You can check the supported optimizers & lr schedulers.
 | Sophia       | *A Scalable Stochastic Second-order Optimizer for Language Model Pre-training*                    | `github <https://github.com/Liuhong99/Sophia>`__                                  | `https://arxiv.org/abs/2305.14342 <https://arxiv.org/abs/2305.14342>`__                       | `cite <https://github.com/Liuhong99/Sophia>`__                                                                       |
 +--------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
 
+Supported LR Scheduler
+----------------------
+
+You can check the supported learning rate schedulers with below code.
+
+::
+
+    from pytorch_optimizer import get_supported_lr_schedulers
+
+    supported_lr_schedulers = get_supported_lr_schedulers()
+
++------------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| LR Scheduler     | Description                                                                                       | Official Code                                                                     | Paper                                                                                         |                                                              Citation                                                |
++==================+===================================================================================================+===================================================================================+===============================================================================================+======================================================================================================================+
+| Explore-Exploit  | *Wide-minima Density Hypothesis and the Explore-Exploit Learning Rate Schedule*                   |                                                                                   | `https://arxiv.org/abs/2003.03977 <https://arxiv.org/abs/2003.03977>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2020arXiv200303977I/exportcitation>`__                                      |
++------------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+| Chebyshev        | *Acceleration via Fractal Learning Rate Schedules*                                                |                                                                                   | `https://arxiv.org/abs/2103.01338 <https://arxiv.org/abs/2103.01338>`__                       | `cite <https://ui.adsabs.harvard.edu/abs/2021arXiv210301338A/exportcitation>`__                                      |
++------------------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
+
 Useful Resources
 ----------------
 
diff --git a/docs/changelogs/v2.10.1.md b/docs/changelogs/v2.10.1.md
@@ -3,6 +3,7 @@
 ### Fix
 
 * `perturb` isn't multiplied by `-step_size` in SWATS optimizer. (#179)
+* `chebyshev step` has size of `T` while the permutation is `2^T`. (#168, #181) 
 
 ### Diff
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -92,7 +92,7 @@ target-version = "py39"
 "./tests/test_general_optimizer_parameters.py" = ["D", "S101"]
 "./tests/test_load_optimizers.py" = ["D", "S101"]
 "./tests/test_load_lr_schedulers.py" = ["D", "S101"]
-"./tests/test_lr_schedulers.py" = ["D"]
+"./tests/test_lr_schedulers.py" = ["D", "S101"]
 "./tests/test_lr_scheduler_parameters.py" = ["D", "S101"]
 "./tests/test_create_optimizer.py" = ["D"]
 "./pytorch_optimizer/__init__.py" = ["F401"]
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -11,7 +11,7 @@
     CyclicLR,
     OneCycleLR,
 )
-from pytorch_optimizer.lr_scheduler.chebyshev import get_chebyshev_schedule
+from pytorch_optimizer.lr_scheduler.chebyshev import get_chebyshev_lr, get_chebyshev_schedule
 from pytorch_optimizer.lr_scheduler.cosine_anealing import CosineAnnealingWarmupRestarts
 from pytorch_optimizer.lr_scheduler.experimental.deberta_v3_lr_scheduler import deberta_v3_large_lr_scheduler
 from pytorch_optimizer.lr_scheduler.linear_warmup import CosineScheduler, LinearScheduler, PolyScheduler
diff --git a/pytorch_optimizer/lr_scheduler/chebyshev.py b/pytorch_optimizer/lr_scheduler/chebyshev.py
@@ -1,30 +1,66 @@
 import numpy as np
 
 
-def chebyshev_steps(small_m: float, big_m: float, num_epochs: int) -> np.ndarray:
+def get_chebyshev_steps(num_epochs: int, small_m: float = 0.05, big_m: float = 1.0) -> np.ndarray:
     r"""Chebyshev steps.
 
+        gamma_{t} = (M + m) / 2.0 - (M - m) * cos ((t - 0.5) * pi / T) / 2, where t = 1, ..., T
+
+    :param num_epochs: int. stands for 'T' notation.
     :param small_m: float. stands for 'm' notation.
     :param big_m:  float. stands for 'M' notation.
-    :param num_epochs: int. stands for 'T' notation.
     :return: np.array. chebyshev_steps.
     """
     c, r = (big_m + small_m) / 2.0, (big_m - small_m) / 2.0
-    thetas = (np.arange(num_epochs) + 0.5) / num_epochs * np.pi
+    thetas = (np.arange(num_epochs) + 0.5) * np.pi / num_epochs  # epoch starts from 0, so +0.5 instead of -0.5
 
     return 1.0 / (c - r * np.cos(thetas))
 
 
-def chebyshev_perm(num_epochs: int) -> np.ndarray:
-    r"""Chebyshev permutation."""
+def get_chebyshev_permutation(num_epochs: int) -> np.ndarray:
+    r"""Fractal chebyshev permutation.
+
+        sigma_{2T} := interlace(sigma_{T}, 2T + 1 - sigma_{T}), where
+        interlace([a_{1}, ..., a_{n}], [b_{1}, ..., b_{n}]) := [a_{1}, b_{1}, ..., n_{1}, b_{n}]
+
+    :param num_epochs: int. number of epochs.
+    """
     perm = np.array([0])
     while len(perm) < num_epochs:
         perm = np.vstack([perm, 2 * len(perm) - 1 - perm]).T.flatten()
     return perm
 
 
 def get_chebyshev_schedule(num_epochs: int) -> np.ndarray:
-    r"""Get Chebyshev schedules."""
-    steps: np.ndarray = chebyshev_steps(0.1, 1, num_epochs - 2)
-    perm: np.ndarray = chebyshev_perm(num_epochs - 2)
+    r"""Get Chebyshev schedules.
+
+    :param num_epochs: int. number of total epochs.
+    """
+    steps: np.ndarray = get_chebyshev_steps(num_epochs)
+    perm: np.ndarray = get_chebyshev_permutation(num_epochs - 2)
     return steps[perm]
+
+
+def get_chebyshev_lr(lr: float, epoch: int, num_epochs: int, is_warmup: bool = False) -> float:
+    r"""Get chebyshev learning rate.
+
+    :param lr: float. learning rate.
+    :param epoch: int. current epochs.
+    :param num_epochs: int. number of total epochs.
+    :param is_warmup: bool. whether warm-up stage or not.
+    """
+    if is_warmup:
+        return lr
+
+    epoch_power: int = np.power(2, int(np.log2(num_epochs - 1)) + 1) if num_epochs > 1 else 1
+    scheduler = get_chebyshev_schedule(epoch_power)
+
+    idx: int = epoch - 2
+    if idx < 0:
+        idx = 0
+    elif idx > len(scheduler) - 1:
+        idx = len(scheduler) - 1
+
+    chebyshev_value: float = scheduler[idx]
+
+    return lr * chebyshev_value
diff --git a/tests/test_lr_scheduler_parameters.py b/tests/test_lr_scheduler_parameters.py
@@ -62,4 +62,4 @@ def test_linear_warmup_lr_scheduler_params():
 
 def test_chebyshev_params():
     with pytest.raises(IndexError):
-        get_chebyshev_schedule(2)
+        get_chebyshev_schedule(0)
diff --git a/tests/test_lr_schedulers.py b/tests/test_lr_schedulers.py
@@ -4,8 +4,8 @@
 import pytest
 from torch import nn
 
-from pytorch_optimizer import AdamP, get_chebyshev_schedule
-from pytorch_optimizer.lr_scheduler.chebyshev import chebyshev_perm
+from pytorch_optimizer import AdamP, get_chebyshev_lr, get_chebyshev_schedule
+from pytorch_optimizer.lr_scheduler.chebyshev import get_chebyshev_permutation
 from pytorch_optimizer.lr_scheduler.cosine_anealing import CosineAnnealingWarmupRestarts
 from pytorch_optimizer.lr_scheduler.experimental.deberta_v3_lr_scheduler import deberta_v3_large_lr_scheduler
 from pytorch_optimizer.lr_scheduler.linear_warmup import CosineScheduler, LinearScheduler, PolyScheduler
@@ -152,8 +152,48 @@ def test_cosine_annealing_warmup_restarts(cosine_annealing_warmup_restart_param)
 
 
 def test_get_chebyshev_scheduler():
-    np.testing.assert_almost_equal(get_chebyshev_schedule(3), 1.81818182, decimal=6)
-    np.testing.assert_array_equal(chebyshev_perm(5), np.asarray([0, 7, 3, 4, 1, 6, 2, 5]))
+    # test the first nontrivial permutations sigma_{T}
+    recipes = {
+        2: np.asarray([0, 1]),
+        4: np.asarray([0, 3, 1, 2]),
+        8: np.asarray([0, 7, 3, 4, 1, 6, 2, 5]),
+        16: np.asarray([0, 15, 7, 8, 3, 12, 4, 11, 1, 14, 6, 9, 2, 13, 5, 10]),
+    }
+
+    for k, v in recipes.items():
+        np.testing.assert_array_equal(get_chebyshev_permutation(k), v)
+
+    np.testing.assert_almost_equal(get_chebyshev_schedule(1), 1.904762, decimal=6)
+    np.testing.assert_almost_equal(get_chebyshev_schedule(3), 8.799878, decimal=6)
+
+
+def test_get_chebyshev_lr():
+    recipes = [
+        0.019125119558059765,
+        0.019125119558059765,
+        0.0010022924983586518,
+        0.0020901181252459123,
+        0.0017496032811320122,
+        0.006336331139456458,
+        0.0011208500962143087,
+        0.004471008393917827,
+        0.0012101602977446309,
+        0.014193791132074378,
+        0.0010208804147606497,
+        0.0025832131864890117,
+        0.0015085567867114075,
+        0.009426190153875151,
+        0.0010594201194061095,
+        0.0033213041232648503,
+        0.001335267780289186,
+        0.001335267780289186,
+        0.001335267780289186,
+    ]
+
+    np.testing.assert_almost_equal(get_chebyshev_lr(1e-3, 0, 16, is_warmup=True), 1e-3)
+
+    for i, expected_lr in enumerate(recipes, start=1):
+        np.testing.assert_almost_equal(get_chebyshev_lr(1e-3, i, 16, is_warmup=False), expected_lr)
 
 
 def test_linear_warmup_linear_scheduler():

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`CyclicLR,`
`12`	`12`	`OneCycleLR,`
`13`	`13`	`)`
`14`		`-from pytorch_optimizer.lr_scheduler.chebyshev import get_chebyshev_schedule`
	`14`	`+from pytorch_optimizer.lr_scheduler.chebyshev import get_chebyshev_lr, get_chebyshev_schedule`
`15`	`15`	`from pytorch_optimizer.lr_scheduler.cosine_anealing import CosineAnnealingWarmupRestarts`
`16`	`16`	`from pytorch_optimizer.lr_scheduler.experimental.deberta_v3_lr_scheduler import deberta_v3_large_lr_scheduler`
`17`	`17`	`from pytorch_optimizer.lr_scheduler.linear_warmup import CosineScheduler, LinearScheduler, PolyScheduler`