[Feature] Implment more cooldown types for WSD lr scheduler (#386)

kozistr · web-flow · commit 02fc0af89c4a · 2025-05-31T15:22:27.000+09:00
* feature: lots of cooldown type

* update: test case

* docs: v3.6.1 changelog

* build(deps): packages
diff --git a/docs/changelogs/v3.6.1.md b/docs/changelogs/v3.6.1.md
@@ -1,5 +1,9 @@
 ## Change Log
 
+## Feature
+
+* Implement more cooldown types for WSD learning rate scheduler. (#382, #386)
+
 ### Fix
 
 * Fix to use `momentum buffer` instead of the gradient to calculate LMO. (#385)
diff --git a/poetry.lock b/poetry.lock
diff --git a/pytorch_optimizer/lr_scheduler/wsd.py b/pytorch_optimizer/lr_scheduler/wsd.py
@@ -1,18 +1,66 @@
 import math
 from functools import partial
+from typing import Literal
 
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, LRScheduler
 
+COOLDOWN_TYPE = Literal['cosine', '1-sqrt', 'linear', '1-square']
 
-def get_wsd_scheduler_lambda(
+
+def get_cosine_cooldown_lr_ratio(
+    current_step: int,
+    num_warmup_steps: int,
+    num_stable_steps: int,
+    num_decay_steps: int,
+    min_lr_ratio: float,
+    num_cycles: float,
+) -> float:
+    r"""Get Cosine cooldown learning rate ratio."""
+    progress = float(current_step - num_warmup_steps - num_stable_steps) / float(max(1, num_decay_steps))
+    value = max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+    return (1.0 - min_lr_ratio) * value + min_lr_ratio
+
+
+def get_1sqrt_cooldown_lr_ratio(
+    current_step: int,
+    num_warmup_steps: int,
+    num_stable_steps: int,
+    num_decay_steps: int,
+) -> float:
+    r"""Get 1-sqrt cooldown learning rate ratio."""
+    return 1.0 - math.sqrt((current_step - num_warmup_steps - num_stable_steps) / num_decay_steps)
+
+
+def get_1square_cooldown_lr_ratio(
+    current_step: int,
+    num_warmup_steps: int,
+    num_stable_steps: int,
+    num_decay_steps: int,
+) -> float:
+    r"""Get 1-square cooldown learning rate ratio."""
+    return 1.0 - math.pow((current_step - num_warmup_steps - num_stable_steps) / num_decay_steps, 2)
+
+
+def get_linear_cooldown_lr_ratio(
+    current_step: int,
+    num_warmup_steps: int,
+    num_stable_steps: int,
+    num_decay_steps: int,
+) -> float:
+    r"""Get linear cooldown learning rate ratio."""
+    return 1.0 - (current_step - num_warmup_steps - num_stable_steps) / num_decay_steps
+
+
+def get_wsd_scheduler_lambda(  # noqa: PLR0911
     current_step: int,
     *,
     num_warmup_steps: int,
     num_stable_steps: int,
     num_decay_steps: int,
     min_lr_ratio: float,
     num_cycles: float,
+    cooldown_type: COOLDOWN_TYPE,
 ) -> float:
     r"""Get WSD learning rate.
 
@@ -23,15 +71,23 @@ def get_wsd_scheduler_lambda(
     :param min_lr_ratio: float. the minimum learning rate as a ratio of the initial learning rate.
     :param num_cycles: float. the number of waves in the cosine schedule (the defaults is to just decrease from the max
         value to 0 following a half-cosine)
+    :param cooldown_type: COOLDOWN_TYPE. cooldown type of the learning rate scheduler.
     """
     if current_step < num_warmup_steps:
         return float(current_step) / float(max(1, num_warmup_steps))
     if current_step < num_warmup_steps + num_stable_steps:
         return 1.0
     if current_step < num_warmup_steps + num_stable_steps + num_decay_steps:
-        progress = float(current_step - num_warmup_steps - num_stable_steps) / float(max(1, num_decay_steps))
-        value = max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
-        return (1.0 - min_lr_ratio) * value + min_lr_ratio
+        if cooldown_type == 'cosine':
+            return get_cosine_cooldown_lr_ratio(
+                current_step, num_warmup_steps, num_stable_steps, num_decay_steps, min_lr_ratio, num_cycles
+            )
+        if cooldown_type == '1-sqrt':
+            return get_1sqrt_cooldown_lr_ratio(current_step, num_warmup_steps, num_stable_steps, num_decay_steps)
+        if cooldown_type == '1-square':
+            return get_1square_cooldown_lr_ratio(current_step, num_warmup_steps, num_stable_steps, num_decay_steps)
+        if cooldown_type == 'linear':
+            return get_linear_cooldown_lr_ratio(current_step, num_warmup_steps, num_stable_steps, num_decay_steps)
     return min_lr_ratio
 
 
@@ -42,6 +98,7 @@ def get_wsd_schedule(
     num_decay_steps: int,
     min_lr_ratio: float = 0.0,
     num_cycles: float = 0.5,
+    cooldown_type: COOLDOWN_TYPE = '1-sqrt',
     last_epoch: int = -1,
 ) -> LRScheduler:
     r"""Get Warmup-Stable-Decay learning rate scheduler.
@@ -53,6 +110,7 @@ def get_wsd_schedule(
     :param min_lr_ratio: float. the minimum learning rate as a ratio of the initial learning rate.
     :param num_cycles: float. the number of waves in the cosine schedule (the defaults is to just decrease from the max
         value to 0 following a half-cosine)
+    :param cooldown_type: COOLDOWN_TYPE. cooldown type of the learning rate scheduler.
     :param last_epoch: int. the index of the last epoch when resuming training.
     """
     lr_scheduler = partial(
@@ -62,6 +120,7 @@ def get_wsd_schedule(
         num_decay_steps=num_decay_steps,
         min_lr_ratio=min_lr_ratio,
         num_cycles=num_cycles,
+        cooldown_type=cooldown_type,
     )
 
     return LambdaLR(optimizer, lr_scheduler, last_epoch)
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -5,12 +5,12 @@ black==25.1.0 ; python_version >= "3.9"
 click==8.1.8 ; python_version >= "3.8"
 colorama==0.4.6 ; python_version >= "3.8" and (sys_platform == "win32" or platform_system == "Windows")
 coverage[toml]==7.6.1 ; python_version == "3.8"
-coverage[toml]==7.8.0 ; python_version >= "3.9"
+coverage[toml]==7.8.2 ; python_version >= "3.9"
 exceptiongroup==1.3.0 ; python_version < "3.11" and python_version >= "3.8"
 filelock==3.16.1 ; python_version == "3.8"
 filelock==3.18.0 ; python_version >= "3.9"
 fsspec==2025.3.0 ; python_version == "3.8"
-fsspec==2025.3.2 ; python_version >= "3.9"
+fsspec==2025.5.1 ; python_version >= "3.9"
 iniconfig==2.1.0 ; python_version >= "3.8"
 isort==5.13.2 ; python_version == "3.8"
 isort==6.0.1 ; python_version >= "3.9"
@@ -31,8 +31,8 @@ pluggy==1.5.0 ; python_version == "3.8"
 pluggy==1.6.0 ; python_version >= "3.9"
 pytest-cov==5.0.0 ; python_version >= "3.8"
 pytest==8.3.5 ; python_version >= "3.8"
-ruff==0.11.10 ; python_version >= "3.8"
-setuptools==80.7.1 ; python_version >= "3.12"
+ruff==0.11.12 ; python_version >= "3.8"
+setuptools==80.9.0 ; python_version >= "3.12"
 sympy==1.13.3 ; python_version == "3.8"
 sympy==1.14.0 ; python_version >= "3.9"
 tomli==2.2.1 ; python_full_version <= "3.11.0a6" and python_version >= "3.8"
diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@
 filelock==3.16.1 ; python_version == "3.8"
 filelock==3.18.0 ; python_version >= "3.9"
 fsspec==2025.3.0 ; python_version == "3.8"
-fsspec==2025.3.2 ; python_version >= "3.9"
+fsspec==2025.5.1 ; python_version >= "3.9"
 jinja2==3.1.6 ; python_version >= "3.8"
 markupsafe==2.1.5 ; python_version == "3.8"
 markupsafe==3.0.2 ; python_version >= "3.9"
@@ -12,7 +12,7 @@ networkx==3.1 ; python_version == "3.8"
 networkx==3.2.1 ; python_version >= "3.9"
 numpy==1.24.4 ; python_version == "3.8"
 numpy==2.0.2 ; python_version >= "3.9"
-setuptools==80.7.1 ; python_version >= "3.12"
+setuptools==80.9.0 ; python_version >= "3.12"
 sympy==1.13.3 ; python_version == "3.8"
 sympy==1.14.0 ; python_version >= "3.9"
 torch==2.4.1+cpu ; python_version == "3.8"
diff --git a/tests/test_lr_schedulers.py b/tests/test_lr_schedulers.py
@@ -307,17 +307,26 @@ def test_rex_lr_scheduler():
         np.testing.assert_almost_equal(expected_lr, lr_scheduler.get_lr(), 6)
 
 
-def test_wsd_lr_scheduler():
+@pytest.mark.parametrize(
+    'recipe',
+    [
+        ('cosine', [0.0005, 0.001, 0.001, 0.001, 0.000775, 0.000325, 0.0001, 0.0001, 0.0001]),
+        ('1-sqrt', [0.0005, 0.001, 0.001, 0.001, 0.0004226, 0.0001835, 0.0001, 0.0001, 0.0001]),
+        ('1-square', [0.0005, 0.001, 0.001, 0.001, 0.0008888, 0.0005555, 0.0001, 0.0001, 0.0001]),
+        ('linear', [0.0005, 0.001, 0.001, 0.001, 0.0006666, 0.0003333, 0.0001, 0.0001, 0.0001]),
+    ],
+)
+def test_wsd_lr_scheduler(recipe):
     optimizer = AdamW(Example().parameters())
     optimizer.step()
 
-    lr_scheduler = get_wsd_schedule(optimizer, 2, 2, 3, min_lr_ratio=0.1)
+    cooldown_type, expected_lrs = recipe
 
-    expected_lrs = [0.0005, 0.001, 0.001, 0.001, 0.000775, 0.000325, 0.0001, 0.0001, 0.0001]
+    lr_scheduler = get_wsd_schedule(optimizer, 2, 2, 3, min_lr_ratio=0.1, cooldown_type=cooldown_type)
 
     for expected_lr in expected_lrs:
         lr_scheduler.step()
-        np.testing.assert_almost_equal(expected_lr, lr_scheduler.get_last_lr(), 6)
+        np.testing.assert_almost_equal(expected_lr, lr_scheduler.get_last_lr()[0], 7)
 
 
 def test_deberta_v3_large_lr_scheduler():