feat: warmup-stable-decay (wsd) lr schedule

Tony Sun · changlan · commit 0fb8d9422311 · 2025-08-04T17:27:39.000-07:00
GitOrigin-RevId: f8131a40f89f38c236ad1f01c52b5c1a0c3e1c2b
diff --git a/axlearn/common/schedule.py b/axlearn/common/schedule.py
@@ -7,6 +7,7 @@
 # Licensed under the Apache License, Version 2.0 (the "License").
 
 """Optimizer schedules."""
+
 import math
 from typing import Callable, Optional, Union
 
@@ -45,22 +46,27 @@ def polynomial(
     Args:
         begin_step: The first step of polynomial schedule.
         begin_value: The begin value of polynomial schedule.
-        end_step: The end step of polynomial schedule. Must be > begin_step.
+        end_step: The end step of polynomial schedule. Must be >= begin_step.
+            If equal to begin_step, the schedule will always return `begin_value`.
         end_value: The end value of polynomial schedule.
         power: The polynomial power.
 
     Returns:
         A ScheduleFn according to the spec.
 
     Raises:
-        ValueError: If begin_step >= end_step.
+        ValueError: If begin_step > end_step.
     """
-    if begin_step >= end_step:
-        raise ValueError(f"begin_step {begin_step} must be < end_step {end_step}.")
+    if begin_step > end_step:
+        raise ValueError(f"begin_step ({begin_step}) must be <= end_step ({end_step}).")
+
+    if begin_step == end_step:
+        # For a zero-duration schedule, always return the starting value.
+        return lambda step: jnp.array(begin_value, dtype=jnp.float32)
 
     def fn(step: Tensor) -> Tensor:
         frac = (step - begin_step) / (end_step - begin_step)
-        frac = jnp.minimum(1.0, jnp.maximum(0.0, frac))
+        frac = jnp.minimum(1.0, jnp.maximum(0.0, frac))  # Clamp progress to [0, 1].
         return begin_value + (frac**power) * (end_value - begin_value)
 
     return fn
@@ -348,6 +354,61 @@ def cosine_with_linear_warmup(
     return segment_wise(segments=segments, segment_steps=segment_steps)
 
 
+def warmup_stable_decay(
+    peak_lr: float,
+    *,
+    max_step: int,
+    decay_begin_step: int,
+    warmup_steps: int = 500,
+    begin_value: float = 0.0,
+    alpha: float = 0.0,
+) -> ScheduleFn:
+    """Warmup stable decay (WSD) learning rate schedule. Linear warmup + constant lr + linear decay.
+
+    Args:
+        peak_lr: The peak learning rate corresponding to the stable part of the schedule.
+        max_step: The total number of steps from warmup + stable + decay.
+        decay_begin_step: The step to begin linear decay. The learning rate is kept constant
+            in [warmup_steps, decay_begin_step).
+        warmup_steps: The number of steps of the warm-up schedule. Skip warm-up if set to 0.
+        begin_value: The begin value of the linear warm-up.
+        alpha: The multiplier of peak_lr used to determine the final lr at the end of decay phase.
+
+    Returns:
+        A composite schedule.
+
+    Raises:
+        ValueError: If decay_begin_step < warmup_steps, or if max_step < decay_begin_step.
+    """
+    if decay_begin_step < warmup_steps:
+        raise ValueError(
+            f"decay_begin_step ({decay_begin_step}) must be >= warmup_steps ({warmup_steps})."
+        )
+    if max_step < decay_begin_step:
+        raise ValueError(f"max_step ({max_step}) must be >= decay_begin_step ({decay_begin_step}).")
+
+    return segment_wise(
+        segments=[
+            config_for_function(polynomial).set(
+                begin_step=0,
+                begin_value=begin_value,
+                end_step=warmup_steps,
+                end_value=peak_lr,
+            ),
+            config_for_function(constant_schedule).set(
+                value=peak_lr,
+            ),
+            config_for_function(polynomial).set(
+                begin_step=0,
+                begin_value=peak_lr,
+                end_step=max_step - decay_begin_step,
+                end_value=peak_lr * alpha,
+            ),
+        ],
+        segment_steps=[warmup_steps, decay_begin_step - warmup_steps],
+    )
+
+
 def constant_with_linear_warmup(
     peak_lr: float,
     *,
diff --git a/axlearn/common/schedule_test.py b/axlearn/common/schedule_test.py
@@ -1,6 +1,7 @@
 # Copyright © 2023 Apple Inc.
 
 """Tests optimizer schedules."""
+
 import math
 
 import jax
@@ -164,6 +165,72 @@ def test_cosine_with_linear_warmup(self, warmup_steps, decay_begin_step):
                 )
                 self.assertAlmostEqual(cosine_rate, value)
 
+    @parameterized.named_parameters(
+        {
+            "testcase_name": "full_schedule",
+            "warmup_steps": 100,
+            "decay_begin_step": 200,
+        },
+        {
+            "testcase_name": "no_stable_phase",
+            "warmup_steps": 100,
+            "decay_begin_step": 100,
+        },
+        {
+            "testcase_name": "no_warmup_phase",
+            "warmup_steps": 0,
+            "decay_begin_step": 200,
+        },
+    )
+    def test_warmup_stable_decay(self, warmup_steps, decay_begin_step):
+        peak_lr = 0.1
+        max_step = 300
+        alpha = 0.1
+        begin_value = 0.0
+
+        s = jax.jit(
+            schedule.warmup_stable_decay(
+                peak_lr=peak_lr,
+                max_step=max_step,
+                warmup_steps=warmup_steps,
+                begin_value=begin_value,
+                decay_begin_step=decay_begin_step,
+                alpha=alpha,
+            )
+        )
+
+        for step in range(1, max_step + 1, 25):
+            lr = s(jnp.array(step, dtype=jnp.int32))
+
+            if warmup_steps > 0 and step <= warmup_steps:  # Linear warmup.
+                warmup_progress = step / warmup_steps
+                expected_lr = begin_value + (peak_lr - begin_value) * warmup_progress
+                self.assertAlmostEqual(expected_lr, lr, places=6)
+
+            elif warmup_steps < step <= decay_begin_step:  # Stable at peak_lr.
+                self.assertAlmostEqual(peak_lr, lr, places=6)
+
+            else:  # Linear decay.
+                num_decay_steps = max_step - decay_begin_step
+                decay_progress = (step - decay_begin_step) / num_decay_steps
+                end_lr = peak_lr * alpha
+                expected_lr = peak_lr + (end_lr - peak_lr) * decay_progress
+                self.assertAlmostEqual(expected_lr, lr, places=6)
+
+    def test_warmup_stable_decay_errors(self):
+        """Test error conditions for warmup_stable_decay."""
+        # Test decay_begin_step < warmup_steps.
+        with self.assertRaises(ValueError):
+            schedule.warmup_stable_decay(
+                peak_lr=0.1, warmup_steps=200, decay_begin_step=100, max_step=300
+            )
+
+        # Test max_step < decay_begin_step.
+        with self.assertRaises(ValueError):
+            schedule.warmup_stable_decay(
+                peak_lr=0.1, warmup_steps=100, decay_begin_step=300, max_step=200
+            )
+
     def test_constant_with_linear_warmup(self):
         peak_lr = 0.1
         warmup_steps = 100