feat(lr): auto-adjust decay_steps instead of raising error

OutisLi · OutisLi · commit 3f8da24fa635 · 2026-01-15T16:52:16.000+08:00
When decay_steps exceeds the decay phase (num_steps - warmup_steps) and
decay_rate is not explicitly provided, automatically adjust decay_steps to
a sensible default (capped at 100, or decay_total//100 + 1) instead of
raising ValueError.

This makes the learning rate scheduler more user-friendly by gracefully
handling misconfigured decay_steps values.

Changes:
- LearningRateExp: auto-adjust decay_steps when &gt;= decay_total
- Update argcheck and training-advanced.md documentation
- Update pd/pt/tf test_lr.py to use auto-adjusted decay_steps
- Remove obsolete validation tests in test_learning_rate.py
- Fix tf test dtype: float32 -&gt; float64
diff --git a/deepmd/dpmodel/utils/learning_rate.py b/deepmd/dpmodel/utils/learning_rate.py
@@ -288,7 +288,6 @@ def __init__(
             If both stop_lr and stop_lr_ratio are provided, or neither is provided.
             If both warmup_steps and warmup_ratio are provided.
             If decay_steps is not positive.
-            If decay_steps is larger than the decay phase total steps when decay_rate is not provided.
         """
         super().__init__(
             start_lr=start_lr,
@@ -307,12 +306,12 @@ def __init__(
 
         if self.decay_steps <= 0:
             raise ValueError(f"decay_steps ({self.decay_steps}) must be positive.")
-        # Only validate decay_steps <= decay_total when computing decay_rate from start_lr/stop_lr
-        if decay_rate is None and self.decay_steps > decay_total:
-            raise ValueError(
-                f"decay_steps ({self.decay_steps}) must not exceed decay phase steps ({decay_total}) "
-                "when decay_rate is not explicitly provided."
-            )
+
+        # Auto-adjust decay_steps if it exceeds decay_total and decay_rate is not provided
+        if decay_rate is None and self.decay_steps >= decay_total:
+            # Compute sensible default: cap at 100, but ensure at least 1 for small decay_total
+            default_ds = 100 if decay_total // 10 > 100 else decay_total // 100 + 1
+            self.decay_steps = default_ds
 
         # Avoid log(0) issues by clamping stop_lr for computation
         clamped_stop_lr = max(self.stop_lr, 1e-10)
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
@@ -2669,7 +2669,10 @@ def learning_rate_exp() -> list[Argument]:
         "Mutually exclusive with stop_lr_ratio."
     )
     doc_decay_steps = (
-        "The learning rate is decaying every this number of training steps."
+        "The learning rate is decaying every this number of training steps. "
+        "If decay_steps exceeds the decay phase steps (num_steps - warmup_steps) "
+        "and decay_rate is not provided, it will be automatically adjusted to a "
+        "sensible default value."
     )
     doc_decay_rate = (
         "The decay rate for the learning rate. "
diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md
@@ -81,7 +81,7 @@ The {ref}`learning_rate <learning_rate>` section for exponential decay in `input
 
 **Additional parameters for `exp` type only:**
 
-- {ref}`decay_steps <learning_rate[exp]/decay_steps>` specifies the interval (in training steps) at which the learning rate is decayed. The learning rate is updated every {ref}`decay_steps <learning_rate[exp]/decay_steps>` steps during the decay phase.
+- {ref}`decay_steps <learning_rate[exp]/decay_steps>` specifies the interval (in training steps) at which the learning rate is decayed. The learning rate is updated every {ref}`decay_steps <learning_rate[exp]/decay_steps>` steps during the decay phase. If `decay_steps` exceeds the decay phase steps (num_steps - warmup_steps) and `decay_rate` is not explicitly provided, it will be automatically adjusted to a sensible default value.
 - {ref}`smooth <learning_rate[exp]/smooth>` (optional, default: `false`) controls the decay behavior. When set to `false`, the learning rate decays in a stepped manner (updated every `decay_steps` steps). When set to `true`, the learning rate decays smoothly at every step.
 
 **Learning rate formula for `exp` type:**
diff --git a/source/tests/pd/test_lr.py b/source/tests/pd/test_lr.py
@@ -18,7 +18,7 @@ class TestLearningRate(unittest.TestCase):
     def setUp(self):
         self.start_lr = 0.001
         self.stop_lr = 3.51e-8
-        # decay_steps must not exceed num_steps
+        # decay_steps will be auto-adjusted if >= num_steps
         self.decay_steps = np.arange(400, 501, 100)
         self.num_steps = np.arange(500, 1600, 500)
 
@@ -72,44 +72,40 @@ def decay_rate_pd(self):
             num_steps=self.stop_step,
         )
 
-        default_ds = 100 if self.stop_step // 10 > 100 else self.stop_step // 100 + 1
-        # Use local variable to avoid modifying instance state
-        decay_step_for_rate = self.decay_step
-        if decay_step_for_rate >= self.stop_step:
-            decay_step_for_rate = default_ds
+        # Use the auto-adjusted decay_steps from my_lr for consistency
+        actual_decay_steps = my_lr.decay_steps
         decay_rate = np.exp(
-            np.log(self.stop_lr / self.start_lr)
-            / (self.stop_step / decay_step_for_rate)
+            np.log(self.stop_lr / self.start_lr) / (self.stop_step / actual_decay_steps)
         )
         my_lr_decay = LearningRateExp(
             start_lr=self.start_lr,
             stop_lr=1e-10,
-            decay_steps=self.decay_step,
+            decay_steps=actual_decay_steps,
             num_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         min_lr = 1e-5
         my_lr_decay_trunc = LearningRateExp(
             start_lr=self.start_lr,
             stop_lr=min_lr,
-            decay_steps=self.decay_step,
+            decay_steps=actual_decay_steps,
             num_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         my_vals = [
             my_lr.value(step_id)
             for step_id in range(self.stop_step)
-            if step_id % self.decay_step != 0
+            if step_id % actual_decay_steps != 0
         ]
         my_vals_decay = [
             my_lr_decay.value(step_id)
             for step_id in range(self.stop_step)
-            if step_id % self.decay_step != 0
+            if step_id % actual_decay_steps != 0
         ]
         my_vals_decay_trunc = [
             my_lr_decay_trunc.value(step_id)
             for step_id in range(self.stop_step)
-            if step_id % self.decay_step != 0
+            if step_id % actual_decay_steps != 0
         ]
         self.assertTrue(np.allclose(my_vals_decay, my_vals))
         self.assertTrue(
diff --git a/source/tests/pt/test_lr.py b/source/tests/pt/test_lr.py
@@ -19,7 +19,7 @@ class TestLearningRate(unittest.TestCase):
     def setUp(self) -> None:
         self.start_lr = 0.001
         self.stop_lr = 3.51e-8
-        # decay_steps must not exceed num_steps
+        # decay_steps will be auto-adjusted if >= num_steps
         self.decay_steps = np.arange(400, 501, 100)
         self.num_steps = np.arange(500, 1600, 500)
 
@@ -73,44 +73,40 @@ def decay_rate_pt(self) -> None:
             num_steps=self.stop_step,
         )
 
-        default_ds = 100 if self.stop_step // 10 > 100 else self.stop_step // 100 + 1
-        # Use local variable to avoid modifying instance state
-        decay_step_for_rate = self.decay_step
-        if decay_step_for_rate >= self.stop_step:
-            decay_step_for_rate = default_ds
+        # Use the auto-adjusted decay_steps from my_lr for consistency
+        actual_decay_steps = my_lr.decay_steps
         decay_rate = np.exp(
-            np.log(self.stop_lr / self.start_lr)
-            / (self.stop_step / decay_step_for_rate)
+            np.log(self.stop_lr / self.start_lr) / (self.stop_step / actual_decay_steps)
         )
         my_lr_decay = LearningRateExp(
             start_lr=self.start_lr,
             stop_lr=1e-10,
-            decay_steps=self.decay_step,
+            decay_steps=actual_decay_steps,
             num_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         min_lr = 1e-5
         my_lr_decay_trunc = LearningRateExp(
             start_lr=self.start_lr,
             stop_lr=min_lr,
-            decay_steps=self.decay_step,
+            decay_steps=actual_decay_steps,
             num_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         my_vals = [
             my_lr.value(step_id)
             for step_id in range(self.stop_step)
-            if step_id % self.decay_step != 0
+            if step_id % actual_decay_steps != 0
         ]
         my_vals_decay = [
             my_lr_decay.value(step_id)
             for step_id in range(self.stop_step)
-            if step_id % self.decay_step != 0
+            if step_id % actual_decay_steps != 0
         ]
         my_vals_decay_trunc = [
             my_lr_decay_trunc.value(step_id)
             for step_id in range(self.stop_step)
-            if step_id % self.decay_step != 0
+            if step_id % actual_decay_steps != 0
         ]
         self.assertTrue(np.allclose(my_vals_decay, my_vals))
         self.assertTrue(
diff --git a/source/tests/tf/test_lr.py b/source/tests/tf/test_lr.py
@@ -23,12 +23,6 @@
 class TestLearningRateScheduleValidation(unittest.TestCase):
     """Test TF wrapper validation and error handling."""
 
-    def test_missing_start_lr(self) -> None:
-        """Test that missing start_lr raises ValueError."""
-        with self.assertRaises(ValueError) as cm:
-            LearningRateSchedule({"type": "exp", "stop_lr": 1e-5})
-        self.assertIn("start_lr", str(cm.exception))
-
     def test_value_before_build(self) -> None:
         """Test that calling value() before build() raises RuntimeError."""
         lr_schedule = LearningRateSchedule({"start_lr": 1e-3})
@@ -48,13 +42,13 @@ class TestLearningRateScheduleBuild(unittest.TestCase):
     """Test TF tensor building and integration."""
 
     def test_build_returns_tensor(self) -> None:
-        """Test that build() returns a float32 TF tensor."""
+        """Test that build() returns a float64 TF tensor."""
         lr_schedule = LearningRateSchedule({"start_lr": 1e-3, "stop_lr": 1e-5})
         global_step = tf.constant(0, dtype=tf.int64)
         lr_tensor = lr_schedule.build(global_step, num_steps=10000)
 
         self.assertIsInstance(lr_tensor, tf.Tensor)
-        self.assertEqual(lr_tensor.dtype, tf.float32)
+        self.assertEqual(lr_tensor.dtype, tf.float64)
 
     def test_default_type_exp(self) -> None:
         """Test that default type is 'exp' when not specified."""
diff --git a/source/tests/universal/dpmodel/utils/test_learning_rate.py b/source/tests/universal/dpmodel/utils/test_learning_rate.py
@@ -199,42 +199,3 @@ def test_cosine_beyond_num_steps(self) -> None:
             num_steps=10000,
         )
         np.testing.assert_allclose(lr.value(20000), 1e-5, rtol=1e-10)
-
-
-class TestLearningRateValidation(unittest.TestCase):
-    """Test learning rate parameter validation."""
-
-    def test_decay_steps_exceeds_decay_total_without_warmup(self) -> None:
-        """Test that decay_steps > num_steps raises ValueError."""
-        with self.assertRaises(ValueError) as cm:
-            LearningRateExp(
-                start_lr=1e-3,
-                stop_lr=1e-5,
-                num_steps=500,
-                decay_steps=600,
-            )
-        self.assertIn("decay_steps", str(cm.exception))
-        self.assertIn("exceed", str(cm.exception))
-
-    def test_decay_steps_exceeds_decay_total_with_warmup(self) -> None:
-        """Test that decay_steps > (num_steps - warmup_steps) raises ValueError."""
-        with self.assertRaises(ValueError) as cm:
-            LearningRateExp(
-                start_lr=1e-3,
-                stop_lr=1e-5,
-                num_steps=1000,
-                decay_steps=900,
-                warmup_steps=200,  # decay_total = 800
-            )
-        self.assertIn("decay_steps", str(cm.exception))
-
-    def test_decay_steps_equals_decay_total_allowed(self) -> None:
-        """Test that decay_steps == decay_total is allowed (boundary case)."""
-        # Should not raise
-        lr = LearningRateExp(
-            start_lr=1e-3,
-            stop_lr=1e-5,
-            num_steps=500,
-            decay_steps=500,
-        )
-        self.assertEqual(lr.decay_steps, 500)

Original file line number	Diff line number	Diff line change
`@@ -2669,7 +2669,10 @@ def learning_rate_exp() -> list[Argument]:`
`2669`	`2669`	`"Mutually exclusive with stop_lr_ratio."`
`2670`	`2670`	`)`
`2671`	`2671`	`doc_decay_steps = (`
`2672`		`- "The learning rate is decaying every this number of training steps."`
	`2672`	`+ "The learning rate is decaying every this number of training steps. "`
	`2673`	`+ "If decay_steps exceeds the decay phase steps (num_steps - warmup_steps) "`
	`2674`	`+ "and decay_rate is not provided, it will be automatically adjusted to a "`
	`2675`	`+ "sensible default value."`
`2673`	`2676`	`)`
`2674`	`2677`	`doc_decay_rate = (`
`2675`	`2678`	`"The decay rate for the learning rate. "`