Added LR scheduler functionality to Tinker Trainer.

chenyushuo · chenyushuo · commit d67a913a149d · 2026-01-16T11:06:15.000+08:00
diff --git a/tests/trainer/trainer_test.py b/tests/trainer/trainer_test.py
@@ -2,6 +2,7 @@
 
 import asyncio
 import json
+import math
 import multiprocessing
 import os
 import shutil
@@ -48,6 +49,8 @@
 from trinity.common.models.utils import get_checkpoint_dir_with_step_num
 from trinity.explorer.proxy.client import TrinityClient
 from trinity.manager.state_manager import StateManager
+from trinity.manager.synchronizer import Synchronizer
+from trinity.trainer.tinker_trainer import TinkerTrainerWrapper
 
 
 class BaseTrainerCase(RayUnittestBase):
@@ -1448,7 +1451,7 @@ def test_trainer(self):
         self.config.buffer.total_epochs = 1
         self.config.buffer.explorer_input.taskset = get_unittest_dataset_config("gsm8k")
         self.config.model.tinker.enable = True
-        self.config.model.tinker.base_model = "Qwen/Qwen3-4B-Instruct-2507"
+        self.config.model.model_path = "Qwen/Qwen3-4B-Instruct-2507"
         self.config.check_and_update()
         both(self.config)
         parser = TensorBoardParser(os.path.join(self.config.monitor.cache_dir, "tensorboard"))
@@ -1464,6 +1467,40 @@ def test_trainer(self):
         self.assertGreater(len(response_metrics), 0)
         self.assertEqual(parser.metric_max_step(response_metrics[0]), 4)
 
+    def test_trainer_class(self):
+        total_steps = 100
+        lr_warmup_steps = 10
+        self.config.algorithm.algorithm_type = "grpo"
+        self.config.model.tinker.enable = True
+        self.config.model.model_path = "Qwen/Qwen3-4B-Instruct-2507"
+        self.config.trainer.total_steps = total_steps
+        self.config.algorithm.optimizer.lr_warmup_steps = lr_warmup_steps
+        self.config.algorithm.optimizer.lr_scheduler_type = "cosine"
+        self.config.check_and_update()
+        lr = self.config.algorithm.optimizer.lr
+
+        @ray.remote
+        class FakeExplorer:
+            def __init__(self, config: Config):
+                self.config = config
+                self.synchronizer = Synchronizer.get_actor(config)
+
+        fake_explorer = FakeExplorer.remote(self.config)
+        ray.get(fake_explorer.__ray_ready__.remote())
+
+        tinker_trainer = TinkerTrainerWrapper(self.config)
+        tinker_trainer._train_step_num = 5
+        self.assertEqual(tinker_trainer.current_learning_rate, lr * 0.5)
+        tinker_trainer._train_step_num = 50
+        self.assertEqual(
+            tinker_trainer.current_learning_rate,
+            lr
+            * (
+                0.5
+                * (1 + math.cos((50 - lr_warmup_steps) / (total_steps - lr_warmup_steps) * math.pi))
+            ),
+        )
+
     def tearDown(self):
         # remove dir only when the test passed
         shutil.rmtree(self.config.checkpoint_job_dir, ignore_errors=True)
diff --git a/trinity/common/config.py b/trinity/common/config.py
@@ -100,9 +100,9 @@ class OptimizerConfig:
     betas: List[float] = field(default_factory=lambda: [0.9, 0.999])
     weight_decay: float = 0.01
     clip_grad: float = 1.0
-    lr_warmup_init: float = 0.0
-    lr_decay_steps: Optional[int] = None
-    lr_decay_style: str = "constant"  # duplicated with lr_scheduler_type in veRL
+    lr_warmup_init: float = 0.0  # used in megatron
+    lr_decay_steps: Optional[int] = None  # used in megatron
+    lr_decay_style: str = "constant"  # used in megatron, duplicated with lr_scheduler_type in veRL
     min_lr: float = 0.0
 
 
diff --git a/trinity/trainer/tinker_trainer.py b/trinity/trainer/tinker_trainer.py
@@ -1,4 +1,6 @@
+import math
 import os
+import sys
 from typing import Dict, List
 
 import ray
@@ -36,7 +38,7 @@ def __init__(self, config: Config):
 
     def _init_algorithm(self):
         self.algorithm = ALGORITHM_TYPE.get(self.config.algorithm.algorithm_type)
-        algorithm_config = self.config.algorithm
+        self.algorithm_config = algorithm_config = self.config.algorithm
         if self.algorithm.compute_advantage_in_trainer:
             self.advantage_fn = ADVANTAGE_FN.get(algorithm_config.advantage_fn)(
                 **algorithm_config.advantage_fn_args
@@ -63,12 +65,60 @@ def _init_algorithm(self):
             and (self.loss_agg_mode == "token-mean")
         )
 
-        self.adam_params = types.AdamParams(
-            learning_rate=algorithm_config.optimizer.lr,
-            beta1=algorithm_config.optimizer.betas[0],
-            beta2=algorithm_config.optimizer.betas[1],
+        self.lr_scheduler_type = algorithm_config.optimizer.lr_scheduler_type
+        self.total_steps = self.config.trainer.total_steps or sys.maxsize
+        self.num_warmup_steps = algorithm_config.optimizer.lr_warmup_steps
+        if self.num_warmup_steps < 0:
+            self.num_warmup_steps = int(
+                algorithm_config.optimizer.lr_warmup_steps_ratio * self.total_steps
+            )
+        self.min_lr_ratio = algorithm_config.optimizer.min_lr_ratio
+        assert 0.0 <= self.min_lr_ratio <= 1.0
+        self.logger.info(
+            f"Total steps: {self.total_steps}, num_warmup_steps: {self.num_warmup_steps}"
+        )
+
+        if self.lr_scheduler_type not in {"constant", "cosine"}:
+            raise NotImplementedError(
+                f"LR scheduler type {self.lr_scheduler_type} is not supported"
+            )
+
+    @property
+    def _current_lr_factor(self):
+        train_step_num = self._train_step_num
+        # warmup
+        if train_step_num < self.num_warmup_steps:
+            factor = float(train_step_num) / float(max(1.0, self.num_warmup_steps))
+            factor = self.min_lr_ratio + (1.0 - self.min_lr_ratio) * factor
+            return factor
+
+        # decay
+        if train_step_num >= self.total_steps:
+            progress = 1.0
+        else:
+            progress = float(train_step_num - self.num_warmup_steps) / float(
+                max(1.0, self.total_steps - self.num_warmup_steps)
+            )
+        if self.lr_scheduler_type == "constant":
+            factor = 1.0
+        elif self.lr_scheduler_type == "cosine":
+            num_cycles = 0.5  # TODO: may add to config
+            factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+        factor = self.min_lr_ratio + (1.0 - self.min_lr_ratio) * factor
+        return max(self.min_lr_ratio, factor)
+
+    @property
+    def current_learning_rate(self):
+        return self._current_lr_factor * self.algorithm_config.optimizer.lr
+
+    @property
+    def adam_params(self):
+        return types.AdamParams(
+            learning_rate=self.current_learning_rate,
+            beta1=self.algorithm_config.optimizer.betas[0],
+            beta2=self.algorithm_config.optimizer.betas[1],
             # eps is currently not in config
-            weight_decay=algorithm_config.optimizer.weight_decay,
+            weight_decay=self.algorithm_config.optimizer.weight_decay,
             grad_clip_norm=self.config.trainer.grad_clip,
         )