rename warmup_style to lr_scheduler_type

chenyushuo · chenyushuo · commit ae965d81b2fe · 2026-01-15T20:20:01.000+08:00
diff --git a/benchmark/config/countdown-template.yaml b/benchmark/config/countdown-template.yaml
@@ -9,7 +9,7 @@ algorithm:
   optimizer:
     lr: 1e-06
     lr_warmup_steps_ratio: 0.0
-    warmup_style: constant
+    lr_scheduler_type: constant
   advantage_fn: ppo
 data_processor: {}
 model:
@@ -78,7 +78,7 @@ trainer:
       optim:
         lr: 1e-5
         lr_warmup_steps_ratio: 0.0
-        warmup_style: constant
+        lr_scheduler_type: constant
       ppo_max_token_len_per_gpu: 12800
       forward_max_token_len_per_gpu: 12800
       cliprange_value: 0.5
diff --git a/benchmark/config/gsm8k-template.yaml b/benchmark/config/gsm8k-template.yaml
@@ -9,7 +9,7 @@ algorithm:
   optimizer:
     lr: 1e-5
     lr_warmup_steps_ratio: 0.0
-    warmup_style: constant
+    lr_scheduler_type: constant
   sample_strategy: default
   policy_loss_fn: ppo
   advantage_fn: grpo
diff --git a/benchmark/config/guru_math-template.yaml b/benchmark/config/guru_math-template.yaml
@@ -16,7 +16,7 @@ algorithm:
     lr: 1e-6
     weight_decay: 0.1
     lr_warmup_steps: 80
-    warmup_style: constant
+    lr_scheduler_type: constant
 cluster:
   node_num: 1
   gpu_per_node: 8
diff --git a/docs/sphinx_doc/source/tutorial/trinity_configs.md b/docs/sphinx_doc/source/tutorial/trinity_configs.md
@@ -97,7 +97,7 @@ algorithm:
   repeat_times: 8
   optimizer:
     lr: 1e-6
-    warmup_style: "warmup"
+    lr_scheduler_type: "constant"
   # The following parameters are optional
   # If not specified, they will automatically be set based on the `algorithm_type`
   sample_strategy: "default"
@@ -111,7 +111,8 @@ algorithm:
 - `repeat_times`: Number of times each task is repeated. Default is `1`. In `dpo`, this is automatically set to `2`. Some algorithms such as GRPO and OPMD require `repeat_times` > 1.
 - `optimizer`: Optimizer configuration for actor.
   - `lr`: Learning rate for actor.
-  - `warmup_style`: Warmup style for actor's learning rate.
+  - `warmup_style`: Deprecated, use `lr_scheduler_type` instead.
+  - `lr_scheduler_type`: Learning rate scheduler type for actor model. Default is `constant`. Supported types: `constant`, `consine`.
 - `sample_strategy`: The sampling strategy used for loading experiences from experience buffer. Supported types: `default`, `staleness_control`, `mix`.
 - `advantage_fn`: The advantage function used for computing advantages.
 - `kl_penalty_fn`: The KL penalty function used for computing KL penalty applied in reward.
diff --git a/docs/sphinx_doc/source_zh/tutorial/trinity_configs.md b/docs/sphinx_doc/source_zh/tutorial/trinity_configs.md
@@ -97,7 +97,7 @@ algorithm:
   repeat_times: 8
   optimizer:
     lr: 1e-6
-    warmup_style: constant
+    lr_scheduler_type: constant
   # 以下参数为可选
   # 若未指定，将根据 `algorithm_type` 自动设置
   sample_strategy: "default"
@@ -111,7 +111,8 @@ algorithm:
 - `repeat_times`: 每个任务重复的次数。默认为 `1`。在 `dpo` 中自动设为 `2`。某些算法如 GRPO 和 OPMD 要求 `repeat_times` > 1。
 - `optimizer`: Actor 优化器的参数。
   - `lr`: 优化器的学习率。
-  - `warmup_style`: 学习率的预热策略。
+  - `warmup_style`：已弃用，请改用 `lr_scheduler_type`。
+  - `lr_scheduler_type`：Actor 模型的学习率调度器类型。默认值为 `constant`。支持类型：`constant`、`cosine`。
 - `sample_strategy`: 从 experience buffer 加载 experience 时使用的采样策略。支持类型：`default`、`staleness_control`、`mix`。
 - `advantage_fn`: 用于计算优势值的函数。
 - `kl_penalty_fn`: 用于在奖励中计算 KL 惩罚的函数。
diff --git a/examples/dpo_human_in_the_loop/dpo.yaml b/examples/dpo_human_in_the_loop/dpo.yaml
@@ -42,7 +42,7 @@ algorithm:
     lr: 5e-7
     lr_warmup_steps_ratio: 0.03  # the total steps will be injected during runtime
     min_lr_ratio: 0.1   # only useful for warmup with cosine
-    warmup_style: cosine  # select from constant/cosine
+    lr_scheduler_type: cosine  # select from constant/cosine
     betas: [0.9, 0.95]
   kl_loss_fn: k1
   kl_loss_fn_args:
diff --git a/examples/dpo_humanlike/dpo.yaml b/examples/dpo_humanlike/dpo.yaml
@@ -7,7 +7,7 @@ algorithm:
     lr: 5e-7
     lr_warmup_steps_ratio: 0.03  # the total steps will be injected during runtime
     min_lr_ratio: 0.1   # only useful for warmup with cosine
-    warmup_style: cosine  # select from constant/cosine
+    lr_scheduler_type: cosine  # select from constant/cosine
     betas: [0.9, 0.95]
   kl_loss_fn: k1
   kl_loss_fn_args:
diff --git a/examples/grpo_alfworld/alfworld.yaml b/examples/grpo_alfworld/alfworld.yaml
@@ -65,7 +65,7 @@ trainer:
 #       optimizer:
 #         lr: 5e-6
 #         lr_warmup_steps_ratio: 0.0
-#         warmup_style: constant
+#         lr_scheduler_type: constant
 #     buffer:
 #       total_epochs: 1
 #       train_batch_size: 32
diff --git a/examples/learn_to_ask/train.yaml b/examples/learn_to_ask/train.yaml
@@ -13,7 +13,7 @@ algorithm:
   optimizer:
     lr: 5.0e-07
     lr_warmup_steps_ratio: 0.0
-    warmup_style: constant
+    lr_scheduler_type: constant
 data_processor: {}
 model:
   model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
diff --git a/examples/tinker/tinker.yaml b/examples/tinker/tinker.yaml
@@ -11,7 +11,7 @@ algorithm:
   optimizer:
     lr: 1.0e-05
     lr_warmup_steps_ratio: 0.0
-    warmup_style: constant
+    lr_scheduler_type: constant
 data_processor: {}
 model:
   model_path: Qwen/Qwen3-4B-Instruct-2507
diff --git a/scripts/context_length_test/context_length.yaml b/scripts/context_length_test/context_length.yaml
@@ -16,7 +16,7 @@ algorithm:
   optimizer:
     lr: 1.0e-05
     lr_warmup_steps_ratio: 0.0
-    warmup_style: constant
+    lr_scheduler_type: constant
 data_processor: {}
 model:
   model_path: ${oc.env:MODEL_PATH,Qwen/Qwen3-0.6B}
diff --git a/tests/template/config.yaml b/tests/template/config.yaml
@@ -8,7 +8,7 @@ algorithm:
   optimizer:
     lr: 1e-6
     lr_warmup_steps_ratio: 0.
-    warmup_style: constant  # select from constant/cosine
+    lr_scheduler_type: constant  # select from constant/cosine
   policy_loss_fn: ppo
   policy_loss_fn_args:
     clip_range: 0.2
diff --git a/trinity/common/verl_config.py b/trinity/common/verl_config.py
@@ -61,6 +61,7 @@ class Optim:
     lr_warmup_steps: int = -1
     lr_warmup_steps_ratio: float = 0.0
     min_lr_ratio: Optional[float] = 0.0
+    warmup_style: Optional[str] = None  # deprecated !
     lr_scheduler_type: str = "constant"
     total_training_steps: int = -1  # ! DO NOT SET, use trainer.total_steps
     betas: List[float] = field(default_factory=lambda: [0.9, 0.999])