[trainer] feat: add reward loop config to default config (verl-project#4452)

yyDing1 · web-flow · commit 0102d04e0666 · 2025-12-09T09:41:50.000+08:00
### What does this PR do? Future PRs will transfer from legacy rm implementation to reward loop (in both rule-based, genrm, disrm, ...) gradually; this PR adds reward loop configs to defaults, which inherit the legacy reward model config, so it will not break any current api. Specifically, future PRs will: - align results between reward loop disrm and legacy fsdp/megatron disrm - deprecate fsdp/megatron disrm, use reward loop disrm as default - use reward loop rule-based, disrm-based, genrm-based as default - deprecate legacy reward model config ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).)
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
@@ -297,12 +297,15 @@ def __init__(
                 self.processor.chat_template = self.config.actor_rollout_ref.model.custom_chat_template
             self.tokenizer.chat_template = self.config.actor_rollout_ref.model.custom_chat_template
 
-        self.reward_manager_worker = RewardLoopWorker.options(
-            scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
-                node_id=ray.get_runtime_context().get_node_id(),
-                soft=False,
-            ),
-        ).remote(self.config, self.reward_router_address)
+        use_reward_loop = True if self.config.reward_model.use_reward_loop else None
+        self.use_reward_loop = use_reward_loop
+        if use_reward_loop:
+            self.reward_loop_worker = RewardLoopWorker.options(
+                scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+                    node_id=ray.get_runtime_context().get_node_id(),
+                    soft=False,
+                ),
+            ).remote(self.config, self.reward_router_address)
 
         trace_config = self.config.actor_rollout_ref.rollout.get("trace", {})
         RolloutTraceConfig.init(
@@ -551,7 +554,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO
         enable_async_reward = (
             self.reward_router_address is not None and self.config.reward_model.enable_resource_pool
         ) or not self.config.reward_model.enable
-        if output.reward_score is None and enable_async_reward:
+        if output.reward_score is None and enable_async_reward and self.use_reward_loop:
             batch = TensorDict(
                 {
                     "prompts": prompt_output["input_ids"],  # [1, prompt_length]
@@ -572,7 +575,7 @@ async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalAgentLoopO
                 batch=batch,
                 non_tensor_batch=non_tensor_batch,
             )
-            result = await self.reward_manager_worker.compute_score.remote(data)
+            result = await self.reward_loop_worker.compute_score.remote(data)
             output.reward_score = result["reward_score"]
             output.extra_fields["reward_extra_info"] = result["reward_extra_info"]
 
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -571,6 +571,30 @@ reward_model:
     use_remove_padding: ${oc.select:actor_rollout_ref.actor.megatron.use_remove_padding,True}
     dtype: bfloat16
   load_weight: true
+  use_reward_loop: true
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: true
+    prompt_length: 512
+    response_length: 512
 algorithm:
   rollout_correction:
     rollout_is: null
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -495,6 +495,30 @@ reward_model:
     save_path: ${oc.select:global_profiler.save_path,null}
     tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null}
   ulysses_sequence_parallel_size: 1
+  use_reward_loop: true
+  rollout:
+    _target_: verl.workers.config.RolloutConfig
+    name: ???
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    enforce_eager: true
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    load_format: auto
+    engine_kwargs: {}
+    limit_images: null
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    disable_log_stats: true
+    skip_tokenizer_init: true
+    prompt_length: 512
+    response_length: 512
 algorithm:
   rollout_correction:
     rollout_is: null
diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml
@@ -18,7 +18,7 @@ defaults:
   # Critic model config.
   - critic@critic: megatron_critic
   # Reward model config.
-  - reward_model@reward_model: megatron_reward_model
+  - reward_model@reward_model: megatron_reward_loop
   # Rollout correction config.
   - algorithm@algorithm.rollout_correction: rollout_correction
   - _self_
diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml
@@ -31,7 +31,7 @@ defaults:
   - critic@critic: dp_critic
 
   # Reward model config.
-  - reward_model@reward_model: dp_reward_model
+  - reward_model@reward_model: dp_reward_loop
 
   # Rollout correction config.
   - algorithm@algorithm.rollout_correction: rollout_correction
diff --git a/verl/trainer/config/reward_model/dp_reward_loop.yaml b/verl/trainer/config/reward_model/dp_reward_loop.yaml
@@ -0,0 +1,42 @@
+defaults:
+  - dp_reward_model
+  - _self_
+
+use_reward_loop: True
+reward_manager: naive
+enable: False
+
+# Whether to deploy the model to a separate resource pool.
+enable_resource_pool: False
+n_gpus_per_node: 0
+nnodes: 0
+
+model:
+  path: ~/models/FsfairX-LLaMA3-RM-v0.1
+  external_lib: ${actor_rollout_ref.model.external_lib}
+  trust_remote_code: False
+
+rollout:
+  _target_: verl.workers.config.RolloutConfig
+  name: ???
+  dtype: bfloat16
+  gpu_memory_utilization: 0.5
+  enforce_eager: true
+  cudagraph_capture_sizes: null
+  free_cache_engine: true
+  data_parallel_size: 1
+  expert_parallel_size: 1
+  tensor_model_parallel_size: 2
+  max_num_batched_tokens: 8192
+  max_model_len: null
+  max_num_seqs: 1024
+  load_format: auto
+  engine_kwargs: {}
+  limit_images: null
+  enable_chunked_prefill: true
+  enable_prefix_caching: true
+  disable_log_stats: true
+  skip_tokenizer_init: true
+
+  prompt_length: 512
+  response_length: 512
diff --git a/verl/trainer/config/reward_model/megatron_reward_loop.yaml b/verl/trainer/config/reward_model/megatron_reward_loop.yaml
@@ -0,0 +1,42 @@
+defaults:
+  - megatron_reward_model
+  - _self_
+
+use_reward_loop: True
+reward_manager: naive
+enable: False
+
+# Whether to deploy the model to a separate resource pool.
+enable_resource_pool: False
+n_gpus_per_node: 0
+nnodes: 0
+
+model:
+  path: ~/models/FsfairX-LLaMA3-RM-v0.1
+  external_lib: ${actor_rollout_ref.model.external_lib}
+  trust_remote_code: False
+
+rollout:
+  _target_: verl.workers.config.RolloutConfig
+  name: ???
+  dtype: bfloat16
+  gpu_memory_utilization: 0.5
+  enforce_eager: true
+  cudagraph_capture_sizes: null
+  free_cache_engine: true
+  data_parallel_size: 1
+  expert_parallel_size: 1
+  tensor_model_parallel_size: 2
+  max_num_batched_tokens: 8192
+  max_model_len: null
+  max_num_seqs: 1024
+  load_format: auto
+  engine_kwargs: {}
+  limit_images: null
+  enable_chunked_prefill: true
+  enable_prefix_caching: true
+  disable_log_stats: true
+  skip_tokenizer_init: true
+
+  prompt_length: 512
+  response_length: 512