[algo] feat: reduce routed expert padding via NestedTensor and uint8 dtype (verl-project#5240)

xhx1022 · web-flow · commit 3e0d4b13efe7 · 2026-02-09T17:41:54.000+08:00
This PR optimizes the routed expert data to reduce communication and memory overhead. - Converts `routed_experts` into a `NestedTensor` representation to avoid padding-heavy dense tensors. - Packs routed expert data into `uint8` format to reduce transmission size. - Removes unnecessary `attention_mask` propagation for routed expert execution. ### Experimental Results The results indicate that the proposed optimization reduces padding-related communication and memory overhead by **around 15%** compared to the original implementation, while preserving execution correctness. <img width="425" height="292" alt="企业微信截图_92aac7da-0169-491e-af86-c1a38661ac7e" src="https://github.com/user-attachments/assets/393cd669-d75d-405c-b61f-95d8926076e9" /> ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) - [ ] If your PR is related to the `recipe` submodule, please also update the reference to the submodule commit via `git submodule update --remote` or `cd recipe && git pull origin main`. --------- Signed-off-by: xhx1022 <1737006628@qq.com>
diff --git a/verl/utils/megatron/router_replay_utils.py b/verl/utils/megatron/router_replay_utils.py
@@ -37,6 +37,7 @@
 from verl.models.mcore.util import (
     postprocess_packed_seqs,
     preprocess_packed_seqs,
+    preprocess_thd_no_padding,
 )
 from verl.utils.device import get_device_name
 from verl.utils.megatron.router_replay_patch import RouterReplay, RouterReplayAction
@@ -233,7 +234,10 @@ def set_router_replay_data(layers_topk_idx, attention_mask, tf_config, vp_rank=N
         None: The function updates internal RouterReplay instances in-place.
     """
     with torch.no_grad():
-        layers_topk_idx_rmpad, _ = preprocess_packed_seqs(layers_topk_idx, attention_mask, pre_process=True)
+        if layers_topk_idx.is_nested:
+            layers_topk_idx_rmpad, _ = preprocess_thd_no_padding(layers_topk_idx, pre_process=True)
+        else:
+            layers_topk_idx_rmpad, _ = preprocess_packed_seqs(layers_topk_idx, attention_mask, pre_process=True)
         layers_topk_idx_rmpad = layers_topk_idx_rmpad.contiguous()  # 1, dynamic_bs_all, layer_num, topk
 
         # 1, dynamic_bs_split, layer_num, topk
diff --git a/verl/workers/engine/megatron/transformer_impl.py b/verl/workers/engine/megatron/transformer_impl.py
@@ -666,14 +666,12 @@ def prepare_model_inputs(self, batch: TensorDict):
         multi_modal_inputs = extract_multi_modal_inputs(batch.get("multi_modal_inputs", []))
 
         routed_experts = batch.get("routed_experts", [])
-        attention_mask = batch.get("attention_mask", None)
 
         return {
             "input_ids": input_ids,
             "loss_mask": loss_mask,
             "multi_modal_inputs": multi_modal_inputs,
             "routed_experts": routed_experts,
-            "attention_mask": attention_mask,
         }
 
     def prepare_model_outputs(self, output: dict, data: TensorDict):
@@ -712,8 +710,7 @@ def forward_step(self, batch_iter: Iterator[TensorDict], model, postprocess_micr
 
         if RouterReplayHelper.is_replay_forward_action(self.tf_config, vp_rank):
             layers_topk_idx = model_inputs["routed_experts"]
-            attention_mask = model_inputs["attention_mask"].to(bool)
-            set_router_replay_data(layers_topk_idx, attention_mask, self.tf_config, vp_rank)
+            set_router_replay_data(layers_topk_idx, None, self.tf_config, vp_rank)
 
         if pad_mode == DatasetPadMode.NO_PADDING:
             label = input_ids.clone()
diff --git a/verl/workers/utils/padding.py b/verl/workers/utils/padding.py
@@ -17,7 +17,7 @@
 from tensordict import TensorDict
 
 from verl.utils import tensordict_utils as tu
-from verl.utils.attention_utils import unpad_input
+from verl.utils.attention_utils import index_first_axis, unpad_input
 
 
 def left_right_2_no_padding(data: TensorDict) -> TensorDict:
@@ -70,6 +70,16 @@ def left_right_2_no_padding(data: TensorDict) -> TensorDict:
     data["position_ids"] = position_ids_nested
     data["loss_mask"] = data["response_mask"]
 
+    routed_experts = data.get("routed_experts", None)
+    if routed_experts is not None and not routed_experts.is_nested:
+        if routed_experts.max() <= 255:
+            routed_experts = routed_experts.to(torch.uint8)
+        routed_experts_rmpad = index_first_axis(routed_experts.unsqueeze(-1).flatten(0, 1), indices)
+        routed_experts_nested = torch.nested.nested_tensor_from_jagged(
+            routed_experts_rmpad.squeeze(-1), offsets=cu_seqlens
+        )
+        data["routed_experts"] = routed_experts_nested
+
     return data