From 922627931e13306bd2841627ff4e76394e4021b5 Mon Sep 17 00:00:00 2001
From: aleskeymalakhov11 <aleksey.malakhov11@gmail.com>
Date: Wed, 8 Apr 2026 15:02:50 +0300
Subject: [PATCH 1/8] add eagle speculative decoding for rollouts

---
 docs/advance/mtp.md                           |  8 ++--
 verl/experimental/agent_loop/agent_loop.py    | 45 ++++++++++++++++---
 .../agent_loop/prometheus_utils.py            | 23 ++++++++++
 .../_generated_ppo_megatron_trainer.yaml      |  8 ++++
 .../_generated_ppo_torchtitan_trainer.yaml    |  8 ++++
 .../config/_generated_ppo_trainer.yaml        |  8 ++++
 .../config/_generated_ppo_veomni_trainer.yaml |  8 ++++
 verl/trainer/config/rollout/rollout.yaml      |  9 ++++
 verl/trainer/ppo/metric_utils.py              | 18 ++++++++
 verl/workers/config/rollout.py                | 41 +++++++++++++++++
 verl/workers/rollout/vllm_rollout/utils.py    | 42 ++++++++++++++++-
 .../rollout/vllm_rollout/vllm_async_server.py | 17 +++++++
 .../rollout/vllm_rollout/vllm_rollout.py      |  3 +-
 13 files changed, 226 insertions(+), 12 deletions(-)
diff --git a/docs/advance/mtp.md b/docs/advance/mtp.md
index b342b670d74..29456af8b9a 100644
--- a/docs/advance/mtp.md
+++ b/docs/advance/mtp.md
@@ -2,7 +2,7 @@
 
 **Author**: `https://github.com/meituan-search`
 
-Last updated: 02/15/2026
+Last updated: 04/08/2026
 
 # 1. Scope of Support
 
@@ -12,6 +12,8 @@ Currently, RL training can be performed on mimo-7B-RL, Qwen-next, and Deepseek s
 
 - **Inference Engine**: Compatible with all engines, but the model must be in the corresponding engine's compatibility list;
 
+- **vLLM Inference-Only Speculative Decoding**: In addition to MTP-based rollout acceleration, vLLM also supports inference-only speculative decoding with external `EAGLE` / `EAGLE3` draft models. This path is currently supported only for vLLM rollout. SGLang is not supported right now;
+
 - **Dependency Versions**:
 
     - mbridge: Apply the patches and review suggestions from PR: [#62](https://github.com/ISEEKYAN/mbridge/pull/62) (Already merged into the main branch);
@@ -31,7 +33,8 @@ The MTP training process can be flexibly controlled through the following config
 | Load MTP Parameters Only | `enable=True`                                                                                                                                                                                                                                                                                              | VRAM usage will increase, but the exported parameters include the MTP module and can be directly used for online deployment              |
 | Full-Parameter MTP Training | `enable=True`<br>`enable_train=True`<br>`mtp_loss_scaling_factor=0.1`                                                                                                                                                                                                                              | MTP Loss will apply to all model parameters                            |
 | MTP Parameter-Only Training | `enable=True`<br>`enable_train=True`<br>`detach_encoder=True`                                                                                                                                                                                                                                      | Freeze the Encoder layer, update only MTP module parameters, MTP Loss applies only to MTP parameters |
-| MTP Accelerated Rollout | 1. vLLM configuration:<br>`enable=True`<br>`enable_rollout=True`<br>`method="mtp"`<br>`num_speculative_tokens=1`<br>2. SGLang configuration:<br>`enable=True`<br>`enable_rollout=True`<br>`speculative_algorithm="EAGLE"`<br>`speculative_num_steps=2`<br>`speculative_eagle_topk=2`<br>`speculative_num_draft_tokens=4` | Achieve inference acceleration during the Rollout phase based on MTP                      |
+| MTP Accelerated Rollout | 1. vLLM configuration:<br>`enable=True`<br>`enable_rollout=True`<br>`method="mtp"`<br>`num_speculative_tokens=1`<br>2. SGLang configuration:<br>`enable=True`<br>`enable_rollout=True`<br>`speculative_algorithm="EAGLE"`<br>`speculative_num_steps=2`<br>`speculative_eagle_topk=2`<br>`speculative_num_draft_tokens=4` | Achieve inference acceleration during the Rollout phase based on trainable MTP parameters |
+| vLLM Inference-Only EAGLE / EAGLE3 Rollout Acceleration | `actor_rollout_ref.rollout.speculative_decoding.enable=True`<br>`actor_rollout_ref.rollout.speculative_decoding.method="EAGLE"` or `"EAGLE3"`<br>`actor_rollout_ref.rollout.speculative_decoding.draft_model_path=/path/to/draft/model`<br>`actor_rollout_ref.rollout.speculative_decoding.num_draft_tokens=4`<br>`actor_rollout_ref.rollout.speculative_decoding.draft_tensor_parallel_size=1` or `tensor_model_parallel_size` | Achieve rollout acceleration on vLLM with an external draft model. This does not require trainable MTP parameters. SGLang does not support this path right now. |
 
 # 3. Experimental Results
 
@@ -109,4 +112,3 @@ The experiment was conducted using following data:
 The result: [wandb link](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
 
 The presence of mtp layer has limited effect on main loss. However, when MTP layer is detached, the mtp_loss converges to a higher value.
-
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index c13a6723890..edd8d4defda 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -31,7 +31,10 @@
 from tensordict import TensorDict
 from transformers import AutoProcessor, AutoTokenizer
 
-from verl.experimental.agent_loop.prometheus_utils import update_prometheus_config
+from verl.experimental.agent_loop.prometheus_utils import (
+    read_spec_decoding_metrics_from_prometheus,
+    update_prometheus_config,
+)
 from verl.experimental.agent_loop.utils import resolve_config_path
 from verl.experimental.teacher_loop import TeacherModelManager
 from verl.protocol import DataProto
@@ -42,11 +45,7 @@
 from verl.utils.dataset.rl_dataset import RLHFDataset, get_dataset_class
 from verl.utils.model import compute_position_id_with_mask
 from verl.utils.ray_utils import auto_await, get_event_loop
-from verl.utils.rollout_trace import (
-    RolloutTraceConfig,
-    rollout_trace_attr,
-    rollout_trace_op,
-)
+from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op
 from verl.utils.tokenizer import normalize_token_ids
 from verl.workers.config import DistillationConfig, DistillationLossConfig, HFModelConfig, RolloutConfig
 from verl.workers.rollout.replica import TokenOutput, get_rollout_replica_class
@@ -1149,6 +1148,13 @@ async def generate_sequences(self, prompts: DataProto) -> DataProto:
         Returns:
             DataProto: Output batch.
         """
+        spec_before = None
+        if self.rollout_config.name == "vllm" and self.rollout_config.speculative_decoding.enable:
+            try:
+                spec_before = read_spec_decoding_metrics_from_prometheus(self.server_addresses)
+            except Exception as e:
+                print(f"speculative decoding unavailable: {e}")
+
         if self.distillation_enabled:
             await self.teacher_model_manager.wake_up()
         chunkes = prompts.chunk(len(self.agent_loop_workers))
@@ -1167,6 +1173,33 @@ async def generate_sequences(self, prompts: DataProto) -> DataProto:
         timing = self._performance_metrics(metrics, output)
 
         output.meta_info = {"timing": timing, **outputs[0].meta_info}
+
+        if spec_before is not None:
+            try:
+                spec_after = read_spec_decoding_metrics_from_prometheus(self.server_addresses)
+                spec_delta = {key: spec_after[key] - spec_before[key] for key in spec_before}
+                acceptance_rate = (
+                    spec_delta["num_accepted_tokens"] / spec_delta["num_draft_tokens"]
+                    if spec_delta["num_draft_tokens"] > 0
+                    else float("inf")
+                )
+
+                mean_acceptance_length = (
+                    1.0 + (spec_delta["num_accepted_tokens"] / spec_delta["num_drafts"])
+                    if spec_delta["num_drafts"] > 0
+                    else 1.0
+                )
+
+                output.meta_info["speculative_decoding_metrics"] = {
+                    "num_drafts": spec_delta["num_drafts"],
+                    "num_draft_tokens": spec_delta["num_draft_tokens"],
+                    "num_accepted_tokens": spec_delta["num_accepted_tokens"],
+                    "avg_draft_acceptance_rate": acceptance_rate,
+                    "mean_acceptance_length": mean_acceptance_length,
+                }
+            except Exception as e:
+                print(f"speculative decoding unavailable: {e}")
+
         return output
 
     def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: DataProto) -> dict[str, float]:
diff --git a/verl/experimental/agent_loop/prometheus_utils.py b/verl/experimental/agent_loop/prometheus_utils.py
index 0ce582df61e..a11d050d3d7 100644
--- a/verl/experimental/agent_loop/prometheus_utils.py
+++ b/verl/experimental/agent_loop/prometheus_utils.py
@@ -108,3 +108,26 @@ def reload_prometheus(port):
 
     except Exception as e:
         logger.error(f"Failed to update Prometheus configuration: {e}")
+
+
+def read_spec_decoding_metrics_from_prometheus(server_adresses: list[str]) -> dict[str, float]:
+    import requests
+    from prometheus_client.parser import text_string_to_metric_families
+
+    metric_name_to_key = {
+        "vllm:spec_decode_num_drafts_total": "num_drafts",
+        "vllm:spec_decode_num_draft_tokens_total": "num_draft_tokens",
+        "vllm:spec_decode_num_accepted_tokens_total": "num_accepted_tokens",
+    }
+    totals = {key: 0.0 for key in metric_name_to_key.values()}
+    session = requests.Session()
+    session.trust_env = False
+
+    for address in server_adresses:
+        metrics_text = session.get(f"http://{address}/metrics", timeout=5).text
+        for family in text_string_to_metric_families(metrics_text):
+            for sample in family.samples:
+                key = metric_name_to_key.get(sample.name)
+                if key is not None:
+                    totals[key] += float(sample.value)
+    return totals
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
index 46602372986..eb1459ef22a 100644
--- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -358,6 +358,14 @@ actor_rollout_ref:
     quantization: null
     quantization_config_file: null
     mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    speculative_decoding:
+      _target_: verl.workers.config.rollout.SpeculativeDecodingConfig
+      enable: false
+      method: eagle3
+      num_steps: 1
+      num_draft_tokens: 4
+      draft_model_path: null
+      draft_tensor_parallel_size: 1
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
     layer_name_map:
       qkv_layer_name: qkv
diff --git a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
index ccc03d127f1..818ae7ef40d 100644
--- a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
@@ -329,6 +329,14 @@ actor_rollout_ref:
     quantization: null
     quantization_config_file: null
     mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    speculative_decoding:
+      _target_: verl.workers.config.rollout.SpeculativeDecodingConfig
+      enable: false
+      method: eagle3
+      num_steps: 1
+      num_draft_tokens: 4
+      draft_model_path: null
+      draft_tensor_parallel_size: 1
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
     layered_summon: false
   model:
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index cbb30d418e4..53cc10e5967 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -338,6 +338,14 @@ actor_rollout_ref:
     quantization: null
     quantization_config_file: null
     mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    speculative_decoding:
+      _target_: verl.workers.config.rollout.SpeculativeDecodingConfig
+      enable: false
+      method: eagle3
+      num_steps: 1
+      num_draft_tokens: 4
+      draft_model_path: null
+      draft_tensor_parallel_size: 1
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
     layered_summon: false
   model:
diff --git a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
index bb2dd113796..bc919191c90 100644
--- a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
@@ -308,6 +308,14 @@ actor_rollout_ref:
     quantization: null
     quantization_config_file: null
     mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    speculative_decoding:
+      _target_: verl.workers.config.rollout.SpeculativeDecodingConfig
+      enable: false
+      method: eagle3
+      num_steps: 1
+      num_draft_tokens: 4
+      draft_model_path: null
+      draft_tensor_parallel_size: 1
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
     layered_summon: false
   model:
diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
index 1144c56a0c9..123b8b58883 100644
--- a/verl/trainer/config/rollout/rollout.yaml
+++ b/verl/trainer/config/rollout/rollout.yaml
@@ -401,5 +401,14 @@ quantization_config_file: null
 # MTP configuration, reuse model configuration
 mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
 
+speculative_decoding:
+  _target_: verl.workers.config.rollout.SpeculativeDecodingConfig
+  enable: False
+  method: eagle3
+  num_steps: 1
+  num_draft_tokens: 4
+  draft_model_path: null
+  draft_tensor_parallel_size: 1
+
 # QAT configuration (inherited from actor's engine config)
 qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
diff --git a/verl/trainer/ppo/metric_utils.py b/verl/trainer/ppo/metric_utils.py
index 4dd7d2d00a5..f0538a5af5a 100644
--- a/verl/trainer/ppo/metric_utils.py
+++ b/verl/trainer/ppo/metric_utils.py
@@ -222,6 +222,24 @@ def compute_data_metrics(batch: DataProto, use_critic: bool = True) -> dict[str,
         metrics["tool_call_counts/max"] = tool_call_counts.max()
         metrics["tool_call_counts/mean"] = tool_call_counts.mean()
 
+    # speculative decoding
+    if "speculative_decoding_metrics" in batch.meta_info:
+        metrics["speculative_decoding/num_drafts"] = np.mean(
+            batch.meta_info["speculative_decoding_metrics"]["num_drafts"]
+        )
+        metrics["speculative_decoding/num_draft_tokens"] = np.mean(
+            batch.meta_info["speculative_decoding_metrics"]["num_draft_tokens"]
+        )
+        metrics["speculative_decoding/num_accepted_tokens"] = np.mean(
+            batch.meta_info["speculative_decoding_metrics"]["num_accepted_tokens"]
+        )
+        metrics["speculative_decoding/avg_draft_acceptance_rate"] = np.mean(
+            batch.meta_info["speculative_decoding_metrics"]["avg_draft_acceptance_rate"]
+        )
+        metrics["speculative_decoding/mean_acceptance_length"] = np.mean(
+            batch.meta_info["speculative_decoding_metrics"]["mean_acceptance_length"]
+        )
+
     return metrics
 
 
diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index 886cb1f836e..a38500c0229 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -33,6 +33,7 @@
     "RolloutConfig",
     "DiffusionRolloutConfig",
     "CheckpointEngineConfig",
+    "SpeculativeDecodingConfig",
 ]
 
 
@@ -145,6 +146,18 @@ class CheckpointEngineConfig(BaseConfig):
     engine_kwargs: dict = field(default_factory=dict)
 
 
+@dataclass
+class SpeculativeDecodingConfig(BaseConfig):
+    enable: bool = False
+
+    method: str = "eagle3"
+    num_steps: int = 3
+    num_draft_tokens: int = 4
+    draft_model_path: str | None = None
+
+    draft_tensor_parallel_size: int = 1
+
+
 @dataclass
 class RolloutConfig(BaseConfig):
     _mutable_fields = {
@@ -262,6 +275,8 @@ class RolloutConfig(BaseConfig):
 
     mtp: MtpConfig = field(default_factory=MtpConfig)
 
+    speculative_decoding: SpeculativeDecodingConfig = field(default_factory=SpeculativeDecodingConfig)
+
     qat: Optional[dict] = None
 
     def __post_init__(self):
@@ -311,6 +326,32 @@ def __post_init__(self):
                     f"Current rollout {self.name=} not implemented pipeline_model_parallel_size > 1 yet."
                 )
 
+        if self.name != "vllm" and self.speculative_decoding.enable:
+            raise NotImplementedError(
+                f"Rollout {self.name=} does not support speculative decoding "
+                f"{self.speculative_decoding.method=} for rollout acceleration yet"
+            )
+
+        if self.name == "vllm" and self.speculative_decoding.enable:
+            if self.speculative_decoding.method.lower() not in {"eagle", "eagle3"}:
+                warnings.warn(
+                    "Speculative decoding methods other than 'eagle' and 'eagle3' are untested and may be buggy ",
+                    stacklevel=2,
+                )
+
+            if (
+                self.speculative_decoding.draft_tensor_parallel_size != 1
+                or self.speculative_decoding.draft_tensor_parallel_size != self.tensor_model_parallel_size
+            ):
+                raise ValueError(
+                    f"draft_tensor_parallel_size={self.speculative_decoding.draft_tensor_parallel_size} "
+                    "cannot be other value than 1 or target model "
+                    "tensor_parallel_size={self.tensor_model_parallel_size} "
+                )
+
+        if self.speculative_decoding.enable and self.mtp.enable_rollout:
+            raise ValueError("Use either speculative_decoding or mtp, but not both simultaneously")
+
 
 @dataclass
 class DiffusionRolloutConfig(RolloutConfig):
diff --git a/verl/workers/rollout/vllm_rollout/utils.py b/verl/workers/rollout/vllm_rollout/utils.py
index 043a10cf790..437ffe9f0b2 100644
--- a/verl/workers/rollout/vllm_rollout/utils.py
+++ b/verl/workers/rollout/vllm_rollout/utils.py
@@ -176,7 +176,13 @@ def monkey_patch_model(self, vocab_size: int):
         # patch weight loader to support MoE model
         patch_vllm_moe_model_weight_loader(self.model_runner.model)
 
-    def update_weights_from_ipc(self, peft_config: dict = None, base_sync_done=False, use_shm: bool = False):
+    def update_weights_from_ipc(
+        self,
+        peft_config: dict = None,
+        base_sync_done=False,
+        use_shm: bool = False,
+        use_speculative_decoding: bool = False,
+    ):
         """Update the weights of the rollout model."""
         from vllm.platforms import current_platform
 
@@ -216,10 +222,27 @@ def update_weights_from_ipc(self, peft_config: dict = None, base_sync_done=False
         )
         receiver.receive_weights(
             on_bucket_received=lambda weights: self._update_weights(
-                weights, peft_config=peft_config, base_sync_done=base_sync_done
+                weights,
+                peft_config=peft_config,
+                base_sync_done=base_sync_done,
             )
         )
 
+        if use_speculative_decoding:
+            # Reload draft weights because they are discarded after each model load.
+            from vllm.model_executor.model_loader import get_model_loader
+
+            loader = get_model_loader(self.model_runner.drafter.vllm_config.load_config)
+            self.model_runner.drafter.model.load_weights(
+                loader.get_all_weights(
+                    self.vllm_config.speculative_config.draft_model_config,
+                    self.model_runner.drafter.model,
+                )
+            )
+
+            # Rebuild RoPE caches because reloading weights clears the cos/sin cache.
+            rebuild_rope_caches(self.model_runner.drafter.model)
+
         if self._is_qat_model:
             # QAT (compressed-tensors): call process_weights_after_loading AFTER all buckets are received
             from verl.utils.qat import manual_process_weights_after_loading
@@ -239,6 +262,11 @@ def update_weights_from_ipc(self, peft_config: dict = None, base_sync_done=False
             model_config = self.model_runner.vllm_config.model_config
             process_weights_after_loading(model, model_config, self.device)
 
+            if use_speculative_decoding:
+                drafter_model = self.model_runner.drafter.model
+                drafter_model_config = self.model_runner.drafter.vllm_config.model_config
+                process_weights_after_loading(drafter_model, drafter_model_config, self.device)
+
     def _update_weights(self, weights: list[tuple[str, torch.Tensor]], peft_config: dict, base_sync_done: bool):
         if peft_config and base_sync_done:
             weights = dict(weights)
@@ -428,3 +456,13 @@ def extract_prompt_logprobs(output: RequestOutput, num_prompt_logprobs: Optional
 
     result_dict["prompt_ids"] = prompt_ids_ls
     result_dict["prompt_logprobs"] = prompt_logprobs_ls
+
+
+@torch.no_grad()
+def rebuild_rope_caches(root_module: torch.nn.Module):
+    for _, m in root_module.named_modules():
+        if hasattr(m, "rotary_emb"):
+            old = m.rotary_emb.cos_sin_cache
+            cache = m.rotary_emb._compute_cos_sin_cache()
+            cache = cache.to(device=old.device, dtype=old.dtype)
+            m.rotary_emb.cos_sin_cache = cache
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index ff37780cff2..371b2f5cb0e 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -290,6 +290,23 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
             }
             args["speculative_config"] = speculative_config
 
+        # speculative decoding:
+        if self.config.speculative_decoding.enable:
+            if self.config.speculative_decoding.draft_model_path is None:
+                raise ValueError(
+                    "self.config.speculative_decoding._draft_model_path shoul not be None when using with vLLM"
+                )
+
+            speculative_config = {
+                "model": self.config.speculative_decoding.draft_model_path,
+                "max_model_len": self.config.max_model_len,
+                "num_speculative_tokens": self.config.speculative_decoding.num_draft_tokens,
+                "method": self.config.speculative_decoding.method.lower(),
+                "draft_tensor_parallel_size": self.config.speculative_decoding.draft_tensor_parallel_size,
+            }
+
+            args["speculative_config"] = speculative_config
+
         if self.config.data_parallel_size > 1:
             assert self.gpus_per_node % self.config.tensor_model_parallel_size == 0, (
                 "gpus_per_node should be divisible by tensor_model_parallel_size"
diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout.py b/verl/workers/rollout/vllm_rollout/vllm_rollout.py
index 90641b575a2..0a854df3c7a 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_rollout.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_rollout.py
@@ -98,6 +98,7 @@ def __init__(
         self.zmq_handle = f"ipc:///tmp/rl-colocate-zmq-{self.device_uuid}.sock"
 
         self.use_shm = not is_support_ipc()
+        self.use_speculative_decoding = config.speculative_decoding.enable
         if self.use_shm:
             logger.warning(
                 "IPC is not supported on your devices. Falling back to shared memory for weight transfer, "
@@ -160,7 +161,7 @@ async def update_weights(
         future = await self._execute_method(
             "update_weights_from_ipc",
             non_block=True,
-            kwargs={**kwargs, "use_shm": self.use_shm},
+            kwargs={**kwargs, "use_shm": self.use_shm, "use_speculative_decoding": self.use_speculative_decoding},
         )
 
         bucket_size_mb = self.config.checkpoint_engine.update_weights_bucket_megabytes

From 157c8fc5b4c9708cf869fcad2f20bcc081bd6810 Mon Sep 17 00:00:00 2001
From: aleskeymalakhov11 <aleksey.malakhov11@gmail.com>
Date: Wed, 8 Apr 2026 16:59:33 +0300
Subject: [PATCH 2/8] merge origin/master

---
 .agent/skills/issue.md                        |   47 +
 .agent/skills/pr.md                           |   33 +
 .claude/skills/issue.md                       |    1 +
 .claude/skills/pr.md                          |    1 +
 .codex/skills/issue.md                        |    1 +
 .codex/skills/pr.md                           |    1 +
 .github/workflows/cpu_unit_tests.yml          |    5 +-
 .../docker-build-ascend-sglang-a2.yml         |   84 ++
 .../docker-build-ascend-sglang-a3.yml         |   84 ++
 .github/workflows/e2e_ascend.yml              |   63 +-
 .github/workflows/e2e_fully_async_policy.yml  |    5 +-
 .../e2e_fully_async_policy_ascend.yml         |    2 -
 .github/workflows/e2e_one_step_off_policy.yml |    6 +-
 .../e2e_one_step_off_policy_ascend.yml        |    2 -
 .../workflows/e2e_ppo_grpo_trainer_trtllm.yml |   56 +-
 .../e2e_ppo_trainer_megatron_sglang.yml       |    4 +-
 .../e2e_ppo_trainer_megatron_sglang_2.yml     |    2 -
 .../e2e_ppo_trainer_megatron_vllm.yml         |   12 +-
 .../e2e_ppo_trainer_megatron_vllm_2.yml       |   15 +-
 ...e2e_ppo_trainer_megatron_vllm_2_ascend.yml |    2 -
 .../workflows/e2e_ppo_trainer_veomni_vllm.yml |    3 +-
 .github/workflows/e2e_sft_llm.yml             |    1 -
 .github/workflows/e2e_sft_llm_ascend.yml      |    3 +-
 .github/workflows/e2e_sft_vlm.yml             |    1 -
 .github/workflows/gpu_unit_tests.yml          |    2 +-
 .github/workflows/model.yml                   |   19 +-
 .github/workflows/model_ascend.yml            |    2 -
 .github/workflows/nightly_ascend.yml          |   54 +-
 .github/workflows/npu_unit_tests.yml          |    2 +-
 .github/workflows/reward_model_sglang.yml     |   18 +-
 .github/workflows/reward_model_vllm.yml       |   12 +-
 .../workflows/reward_model_vllm_ascend.yml    |    1 -
 .github/workflows/sanity.yml                  |   28 +-
 .github/workflows/sgl.yml                     |    2 -
 .github/workflows/vllm.yml                    |    8 +-
 .github/workflows/vllm_omni.yml               |   13 +-
 .pre-commit-config.yaml                       |   38 +-
 AGENTS.md                                     |   87 ++
 CLAUDE.md                                     |    1 +
 CONTRIBUTING.md                               |   21 +-
 README.md                                     |    2 +
 config/plain.yaml                             |  129 ++
 docker/Dockerfile.stable.sglang               |    2 +-
 docker/Dockerfile.stable.vllm                 |   35 +-
 .../Dockerfile.ascend.sglang_8.3.rc1_a2       |    2 +-
 .../Dockerfile.ascend.sglang_8.3.rc1_a3       |    2 +-
 docs/advance/nvfp4_qat.md                     |   87 ++
 docs/advance/ppo_lora.rst                     |    2 +-
 docs/advance/rollout_skip.rst                 |   76 +-
 docs/algo/rollout_corr.md                     |   10 +-
 .../amd_build_dockerfile_page.rst             |    4 +-
 .../contribution_guide/ascend_ci_guide_zh.rst |    5 +-
 .../ascend_performance_analysis_guide.md      |   26 +-
 .../examples/ascend_retool_best_pratice.rst   |   12 +-
 .../examples/ascend_sglang_best_practices.rst |    9 +-
 .../dapo_multi_model_optimization_practice.md |   24 +-
 .../examples/gspo_optimization_practice.md    |   95 +-
 .../run_qwen3_32B_megatron_1k_256k_npu.md     |    4 +-
 docs/ascend_tutorial/faq/faq.rst              |  100 +-
 .../features/ascend_backend_features.md       |    4 +-
 .../features/ascend_consistency.rst           |    2 -
 .../profiling/precision_debugger.md           |  367 ++++++
 .../quick_start/ascend_quick_start.rst        |  122 +-
 .../quick_start/ascend_sglang_quick_start.rst |   50 +-
 .../quick_start/dockerfile_build_guidance.rst |    6 +-
 .../editing-agent-instructions.md             |   91 ++
 docs/examples/config.rst                      |    2 +-
 docs/examples/gsm8k_example.rst               |    4 +-
 docs/index.rst                                |    8 +
 docs/start/multinode.rst                      |    4 +-
 docs/workers/sglang_worker.rst                |    8 +-
 .../flowgrpo_trainer/diffusers/qwen_image.py  |  146 +++
 examples/flowgrpo_trainer/reward_fn.py        |  152 +++
 .../flowgrpo_trainer/scheduler/__init__.py    |   17 +
 .../scheduling_flow_match_sde_discrete.py     |    0
 .../vllm_omni/pipeline_qwenimage.py           |    2 +-
 .../run_qwen2-7b_math_megatron_lora.sh        |    2 +-
 .../run_qwen2-7b_math_megatron_trtllm.sh      |    2 +-
 .../run_qwen3-30b_dapo_megatron_fp8_trtllm.sh |    2 +-
 examples/grpo_trainer/run_qwen3-32b_npu.sh    |    1 +
 .../run_qwen3-32b_sglang_mindspeedllm_npu.sh  |  233 ++++
 .../run_qwen3_235b_256k_megatron_npu.sh       |  207 +++
 .../run_qwen3_235b_megatron_npu.sh            |    5 +-
 .../run_qwen3_5-122b-a10b-megatron.sh         |  189 +++
 .../grpo_trainer/run_qwen3_5_27b_vllm_fsdp.sh |   85 ++
 .../run_qwen3_5_27b_vllm_fsdp_npu.sh          |   85 ++
 .../grpo_trainer/run_qwen3_5_35b_vllm_fsdp.sh |   85 ++
 .../run_qwen3_5_35b_vllm_fsdp_npu.sh          |   85 ++
 .../run_qwen3moe-30b_megatron_lora.sh         |    2 +-
 ...un_qwen3moe-30b_sglang_mindspeedllm_npu.sh |  245 ++++
 .../run_qwen_gsm8k.sh                         |    2 +-
 ...megatron.sh => run_qwen_gsmk8_megatron.sh} |    0
 examples/ppo_trainer/run_deepseek7b_llm.sh    |    4 +-
 .../run_deepseek7b_llm_modelscope.sh          |    4 +-
 .../ppo_trainer/run_deepseek7b_llm_pfppo.sh   |    4 +-
 .../run_deepseek7b_llm_sandbox_fusion.sh      |    4 +-
 .../ppo_trainer/run_deepseek7b_llm_sp2.sh     |    4 +-
 examples/ppo_trainer/run_gemma.sh             |    4 +-
 examples/ppo_trainer/run_qwen2-7b_rm.sh       |    4 +-
 .../run_qwen2-7b_rm_reward_loop_colocate.sh   |    4 +-
 .../run_qwen2-7b_rm_seq_balance.sh            |    4 +-
 ...n_qwen2-7b_rm_seq_balance_fused_kernels.sh |    4 +-
 .../run_qwen2-7b_rm_seq_balance_nsys.sh       |    4 +-
 .../ppo_trainer/run_qwen2-7b_seq_balance.sh   |    4 +-
 .../run_qwen2-7b_sglang_seq_balance.sh        |    4 +-
 examples/ppo_trainer/run_qwen2.5-32b.sh       |    4 +-
 .../run_qwen2.5-3b_rm_reward_loop_colocate.sh |    4 +-
 examples/ppo_trainer/run_qwen3-8b_npu.sh      |    4 +-
 .../gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh    |    2 +
 .../split_placement/run_deepseek7b_llm.sh     |    4 +-
 requirements-npu.txt                          |    1 +
 scripts/generate_trainer_config.sh            |    3 +-
 tests/experimental/agent_loop/conftest.py     |   28 +
 ...t_agent_loop_extra_fields_schema_on_cpu.py |    7 +-
 .../agent_loop/test_diffusion_agent_loop.py   |  141 +++
 .../reward_loop/test_visual_reward_manager.py |  146 +++
 tests/models/test_diffusers_fsdp_engine.py    |  206 +++
 tests/models/test_engine.py                   |   13 +-
 .../special_e2e/generation/run_gen_qwen05.sh  |   26 -
 .../generation/run_gen_qwen05_server.sh       |   26 -
 .../ppo_trainer/run_function_reward.sh        |   13 +-
 .../ppo_trainer/run_model_reward.sh           |  100 --
 .../special_e2e/ppo_trainer/run_single_gpu.sh |   24 -
 .../ppo_trainer/run_single_gpu_with_engine.sh |   25 -
 tests/special_e2e/run_dapo.sh                 |   90 --
 tests/special_e2e/run_fully_async_policy.sh   |   20 +-
 .../run_geo3k_fsdp_sgl_multiturn_w_tool.sh    |   58 -
 tests/special_e2e/run_grpo_lora_with_merge.sh |   93 --
 .../run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh   |   62 -
 .../run_gsm8k_fsdp_sgl_multiturn_w_tool.sh    |   58 -
 tests/special_e2e/run_one_step_off_policy.sh  |   15 +-
 tests/special_e2e/run_ppo_trainer_megatron.sh |    8 +-
 .../special_e2e/run_ppo_trainer_torchtitan.sh |   69 -
 tests/special_e2e/run_test.sh                 |   13 -
 .../run_dapo_moonlight-16b_megatron_npu.sh    |  189 +++
 .../run_grpo_qwen25-7b-instruct_fsdp_npu.sh   |    2 +-
 ...run_grpo_qwen25-vl-3b-instruct_fsdp_npu.sh |    2 +-
 .../run_ppo_qwen3-8b_fsdp_npu.sh              |    6 +-
 tests/special_npu/run_qwen3_06b_ppo.sh        |    4 +-
 .../run_qwen3_30b_grpo_mindspeedllm.sh        |  268 ++++
 .../run_qwen3_8b_grpo_mindspeedllm.sh         |  223 ++++
 .../special_sanity/check_device_api_usage.py  |    1 +
 tests/special_sanity/check_license.py         |   19 +-
 tests/special_sanity/test_config_docs.py      |    1 +
 .../config/legacy_ppo_megatron_trainer.yaml   |  471 -------
 tests/trainer/config/legacy_ppo_trainer.yaml  | 1126 -----------------
 .../trainer/config/test_algo_config_on_cpu.py |  204 ---
 .../config/test_legacy_config_on_cpu.py       |  180 ---
 tests/trainer/ppo/test_core_algos_on_cpu.py   |   45 +
 tests/trainer/ppo/test_rollout_corr.py        |   47 +
 .../ppo/test_rollout_corr_integration.py      |   43 +
 .../test_multiturn_sft_dataset_on_cpu.py      |    2 -
 tests/utils/test_padding_on_cpu.py            |   40 +-
 tests/utils/test_rollout_skip_on_cpu.py       |  310 +++--
 .../test_special_adapter_path_integration.py  |  224 ++++
 .../rollout_trtllm/test_async_server.py       |    2 +-
 .../test_trtllm_rollout_utils.py              |    8 +-
 .../rollout_vllm/test_vllm_omni_generate.py   |   25 +-
 .../workers/test_engine_workers_lora_sync.py  |  465 +++++++
 verl/checkpoint_engine/base.py                |    5 +
 verl/experimental/agent_loop/__init__.py      |   10 +-
 verl/experimental/agent_loop/agent_loop.py    |  142 +--
 .../agent_loop/diffusion_agent_loop.py        |  382 ++++++
 .../agent_loop/single_turn_agent_loop.py      |  113 ++
 .../agent_loop/agent_loop.py                  |    3 +
 .../fully_async_policy/detach_utils.py        |    4 +-
 .../shell/dapo_30b_a3b_math_fsdp_npu.sh       |  168 +++
 verl/experimental/reward_loop/reward_loop.py  |   58 +-
 .../reward_loop/reward_manager/__init__.py    |    2 +
 .../reward_loop/reward_manager/visual.py      |   92 ++
 .../teacher_loop/teacher_manager.py           |  175 +++
 .../teacher_loop/teacher_model.py             |   33 +
 verl/model_merger/base_model_merger.py        |   12 +-
 verl/models/diffusers_model/__init__.py       |   24 +
 verl/models/diffusers_model/base.py           |  170 +++
 verl/models/diffusers_model/utils.py          |  114 ++
 verl/models/mcore/__init__.py                 |    8 +-
 verl/models/mcore/config_converter.py         |    4 +-
 verl/models/mcore/model_forward.py            |   86 +-
 verl/models/mcore/model_forward_fused.py      |   18 +-
 verl/models/mcore/model_initializer.py        |   14 +-
 verl/models/mcore/registry.py                 |   34 +-
 verl/models/mcore/util.py                     |  222 +++-
 verl/models/transformers/glm4v.py             |   15 +-
 verl/models/transformers/monkey_patch.py      |   33 +
 verl/models/transformers/npu_patch.py         |   36 +
 verl/models/transformers/qwen2_vl.py          |   21 +-
 verl/models/transformers/qwen3_5.py           |  262 ++++
 verl/models/transformers/qwen3_vl.py          |   16 +-
 verl/single_controller/ray/base.py            |    5 +-
 .../config/_generated_diffusion_trainer.yaml  |  665 ++++++++++
 .../_generated_ppo_megatron_trainer.yaml      |  134 +-
 .../_generated_ppo_torchtitan_trainer.yaml    |  108 +-
 .../config/_generated_ppo_trainer.yaml        |  182 ++-
 .../config/_generated_ppo_veomni_trainer.yaml |  108 +-
 .../trainer/config/actor/mindspeed_actor.yaml |   18 +
 verl/trainer/config/algorithm.py              |   53 +-
 .../config/algorithm/rollout_correction.yaml  |    4 +-
 verl/trainer/config/critic/critic.yaml        |   19 -
 verl/trainer/config/critic/dp_critic.yaml     |   38 +-
 .../config/critic/megatron_critic.yaml        |   78 --
 .../config/critic/mindspeed_critic.yaml       |   30 +
 .../config/critic/torchtitan_critic.yaml      |    6 -
 verl/trainer/config/critic/veomni_critic.yaml |    6 -
 verl/trainer/config/diffusion_trainer.yaml    |  190 +++
 verl/trainer/config/engine/megatron.yaml      |    6 +
 verl/trainer/config/engine/mindspeed.yaml     |   45 +
 .../trainer/config/model/diffusion_model.yaml |   65 +
 verl/trainer/config/model/hf_model.yaml       |   66 +
 .../trainer/config/model_engine/megatron.yaml |    2 +
 .../config/model_engine/mindspeed.yaml        |    2 +
 verl/trainer/config/ppo_megatron_trainer.yaml |  248 +---
 verl/trainer/config/ppo_trainer.yaml          |   37 +-
 verl/trainer/config/profiler/profiler.yaml    |   23 +-
 verl/trainer/config/ref/mindspeed_ref.yaml    |   36 +
 .../config/rollout/diffusion_rollout.yaml     |   51 +
 verl/trainer/config/rollout/rollout.yaml      |   37 +-
 verl/trainer/config/sft_trainer_engine.yaml   |    1 +
 verl/trainer/distillation/megatron/losses.py  |   12 +-
 verl/trainer/main_ppo.py                      |   38 +-
 verl/trainer/ppo/diffusion_algos.py           |   97 ++
 verl/trainer/ppo/ray_trainer.py               |   57 +-
 verl/trainer/ppo/reward.py                    |    8 +-
 verl/trainer/ppo/rollout_corr_helper.py       |  117 +-
 verl/trainer/sft_trainer.py                   |    6 +
 verl/trainer/sft_trainer_ray.py               |   19 +
 .../checkpoint/megatron_checkpoint_manager.py |  151 ++-
 verl/utils/dataset/dataset_utils.py           |    1 +
 verl/utils/dataset/multiturn_sft_dataset.py   |    4 +-
 verl/utils/debug/metrics.py                   |   12 +
 verl/utils/experimental/reward_utils.py       |   36 +
 verl/utils/flops_counter.py                   |    9 +-
 verl/utils/fp8_utils.py                       |    7 +-
 verl/utils/fsdp_utils.py                      |   45 +-
 verl/utils/import_utils.py                    |   10 +-
 verl/utils/megatron/router_replay_utils.py    |   10 +-
 verl/utils/megatron_peft_utils.py             |  169 ---
 verl/utils/megatron_utils.py                  |  143 ++-
 verl/utils/model.py                           |   11 +-
 verl/utils/profiler/config.py                 |   27 +
 .../profiler/precision_debugger_profile.py    |  265 ++++
 verl/utils/profiler/profile.py                |    5 +
 verl/utils/reward_score/__init__.py           |   49 +
 .../reward_score/jpeg_compressibility.py      |   61 +
 verl/utils/reward_score/math_verify.py        |   56 +-
 verl/utils/rollout_skip.py                    |  421 +++++-
 verl/utils/tensordict_utils.py                |    2 +-
 verl/utils/tracking.py                        |   13 +-
 verl/utils/transformers_compat.py             |   18 +
 verl/utils/vllm/npu_vllm_patch.py             |   24 +-
 verl/utils/vllm/patch.py                      |    9 +-
 verl/utils/vllm/vllm_fp8_utils.py             |  179 ++-
 verl/workers/actor/dp_actor.py                |   10 +-
 verl/workers/actor/megatron_actor.py          |    2 +
 verl/workers/config/actor.py                  |   39 +-
 verl/workers/config/critic.py                 |   60 +-
 verl/workers/config/distillation.py           |    8 -
 verl/workers/config/engine.py                 |   33 +-
 verl/workers/config/model.py                  |   21 +-
 verl/workers/config/rollout.py                |   37 +-
 verl/workers/engine/__init__.py               |   10 +-
 verl/workers/engine/fsdp/__init__.py          |    8 +
 verl/workers/engine/fsdp/diffusers_impl.py    |  827 ++++++++++++
 verl/workers/engine/fsdp/transformer_impl.py  |   35 +-
 .../engine/megatron/transformer_impl.py       |  109 +-
 verl/workers/engine/mindspeed/__init__.py     |    4 +-
 .../engine/mindspeed/transformer_impl.py      |   60 +-
 verl/workers/engine/mindspeed/utils.py        |  223 ++++
 verl/workers/engine_workers.py                |   78 +-
 verl/workers/fsdp_workers.py                  |    8 +-
 verl/workers/megatron_workers.py              |    7 +-
 verl/workers/rollout/replica.py               |   13 +-
 .../sglang_rollout/async_sglang_server.py     |   20 +-
 .../rollout/sglang_rollout/sglang_rollout.py  |    6 +-
 .../rollout/trtllm_rollout}/__init__.py       |    2 +-
 .../rollout/trtllm_rollout/trtllm_rollout.py  |   14 +-
 verl/workers/rollout/utils.py                 |    6 +-
 verl/workers/rollout/vllm_rollout/utils.py    |    2 +-
 .../rollout/vllm_rollout/vllm_async_server.py |   12 +-
 .../vllm_rollout/vllm_omni_async_server.py    |    4 +-
 verl/workers/utils/losses.py                  |   54 +-
 verl/workers/utils/padding.py                 |   40 +
 282 files changed, 12698 insertions(+), 4489 deletions(-)
 create mode 100644 .agent/skills/issue.md
 create mode 100644 .agent/skills/pr.md
 create mode 120000 .claude/skills/issue.md
 create mode 120000 .claude/skills/pr.md
 create mode 120000 .codex/skills/issue.md
 create mode 120000 .codex/skills/pr.md
 create mode 100644 .github/workflows/docker-build-ascend-sglang-a2.yml
 create mode 100644 .github/workflows/docker-build-ascend-sglang-a3.yml
 create mode 100644 AGENTS.md
 create mode 120000 CLAUDE.md
 create mode 100644 config/plain.yaml
 create mode 100644 docs/advance/nvfp4_qat.md
 create mode 100644 docs/ascend_tutorial/profiling/precision_debugger.md
 create mode 100644 docs/contributing/editing-agent-instructions.md
 create mode 100644 examples/flowgrpo_trainer/diffusers/qwen_image.py
 create mode 100644 examples/flowgrpo_trainer/reward_fn.py
 create mode 100644 examples/flowgrpo_trainer/scheduler/__init__.py
 rename examples/{vllm_omni => flowgrpo_trainer/scheduler}/scheduling_flow_match_sde_discrete.py (100%)
 rename examples/{ => flowgrpo_trainer}/vllm_omni/pipeline_qwenimage.py (99%)
 create mode 100644 examples/grpo_trainer/run_qwen3-32b_sglang_mindspeedllm_npu.sh
 create mode 100644 examples/grpo_trainer/run_qwen3_235b_256k_megatron_npu.sh
 create mode 100644 examples/grpo_trainer/run_qwen3_5-122b-a10b-megatron.sh
 create mode 100644 examples/grpo_trainer/run_qwen3_5_27b_vllm_fsdp.sh
 create mode 100644 examples/grpo_trainer/run_qwen3_5_27b_vllm_fsdp_npu.sh
 create mode 100644 examples/grpo_trainer/run_qwen3_5_35b_vllm_fsdp.sh
 create mode 100644 examples/grpo_trainer/run_qwen3_5_35b_vllm_fsdp_npu.sh
 create mode 100644 examples/grpo_trainer/run_qwen3moe-30b_sglang_mindspeedllm_npu.sh
 rename examples/on_policy_distillation_trainer/{run_qwen_gsmk8k_megatron.sh => run_qwen_gsmk8_megatron.sh} (100%)
 create mode 100644 tests/experimental/agent_loop/conftest.py
 create mode 100644 tests/experimental/agent_loop/test_diffusion_agent_loop.py
 create mode 100644 tests/experimental/reward_loop/test_visual_reward_manager.py
 create mode 100644 tests/models/test_diffusers_fsdp_engine.py
 delete mode 100755 tests/special_e2e/generation/run_gen_qwen05.sh
 delete mode 100644 tests/special_e2e/generation/run_gen_qwen05_server.sh
 delete mode 100644 tests/special_e2e/ppo_trainer/run_model_reward.sh
 delete mode 100644 tests/special_e2e/ppo_trainer/run_single_gpu.sh
 delete mode 100644 tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh
 delete mode 100644 tests/special_e2e/run_dapo.sh
 delete mode 100644 tests/special_e2e/run_geo3k_fsdp_sgl_multiturn_w_tool.sh
 delete mode 100644 tests/special_e2e/run_grpo_lora_with_merge.sh
 delete mode 100644 tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
 delete mode 100644 tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
 delete mode 100644 tests/special_e2e/run_ppo_trainer_torchtitan.sh
 delete mode 100644 tests/special_e2e/run_test.sh
 create mode 100644 tests/special_npu/nightly_ci_ascend/run_dapo_moonlight-16b_megatron_npu.sh
 create mode 100644 tests/special_npu/run_qwen3_30b_grpo_mindspeedllm.sh
 create mode 100644 tests/special_npu/run_qwen3_8b_grpo_mindspeedllm.sh
 delete mode 100644 tests/trainer/config/legacy_ppo_megatron_trainer.yaml
 delete mode 100644 tests/trainer/config/legacy_ppo_trainer.yaml
 delete mode 100644 tests/trainer/config/test_algo_config_on_cpu.py
 delete mode 100644 tests/trainer/config/test_legacy_config_on_cpu.py
 create mode 100644 tests/utils/test_special_adapter_path_integration.py
 create mode 100644 tests/workers/test_engine_workers_lora_sync.py
 create mode 100644 verl/experimental/agent_loop/diffusion_agent_loop.py
 create mode 100644 verl/experimental/fully_async_policy/shell/dapo_30b_a3b_math_fsdp_npu.sh
 create mode 100644 verl/experimental/reward_loop/reward_manager/visual.py
 create mode 100644 verl/experimental/teacher_loop/teacher_manager.py
 create mode 100644 verl/models/diffusers_model/__init__.py
 create mode 100644 verl/models/diffusers_model/base.py
 create mode 100644 verl/models/diffusers_model/utils.py
 create mode 100644 verl/models/transformers/qwen3_5.py
 create mode 100644 verl/trainer/config/_generated_diffusion_trainer.yaml
 create mode 100644 verl/trainer/config/actor/mindspeed_actor.yaml
 create mode 100644 verl/trainer/config/critic/mindspeed_critic.yaml
 create mode 100644 verl/trainer/config/diffusion_trainer.yaml
 create mode 100644 verl/trainer/config/engine/mindspeed.yaml
 create mode 100644 verl/trainer/config/model/diffusion_model.yaml
 create mode 100644 verl/trainer/config/model_engine/megatron.yaml
 create mode 100644 verl/trainer/config/model_engine/mindspeed.yaml
 create mode 100644 verl/trainer/config/ref/mindspeed_ref.yaml
 create mode 100644 verl/trainer/config/rollout/diffusion_rollout.yaml
 create mode 100644 verl/trainer/ppo/diffusion_algos.py
 create mode 100644 verl/utils/experimental/reward_utils.py
 create mode 100644 verl/utils/profiler/precision_debugger_profile.py
 create mode 100644 verl/utils/reward_score/jpeg_compressibility.py
 create mode 100644 verl/workers/engine/fsdp/diffusers_impl.py
 create mode 100644 verl/workers/engine/mindspeed/utils.py
 rename {tests/trainer/config => verl/workers/rollout/trtllm_rollout}/__init__.py (91%)

diff --git a/.agent/skills/issue.md b/.agent/skills/issue.md
new file mode 100644
index 00000000000..84ceb8cd11e
--- /dev/null
+++ b/.agent/skills/issue.md
@@ -0,0 +1,47 @@
+---
+name: issue
+description: Create or update a GitHub issue following verl project conventions.
+user_invocable: true
+---
+
+When the user asks to create or update an issue, follow these steps:
+
+### 1. Gather Context
+
+Read the following to understand available issue types and their required fields:
+
+- [`bug-report.yml`](.github/ISSUE_TEMPLATE/bug-report.yml)
+- [`feature-request.yml`](.github/ISSUE_TEMPLATE/feature-request.yml)
+
+If updating an existing issue, read its current title, body, labels, and comments first.
+
+### 2. Determine Issue Type
+
+Based on the user's description, select the appropriate template:
+
+- **Bug report** ([`bug-report.yml`](.github/ISSUE_TEMPLATE/bug-report.yml)) — something is broken or behaves unexpectedly
+- **Feature request** ([`feature-request.yml`](.github/ISSUE_TEMPLATE/feature-request.yml)) — a new capability or enhancement
+- **Blank issue** — if neither template fits
+
+### 3. Compose the Issue
+
+Fill in the template fields based on information from the user and the codebase. For bug reports, run `python scripts/diagnose.py` to gather system info if possible.
+
+When updating, ensure the title and body still accurately reflect the current state of the issue.
+
+### 4. Check for Duplicates
+
+Search for existing issues before creating:
+
+```
+gh issue list --repo verl-project/verl --state open --search "<keywords>"
+```
+
+If a duplicate exists, inform the user instead of creating a new one.
+
+### 5. Create or Update the Issue
+
+- **Create**: add `good first issue` and/or `call for contribution` labels if the issue is straightforward and suitable for new contributors.
+- **Update**: update title, body, and labels as needed.
+
+Return the issue URL when done.
diff --git a/.agent/skills/pr.md b/.agent/skills/pr.md
new file mode 100644
index 00000000000..58055f61777
--- /dev/null
+++ b/.agent/skills/pr.md
@@ -0,0 +1,33 @@
+---
+name: pr
+description: Create or update a pull request following verl project conventions.
+user_invocable: true
+---
+
+When the user asks to create or update a PR, follow these steps:
+
+### 1. Gather Context
+
+Read the following and understand the current branch's changes compared to main:
+
+- [`CONTRIBUTING.md`](CONTRIBUTING.md)
+- [`PULL_REQUEST_TEMPLATE.md`](.github/PULL_REQUEST_TEMPLATE.md)
+
+If a PR already exists for this branch, also read its current title, body, and review comments.
+
+### 2. Compose PR Title and Body
+
+Follow the PR template strictly for both title format and body sections. Only check checklist boxes for steps that have actually been completed.
+
+When updating, ensure the title and body still accurately reflect **all** changes on the branch, not just the latest commit.
+
+### 3. Pre-submit Checks
+
+Run pre-commit and fix any issues before creating or pushing.
+
+### 4. Create or Update the PR
+
+- **Create**: target `main` by default unless the user specifies otherwise.
+- **Update**: push new commits and update the title and body if the scope has changed. **Read the current PR title and body first** and incorporate any edits the user may have made directly on GitHub — never overwrite with a version generated from scratch.
+
+Return the PR URL when done.
diff --git a/.claude/skills/issue.md b/.claude/skills/issue.md
new file mode 120000
index 00000000000..c78661ce68d
--- /dev/null
+++ b/.claude/skills/issue.md
@@ -0,0 +1 @@
+../../.agent/skills/issue.md
\ No newline at end of file
diff --git a/.claude/skills/pr.md b/.claude/skills/pr.md
new file mode 120000
index 00000000000..2b4377c0b96
--- /dev/null
+++ b/.claude/skills/pr.md
@@ -0,0 +1 @@
+../../.agent/skills/pr.md
\ No newline at end of file
diff --git a/.codex/skills/issue.md b/.codex/skills/issue.md
new file mode 120000
index 00000000000..c78661ce68d
--- /dev/null
+++ b/.codex/skills/issue.md
@@ -0,0 +1 @@
+../../.agent/skills/issue.md
\ No newline at end of file
diff --git a/.codex/skills/pr.md b/.codex/skills/pr.md
new file mode 120000
index 00000000000..2b4377c0b96
--- /dev/null
+++ b/.codex/skills/pr.md
@@ -0,0 +1 @@
+../../.agent/skills/pr.md
\ No newline at end of file
diff --git a/.github/workflows/cpu_unit_tests.yml b/.github/workflows/cpu_unit_tests.yml
index 48ce123bc07..776f37ff98b 100644
--- a/.github/workflows/cpu_unit_tests.yml
+++ b/.github/workflows/cpu_unit_tests.yml
@@ -44,6 +44,7 @@ on:
       - v0.*
     paths:
       - "**/*.py"
+      - "!tests/special_sanity/**"
       - .github/workflows/cpu_unit_tests.yml
 
 # Cancel jobs on the same ref if a new one is triggered
@@ -56,7 +57,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -80,7 +81,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 20 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -95,7 +95,6 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
-          pip3 install --upgrade "transformers>=5.0.0"
       - name: Download datasets
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
diff --git a/.github/workflows/docker-build-ascend-sglang-a2.yml b/.github/workflows/docker-build-ascend-sglang-a2.yml
new file mode 100644
index 00000000000..4b968778fff
--- /dev/null
+++ b/.github/workflows/docker-build-ascend-sglang-a2.yml
@@ -0,0 +1,84 @@
+name: docker-build-ascend-sglang-a2
+
+on:
+  workflow_dispatch:
+  push:
+    branches: ["main"]
+    paths:
+      - "docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2"
+      - ".github/workflows/docker-build-ascend-sglang-a2.yml"
+  release:
+    types: [published]
+  schedule:
+    - cron: "0 16 * * *"
+
+jobs:
+  build-ascend-sglang-image-a2:
+    if: ${{ github.event_name != 'pull_request' && github.repository_owner == 'verl-project' }}
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-build-ascend-sglang-image-a2
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    steps:
+      - name: Remove unnecessary parts in github actions runners to free up disk space
+        uses: jlumbroso/free-disk-space@v1.3.1
+        with:
+          tool-cache: true
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Get base image name and tag
+        id: base_image
+        run: |
+          BASE_IMAGE_FULL=$(grep '^FROM' ./docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2 | head -1 | cut -d' ' -f2)
+          echo "Base image full: $BASE_IMAGE_FULL" 
+          BASE_IMAGE_TAG=$(echo "$BASE_IMAGE_FULL" | cut -d':' -f2)
+          echo "Base image tag: $BASE_IMAGE_TAG"
+          NEW_IMAGE_NAME="verl-sglang-$BASE_IMAGE_TAG"
+          echo "New image name: $NEW_IMAGE_NAME"  
+          echo "base_image_tag=$BASE_IMAGE_TAG" >> "$GITHUB_OUTPUT"
+          echo "new_image_name=$NEW_IMAGE_NAME" >> "$GITHUB_OUTPUT"
+
+      - name: Get image tag
+        id: version
+        run: |
+          BRANCH_NAME=$(echo "${{ github.ref }}" | sed 's/refs\/heads\///g' | sed 's/[^a-zA-Z0-9._-]/_/g')
+          if [ "${{ github.event_name }}" = "release" ]; then
+            echo "tag=${{ steps.base_image.outputs.new_image_name }}-${{ github.event.release.tag_name }}" >> "$GITHUB_OUTPUT"
+          elif [ "$BRANCH_NAME" = "main" ]; then
+            echo "tag=${{ steps.base_image.outputs.new_image_name }}-latest" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Quay.io
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.QUAY_USERNAME }}
+          password: ${{ secrets.QUAY_PASSWORD }}
+
+      - name: Clean Docker cache before build
+        run: |
+          docker system prune -a -f --volumes || true
+
+      - name: Build and push images Quay
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          file: ./docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2
+          push: true
+          tags: |
+            quay.io/ascend/verl:${{ steps.version.outputs.tag }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          build-args: |
+            BUILDKIT_INLINE_CACHE=1
diff --git a/.github/workflows/docker-build-ascend-sglang-a3.yml b/.github/workflows/docker-build-ascend-sglang-a3.yml
new file mode 100644
index 00000000000..5ff3c38b9ab
--- /dev/null
+++ b/.github/workflows/docker-build-ascend-sglang-a3.yml
@@ -0,0 +1,84 @@
+name: docker-build-ascend-sglang-a3
+
+on:
+  workflow_dispatch:
+  push:
+    branches: ["main"]
+    paths:
+      - "docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3"
+      - ".github/workflows/docker-build-ascend-sglang-a3.yml"
+  release:
+    types: [published]
+  schedule:
+    - cron: "0 16 * * *"
+
+jobs:
+  build-ascend-sglang-image-a3:
+    if: ${{ github.event_name != 'pull_request' && github.repository_owner == 'verl-project' }}
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-${{ github.ref }}-build-ascend-sglang-image-a3
+      cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+    steps:
+      - name: Remove unnecessary parts in github actions runners to free up disk space
+        uses: jlumbroso/free-disk-space@v1.3.1
+        with:
+          tool-cache: true
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Get base image name and tag
+        id: base_image
+        run: |
+          BASE_IMAGE_FULL=$(grep '^FROM' ./docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3 | head -1 | cut -d' ' -f2)
+          echo "Base image full: $BASE_IMAGE_FULL" 
+          BASE_IMAGE_TAG=$(echo "$BASE_IMAGE_FULL" | cut -d':' -f2)
+          echo "Base image tag: $BASE_IMAGE_TAG"
+          NEW_IMAGE_NAME="verl-sglang-$BASE_IMAGE_TAG"
+          echo "New image name: $NEW_IMAGE_NAME"  
+          echo "base_image_tag=$BASE_IMAGE_TAG" >> "$GITHUB_OUTPUT"
+          echo "new_image_name=$NEW_IMAGE_NAME" >> "$GITHUB_OUTPUT"
+
+      - name: Get image tag
+        id: version
+        run: |
+          BRANCH_NAME=$(echo "${{ github.ref }}" | sed 's/refs\/heads\///g' | sed 's/[^a-zA-Z0-9._-]/_/g')
+          if [ "${{ github.event_name }}" = "release" ]; then
+            echo "tag=${{ steps.base_image.outputs.new_image_name }}-${{ github.event.release.tag_name }}" >> "$GITHUB_OUTPUT"
+          elif [ "$BRANCH_NAME" = "main" ]; then
+            echo "tag=${{ steps.base_image.outputs.new_image_name }}-latest" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Quay.io
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.QUAY_USERNAME }}
+          password: ${{ secrets.QUAY_PASSWORD }}
+
+      - name: Clean Docker cache before build
+        run: |
+          docker system prune -a -f --volumes || true
+
+      - name: Build and push images Quay
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: linux/amd64,linux/arm64
+          file: ./docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3
+          push: true
+          tags: |
+            quay.io/ascend/verl:${{ steps.version.outputs.tag }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          build-args: |
+            BUILDKIT_INLINE_CACHE=1
diff --git a/.github/workflows/e2e_ascend.yml b/.github/workflows/e2e_ascend.yml
index b98fe13d6b5..22ffb039f94 100644
--- a/.github/workflows/e2e_ascend.yml
+++ b/.github/workflows/e2e_ascend.yml
@@ -91,8 +91,7 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
-          pip install -e .
+          pip install --no-deps -e .
       - name: Check final pip list
         run: |
           pip list
@@ -120,6 +119,61 @@ jobs:
           ray stop --force
           USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_grpo_mindspeed bash tests/special_npu/run_qwen3_30b_grpo_mindspeed.sh
 
+  engine_mindspeed_llm_rl_job:
+    if: github.repository_owner == 'verl-project'
+    name: E2E Ascend testing for RL training scenarios of LLM models using MindSpeed_LLM engine
+    runs-on: linux-aarch64-a2b3-8
+    timeout-minutes: 120
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/modelfoundry/ascend-ci/verl/verl:verl-sglang-8.3.rc1-910b-ubuntu22.04-py3.11-latest
+      options: >-
+        --shm-size 16g
+    env:
+      HF_ENDPOINT: "https://hf-mirror.com"
+      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+          npu-smi info
+      - name: Check initial pip list from image
+        run: |
+          pip list
+      - name: Checkout volcengine/verl repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          clean: true
+      - name: Install the current repository
+        run: |
+          pip install --no-deps --no-build-isolation -e .
+      - name: Check final pip list
+        run: |
+          pip list
+      - name: Configure related dependencies
+        run: |
+          git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git /Megatron-LM
+          rm -rf /MindSpeed
+          git clone https://gitcode.com/ascend/MindSpeed.git /MindSpeed
+          git clone https://gitcode.com/ascend/MindSpeed-LLM.git /MindSpeed-LLM
+      - name: Preprocess gsm8k dataset
+        run: |
+          python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
+      - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (MindSpeedLLM backend)
+        run: |
+          ray stop --force
+          export PYTHONPATH=$PYTHONPATH:/Megatron-LM
+          export PYTHONPATH=$PYTHONPATH:/MindSpeed
+          export PYTHONPATH=$PYTHONPATH:/MindSpeed-LLM
+          bash tests/special_npu/run_qwen3_8b_grpo_mindspeedllm.sh
+      - name: Running gsm8k e2e training tests with GRPO on ASCEND NPU (MindSpeedLLM backend, MoE Model)
+        run: |
+          ray stop --force
+          export PYTHONPATH=$PYTHONPATH:/Megatron-LM
+          export PYTHONPATH=$PYTHONPATH:/MindSpeed
+          export PYTHONPATH=$PYTHONPATH:/MindSpeed-LLM
+          USE_DIST_CKPT=True USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json DUMMY_MODEL_PATH=$HOME/dist_ckpt/qwen3_30b_grpo_mindspeedllm bash tests/special_npu/run_qwen3_30b_grpo_mindspeedllm.sh
+
   vlm_rl_job:
     if: github.repository_owner == 'verl-project'
     name: E2E Ascend testing for RL training scenarios of VLM models
@@ -147,8 +201,7 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
-          pip install -e .
+          pip install --no-deps -e .
       - name: Check final pip list
         run: |
           pip list
@@ -159,4 +212,4 @@ jobs:
         run: |
           ray stop --force
           bash tests/special_npu/run_qwen2_5_vl_3b_npu.sh
-          rm -rf $HOME/ckpts
+          rm -rf $HOME/ckpts
\ No newline at end of file
diff --git a/.github/workflows/e2e_fully_async_policy.yml b/.github/workflows/e2e_fully_async_policy.yml
index a46be304814..5a60e9c37f9 100644
--- a/.github/workflows/e2e_fully_async_policy.yml
+++ b/.github/workflows/e2e_fully_async_policy.yml
@@ -80,7 +80,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -105,7 +105,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 10 # Increase timeout for async training
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -134,7 +133,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 10 # Increase timeout for async training
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -149,6 +147,7 @@ jobs:
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
           pip3 install cupy-cuda12x==13.6.0
+          pip3 install git+https://github.com/ISEEKYAN/mbridge.git@main --no-deps --no-build-isolation
       - name: Prepare GSM8K dataset
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
diff --git a/.github/workflows/e2e_fully_async_policy_ascend.yml b/.github/workflows/e2e_fully_async_policy_ascend.yml
index 57a2c971c1d..973629f4509 100644
--- a/.github/workflows/e2e_fully_async_policy_ascend.yml
+++ b/.github/workflows/e2e_fully_async_policy_ascend.yml
@@ -109,7 +109,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
       - name: Check final pip list
         run: |
@@ -154,7 +153,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
       - name: Check final pip list
         run: |
diff --git a/.github/workflows/e2e_one_step_off_policy.yml b/.github/workflows/e2e_one_step_off_policy.yml
index de3f8df5c1e..3d776f199c1 100644
--- a/.github/workflows/e2e_one_step_off_policy.yml
+++ b/.github/workflows/e2e_one_step_off_policy.yml
@@ -80,7 +80,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -105,13 +105,12 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 10 # Increase timeout for async training
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
       ACTOR_STRATEGY: "fsdp2"
-    steps:
+  steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
@@ -134,7 +133,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 10 # Increase timeout for async training
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
diff --git a/.github/workflows/e2e_one_step_off_policy_ascend.yml b/.github/workflows/e2e_one_step_off_policy_ascend.yml
index 6b14d69cbed..8068ad818a7 100644
--- a/.github/workflows/e2e_one_step_off_policy_ascend.yml
+++ b/.github/workflows/e2e_one_step_off_policy_ascend.yml
@@ -109,7 +109,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
       - name: Check final pip list
         run: |
@@ -154,7 +153,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
       - name: Check final pip list
         run: |
diff --git a/.github/workflows/e2e_ppo_grpo_trainer_trtllm.yml b/.github/workflows/e2e_ppo_grpo_trainer_trtllm.yml
index 61a19d43419..3c2e62f99cd 100644
--- a/.github/workflows/e2e_ppo_grpo_trainer_trtllm.yml
+++ b/.github/workflows/e2e_ppo_grpo_trainer_trtllm.yml
@@ -117,7 +117,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -135,7 +134,7 @@ jobs:
         run: |
           export TRTLLM_TEST_MODEL_PATH_ROOT="${HOME}/models"
           ray stop --force
-          pytest -v -s \
+          pytest -v -s --durations=20 \
             tests/workers/rollout/rollout_trtllm/test_adapter.py \
             tests/workers/rollout/rollout_trtllm/test_async_server.py \
             tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py
@@ -145,7 +144,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -167,6 +165,17 @@ jobs:
           DATADIR=${HOME}/data \
             bash examples/grpo_trainer/run_qwen2-7b_math_trtllm.sh 2 \
             trainer.total_training_steps=1 \
+            data.train_batch_size=32 \
+            data.max_prompt_length=128 \
+            data.max_response_length=64 \
+            actor_rollout_ref.rollout.n=1 \
+            actor_rollout_ref.actor.ppo_mini_batch_size=32 \
+            actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+            actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+            actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+            actor_rollout_ref.rollout.max_num_seqs=32 \
+            actor_rollout_ref.rollout.max_num_batched_tokens=1024 \
+            actor_rollout_ref.rollout.max_model_len=256 \
             data.train_files="['${PWD}/data/gsm8k/train.parquet']" \
             data.val_files="['${PWD}/data/gsm8k/test.parquet']" \
             trainer.logger='["console"]' \
@@ -180,7 +189,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -203,6 +211,14 @@ jobs:
           ACTOR_TP=2 \
             bash examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh 2 \
             trainer.total_training_steps=1 \
+            data.train_batch_size=32 \
+            data.max_prompt_length=128 \
+            data.max_response_length=64 \
+            actor_rollout_ref.rollout.n=1 \
+            actor_rollout_ref.actor.ppo_mini_batch_size=32 \
+            actor_rollout_ref.rollout.max_num_seqs=32 \
+            actor_rollout_ref.rollout.max_num_batched_tokens=1024 \
+            actor_rollout_ref.rollout.max_model_len=256 \
             data.train_files="['${PWD}/data/gsm8k/train.parquet']" \
             data.val_files="['${PWD}/data/gsm8k/test.parquet']" \
             trainer.logger='["console"]' \
@@ -210,12 +226,11 @@ jobs:
       - name: clean up
         run: |
           rm -rf checkpoints
-  e2e_grpo_trainer_fsdp-vlm:
+  e2e_grpo_trainer_megatron-vlm:
     needs: setup
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -233,16 +248,17 @@ jobs:
       - name: Prepare GEO3K dataset
         run: |
           python3 examples/data_preprocess/geo3k.py --local_dataset_path ${HOME}/models/hf_data/hiyouga/geometry3k --local_save_dir ${PWD}/data/geo3k
-      - name: Running GEO3K E2E training tests with FSDP on 8 L20 GPUs (VLM)
-        run: |
-          ray stop --force
-          DATADIR=${HOME}/data \
-            bash examples/grpo_trainer/run_qwen2_5_vl_3b_trtllm.sh 2 \
-            trainer.total_training_steps=1 \
-            data.train_files="['${PWD}/data/geo3k/train.parquet']" \
-            data.val_files="['${PWD}/data/geo3k/test.parquet']" \
-            trainer.logger='["console"]' \
-            actor_rollout_ref.model.path="${HOME}/models/Qwen/Qwen3-VL-2B-Instruct"
+      # FIXME: timeout in 30 minutes
+      # - name: Running GEO3K E2E training tests with FSDP on 8 L20 GPUs (VLM)
+      #   run: |
+      #     ray stop --force
+      #     DATADIR=${HOME}/data \
+      #       bash examples/grpo_trainer/run_qwen2_5_vl_3b_trtllm.sh 2 \
+      #       trainer.total_training_steps=1 \
+      #       data.train_files="['${PWD}/data/geo3k/train.parquet']" \
+      #       data.val_files="['${PWD}/data/geo3k/test.parquet']" \
+      #       trainer.logger='["console"]' \
+      #       actor_rollout_ref.model.path="${HOME}/models/Qwen/Qwen3-VL-2B-Instruct"
       - name: clean up
         run: |
           rm -rf checkpoints
@@ -259,10 +275,10 @@ jobs:
           bash examples/grpo_trainer/run_qwen3-30b_dapo_megatron_fp8_trtllm.sh \
             reward_model.reward_kwargs.overlong_buffer_cfg.len=258 \
             reward_model.reward_kwargs.max_resp_len=512 \
-            data.max_prompt_length=512 \
-            data.max_response_length=512 \
+            data.max_prompt_length=128 \
+            data.max_response_length=64 \
             data.train_batch_size=32 \
-            actor_rollout_ref.rollout.n=4 \
+            actor_rollout_ref.rollout.n=1 \
             actor_rollout_ref.rollout.max_num_seqs=16 \
             actor_rollout_ref.rollout.max_num_batched_tokens=1024 \
             actor_rollout_ref.rollout.max_model_len=1024 \
@@ -276,7 +292,7 @@ jobs:
 
   cleanup:
     runs-on: ubuntu-latest
-    needs: [setup, trtllm_unit_tests, e2e_grpo_trainer_fsdp-qwen2, e2e_grpo_trainer_megatron-qwen2, e2e_grpo_trainer_fsdp-vlm]
+    needs: [setup, trtllm_unit_tests, e2e_grpo_trainer_fsdp-qwen2, e2e_grpo_trainer_megatron-qwen2, e2e_grpo_trainer_megatron-vlm]
     if: always()
     steps:
       - id: destroy-runner
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
index 172ddf1d156..93bf17324a5 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang.yml
@@ -110,7 +110,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -123,6 +122,7 @@ jobs:
       - name: Install the current repository
         run: |
           pip3 install -r requirements-test.txt
+          pip3 install megatron-bridge --no-deps
           pip3 install git+https://github.com/ISEEKYAN/mbridge.git@main --no-deps --no-build-isolation
           pip3 install --no-deps -e .
       - name: Prepare GSM8K dataset
@@ -158,7 +158,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -172,6 +171,7 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
+          pip3 install megatron-bridge --no-deps
       - name: Prepare GSM8K dataset
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml
index ba9d3b23545..ec90add1759 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_sglang_2.yml
@@ -110,7 +110,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 40 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -137,7 +136,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
index 7d3577e176b..5cca191d7e9 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm.yml
@@ -87,7 +87,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -112,7 +112,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -125,6 +124,7 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps --force-reinstall .
+          pip3 install megatron-bridge --no-deps
           pip3 install git+https://github.com/ISEEKYAN/mbridge.git@main --no-deps --no-build-isolation
           pip3 install math-verify
       - name: Prepare GSM8K dataset
@@ -145,9 +145,9 @@ jobs:
       - name: clean up and install Megatron-Bridge
         run: |
           rm -rf checkpoints
-          pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@83a7c11 --no-deps --no-build-isolation
-          pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@5455f0a --no-deps --no-build-isolation
-          pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1
+          pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@6259ae8 --no-deps --no-build-isolation
+          pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@7ca9dc5 --no-deps --no-build-isolation
+          pip3 install "nvidia-modelopt[torch]>=0.37.0"
       - name: Running GSM8K E2E training tests with 3D parallelism on 8 L20 GPUs with Megatron, use Megatron-Bridge LoRA e2e to pre-load and save (Deepseek)
         run: |
           ray stop --force
@@ -168,7 +168,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -181,6 +180,7 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
+          pip3 install megatron-bridge --no-deps
           pip3 install math-verify
       - name: Prepare GSM8K dataset
         run: |
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
index 83a1faf8832..a4f26d2beb7 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2.yml
@@ -87,7 +87,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -111,7 +111,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -124,9 +123,9 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps --force-reinstall .
-          pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@83a7c11 --no-deps --no-build-isolation
-          pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@5455f0a --no-deps --no-build-isolation
-          pip3 install "nvidia-modelopt[torch]>=0.37.0" transformers==4.57.1
+          pip3 install git+https://github.com/NVIDIA-NeMo/Megatron-Bridge.git@6259ae8 --no-deps --no-build-isolation
+          pip3 install git+https://github.com/NVIDIA/Megatron-LM.git@7ca9dc5 --no-deps --no-build-isolation
+          pip3 install "nvidia-modelopt[torch]>=0.37.0"
       - name: Prepare GSM8K dataset
         run: |
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
@@ -138,7 +137,7 @@ jobs:
           MAX_PROMPT_LENGTH=512 MAX_RESPONSE_LENGTH=512 \
           MODEL_ID=Qwen/Qwen3-30B-A3B-Instruct-2507 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False \
           COMMON_PP=2 COMMON_VPP=null COMMON_CP=1 COMMON_TP=4 COMMON_EP=4 COMMON_ETP=1 INFER_TP=8 \
-          USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
+          USE_DIST_CKPT=False ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
       - name: Running GSM8K E2E training tests with 3D parallelism with FP8 rollout on 8 L20 GPUs with Megatron-Bridge (Qwen3-30B-A3B-Instruct-2507)
         run: |
           ray stop --force
@@ -147,7 +146,7 @@ jobs:
           MAX_PROMPT_LENGTH=512 MAX_RESPONSE_LENGTH=512 \
           MODEL_ID=Qwen/Qwen3-30B-A3B-Instruct-2507 USE_MBRIDGE=True VANILLA_MBRIDGE=False VALUE_VANILLA_MBRIDGE=False \
           COMMON_PP=2 COMMON_VPP=null COMMON_CP=1 COMMON_TP=4 COMMON_EP=4 COMMON_ETP=1 INFER_TP=2 \
-          USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 ROLLOUT_QUANTIZATION=fp8 bash tests/special_e2e/run_ppo_trainer_megatron.sh
+          USE_DIST_CKPT=False ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 ROLLOUT_QUANTIZATION=fp8 bash tests/special_e2e/run_ppo_trainer_megatron.sh
       - name: clean up
         run: |
           rm -rf checkpoints
@@ -169,7 +168,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -252,7 +250,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 40 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
diff --git a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2_ascend.yml b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2_ascend.yml
index d0abdcc60e3..878f087651c 100644
--- a/.github/workflows/e2e_ppo_trainer_megatron_vllm_2_ascend.yml
+++ b/.github/workflows/e2e_ppo_trainer_megatron_vllm_2_ascend.yml
@@ -113,7 +113,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
       - name: Check final pip list
         run: |
@@ -190,7 +189,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
           pip install trl==0.26.0
       - name: Check final pip list
diff --git a/.github/workflows/e2e_ppo_trainer_veomni_vllm.yml b/.github/workflows/e2e_ppo_trainer_veomni_vllm.yml
index 0accafd58e8..c02329dd025 100644
--- a/.github/workflows/e2e_ppo_trainer_veomni_vllm.yml
+++ b/.github/workflows/e2e_ppo_trainer_veomni_vllm.yml
@@ -81,7 +81,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -105,7 +105,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
diff --git a/.github/workflows/e2e_sft_llm.yml b/.github/workflows/e2e_sft_llm.yml
index 435a0a626db..3af3caf9f1f 100644
--- a/.github/workflows/e2e_sft_llm.yml
+++ b/.github/workflows/e2e_sft_llm.yml
@@ -92,7 +92,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
diff --git a/.github/workflows/e2e_sft_llm_ascend.yml b/.github/workflows/e2e_sft_llm_ascend.yml
index 3919da747a9..3f08a87d16e 100644
--- a/.github/workflows/e2e_sft_llm_ascend.yml
+++ b/.github/workflows/e2e_sft_llm_ascend.yml
@@ -95,8 +95,7 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
-          pip install -e .
+          pip install --no-deps -e .
           pip install git+https://github.com/ByteDance-Seed/VeOmni.git@v0.1.4
           pip install pandas==2.3.3
           pip uninstall -y mbridge
diff --git a/.github/workflows/e2e_sft_vlm.yml b/.github/workflows/e2e_sft_vlm.yml
index 93d02c83c8c..e00385a15c5 100644
--- a/.github/workflows/e2e_sft_vlm.yml
+++ b/.github/workflows/e2e_sft_vlm.yml
@@ -92,7 +92,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
diff --git a/.github/workflows/gpu_unit_tests.yml b/.github/workflows/gpu_unit_tests.yml
index d3b8c714bc9..23e6258c982 100644
--- a/.github/workflows/gpu_unit_tests.yml
+++ b/.github/workflows/gpu_unit_tests.yml
@@ -52,6 +52,7 @@ on:
       - "**/*.py"
       # Other entrypoints
       - "!examples/**"
+      - "!tests/special_sanity/**"
       - "!verl/trainer/main_*.py"
       - "!verl/trainer/fsdp_sft_trainer.py"
       # Entrypoints
@@ -95,7 +96,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 60 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml
index 5522ba71466..6202f1dbf0f 100644
--- a/.github/workflows/model.yml
+++ b/.github/workflows/model.yml
@@ -62,7 +62,7 @@ concurrency:
   cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -86,7 +86,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 20 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -95,11 +94,10 @@ jobs:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
-      - name: Install the current repository and upgrade to latest transformers(4.54.0)/flash_attn, transformers 4.55.0 has strange behavior with model backward
+      - name: Install the current repository and upgrade to latest transformers
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
-          pip3 install --upgrade "transformers<5.0.0"
       - name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8
         run: |
           pytest -s tests/models/test_transformer.py
@@ -119,6 +117,9 @@ jobs:
       - name: Run distributed test
         run: |
           bash tests/special_distributed/run_all.sh
+      - name: Clean up and recover transformers
+        run: |
+          pip3 install --upgrade "transformers==5.3.0"
 
   # TODO: Move this back to model_rmpad once FSDP2 is stable.
   # NOTE: List as an independent job to make rerun easier.
@@ -127,7 +128,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 20 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -149,7 +149,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 20 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -162,15 +161,21 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
+          pip3 install --upgrade "accelerate>=1.13.0"
       - name: Download model config files
         run: |
           hf download Qwen/Qwen2.5-0.5B-Instruct --local-dir $HOME/models/Qwen/Qwen2.5-0.5B-Instruct
-
       - name: Running mcore engine tests on 8 L20 GPUs
         run: |
           ray stop --force
           pytest -s -x tests/models/test_engine.py
 
+      - name: Running diffusers FSDP engine tests on 8 L20 GPUs
+        run: |
+          ray stop --force
+          pip3 install diffusers==0.37.0
+          pytest -s -x tests/models/test_diffusers_fsdp_engine.py
+
   cleanup:
     runs-on: ubuntu-latest
     needs: [setup, model_rmpad, model_rmpad_fsdp2_unstable, model_engine]
diff --git a/.github/workflows/model_ascend.yml b/.github/workflows/model_ascend.yml
index a5ab7620ee3..f797cca5c94 100644
--- a/.github/workflows/model_ascend.yml
+++ b/.github/workflows/model_ascend.yml
@@ -87,7 +87,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .[test]
       - name: Check final pip list
         run: |
@@ -127,7 +126,6 @@ jobs:
           fetch-depth: 0
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .[test]
       - name: Prepare weights
         run: |
diff --git a/.github/workflows/nightly_ascend.yml b/.github/workflows/nightly_ascend.yml
index c74ea4ba7d9..451f224b906 100644
--- a/.github/workflows/nightly_ascend.yml
+++ b/.github/workflows/nightly_ascend.yml
@@ -71,7 +71,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
       - name: Check final pip list
         run: |
@@ -114,7 +113,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
       - name: Check final pip list
         run: |
@@ -157,7 +155,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .
       - name: Check final pip list
         run: |
@@ -172,3 +169,54 @@ jobs:
         run: |
           ray stop --force
           bash tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-vl-3b-instruct_fsdp_npu.sh
+
+  # Test dapo moonlight-16b megatron vllm
+  nightlyCI_dapo-moonlight-16b-megatron-vllm_ascend:
+    if: github.repository_owner == 'verl-project'
+    runs-on: linux-aarch64-a2b3-8
+    timeout-minutes: 180 # Increase this timeout value as needed
+    container:
+      image: swr.ap-southeast-1.myhuaweicloud.com/base_image/ascend-ci/verl/verl:verl-8.5.0-910b-ubuntu22.04-py3.11-latest
+      options: >-
+        --shm-size 16g
+    env:
+      HF_ENDPOINT: "https://hf-mirror.com"
+      HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+          npu-smi info
+      - name: Check initial pip list from image
+        run: |
+          pip list
+      - name: Checkout verl-project/verl repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          submodules: recursive
+          clean: true
+      - name: Install the current repository
+        run: |
+          pip install -r requirements-npu.txt
+          pip install --no-deps -e .
+      - name: Check final pip list
+        run: |
+          pip list
+      - name: Prepare weights
+        run: |
+          ln -s /root/.cache/models ~/models
+      - name: Preprocess geo3k dataset
+        run: |
+          python examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/.cache/datasets/openai/gsm8k
+      - name: update mbridge
+        run: |
+          # get mbridge path
+          MBRIDGE_PATH=$(pip show mbridge | grep Location | awk '{print $2}')
+          # cuda to npu
+          TARGET_FILE="${MBRIDGE_PATH}/mbridge/models/ext/deepseek_v3/dequant_fp8_safetensor_io.py"
+          sed -i '34s/cuda/npu/;51s/cuda/npu/' "$TARGET_FILE"
+      - name: Running nightlyCI_dapo-moonlight-16b-megatron-vllm_ascend
+        run: |
+          ray stop --force
+          bash tests/special_npu/nightly_ci_ascend/run_dapo_moonlight-16b_megatron_npu.sh
diff --git a/.github/workflows/npu_unit_tests.yml b/.github/workflows/npu_unit_tests.yml
index 7f678409da0..87a119598bb 100644
--- a/.github/workflows/npu_unit_tests.yml
+++ b/.github/workflows/npu_unit_tests.yml
@@ -52,6 +52,7 @@ on:
       - "**/*.py"
       # Other entrypoints
       - "!examples/**"
+      - "!tests/special_sanity/**"
       - "!verl/trainer/main_*.py"
       - "!verl/trainer/fsdp_sft_trainer.py"
       - "!recipe/**"
@@ -97,7 +98,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .[test]
           pip install mlflow pytest-asyncio
       - name: Check final pip list
diff --git a/.github/workflows/reward_model_sglang.yml b/.github/workflows/reward_model_sglang.yml
index c9a4e9804a0..965996e9c43 100644
--- a/.github/workflows/reward_model_sglang.yml
+++ b/.github/workflows/reward_model_sglang.yml
@@ -83,7 +83,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -106,20 +105,17 @@ jobs:
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_dir ${HOME}/data/gsm8k
       - name: Running sglang generative reward model tests on 8 L20 GPUs
         run: |
-          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
           ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_reward_model_genrm.py
       - name: Running sglang discriminative reward model tests on 8 L20 GPUs
         run: |
-          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
           ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_reward_model_disrm.py
-      - name: Running sglang agent loop with reward manager tests on 8 L20 GPUs
-        run: |
-          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
-          ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
-      - name: Running sglang agent loop with reward model colocate tests on 8 L20 GPUs
-        run: |
-          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
-          ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
+      # FIXME(@yyDing1): broken
+      # - name: Running sglang agent loop with reward manager tests on 8 L20 GPUs
+      #   run: |
+      #     ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
+      # - name: Running sglang agent loop with reward model colocate tests on 8 L20 GPUs
+      #   run: |
+      #     ROLLOUT_NAME=sglang pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
 
   cleanup:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/reward_model_vllm.yml b/.github/workflows/reward_model_vllm.yml
index aebde06984f..0c99e62951d 100644
--- a/.github/workflows/reward_model_vllm.yml
+++ b/.github/workflows/reward_model_vllm.yml
@@ -59,7 +59,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -83,7 +83,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 30 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -105,21 +104,22 @@ jobs:
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k --local_dir ${HOME}/data/gsm8k
       - name: Running vllm generative reward model tests on 8 L20 GPUs
         run: |
-          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
           ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_reward_model_genrm.py
       - name: Running vllm discriminative reward model tests on 8 L20 GPUs
         run: |
-          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
           ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_reward_model_disrm.py
 
       - name: Running vllm agent loop with reward manager tests on 8 L20 GPUs
         run: |
-          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
           ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_standalone.py
       - name: Running vllm agent loop with reward model colocate tests on 8 L20 GPUs
         run: |
-          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
           ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_agent_reward_loop_colocate.py
+      - name: Running vllm agent loop with image reward manager tests on 2 L20 GPUs
+        run: |
+          pip3 install python-Levenshtein
+          unset http_proxy https_proxy HTTP_PROXY HTTPS_PROXY
+          ROLLOUT_NAME=vllm pytest -s -x tests/experimental/reward_loop/test_visual_reward_manager.py
 
   cleanup:
     runs-on: ubuntu-latest
diff --git a/.github/workflows/reward_model_vllm_ascend.yml b/.github/workflows/reward_model_vllm_ascend.yml
index b57aa97c73b..676144a0ef2 100644
--- a/.github/workflows/reward_model_vllm_ascend.yml
+++ b/.github/workflows/reward_model_vllm_ascend.yml
@@ -85,7 +85,6 @@ jobs:
           clean: true
       - name: Install the current repository
         run: |
-          pip install -r requirements-npu.txt
           pip install --no-deps -e .[test]
       - name: Check final pip list
         run: |
diff --git a/.github/workflows/sanity.yml b/.github/workflows/sanity.yml
index ac7532d2f04..b9ec384f400 100644
--- a/.github/workflows/sanity.yml
+++ b/.github/workflows/sanity.yml
@@ -76,33 +76,11 @@ jobs:
           pip3 install -r requirements.txt
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
+      # Most sanity checks now run via pre-commit (see .pre-commit-config.yaml
+      # and .github/workflows/pre-commit.yml). Only checks that need the full
+      # verl installation or CI-only context remain here.
       - name: Run sanity test
         run: |
           pytest -s -x tests/special_sanity
-      - name: Run license test
-        run: |
-          python3 tests/special_sanity/check_license.py --directories .
-      - name: Assert naming convention
-        run: |
-          if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ 'veRL' .; then
-            echo "Please use verl instead of veRL in the codebase"
-            exit 1
-          fi
-      - name: Assert SGLang naming convention
-        run: |
-          if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ --exclude=ascend_sglang_best_practices.rst -E 'Sglang|sgLang|sglAng|sglaNg|sglanG' .; then
-            echo "Please use SGLang or sglang as the formal name of SGLang rollout engine"
-            exit 1
-          fi
-      - name: Validate test folder structure
-        run: python3 tests/special_sanity/validate_structure.py
       - name: Assert documentation requirement for functions
         run: python3 tests/special_sanity/validate_imported_docs.py
-      - name: Assert device api usage in verl/verl
-        run: python3 tests/special_sanity/check_device_api_usage.py --directory ./verl
-      - name: Assert documentation time info
-        run: python3 tests/special_sanity/check_docs_time_info.py
-      - name: Check docstrings for specified files
-        run: python3 tests/special_sanity/check_docstrings.py
-      - name: Check DataProto for specified folders
-        run: python3 tests/special_sanity/check_dataproto_usage.py -d ./verl/workers/engine
diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml
index bc0c0bb7f4a..1286d8a23f6 100644
--- a/.github/workflows/sgl.yml
+++ b/.github/workflows/sgl.yml
@@ -99,7 +99,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 35 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -130,7 +129,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 35 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
index d358349f72c..8f0ad1626bd 100644
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@@ -72,7 +72,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -96,7 +96,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 35 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -109,7 +108,6 @@ jobs:
         run: |
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
-          pip3 install --upgrade "transformers<5.0"
       #      - name: Download Model to Use
       #        run: |
       #          hf download Qwen/Qwen2.5-0.5B-Instruct --local-dir ${HOME}/models/Qwen/Qwen2.5-0.5B-Instruct
@@ -123,7 +121,7 @@ jobs:
           python3 examples/data_preprocess/gsm8k.py --local_dataset_path ${HOME}/models/hf_data/gsm8k
       - name: Test the latest vLLM Rollout async with agent loop
         run: |
-          ROLLOUT_NAME=vllm pytest -svvv tests/experimental/agent_loop
+          ROLLOUT_NAME=vllm pytest -svvv tests/experimental/agent_loop -m "not vllm_omni"
       - name: Test vllm server abort functionality
         run: |
           pytest tests/workers/rollout/rollout_vllm/test_vllm_abort.py -v -s
@@ -133,7 +131,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 35 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -147,7 +144,6 @@ jobs:
           pip3 install pytest-asyncio
           pip3 install -r requirements-test.txt
           pip3 install --no-deps -e .
-          pip3 install --upgrade "transformers<5.0"
           pip3 install cupy-cuda12x==13.6.0
       - name: Test vLLM ServerAdapter with Checkpoint Engine (NCCL)
         run: |
diff --git a/.github/workflows/vllm_omni.yml b/.github/workflows/vllm_omni.yml
index ee9365e265a..0b23be14a9d 100644
--- a/.github/workflows/vllm_omni.yml
+++ b/.github/workflows/vllm_omni.yml
@@ -46,6 +46,8 @@ on:
       - "**/*.py"
       # Other entrypoints
       - "!examples/**"
+
+      - "examples/flowgrpo_trainer/**"
       - "!tests/**"
       - "!verl/trainer/main_*.py"
       - "!verl/trainer/fsdp_sft_trainer.py"
@@ -58,8 +60,8 @@ on:
       # Entrypoints
       - ".github/workflows/vllm_omni.yml"
       - "tests/workers/rollout/rollout_vllm/test_vllm_omni_generate.py"
+      - "tests/experimental/agent_loop/test_diffusion_agent_loop.py"
       - "verl/workers/rollout/vllm_rollout/vllm_omni_async_server.py"
-      - "verl/models/diffusion/**"
 
 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
@@ -71,7 +73,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2"
+  IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm018.dev1"
   DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner"
 
 jobs:
@@ -95,7 +97,6 @@ jobs:
     runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"]
     timeout-minutes: 35 # Increase this timeout value as needed
     env:
-      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1,hf-mirror.com"
       HF_ENDPOINT: "https://hf-mirror.com"
@@ -110,11 +111,15 @@ jobs:
           pip3 install --no-deps -e .
       - name: Install vllm-omni
         run: |
-          pip3 install git+https://github.com/vllm-project/vllm-omni.git@3158e9d28f1286119756604e9c40292cee4808d3
+          pip3 install git+https://github.com/vllm-project/vllm-omni.git@a90a769
       - name: Test vLLM Omni generate
         run: |
           ray stop --force
           pytest tests/workers/rollout/rollout_vllm/test_vllm_omni_generate.py -v -s
+      - name: Test diffusion agent loop
+        run: |
+          ray stop --force
+          pytest tests/experimental/agent_loop/test_diffusion_agent_loop.py -v -s
 
   cleanup:
     runs-on: ubuntu-latest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7f836f50ff9..7417fe74b55 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,24 +20,48 @@ repos:
         language: script
         pass_filenames: false
 
-  - repo: local
-    hooks:
+      - id: check-docs-time-info
+        name: Check docs Last updated info
+        entry: python3 tests/special_sanity/check_docs_time_info.py
+        language: python
+        pass_filenames: false
+
       - id: check-docstrings
         name: Check doc string coverage
         entry: python3 tests/special_sanity/check_docstrings.py
         language: python
         pass_filenames: false
 
-  - repo: local
-    hooks:
       - id: check-license
         name: Check license
-        entry: python3 tests/special_sanity/check_license.py --directories examples scripts tests verl setup.py
+        entry: python3 tests/special_sanity/check_license.py --directories .
         language: python
         pass_filenames: false
 
-  - repo: local
-    hooks:
+      - id: check-device-api-usage
+        name: Check device API usage
+        entry: python3 tests/special_sanity/check_device_api_usage.py --directory ./verl
+        language: python
+        pass_filenames: false
+
+      - id: check-dataproto-usage
+        name: Check DataProto usage
+        entry: python3 tests/special_sanity/check_dataproto_usage.py -d ./verl/workers/engine
+        language: python
+        pass_filenames: false
+
+      - id: validate-structure
+        name: Validate test structure
+        entry: python3 tests/special_sanity/validate_structure.py
+        language: python
+        pass_filenames: false
+
+      - id: check-naming-conventions
+        name: Check naming conventions
+        entry: sh -c 'fail=0; if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ --exclude=.pre-commit-config.yaml "veRL" .; then echo "Please use verl instead of veRL"; fail=1; fi; if grep -rIn --exclude-dir=.git --exclude-dir=.github --exclude-dir=venv --exclude-dir=__pycache__ --exclude=ascend_sglang_best_practices.rst --exclude=.pre-commit-config.yaml -E "Sglang|sgLang|sglAng|sglaNg|sglanG" .; then echo "Please use SGLang or sglang"; fail=1; fi; exit $fail'
+        language: system
+        pass_filenames: false
+
       - id: compileall
         name: Compile all python files
         entry: sh -c 'PYTHONWARNINGS=error python3 -m compileall -q . -x "(^|[\\/])(\.venv|venv|\.git)([\\/]|$)"'
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 00000000000..f44706f26ed
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,87 @@
+# Agent Instructions for verl
+
+> These instructions apply to **all** AI-assisted contributions to `verl-project/verl`.
+> Breaching these guidelines can result in automatic banning.
+
+## 1. Contribution Policy (Mandatory)
+
+### Duplicate-work checks
+
+Before proposing a PR, run these checks:
+
+```bash
+gh issue view <issue_number> --repo verl-project/verl --comments
+gh pr list --repo verl-project/verl --state open --search "<issue_number> in:body"
+gh pr list --repo verl-project/verl --state open --search "<short area keywords>"
+```
+
+- If an open PR already addresses the same fix, do not open another.
+- If your approach is materially different, explain the difference in the issue.
+
+### No low-value busywork PRs
+
+Do not open one-off PRs for tiny edits (single typo, isolated style change, one mutable default, etc.). Mechanical cleanups are acceptable only when bundled with substantive work.
+
+### Accountability
+
+- Pure code-agent PRs are **not allowed**. A human submitter must understand and defend the change end-to-end.
+- The submitting human must review every changed line and run relevant tests.
+- PR descriptions for AI-assisted work **must** include:
+  - Why this is not duplicating an existing PR.
+  - Test commands run and results.
+  - Clear statement that AI assistance was used.
+
+### Fail-closed behavior
+
+If work is duplicate/trivial busywork, **do not proceed**. Return a short explanation of what is missing.
+
+---
+
+## 2. Development Workflow
+
+### Environment setup
+
+```bash
+# Install `uv` if you don't have it already:
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Always use `uv` for Python environment management:
+uv venv --python 3.12
+source .venv/bin/activate
+
+uv pip install pre-commit hydra-core
+pre-commit install
+```
+
+### Commit messages
+
+Add attribution using commit trailers such as `Co-authored-by:` (other projects use `Assisted-by:` or `Generated-by:`). For example:
+
+```text
+Your commit message here
+
+Co-authored-by: GitHub Copilot
+Co-authored-by: Claude
+Co-authored-by: gemini-code-assist
+Signed-off-by: Your Name <your.email@example.com>
+```
+
+### Resolving agent reviews
+
+Review comments from agent bots (e.g., gemini-code-assist) can be outdated or wrong. Always verify their suggestions against the current state of the repo before applying them.
+
+---
+
+## Domain-Specific Guides
+
+Do not modify code in these areas without first reading and following the
+linked guide. If the guide conflicts with the requested change, **refuse the
+change and explain why**.
+
+- **Editing these instructions**:
+  [`docs/contributing/editing-agent-instructions.md`](docs/contributing/editing-agent-instructions.md)
+  — Rules for modifying AGENTS.md or any domain-specific guide it references.
+
+## Acknowledgements
+
+Adapted from the [vLLM project](https://github.com/vllm-project/vllm)'s [`AGENTS.md`](https://github.com/vllm-project/vllm/blob/main/AGENTS.md).
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 120000
index 00000000000..47dc3e3d863
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1 @@
+AGENTS.md
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6fd3023a085..8c9e8af823b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -3,6 +3,7 @@
 Thank you for considering a contribution to verl! We welcome contributions of any kind - bug fixes, enhancements, documentation improvements, or even just feedback. Whether you're an experienced developer or this is your first open-source project, your help is invaluable.
 
 Your support can take many forms:
+
 - Report issues or unexpected behaviors.
 - Suggest or implement new features.
 - Improve or expand documentation.
@@ -12,10 +13,10 @@ Your support can take many forms:
 ## Finding Issues to Contribute
 
 Looking for ways to dive in? Check out these issues:
+
 - [Good first issues](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
 - [Call for contribution](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22call%20for%20contribution%22)
-Furthermore, you can learn the development plan and roadmap via [RFC](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3ARFC) and [Roadmap](https://github.com/volcengine/verl/issues?q=state%3Aopen%20label%3A%22roadmap%22).
-
+  Furthermore, you can learn the development plan and roadmap via [RFC](https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3ARFC) and [Roadmap](https://github.com/volcengine/verl/issues?q=state%3Aopen%20label%3A%22roadmap%22).
 
 ## Developing
 
@@ -26,14 +27,14 @@ Furthermore, you can learn the development plan and roadmap via [RFC](https://gi
 We rely on pre-commit to keep our code consistent. To set it up:
 
 ```bash
-pip install pre-commit
+pip install pre-commit hydra-core
 pre-commit install
 # for staged changes
 pre-commit run
 # for all files in the repo
 pre-commit run --all-files
 # run a specific hook with pre-commit
-# pre-commit run --all-files --show-diff-on-failure --color=always <hood-id>
+# pre-commit run --all-files --show-diff-on-failure --color=always <hook-id>
 pre-commit run --all-files --show-diff-on-failure --color=always ruff
 pre-commit run --all-files --show-diff-on-failure --color=always autogen-trainer-cfg
 ```
@@ -41,6 +42,7 @@ pre-commit run --all-files --show-diff-on-failure --color=always autogen-trainer
 ## Testing
 
 Our test suites run on GitHub Actions. Check these workflows for details:
+
 - [GPU unit tests](https://github.com/volcengine/verl/blob/main/.github/workflows/gpu_unit_tests.yml)
 - [CPU unit tests](https://github.com/volcengine/verl/blob/main/.github/workflows/cpu_unit_tests.yml)
 - [vLLM tests](https://github.com/volcengine/verl/blob/main/.github/workflows/vllm.yml)
@@ -55,6 +57,7 @@ If possible, please add CI test(s) for your new feature:
 3. Minimize the workload of the test script(s) (see existing scripts for examples).
 
 ## Building the Docs
+
 ```
 # Ensure verl is on your PYTHONPATH, e.g.:
 pip install -e .[test]
@@ -70,16 +73,25 @@ make html
 # Preview locally
 python -m http.server -d _build/html/
 ```
+
 Open your browser at http://localhost:8000 to explore the docs.
 
 ## Pull Requests & Code Reviews
 
 Thanks for submitting a PR! To streamline reviews:
+
 - Follow our Pull Request Template for title format and checklist.
 - Adhere to our pre-commit lint rules and ensure all checks pass.
 - Update docs for any user-facing changes.
 - Add or update tests in the CI workflows, or explain why tests aren't applicable.
 
+## AI-Assisted Contributions
+
+See
+
+- [`AGENTS.md`](AGENTS.md) for rules that all AI coding agents must follow
+- [`editing-agent-instructions.md`](docs/contributing/editing-agent-instructions.md) for guidelines on editing agent instructions.
+
 ## License
 
 See the [LICENSE](https://github.com/volcengine/verl/blob/main/LICENSE) file for full details.
@@ -87,4 +99,3 @@ See the [LICENSE](https://github.com/volcengine/verl/blob/main/LICENSE) file for
 ## Thank You
 
 We appreciate your contributions to verl. Your efforts help make the project stronger and more user-friendly. Happy coding!
-
diff --git a/README.md b/README.md
index 071c65a63e8..5820d5b5af4 100644
--- a/README.md
+++ b/README.md
@@ -49,6 +49,8 @@ verl is fast with:
 
 ## News
 
+- [2026/04] verl's Megatron backend LoRA and router replay support is showcased at [PyTorch Conference Europe 2026](https://pytorchconferenceeu2026.sched.com/event/2Juce/optimizing-reinforcement-learning-at-trillion-parameter-scale-songlin-jiang-aalto-university-mind-lab).
+- [2026/03] verl is presented at NVIDIA GTC26: [session#1](https://www.nvidia.com/en-us/on-demand/session/gtc26-S81829/), [session#2](https://www.nvidia.com/en-us/on-demand/session/gtc26-S81620/)
 - [2026/01] verl has been migrated to the [verl-project](https://github.com/verl-project)
 - [2026/01] verl first meetup was successfully held in Shanghai on 01/10, hosted by Volcengine and NVIDIA, the slides has been uploaded to [verl-data](https://github.com/verl-project/verl-data).
 - [2026/01] The `recipe` directory has been migrated to a dedicated repository: [verl-recipe](https://github.com/verl-project/verl-recipe) and added as a submodule. See https://github.com/volcengine/verl/pull/4795. It can be used as it was after `git submodule update --init --recursive recipe`. Note that [`transfer_queue`](verl/experimental/transfer_queue), [`fully_async_policy`](verl/experimental/fully_async_policy), [`one_step_off_policy`](verl/experimental/one_step_off_policy) and [`vla`](verl/experimental/vla) are kept under [`verl/experimental`](verl/experimental) since they are planned to be merged into the main library. Use them through `verl.experimental.{module}`.
diff --git a/config/plain.yaml b/config/plain.yaml
new file mode 100644
index 00000000000..fe0c6d14d73
--- /dev/null
+++ b/config/plain.yaml
@@ -0,0 +1,129 @@
+hydra:
+  searchpath:
+    - file://verl/trainer/config
+
+defaults:
+  - ppo_megatron_trainer
+  - _self_
+
+algorithm:
+  adv_estimator: grpo
+  use_kl_in_reward: false
+
+data:
+  train_files: /from_s3/dataset/train.parquet
+  val_files: /from_s3/dataset/test.parquet
+  train_batch_size: 128
+  max_prompt_length: 512
+  max_response_length: 2048
+  filter_overlong_prompts: true
+  truncation: error
+  return_raw_chat: true
+
+actor_rollout_ref:
+  model:
+    path: /from_s3/models
+    use_fused_kernels: false # true
+    use_remove_padding: false
+    trust_remote_code: true
+
+  actor:
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 2048
+    ppo_mini_batch_size: 16
+    ppo_micro_batch_size_per_gpu: 1
+    optim:
+      lr: 1e-6
+    use_kl_loss: false
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    entropy_coeff: 0
+
+    # USE_LEGACY_WORKER_IMPL=enable path from the bash snippet
+    router_replay:
+      # mode: R3
+      mode: disabled
+      record_file: null  # Path for recording routing decisions
+      replay_file: null   # Path for replaying recorded decisions
+
+    megatron:
+      param_offload: true
+      optimizer_offload: true
+      grad_offload: true
+      pipeline_model_parallel_size: 1
+      # tensor_model_parallel_size: 4
+      # expert_model_parallel_size: 4
+      # expert_tensor_parallel_size: 2
+
+      tensor_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      expert_tensor_parallel_size: 1
+
+      use_mbridge: true
+      override_transformer_config:
+        # moe_enable_deepep: true
+        # moe_token_dispatcher_type: flex
+        apply_rope_fusion: false # true
+        bias_activation_fusion: false # true
+        # moe_router_dtype: fp32
+        recompute_method: uniform
+        recompute_granularity: full
+        recompute_num_layers: 1
+        gradient_accumulation_fusion: false # true
+        moe_permute_fusion: false
+
+#  Using a large number of experts (e.g. >=32) without fp32 routing. Consider enabling moe_router_dtype for better numerical stability. [repeated 7x across cluster]
+# UserWarning: moe_enable_deepep is deprecated.Please use --moe-flex-dispatcher-backend=deepep instead. [repeated 7x across cluster]
+
+  rollout:
+    name: vllm
+    mode: async
+    temperature: 1.0
+    tensor_model_parallel_size: 1
+    gpu_memory_utilization: 0.75
+    n: 8
+
+    enable_chunked_prefill: true # not tested yet with true
+    enable_prefix_caching: true # not tested yet with true
+    enforce_eager: false # not tested yet with false
+
+    max_num_batched_tokens: 1024
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false # not tested yet with true
+    log_prob_use_dynamic_bsz: false
+    log_prob_micro_batch_size_per_gpu: 1
+    log_prob_max_token_len_per_gpu: 2048
+
+    load_format: "safetensors"
+
+    disable_log_stats: false
+    prometheus:
+      enable: true
+
+    speculative_decoding:
+      enable: true
+
+      method: EAGLE3
+
+      num_steps: 1
+      num_draft_tokens: 4
+
+      draft_model_path: /from_s3/eagle_vllm
+      draft_tensor_parallel_size: 1
+
+
+trainer:
+  critic_warmup: 0
+  logger: ["console", "clearml"]
+  project_name: verl_megatron_moe
+  experiment_name: cuda-graphs-eagle3-4tokens-qwen3-lighteval-MATH
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  test_freq: 10
+  total_training_steps: 50000
+  balance_batch: false
+  use_legacy_worker_impl: enable
+  val_before_train: false
+  log_val_generations: 10
+  # rollout_data_dir: /from_s3/train_rollout_data/
diff --git a/docker/Dockerfile.stable.sglang b/docker/Dockerfile.stable.sglang
index 35a2926f824..6838efbd632 100644
--- a/docker/Dockerfile.stable.sglang
+++ b/docker/Dockerfile.stable.sglang
@@ -50,4 +50,4 @@ RUN apt-get update && \
     libcudnn9-cuda-12=9.16.0.29-1 \
     libcudnn9-dev-cuda-12=9.16.0.29-1 \
     libcudnn9-headers-cuda-12=9.16.0.29-1 && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/Dockerfile.stable.vllm b/docker/Dockerfile.stable.vllm
index f3e399d2add..e64ebb98bcd 100644
--- a/docker/Dockerfile.stable.vllm
+++ b/docker/Dockerfile.stable.vllm
@@ -1,9 +1,13 @@
-# vllm017
+# vllm018
 
-FROM nvidia/cuda:12.9.1-devel-ubuntu22.04
+FROM nvidia/cuda:12.9.1-devel-ubuntu24.04
 
 ARG DEBIAN_FRONTEND=noninteractive
 ARG PIP_NO_CACHE_DIR=1
+# PEP 668: Ubuntu 24.04 blocks system-wide pip installs; override for Docker
+ENV PIP_BREAK_SYSTEM_PACKAGES=1
+
+RUN sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list.d/ubuntu.sources
 
 RUN apt-get update && apt-get install -y \
     git \
@@ -16,7 +20,6 @@ RUN apt-get update && apt-get install -y \
     numactl \
     software-properties-common \
     vim && \
-    add-apt-repository ppa:deadsnakes/ppa -y && \
     apt-get update && \
     apt-get install -y \
     python3.12 \
@@ -30,11 +33,9 @@ RUN wget https://bootstrap.pypa.io/get-pip.py && \
 RUN ln -sf /usr/bin/python3.12 /usr/bin/python3 && \
     ln -sf /usr/bin/python3.12 /usr/bin/python
 
-RUN pip install torch==2.10.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu129
-
-RUN pip install vllm==0.17.0
+RUN pip install torch==2.10.0 torchvision==0.25.0 torchaudio==2.10.0 --index-url https://download.pytorch.org/whl/cu129
 
-RUN pip install pybind11
+RUN pip install pybind11 wheel
 
 RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb && \
     dpkg -i cuda-keyring_1.1-1_all.deb && \
@@ -42,20 +43,18 @@ RUN wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_6
     apt-get -y install cudnn && \
     rm -rf /var/lib/apt/lists/*
 
-RUN pip install nvidia-mathdx
+RUN pip install nvidia-mathdx ninja
 
-RUN MAX_JOBS=128 pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
+RUN MAX_JOBS=256 pip install -v --disable-pip-version-check --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
 
 RUN export NVTE_FRAMEWORK=pytorch && \
-    MAX_JOBS=128 \
+    MAX_JOBS=256 \
     NVTE_BUILD_THREADS_PER_JOB=4 \
     pip3 install --resume-retries 999 --no-build-isolation git+https://github.com/NVIDIA/TransformerEngine.git@release_v2.12
 
-# RUN pip install --upgrade transformers tokenizers
-
 RUN pip install codetiming mathruler pylatexenc qwen_vl_utils cachetools pytest-asyncio
 
-RUN export FLASH_ATTENTION_FORCE_BUILD="TRUE" && MAX_JOBS=16 pip install --no-build-isolation flash_attn==2.8.3
+RUN export FLASH_ATTENTION_FORCE_BUILD="TRUE" && MAX_JOBS=32 pip install --no-build-isolation flash_attn==2.8.3
 
 RUN NSIGHT_VERSION=2025.6.1_2025.6.1.190-1_$(if [ "$(uname -m)" = "aarch64" ]; then echo "arm64"; else echo "amd64"; fi) && \
     wget https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_6/nsight-systems-${NSIGHT_VERSION}.deb && \
@@ -90,15 +89,19 @@ RUN git clone -b hybrid-ep https://github.com/deepseek-ai/DeepEP.git && \
     export CPATH=/usr/local/cuda/targets/x86_64-linux/include/cccl:$CPATH && \
     python setup.py install
 
+RUN pip install vllm==0.18.0
+
 RUN pip3 install --no-deps trl==0.27.0
 
 RUN pip3 install nvtx matplotlib liger_kernel
 
-RUN pip install -U git+https://github.com/ISEEKYAN/mbridge.git
+RUN pip install -U git+https://github.com/ISEEKYAN/mbridge.git@641a5a0
 
 RUN pip install --no-deps git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.16.0
 
-RUN pip install git+https://github.com/volcengine/verl.git@v0.7.0 && \
+RUN pip install transformers==5.3.0
+
+RUN pip install git+https://github.com/volcengine/verl.git@v0.7.1 && \
     pip uninstall -y verl
 
 RUN apt-get update && apt-get install -y curl \
@@ -109,4 +112,4 @@ RUN apt-get update && \
     libcudnn9-cuda-12=9.16.0.29-1 \
     libcudnn9-dev-cuda-12=9.16.0.29-1 \
     libcudnn9-headers-cuda-12=9.16.0.29-1 && \
-    rm -rf /var/lib/apt/lists/*
+    rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2 b/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2
index 7f61e48e4cf..d1224623095 100644
--- a/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2
+++ b/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2
@@ -72,7 +72,7 @@ RUN pip install -e MindSpeed && \
 # Prepare and install verl (update frequently)
 RUN git clone --recursive https://github.com/volcengine/verl.git && \
     cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
-    pip install ray==2.46.0 click==8.2.1 cachetools && \
+    pip install ray==2.46.0 click==8.2.1 cachetools setuptools==80.10.2 nvtx && \
     # Clear extra files
     rm -rf /tmp/* /var/tmp/* && \
     pip cache purge
diff --git a/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3 b/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3
index 2e765610802..3f874d43000 100644
--- a/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3
+++ b/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3
@@ -70,7 +70,7 @@ RUN pip install -e MindSpeed && \
 # Prepare and install verl (update frequently)
 RUN git clone --recursive https://github.com/volcengine/verl.git && \
     cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. && \
-    pip install ray==2.46.0 click==8.2.1 cachetools && \
+    pip install ray==2.46.0 click==8.2.1 cachetools setuptools==80.10.2 nvtx && \
     # Clear extra files
     rm -rf /tmp/* /var/tmp/* && \
     pip cache purge
diff --git a/docs/advance/nvfp4_qat.md b/docs/advance/nvfp4_qat.md
new file mode 100644
index 00000000000..f7c2f19a23e
--- /dev/null
+++ b/docs/advance/nvfp4_qat.md
@@ -0,0 +1,87 @@
+# NVFP4 QAT (Quantization-Aware Training) in verl
+
+Last updated: 04/02/2026
+
+verl supports NVFP4 Quantization-Aware Training (QAT), which applies fake quantization during training so the model learns to tolerate NVFP4 quantization error. At rollout time, weights are packed into real NVFP4 format for vLLM inference. This closes the precision gap between training and inference, preventing KL divergence explosion.
+
+| Training Backend | Training Precision | Rollout Precision | vLLM Quant Method |
+|---|---|---|---|
+| **FSDP** | BF16 + fake quantization | NVFP4 W4A16 | `compressed-tensors` |
+| **Megatron** | BF16 + fake quantization | NVFP4 W4A16 | `modelopt` |
+
+> [!TIP]
+> For ready-to-run scripts, environment setup, and experimental results, see the [QAT recipe](https://github.com/verl-project/verl-recipe/tree/main/qat).
+
+---
+
+## Key Configuration
+
+### FSDP Backend
+
+Configured under `actor_rollout_ref.actor.fsdp_config.qat`:
+
+```yaml
+actor_rollout_ref:
+  actor:
+    fsdp_config:
+      qat:
+        enable: true
+        mode: "w4a16"
+        group_size: 16
+        ignore_patterns:
+          - "lm_head"
+          - "embed_tokens"
+          - "re:.*mlp.gate$"
+        quantization_config_path: "recipe/qat/config/nvfp4_w4a16.json"
+```
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `fsdp_config.qat.enable` | Enable QAT | `False` |
+| `fsdp_config.qat.mode` | Quantization mode | `"w4a16"` |
+| `fsdp_config.qat.group_size` | Quantization group size | `16` |
+| `fsdp_config.qat.ignore_patterns` | Layers to skip. Supports `re:` prefix for regex, otherwise substring match | `["lm_head", "embed_tokens", "re:.*mlp.gate$"]` |
+| `fsdp_config.qat.quantization_config_path` | vLLM quantization config JSON path | Required |
+
+### Megatron Backend
+
+Configured under `actor_rollout_ref.actor.megatron.qat`:
+
+```yaml
+actor_rollout_ref:
+  actor:
+    megatron:
+      qat:
+        enable: true
+        mode: "w4a16"
+        group_size: 16
+        ignore_patterns:
+          - "lm_head"
+          - "*mlp.gate"
+        quantization_config_path: "recipe/qat/config/nvfp4_w4a16_megatron.json"
+```
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `megatron.qat.enable` | Enable QAT | `False` |
+| `megatron.qat.mode` | Quantization mode | `"w4a16"` |
+| `megatron.qat.group_size` | Quantization group size | `16` |
+| `megatron.qat.ignore_patterns` | Layers to skip. Uses `fnmatch` glob syntax | `["lm_head", "*mlp.gate"]` |
+| `megatron.qat.quantization_config_path` | vLLM quantization config JSON path | Required |
+
+---
+
+## Support Matrix
+
+- NVFP4 W4A16 (weight-only FP4 quantization)
+- Dense models and MoE models
+- FSDP and Megatron training backends
+- Full quantization and FFN-only quantization strategies
+- Verified on Qwen3-8B-Base and Qwen3-30B-A3B-Base
+
+---
+
+## Notes
+
+- FSDP backend has scalability limitations for very large models. For large-scale training, use the Megatron backend.
+- FSDP uses `re:` prefix regex for `ignore_patterns`, while Megatron uses `fnmatch` glob syntax. The two are not interchangeable.
diff --git a/docs/advance/ppo_lora.rst b/docs/advance/ppo_lora.rst
index 8a6f5186a67..dce2edc4046 100644
--- a/docs/advance/ppo_lora.rst
+++ b/docs/advance/ppo_lora.rst
@@ -65,7 +65,7 @@ Megatron Backend Usage Guide
 
 You need to install and enable Megatron-Bridge for Megatron LoRA support.
 
-Make sure you use Megatron-Bridge later than 0.2.0, and we recommended using `this commit <https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/83a7c1134c562d8c6decd10a1f0a6e6a7a8a3a44>`_ or later for proper support, and use the following settings to enable Megatron-Bridge:
+Make sure you use Megatron-Bridge later than 0.2.0, and we recommended using `this commit <https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/6259ae83c735c4412796fc5cfb4c9607b949ae29>`_ or later for proper support, and use the following settings to enable Megatron-Bridge:
 
 - ``actor_rollout_ref.actor.megatron.use_mbridge=True``
 - ``actor_rollout_ref.actor.megatron.vanilla_mbridge=False``
diff --git a/docs/advance/rollout_skip.rst b/docs/advance/rollout_skip.rst
index 1839beed3e4..6ed661d2bad 100644
--- a/docs/advance/rollout_skip.rst
+++ b/docs/advance/rollout_skip.rst
@@ -1,61 +1,73 @@
 RolloutSkip Function Usage Documentation
 ========================================
 
-Last updated: 08/01/2025.
+Last updated: 2026-03-25
 
 Applicable Scenarios
 --------------------
+The RolloutSkip utility accelerates RL training by caching and reusing pre-generated rollout data,
+avoiding redundant sequence generation during debugging, replay, or fixed-experiment runs.
 
-The RolloutSkip functionality is designed to accelerate the rollout process in reinforcement learning training by caching and reusing previously generated sequences. This feature is particularly useful when:
+It is suitable for:
 
-1. You need to repeatedly run experiments with the same configuration
-
-2. You want to save time by avoiding redundant sequence generation to come close to the optimal policy
+1. Re-running experiments with the same configuration
+2. Speeding up training by skipping repeated generation
+3. Reproducing rollout results in debugging
 
 
 API and Usage Example
 ----------------------
 
-2.1 Trainer Adaptation
-~~~~~~~~~~~~~~~~~~~~~~
-
-Both`RayDAPOTrainer()` (in `verl/recipe/dapo/dapo_ray_trainer.py`) and `RayPPOTrainer()`(in `verl/trainer/ppo/ray_trainer.py``) have already been adapted.
+Trainer Adaptation
+~~~~~~~~~~~~~~~~~~
+RolloutSkip is already supported in ``RayDAPOTrainer`` and ``RayPPOTrainer``.
 
-This is an example of how to patch rollout_skip in RayPPOTrainer.
+Example integration:
 
 .. code-block:: python
 
-    #* Import the RolloutSkip class
     from verl.utils.rollout_skip import RolloutSkip
 
-    ...
-    class RayPPOTrainer:
-        ...
-        def fit(self):
-            ...
+    # Inside trainer.fit()
+    rollout_skip = RolloutSkip(self.config, self.async_rollout_manager)
+    rollout_skip.wrap_generate_sequences()
+
 
-            #* Add code as follow:
-            rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)
-            rollout_skip.wrap_generate_sequences()
+Basic Configuration
+~~~~~~~~~~~~~~~~~~~
+Add these parameters to enable RolloutSkip:
 
-            ...
+.. code-block:: bash
 
-            for epoch in range(self.config.trainer.total_epochs):
-                for batch_dict in self.train_dataloader:
-                    ...
+    actor_rollout_ref.rollout.skip.enable=True
+    actor_rollout_ref.rollout.skip.dump_dir=/path/to/rollout_dump
+    actor_rollout_ref.rollout.skip.max_dump_step=10
 
-2.2 Basic Configuration
-~~~~~~~~~~~~~~~~~~~~~~~
 
-Then, you should add the following parameters to your config to enable the RolloutSkip feature:
+Configuration Parameters
+------------------------
+- **skip.enable**: Enable or disable RolloutSkip.
+- **skip.dump_dir**: Root directory to save cached rollout data.
+- **skip.max_dump_step**: Maximum number of steps to cache.
 
-.. code-block:: bash
 
-    actor_rollout_ref.rollout.skip_rollout=True \
-    actor_rollout_ref.rollout.skip_dump_dir="/tmp/rollout_dump" \
+Cached Directory Structure
+--------------------------
+The directory structure is automatically generated to isolate different experiments:
+
+.. code-block:: text
 
+    {dump_dir}/{exp_name}_{project_name}/
+    └── GBS{gbs}_N{n}_in{prompt_len}_out{response_len}/
+        ├── train_step__gen_step.txt
+        ├── genstep_000001/
+        │   ├── new_batch.dp
+        │   ├── gen_batch.dp
+        │   └── meta.json
+        └── genstep_000002/
 
-Note:
 
-1. The `skip_dump_dir` is the directory where the cached sequences will be stored. Ensure that this directory is writable and accessible by your training process. And make sure that `skip_dump_dir` is not relative path because ray will store the data in `/tmp/ray/session_<session_id>/` and the relative path will not be found in the worker.
-2. The dumped data path follows this naming pattern `{experiment_name}_{project_name}_TrainGBS{train_gbs}__InferGBS{gen_gbs}__N{n}`, once you change the `experiment_name`, `project_name`, `train_gbs`, `gen_gbs`, or `n`, the cached data will be stored in a new directory.
+Each ``genstep_*`` folder contains:
+- ``new_batch.dp``: Input prompt batch
+- ``gen_batch.dp``: Generated response batch
+- ``meta.json``: Step metadata
\ No newline at end of file
diff --git a/docs/algo/rollout_corr.md b/docs/algo/rollout_corr.md
index 066919b67d5..0096f3bd3ac 100644
--- a/docs/algo/rollout_corr.md
+++ b/docs/algo/rollout_corr.md
@@ -170,7 +170,7 @@ For advanced customization or YAML-based configs:
 algorithm:
   rollout_correction:
     rollout_is: token # IS weights: "token", "sequence", or null
-    rollout_is_threshold: 2.0 # Upper threshold for IS weights
+    rollout_is_threshold: 2.0 # TIS upper bound, or "0.5_5.0" for IcePop
     rollout_is_batch_normalize: false # Batch normalize IS weights to mean=1.0
     rollout_rs: null # Rejection sampling: comma-separated canonical options (e.g. "token_k1,seq_max_k2")
     rollout_rs_threshold: null # Threshold spec: float(s) or "lower_upper" string(s)
@@ -234,13 +234,15 @@ Importance sampling weights aggregation level:
 
 All IS weights are safety-bounded to [exp(-20), exp(20)] ≈ [2e-9, 5e8]
 
-### `rollout_is_threshold` (float)
+### `rollout_is_threshold` (str or float)
 
-Upper threshold for IS weight truncation. Default: `2.0`
+Threshold specification for IS weighting. Default: `2.0`
 
-- Truncates IS weights via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling)
+- Single float or float-like string: TIS via `.clamp(max=rollout_is_threshold)`
+- `"lower_upper"` string such as `"0.5_5.0"`: IcePop, zero weights outside `[lower, upper]`
 - Applied to IS weights for variance reduction
 - Separate from rejection sampling (controlled by `rollout_rs` parameters)
+- Unlike `rollout_rs`, IcePop does not modify `response_mask`; it only changes the IS coefficients
 
 ### `rollout_is_batch_normalize` (bool)
 
diff --git a/docs/amd_tutorial/amd_build_dockerfile_page.rst b/docs/amd_tutorial/amd_build_dockerfile_page.rst
index fc462c17fbd..1021b3581e1 100644
--- a/docs/amd_tutorial/amd_build_dockerfile_page.rst
+++ b/docs/amd_tutorial/amd_build_dockerfile_page.rst
@@ -771,8 +771,8 @@ slurm_script.sh
         critic.model.path=$MODEL_PATH \
         critic.model.enable_gradient_checkpointing=False \
         critic.ppo_micro_batch_size_per_gpu=8 \
-        critic.model.fsdp_config.param_offload=False \
-        critic.model.fsdp_config.optimizer_offload=False \
+        critic.fsdp.param_offload=False \
+        critic.fsdp.optimizer_offload=False \
         algorithm.kl_ctrl.kl_coef=0.0001 \
         trainer.critic_warmup=0 \
         trainer.logger='["console","wandb"]' \
diff --git a/docs/ascend_tutorial/contribution_guide/ascend_ci_guide_zh.rst b/docs/ascend_tutorial/contribution_guide/ascend_ci_guide_zh.rst
index 0cc9b0dee48..87171b0f1b1 100644
--- a/docs/ascend_tutorial/contribution_guide/ascend_ci_guide_zh.rst
+++ b/docs/ascend_tutorial/contribution_guide/ascend_ci_guide_zh.rst
@@ -46,7 +46,7 @@ NPU 相关的工作流主要包括：
 
 **Note**
 
-   {HOME}是root
+   ${HOME}是root
 
    gpu用例中权重在~/models/路径下，如需适配可以用软链接，``ln -s /root/.cache/models ~/models``
 
@@ -120,8 +120,7 @@ NPU 相关的工作流主要包括：
              clean: true 
          - name: Install dependencies
            run: |
-             pip install -r requirements-npu.txt
-             pip install -e .
+             pip install --no-deps -e .
          - name: Verify environment
            run: pip list
          # 以下为具体测试步骤（根据需求定制）
diff --git a/docs/ascend_tutorial/examples/ascend_performance_analysis_guide.md b/docs/ascend_tutorial/examples/ascend_performance_analysis_guide.md
index d1dc4fb218d..e3ea1870de1 100644
--- a/docs/ascend_tutorial/examples/ascend_performance_analysis_guide.md
+++ b/docs/ascend_tutorial/examples/ascend_performance_analysis_guide.md
@@ -16,11 +16,11 @@ Last updated: 02/24/2026.
 
 ![rl_data_stream](https://github.com/chengminhua/verl_data/raw/main/MindStudio_Insight_use/rl_data_stream.png)
 
-## profilling工具使能
+## profiling工具使能
 
 ### 使能方法
 
-使能和配置教程可参考：[verl/docs/ascend_tutorial/profiling/ascend_profiling_zh.rst at main · verl-project/verl](https://github.com/verl-project/verl/raw/main/docs/ascend_tutorial/profiling/ascend_profiling_zh.rst)
+使能和配置教程可参考：[verl/docs/ascend_tutorial/profiling/ascend_profiling_zh.rst at main · verl-project/verl](https://github.com/verl-project/verl/blob/main/docs/ascend_tutorial/profiling/ascend_profiling_zh.rst)
 
 ## 性能分析方法论
 
@@ -38,7 +38,6 @@ Last updated: 02/24/2026.
 
 - **操作**：通过MindStudio Insight直接查看MSTX打点数据，观察Rollout阶段不同DP Rank的负载均衡情况
 - **价值**：快速识别负载不均问题
-
 - **效果展示：**
 
 ![Load_Balancing_Analysis](https://github.com/chengminhua/verl_data/raw/main/MindStudio_Insight_use/Load_Balancing_Analysis.gif)
@@ -57,9 +56,7 @@ Last updated: 02/24/2026.
 #### 性能分析
 
 - **操作**：可通过 MindStudio Insight Windows 或 Linux 版本加载 Profiling 数据
-
 - **价值**：MindStudio Insight 支持分析任务调度效率、算子执行性能、计算资源利用率、集合通信性能等。其 Timeline 视图具备任务拆解与 Overlap 分析功能（**为 MindStudio 独有核心特性，在 NV 及其他竞品中不具备，是 AI 调优的必备工具**），并支持鼠标交互式分析。
-
 - **效果展示**：
 
 ![performance%20analysis](https://github.com/chengminhua/verl_data/raw/main/MindStudio_Insight_use/performance%20analysis.png)
@@ -116,16 +113,14 @@ host bound是指CPU任务量综合大于NPU，导致NPU执行出现空泡的现
 
 ### 3.算子性能初诊
 
-需要利用`".\ASCEND_PROFILER_OUTPUT\operator_details.csv"`来做分析，从而判断算子识否有性能问题。
+需要利用 `".\ASCEND_PROFILER_OUTPUT\operator_details.csv"`来做分析，从而判断算子是否有性能问题。
 
-Profiling工具会统计这些流水线在不同核上的平均繁忙时间（xxx_time），与最慢核的完整kernel耗时（task_duration）做除法，得到流水线利用率（xxx_ratio）。这些流水线之间虽然互有依赖，且搬运类流水线会互抢带宽，但算子只要设计得当，是可以做到互相掩盖的。因此我们可以初步认为，**当算子的执行耗时大到一定程度上，算子应当在某一条流水线上形成bound**，即利用率要高到一定程度。经验上，在单算子耗时达到50μ时，就可以认为算子应当在bound流水线上，达成80%+的占用率了。
+Profiling工具会统计这些流水线在不同核上的平均繁忙时间（xxx_time），与最慢核的完整kernel耗时（task_duration）做除法，得到流水线利用率（xxx_ratio）。这些流水线之间虽然互有依赖，且搬运类流水线会互抢带宽，但算子只要设计得当，是可以做到互相掩盖的。因此我们可以初步认为，**当算子的执行耗时大到一定程度上，算子应当在某一条流水线上形成bound**，即利用率要高到一定程度。经验上，在单算子耗时达到50μs时，就可以认为算子应当在bound流水线上，达成80%+的占用率了。
 
 以下图为例，第一行是一个FA算子，第二行是一个Matmul算子，FA在vec流水线上达到了88.1%的利用率，Matmul算子在mac流水线上达到了89.8%的利用率，他们的性能可以认为是合格的。
 
 ![Operator%20performance](https://github.com/chengminhua/verl_data/raw/main/MindStudio_Insight_use/Operator%20performance.png)
 
-
-
 ### 4.亲和shape调整
 
 对于一个模型而言，超参是我们控制不了的，但我们可以控制并发度、权重格式、切分策略等因素来迎合算子，使其发挥出最大的性能，这一节主要从算子搬运效率和负载均衡两个方面出发，讨论模型侧值得尝试的调整方向。
@@ -148,7 +143,7 @@ mte2是一个自身效率严重受shape影响的流水线。要想让mte2保证
 
 首先，我们明确出当前NPU卡是多少核的，如果不清楚，跑出来的profiling里都是20，40这样的数，就说明是20核，反之是24核。这里我的24核其实是代表了一个cube和两个vector组成的小组，我们可以认为是一个cube作为主核，带了两个vector作为从核。如果一个算子是纯vector算子，那么就不再有组的概念，40或48个vector核会作为主核直接独立去拿逻辑任务。
 
-对于LLM中的vector算子，它的一种常见分核策略有可能是分在最高维，也就是batch维，常见于对低维（也叫尾轴）有规约操作的norm类、动态量化类等算子；另一种是整体拍平，允许算子切分的非常细的算子，如elementwse算子。对于第一种，我们就可以在模型侧关注它的负载均衡问题。例如我们打48batch，而硬件却是个40个vector核，那这40个核会循环2次，第二次有多数的核会无事可做，这个batch数就可以认为是不友好的。如果将batch打到64或80，性能可以预见会是无损的。同样的情况下，如果是48核的卡，那我们可以认为这就是个非常友好的batch数。
+对于LLM中的vector算子，它的一种常见分核策略有可能是分在最高维，也就是batch维，常见于对低维（也叫尾轴）有规约操作的norm类、动态量化类等算子；另一种是整体拍平，允许算子切分的非常细的算子，如elementwise算子。对于第一种，我们就可以在模型侧关注它的负载均衡问题。例如我们打48batch，而硬件却是个40个vector核，那这40个核会循环2次，第二次有多数的核会无事可做，这个batch数就可以认为是不友好的。如果将batch打到64或80，性能可以预见会是无损的。同样的情况下，如果是48核的卡，那我们可以认为这就是个非常友好的batch数。
 
 对于cube类算子，它常见的分核策略是以base快去切分M和N（K轴是累加轴，对它分核会引入确定性问题）。最常见的分块是baseM=128，baseN=256。在decode阶段，我们的耗时基本可以看做都是在搬权重，这是因为激活的M极小，M方向大概率只分了一块，那么右矩阵就只需要搬一次。所以我们在M≤128的范围内可以尽情提高M，对性能都基本是无损的，如果M大于128，可以认为(128, 256]是下一个性能分档。
 除了M外，N轴切分的任务也影响算子亲和性，以deepseekR1中的MLA预处理为例，它会使用同一个激活（shape为[batch_size, 7168]）与两个权重做矩阵乘(shape为[7168, 1536]和[7168, 576])。在batch_size打不大的情况下，即使baseN缩短为128，N轴都不能用满核数，所以此时这两个矩阵乘各自的耗时，会约等于将他们权重N轴拼起来乘(shape为[7168, 2112])的矩阵乘的耗时。如果仅考虑模型竞争力，我们更希望对这两个权重做合并，否则两个小的矩阵乘带宽利用率都会非常差。
@@ -156,14 +151,3 @@ mte2是一个自身效率严重受shape影响的流水线。要想让mte2保证
 对于Attention算子，它常见的分核策略是q_seqlen、batch_size和kv_headnum。增量阶段q_seqlen会以MTP和GQA倍数做合并，但是通常也不会大过128，划分不出第二个任务，那么并行度基本就是batch_size * kv_headnum。
 
 总的来说，我们可以依据shape信息和算子类别，对算子是否有负载均衡问题作出识别，从而对我们切分策略选择，最高吞吐量的batch策略作出预判。
-
-
-
-
-
-
-
-
-
-
-
diff --git a/docs/ascend_tutorial/examples/ascend_retool_best_pratice.rst b/docs/ascend_tutorial/examples/ascend_retool_best_pratice.rst
index 99ca29cf5ae..5938f916793 100644
--- a/docs/ascend_tutorial/examples/ascend_retool_best_pratice.rst
+++ b/docs/ascend_tutorial/examples/ascend_retool_best_pratice.rst
@@ -27,10 +27,10 @@ Retool论文参考([Retool](https://arxiv.org/pdf/2504.11536))
 ============    ============================================================
 software        version 
 ============    ============================================================
-Python          ``>= 3.10, <3.12``
-CANN            ``== 8.3.RC1``
-torch           ``== 2.7.1``
-torch_npu       ``== 2.7.1``
+Python          ``>=3.10, <3.12``
+CANN            ``==8.3.RC1``
+torch           ``==2.7.1``
+torch_npu       ``==2.7.1``
 verl            ``v0.6.1 commitId=d62da4950573d7a4b7ef2362337952e7ab59e78d``
 vllm            ``v0.11.0``
 vllm-ascend     ``v0.11.0-dev``
@@ -82,8 +82,8 @@ transformers    ``4.57.6``
 .. code-block:: bash
 
   python3 -m verl.model_merger merge --backend fsdp \
-      --local_dir ${DATASETS}/checkpoint/multiturn-sft-qwen-2.5-7b-instruct/global_step_372 \
-      --target_dir ${DATASETS}/checkpoint/multiturn-sft-qwen-2.5-7b-instruct/global_step_372/huggingface
+      --local_dir /PATH/TO/checkpoint/multiturn-sft-qwen-2.5-7b-instruct/global_step_372 \
+      --target_dir /PATH/TO/checkpoint/multiturn-sft-qwen-2.5-7b-instruct/global_step_372/huggingface
 
 2.代码沙箱准备
 
diff --git a/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst b/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst
index 32c21535aa4..4bea9de6761 100644
--- a/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst
+++ b/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst
@@ -27,8 +27,7 @@ SGLang 是当前主流的高性能开源推理引擎, 昇腾已经全面原生
 
 环境构建
 -----------------------------------
-我们在quickstart中提供了两种构建环境的方法, 1.从镜像文件DockerFile进行构建 2.从自定义Conda环境进行构建
-
+我们在 `quickstart <https://github.com/verl-project/verl/blob/main/docs/ascend_tutorial/quick_start/ascend_sglang_quick_start.rst>`_中提供了两种构建环境的方法, 1.从镜像文件DockerFile进行构建 2.从自定义Conda环境进行构建
 在本实践中, 我们额外指定verl 的commit id 以避免引入其他问题
 
 .. code-block:: bash
@@ -61,7 +60,7 @@ DAPO-Math-17k: https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
 
 .. code-block:: bash
 
-    actor_rollout_ref.actor.megatron.use_dist_checkpointing=False
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=False \
     actor_rollout_ref.actor.megatron.use_mbridge=True
 
 `Qwen2.5-32B`_
@@ -124,7 +123,7 @@ DAPO-Math-17k: https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
   export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
   export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
   export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
-  export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8
+  export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
   # 修改为当前需要跑的用例路径
   DEFAULT_SH="./run_*.sh"
   echo "Use $DEFAULT_SH"
@@ -198,7 +197,7 @@ SOCKET_IFNAME, HCCL_SOCKET_IFNAME, GLOO_SOCKET_IFNAME: 修改为对应通信网
 3.模型评估
 ^^^^^^^^^^^
 
-不同模型步骤一致,仅以Qwen3-30b为例列举
+不同模型步骤一致,仅以Qwen3-30B为例列举
 
 我们通过 AISBenchmark 评估模型,该工具支持vllm/sglang多种推理后端的评估
 
diff --git a/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md b/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md
index 4c51ce60164..32072d3c485 100644
--- a/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md
+++ b/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md
@@ -64,16 +64,16 @@ reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor}   #惩罚
 
 ## 安装基础环境
 
-| software | version|
-| --- | --- |
-| Python| >= 3.10, <3.12 |
-| CANN | == 8.3.RC1 |
-| torch | == 2.7.1 |
-| torch_npu | == 2.7.1 |
-| verl | main分支 commitId=252d76908b903ad8fb6969eb3a5e5f873c95ea2b |
-| vllm | 	v0.11.0 |
-| vllm-ascend | v0.11.0-dev|
-| transformers | 	4.57.3|
+| software     | version                                                    |
+| ------------ | ---------------------------------------------------------- |
+| Python       | >=3.10, <3.12                                              |
+| CANN         | ==8.3.RC1                                                  |
+| torch        | ==2.7.1                                                    |
+| torch_npu    | ==2.7.1                                                    |
+| verl         | main分支 commitId=252d76908b903ad8fb6969eb3a5e5f873c95ea2b |
+| vllm         | v0.11.0                                                    |
+| vllm-ascend  | v0.11.0-dev                                                |
+| transformers | 4.57.3                                                     |
 
 在本实践中, 我们通过指定 verl 的commit id 以避免引入其他问题
 ```
@@ -91,7 +91,7 @@ git checkout main
 
 Geometry3k 数据集是由加利福尼亚大学洛杉矶分校与浙江大学联合研发的几何领域专用数据集，核心面向视觉问答（VQA）任务展开研究与模型训练。该数据集总计包含 3002 个样本，采用图像和文本两种模态数据形式构建，其中文本模态涵盖各类几何问题描述，图像则以可视化图表呈现问题中的几何图形信息，包括三角形、圆形、四边形等基础几何形状，以及不同图形间的位置、嵌套、相交等关联关系。可以从Hugging Face库下载对应的原始数据集：[Geometry3k ](https://huggingface.co/datasets/hiyouga/geometry3k)
 
-```python
+```shell
 # 下载原始数据并预处理
 python ./examples/data_preprocess/geo3k.py --local_dir=./data/geo3k
 ```
@@ -124,7 +124,7 @@ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
 
 #### OpenEuler 操作系统
 
-执行如下命令重操作系统源安装jemalloc
+执行如下命令通过操作系统源安装jemalloc
 
 ```shell
 yum install jemalloc
diff --git a/docs/ascend_tutorial/examples/gspo_optimization_practice.md b/docs/ascend_tutorial/examples/gspo_optimization_practice.md
index 0d3419be518..5c3c5e96a98 100644
--- a/docs/ascend_tutorial/examples/gspo_optimization_practice.md
+++ b/docs/ascend_tutorial/examples/gspo_optimization_practice.md
@@ -10,25 +10,25 @@ GSPO通过将优化颗粒度从**token级**提升到**sequence级**，规避了G
 
 想要成功在verl仓库中成功调用到GSPO算法，需要进行如下的必要配置
 
-~~~python
+```python
 # 核心算法配置  
-algorithm.adv_estimator=grpo \                    # 使用GRPO优势估计器    
-algorithm.use_kl_in_reward=False \                # 不在奖励中添加KL惩罚    
+algorithm.adv_estimator=grpo \                    # 使用GRPO优势估计器  
+algorithm.use_kl_in_reward=False \                # 不在奖励中添加KL惩罚  
 # GSPO策略损失模式  
 actor_rollout_ref.actor.policy_loss.loss_mode=gspo \ # 启用GSPO策略损失
 # 极小裁剪范围（GSPO特色）  
-actor_rollout_ref.actor.clip_ratio_low=0.0003 \   # 裁剪下界，论文推荐值    
-actor_rollout_ref.actor.clip_ratio_high=0.0004 \  # 裁剪上界，论文推荐值    
+actor_rollout_ref.actor.clip_ratio_low=0.0003 \   # 裁剪下界，论文推荐值  
+actor_rollout_ref.actor.clip_ratio_high=0.0004 \  # 裁剪上界，论文推荐值  
 # KL配置（GSPO不使用KL loss）  
-actor_rollout_ref.actor.use_kl_loss=False \       # 禁用KL损失    
-actor_rollout_ref.actor.kl_loss_coef=0.0 \        # KL损失系数设为0    
+actor_rollout_ref.actor.use_kl_loss=False \       # 禁用KL损失  
+actor_rollout_ref.actor.kl_loss_coef=0.0 \        # KL损失系数设为0  
 # 序列级损失聚合模式（GSPO核心）  
-actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-mean \ # 序列级平均，GSPO论文推荐    
+actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-mean \ # 序列级平均，GSPO论文推荐  
 # 批次配置  
 actor_rollout_ref.rollout.n=16 \                  # 每个prompt生成16个响应（组采样）
-~~~
+```
 
-一般选择入口函数为`verl.trainer.main_ppo`
+一般选择入口函数为 `verl.trainer.main_ppo`
 
 ## 基础环境
 
@@ -38,10 +38,10 @@ actor_rollout_ref.rollout.n=16 \                  # 每个prompt生成16个响
 
 | software     | version                                                    |
 | ------------ | ---------------------------------------------------------- |
-| Python       | >= 3.10, <3.12                                             |
-| CANN         | == 8.3.RC1                                                 |
-| torch        | == 2.7.1                                                   |
-| torch_npu    | == 2.7.1                                                   |
+| Python       | >=3.10, <3.12                                              |
+| CANN         | ==8.3.RC1                                                  |
+| torch        | ==2.7.1                                                    |
+| torch_npu    | ==2.7.1                                                    |
 | verl         | main分支 commitId=252d76908b903ad8fb6969eb3a5e5f873c95ea2b |
 | vllm         | v0.11.0                                                    |
 | vllm-ascend  | v0.11.0-dev                                                |
@@ -49,12 +49,12 @@ actor_rollout_ref.rollout.n=16 \                  # 每个prompt生成16个响
 
 在本实践中, 我们通过指定 verl 的commit id 以避免引入其他问题
 
-~~~bash
+```bash
 cd verl
 git checkout 252d76908b903ad8fb6969eb3a5e5f873c95ea2b
 # 指定相应的recipe版本
 git submodule update --init --recursive recipe
-~~~
+```
 
 ### 权重获取
 
@@ -62,13 +62,13 @@ git submodule update --init --recursive recipe
 
 ### 数据集准备
 
-~~~bash
+```bash
 # 下载math-17k数据集
 git clone https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
 
 # 下载AIME_2024测试数据集
 git clone https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
-~~~
+```
 
 ### jemalloc安装
 
@@ -93,7 +93,7 @@ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
 
 #### OpenEuler 操作系统
 
-执行如下命令重操作系统源安装jemalloc
+执行如下命令通过操作系统源安装jemalloc
 
 ```shell
 yum install jemalloc
@@ -120,7 +120,7 @@ export LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libjemalloc.so.2
 
 针对本实践提供的多机任务，可用下面的脚本拉起
 
-~~~bash
+```bash
 pkill -9 python
 ray stop --force
 rm -rf /tmp/ray
@@ -196,9 +196,9 @@ else
 fi
 
 sleep 600
-~~~
+```
 
-DEFAULT_SH:修改为训练所用配置 sh 文件路径。在此案例中修改为 [Qwen2.5-32B](https://github.com/volcengine/verl/blob/main/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh) 路径。
+DEFAULT_SH:修改为训练所用配置 sh 文件路径。在此案例中修改为 [Qwen3-32B](https://github.com/volcengine/verl/blob/main/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh) 路径。
 
 NNODES 和 NPUS_PER_NODE:修改为使用节点数量和每个节点 NPU 数量。在此案例中分别为4和16。
 
@@ -218,14 +218,14 @@ ifconfig |grep "$(hostname -I |awk '{print $1}'|awk -F '.' '{print $0}')" -B 1|a
 
 #### 动态bsz
 
-~~~bash
+```bash
 actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
 infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
-~~~
+```
 
 **这个优化点主要调整上面这两个参数，不过需要注意这两个参数调整的太大会导致OOM**
 
-**主要调整**`actor_ppo_max_token_len`,调大了会降低训练的耗时，调整`infer_ppo_max_token_len`没有明显的收益，可以不动
+**主要调整** `actor_ppo_max_token_len`,调大了会降低训练的耗时，调整 `infer_ppo_max_token_len`没有明显的收益，可以不动
 
 **这两个参数的作用介绍如下：**
 
@@ -238,16 +238,16 @@ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
 
 #### ACLgraph+FULL_DECODE_ONLY
 
-推理算子下发方面的优化，平均能有`15%~20%`左右的性能收益。
+推理算子下发方面的优化，平均能有 `15%~20%`左右的性能收益。
 
 先看单开**ACLgraph**，如下：
 
-~~~bash
+```bash
 # 开启ACLgraph+FULL_DECODE_ONLY（注意：当设置此参数为False时，TASK_QUEUE_ENABLE必须设置为1，不然会报错）
-actor_rollout_ref.rollout.enforce_eager=False
+actor_rollout_ref.rollout.enforce_eager=False \
 actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_capture_sizes='[8,16,32,64,128]' \ 
-actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode='FULL_DECODE_ONLY' \
-~~~
+actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode='FULL_DECODE_ONLY'
+```
 
 `FULL_DECODE_ONLY`开启成功后有如下输出：
 
@@ -279,7 +279,7 @@ cudagraph_capture_sizes设置的值对应的是批大小，这里的批大小不
 
 #### AIV
 
-打开方式：设置`export HCCL_OP_EXPANSION_MODE="AIV"`
+打开方式：设置 `export HCCL_OP_EXPANSION_MODE="AIV"`
 
 HCCL_OP_EXPANSION_MODE环境变量用于配置通信算法的编排展开位置，支持如下取值：
 
@@ -298,7 +298,7 @@ HCCL_OP_EXPANSION_MODE环境变量用于配置通信算法的编排展开位置
 - 每个task调用runtime接口，下发到device的rtsqueue
 - STARS从rstqueue上顺序拿取task
 - 根据task类型分别调用掉SDMA和RDMA引擎。
-    **单算子瓶颈**：hostbound 每个task提交是2~5us，一个通信算子有几百个task，单算子场景不会在device上缓存，下发一个执行一个
+  **单算子瓶颈**：hostbound 每个task提交是2~5us，一个通信算子有几百个task，单算子场景不会在device上缓存，下发一个执行一个
 
 ##### AICpu机制展开
 
@@ -370,46 +370,43 @@ TASK_QUEUE_ENABLE，下发优化，图模式设置为1（即开启图模式的
 
 ### verl框架参数设置
 
-主要是内存方面的一些设置开关（注意，这个里面的优化都或多或少会导致吞吐量有一定程度的劣化）
+以下是内存方面的一些设置开关（注意，这个里面的优化都或多或少会导致吞吐量有一定程度的劣化）
 
-~~~bash
+```bash
 # 梯度检查点 (Gradient Checkpointing)
 # 作用: 通过重新计算激活值来节省显存,以计算换内存。在前向传播时不保存中间激活值,反向传播时重新计算,可以显著降低显存占用,允许使用更大的batch size。
-actor_rollout_ref.model.enable_gradient_checkpointing=True
+actor_rollout_ref.model.enable_gradient_checkpointing=True \
 
 # 参数卸载 (Parameter Offload)
 # 作用: 将模型参数卸载到CPU内存,训练时再加载回GPU。
-actor_rollout_ref.actor.fsdp_config.param_offload=${offload}  # True  
-actor_rollout_ref.ref.fsdp_config.param_offload=${offload}    # True
+actor_rollout_ref.actor.fsdp_config.param_offload=True  \ 
+actor_rollout_ref.ref.fsdp_config.param_offload=True \
 
 # 优化器状态卸载 (Optimizer Offload)
 # 作用: 将优化器状态(如Adam的动量)卸载到CPU。优化器状态通常占用大量显存(对于Adam,每个参数需要额外8字节),卸载可以节省显存。
-actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload}  # True
+actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
 
 # 释放推理引擎缓存 (Free Cache Engine)
 # 作用: 在训练阶段释放推理引擎的KV cache和权重。这是3D-HybridEngine的核心优化,允许在同一GPU上交替进行推理和训练,显著降低显存需求。
-actor_rollout_ref.rollout.free_cache_engine=True
+actor_rollout_ref.rollout.free_cache_engine=True \
 
 #  熵计算优化
 # entropy_checkpointing: 在训练时对熵计算启用重计算,降低显存峰值
 # entropy_from_logits_with_chunking: 分块处理logits张量(如2048 tokens一组),避免一次性加载整个[bsz*seq_len, vocab]张量
-actor_rollout_ref.actor.entropy_checkpointing=True  
-actor_rollout_ref.ref.entropy_checkpointing=True  
-actor_rollout_ref.actor.entropy_from_logits_with_chunking=True  
-actor_rollout_ref.ref.entropy_from_logits_with_chunking=True
+actor_rollout_ref.actor.entropy_checkpointing=True \
+actor_rollout_ref.ref.entropy_checkpointing=True \
+actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \  
+actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
 
 # 推理引擎显存配置
 # gpu_memory_utilization: 控制vLLM使用的GPU显存比例(0.90 = 90%)
 # enforce_eager=False: 启用CUDA graphs加速推理,但会占用额外显存
-actor_rollout_ref.rollout.gpu_memory_utilization=0.90  
-actor_rollout_ref.rollout.enforce_eager=False
-~~~
+actor_rollout_ref.rollout.gpu_memory_utilization=0.90 \
+actor_rollout_ref.rollout.enforce_eager=False \
+```
 
 ## NPU调优参考文章
 
 环境变量相关：[环境变量列表-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/apiref/Envvariables/Envir_001.html)
 
 社区性能调优教程：[性能调优流程-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0001.html)
-
-
-
diff --git a/docs/ascend_tutorial/examples/run_qwen3_32B_megatron_1k_256k_npu.md b/docs/ascend_tutorial/examples/run_qwen3_32B_megatron_1k_256k_npu.md
index 18f029b99c8..c8c16623161 100644
--- a/docs/ascend_tutorial/examples/run_qwen3_32B_megatron_1k_256k_npu.md
+++ b/docs/ascend_tutorial/examples/run_qwen3_32B_megatron_1k_256k_npu.md
@@ -62,8 +62,8 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.actor.megatron.tensor_model_parallel_size=8 \
     actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1 \
     actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
-    actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
-    +actor_rollout_ref.actor.megatron.override_transformer_config.context_parallel_size=${CP} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${cp_size} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.context_parallel_size=${cp_size} \
     +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True \
     +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
     +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
diff --git a/docs/ascend_tutorial/faq/faq.rst b/docs/ascend_tutorial/faq/faq.rst
index 3540f8cfdde..ed8e9b55590 100644
--- a/docs/ascend_tutorial/faq/faq.rst
+++ b/docs/ascend_tutorial/faq/faq.rst
@@ -1 +1,99 @@
-Last updated: 03/16/2026.
+NPU 常见问题解答
+================
+
+Last updated: 03/26/2026.
+
+本文档总结了在 NPU 上执行 VERL 训练和推理时遇到的常见问题及解决方案。
+
+环境配置问题
+------------
+
+### Q1: NPU 设备不可见怎么办？
+
+**问题现象**：torch_npu.npu.is_available() 返回 False
+
+**解决方案**：
+
+.. code-block:: bash
+
+   # 检查设备可见性
+   echo $ASCEND_RT_VISIBLE_DEVICES
+   
+   # 设置可见设备并禁用ray自动设置
+   export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+   export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+   
+   # 检查驱动状态
+   npu-smi info
+
+调试和诊断
+----------
+
+### Q1: 如何启用 NPU 性能分析？
+
+使用 VERL 内置的 profiler：
+
+.. code-block:: shell
+
+   actor_rollout_ref.actor.profiler.tool_config.npu.discrete=true \
+   actor_rollout_ref.actor.profiler.tool_config.npu.contents=npu,cpu \
+   actor_rollout_ref.actor.profiler.tool_config.npu.level=1 \
+   actor_rollout_ref.actor.profiler.tool_config.npu.analysis=true
+
+### Q2: 如何排查 NPU 训练失败的问题？
+
+**排查步骤**：
+
+1. 检查环境变量配置
+2. 验证设备可见性
+3. 检查 CANN 版本兼容性
+4. 查看日志中的具体错误信息
+5. 使用最小化示例复现问题
+
+**启用详细日志**：
+
+.. code-block:: bash
+
+   # VERL 框架日志
+   export VERL_LOGGING_LEVEL=DEBUG
+   
+   # 昇腾 NPU 日志（0=DEBUG, 1=INFO, 2=WARNING, 3=ERROR）
+   export ASCEND_GLOBAL_LOG_LEVEL=0
+   export ASCEND_SLOG_PRINT_TO_STDOUT=1
+   
+   # HCCL 通信日志
+   export HCCL_DEBUG=INFO
+
+常见错误信息
+------------
+
+### Q1: "torch_npu detected, but NPU device is not available or visible"
+
+**原因**：NPU 驱动未正确安装或设备不可见
+
+**解决方案**：检查驱动安装状态和 ASCEND_RT_VISIBLE_DEVICES 设置
+
+### Q2: "KeyError: decoder.layers.0.self_attention.q_layernorm.weight"
+
+**原因**：MindSpeed版本过低
+
+**解决方案**：切换MindSpeed至 2.3.0_core_r0.12.1
+
+参考资料
+--------
+
+- `NPU 性能优化指南 <../perf/perf_tuning_on_ascend.rst>`_
+- `NPU 快速开始指南 <../start/install.rst>`_
+- `NPU CI 指南 <../contribution_guide/ascend_ci_guide_zh.rst>`_
+- Ascend NPU 文档: https://www.hiascend.com/document
+- CANN 工具包文档: https://www.hiascend.com/software/cann
+
+获取更多帮助
+------------
+
+如果以上 FAQ 无法解决您的问题，请：
+
+1. 查看完整的错误日志
+2. 在 GitHub Issues 中搜索类似问题
+3. 提供详细的错误信息和环境配置
+4. 提供最小可复现示例
\ No newline at end of file
diff --git a/docs/ascend_tutorial/features/ascend_backend_features.md b/docs/ascend_tutorial/features/ascend_backend_features.md
index 9e81213c79f..3fb7b1b2e1a 100644
--- a/docs/ascend_tutorial/features/ascend_backend_features.md
+++ b/docs/ascend_tutorial/features/ascend_backend_features.md
@@ -30,7 +30,7 @@ Last updated: 03/03/2026.
 | `ep_size`| `actor_rollout_ref.rollout.expert_parallel_size`|EP并行度|
 | `node_rank`| `无，根据实际实例和卡数自动计算` |实例中的节点排序|
 | `load_format`|  `actor_rollout_ref.rollout.load_format` |要加载的模型权重格式|
-| `disable_log_stats`|  `actor_rollout_ref.rollout.disable_log_stats`|记录抢占请求的累积数量 |
+| `disable_log_stats`|  `actor_rollout_ref.rollout.disable_log_stats`|控制是否记录 rollout 统计日志 |
 | `nnodes `|  `无，根据实际实例和卡数自动计算` | 每个实例包含的节点数量` |
 | `trust_remote_code`| `actor_rollout_ref.model.trust_remote_code`|是否允许在 Hub 上定义自定义模型，并将其写入自己的建模文件中|
 | `max_num_seqs` | `actor_rollout_ref.rollout.max_num_seqs` |正在运行的请求的最大数量|
@@ -272,3 +272,5 @@ class MindSpeedFeature:
 | `actor_rollout_ref.actor.megatron.override_transformer_config.use_fused_swiglu` |是否使用融合swiglu，默认值为False|
 | `actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage` |第一个pipeline stage 的层数，默认值为none|
 | `actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage` |最后一个pipeline stage 的层数，默认值为none|
+
+注：`actor_rollout_ref.actor.megatron.use_mbridge` 与 `actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size` (VPP) 暂不支持同时开启。由于 verl 默认开启 mbridge, 使用 VPP 参数时请手动将 `actor_rollout_ref.actor.megatron.use_mbridge` 置为 False。
\ No newline at end of file
diff --git a/docs/ascend_tutorial/features/ascend_consistency.rst b/docs/ascend_tutorial/features/ascend_consistency.rst
index 72c79a37318..2ea994b8a82 100644
--- a/docs/ascend_tutorial/features/ascend_consistency.rst
+++ b/docs/ascend_tutorial/features/ascend_consistency.rst
@@ -29,8 +29,6 @@ Last updated: 11/17/2025.
 
 在单卡无通信情况下：
 
-- HCCL和LCCL通信下:
- 
   -  export CLOSE_MATMUL_K_SHIFT=1
   -  export ATB_MATMUL_SHUFFLE_K_ENABLE=0
   -  export VLLM_ENABLE_V1_MULTIPROCESSING=0
diff --git a/docs/ascend_tutorial/profiling/precision_debugger.md b/docs/ascend_tutorial/profiling/precision_debugger.md
new file mode 100644
index 00000000000..5368c80282b
--- /dev/null
+++ b/docs/ascend_tutorial/profiling/precision_debugger.md
@@ -0,0 +1,367 @@
+# Precision Debugger (msprobe) in verl
+
+Last updated: 03/28/2026.
+
+This guide explains how to collect precision data in verl using the
+`msprobe` PrecisionDebugger.
+
+## Prerequisites
+
+* Install `msprobe` in the training environment.
+* Prepare a `config.json` for msprobe (see examples below).
+* Enable profiler for the roles you want to collect. PrecisionDebugger only
+  runs on roles whose `profiler.enable=True` and whose rank matches
+  `profiler.ranks`.
+
+## Configuration
+
+PrecisionDebugger is integrated through verl's unified profiler interface.
+You configure it in two places:
+
+* **Global profiling control** via `global_profiler` in the trainer config.
+* **Role profiling control** via each role's `profiler` block.
+
+### Global profiling control
+
+In `global_profiler`, set the profiler tool to `precision_debugger` and
+configure the msprobe-specific options under `global_tool_config`.
+
+```yaml
+global_profiler:
+  tool: precision_debugger
+  steps: [1, 2, 5]
+  save_path: "outputs/profile" # optional, not used by msprobe
+  global_tool_config:
+    precision_debugger:
+      _target_: verl.utils.profiler.config.PrecisionDebuggerToolConfig
+      enable: True
+      config_path: /path/to/config.json
+      data_dir: outputs/precision_debug
+      steps: [1, 2, 5]
+      stages:
+        - actor_update
+        - actor_compute_log_prob
+        - ref_compute_log_prob
+        - compute_values
+        - critic_update
+        - compute_rm_score
+      strict: False
+```
+
+Notes:
+
+* `steps` in `global_profiler` controls the step window for start/stop.
+* `precision_debugger.steps` provides an extra filter. If both are set,
+  the intersection is applied.
+* `data_dir` is the root directory for dumps. The actual path is
+  `{data_dir}/step_{global_step}/{stage}`.
+* `save_path` is ignored by msprobe.
+
+### Role profiling control
+
+Enable profiling for the roles you want to collect:
+
+```yaml
+actor_rollout_ref:
+  actor:
+    profiler:
+      enable: True
+      all_ranks: False
+      ranks: [0]
+  ref:
+    profiler:
+      enable: True
+      all_ranks: False
+      ranks: [0]
+critic:
+  profiler:
+    enable: True
+    all_ranks: False
+    ranks: [0]
+```
+
+If you want to provide an explicit per-role msprobe config, add
+`profiler.tool_config.precision_debugger` for each enabled role. In practice,
+the most important fields are:
+
+* `config_path`
+* `data_dir`
+* `steps`
+* `stages`
+* `strict`
+
+## Supported stages
+
+PrecisionDebugger collects data from the following stages:
+
+* `actor_update`
+* `actor_compute_log_prob`
+* `ref_compute_log_prob`
+* `compute_values`
+* `critic_update`
+* `compute_rm_score`
+
+Rollout generation is intentionally skipped (`rollout_generate` is ignored).
+
+The current integration is designed for training-side stages. In a typical PPO
+run, the most common useful combinations are:
+
+* actor/ref only:
+  `actor_compute_log_prob`, `ref_compute_log_prob`, `actor_update`
+* actor/ref/critic:
+  `actor_compute_log_prob`, `ref_compute_log_prob`, `compute_values`,
+  `critic_update`, `actor_update`
+
+## msprobe config.json examples
+
+Example for `task: statistics`:
+
+```json
+{
+  "task": "statistics",
+  "dump_path": "/home/data_dump",
+  "rank": [],
+  "step": [],
+  "level": "L1",
+  "async_dump": false,
+  "statistics": {
+    "scope": [],
+    "list": [],
+    "tensor_list": [],
+    "data_mode": ["all"],
+    "summary_mode": "statistics"
+  }
+}
+```
+
+Example for `task: tensor`:
+
+```json
+{
+  "task": "tensor",
+  "dump_path": "/home/data_dump",
+  "rank": [],
+  "step": [],
+  "level": "L1",
+  "async_dump": false,
+  "tensor": {
+    "scope": [],
+    "list": [],
+    "data_mode": ["all"],
+    "bench_path": "/home/bench_data_dump",
+    "summary_mode": "md5",
+    "diff_nums": 5
+  }
+}
+```
+
+## Minimal example
+
+The following example enables PrecisionDebugger on steps `1` and `2` for rank
+`0`:
+
+```yaml
+global_profiler:
+  tool: precision_debugger
+  steps: [1, 2]
+  global_tool_config:
+    precision_debugger:
+      _target_: verl.utils.profiler.config.PrecisionDebuggerToolConfig
+      enable: True
+      config_path: /path/to/dump_config.json
+      data_dir: outputs/precision_debug
+      steps: [1, 2]
+      stages:
+        - actor_compute_log_prob
+        - ref_compute_log_prob
+        - actor_update
+      strict: False
+
+actor_rollout_ref:
+  actor:
+    profiler:
+      enable: True
+      ranks: [0]
+  ref:
+    profiler:
+      enable: True
+      ranks: [0]
+```
+
+## Output layout
+
+Verl organizes PrecisionDebugger output by training global step and stage.
+Inside each stage directory, msprobe creates its own `step*/rank*` layout.
+
+Example:
+
+```text
+outputs/precision_debug/
+  step_1/
+    actor_compute_log_prob/step0/rank0/dump.json
+    actor_update/step0/rank0/dump.json
+    ref_compute_log_prob/step0/rank0/dump.json
+  step_2/
+    actor_compute_log_prob/step0/rank0/dump.json
+    actor_update/step0/rank0/dump.json
+    ref_compute_log_prob/step0/rank0/dump.json
+```
+
+Observed output from a real run:
+
+* Outer `step_<global_step>` directories are created by verl.
+* Inner `step0/rank0/dump.json` directories are created by msprobe.
+* With the current integration, each profiled stage is collected in an
+  independent dump session, so stage-local output typically lands in `step0`.
+
+## How results are written
+
+The verl integration wraps each profiled stage with:
+
+* `debugger.start(model=...)`
+* execute the stage
+* `debugger.stop()`
+* `service.reset_status()` if the msprobe runtime exposes it
+
+Verl does **not** manually call `debugger.step()` in the current integration.
+Instead, each stage writes to its own dump directory and resets msprobe runtime
+status after `stop()` to avoid stale `dump.json` cache growth across stages.
+
+For L0 collection, PrecisionDebugger must bind to the actual model used in the
+stage. The profiler resolves the model inside
+`verl/utils/profiler/precision_debugger_profile.py` and supports both legacy
+workers and the newer model-engine worker path.
+
+## Overhead and disk usage
+
+Below are measurements from a real PPO run on Ascend with:
+
+* model: `Qwen2-0.5B`
+* profiled steps: `[1, 2]`
+* rank: `0`
+* stages:
+  * L1: `actor_compute_log_prob`, `ref_compute_log_prob`, `actor_update`
+  * L0: `actor_compute_log_prob`, `ref_compute_log_prob`, `compute_values`,
+    `critic_update`, `actor_update`
+
+### Time overhead
+
+| Run | Model | Profiled steps | Measured step time |
+|---|---|---:|---:|
+| Baseline | `Qwen2-0.5B` | None | about `16-18 s/step` in steady state |
+| L0 | `Qwen2-0.5B` | `step 1` | `66.81 s` |
+| L0 | `Qwen2-0.5B` | `step 2` | `48.78 s` |
+| L0 | `Qwen2-0.5B` | non-profiled later steps | about `17 s/step` |
+| L1 | `Qwen2-0.5B` | `step 1` | `177.35 s` |
+| L1 | `Qwen2-0.5B` | `step 2` | `161.80 s` |
+| L1 | `Qwen2-0.5B` | non-profiled later steps | about `17 s/step` |
+
+In this experiment, profiled L0 steps were about `3x-4x` slower than the
+baseline steady-state step time, and profiled L1 steps were about `9x-10x`
+slower. Non-profiled later steps remained close to baseline in both cases.
+
+In general, PrecisionDebugger should be treated as a heavy-weight precision
+debugging tool rather than a lightweight profiler. In larger models or broader
+stage coverage, it is common to observe `tens-X` performance inflation for
+profiled steps.
+
+### Disk usage
+
+| Level | Model | Stages | Scope | Disk usage |
+|---|---|---|---|---:|
+| L1 | `Qwen2-0.5B` | `actor_compute_log_prob`, `ref_compute_log_prob`, `actor_update` | total for `step_1` and `step_2` | `21 MB` |
+| L1 | `Qwen2-0.5B` | `actor_compute_log_prob`, `ref_compute_log_prob`, `actor_update` | per step | about `11 MB` |
+| L1 | `Qwen2-0.5B` | `actor_update` | per step | about `5.1-5.2 MB` |
+| L1 | `Qwen2-0.5B` | `actor_compute_log_prob` | per step | about `2.6 MB` |
+| L1 | `Qwen2-0.5B` | `ref_compute_log_prob` | per step | about `2.6 MB` |
+| L0 | `Qwen2-0.5B` | `actor_compute_log_prob`, `ref_compute_log_prob`, `actor_update` | total for `step_1` and `step_2` | `8.8 MB` |
+| L0 | `Qwen2-0.5B` | `actor_compute_log_prob`, `ref_compute_log_prob`, `actor_update` | per step | about `4.4 MB` |
+| L0 | `Qwen2-0.5B` | `actor_update` | per step | about `2.5 MB` |
+| L0 | `Qwen2-0.5B` | `actor_compute_log_prob` | per step | about `1.1 MB` |
+| L0 | `Qwen2-0.5B` | `ref_compute_log_prob` | per step | about `0.86-0.87 MB` |
+
+In this experiment, total L1 disk usage was about `2.4x` the L0 disk usage for
+the measured actor/ref stage set.
+
+These numbers depend on:
+
+* selected stages
+* number of profiled steps
+* dump level and task
+* model shape and sequence length
+
+## How to analyze results
+
+At minimum, check:
+
+* which `step_<global_step>` directory was generated
+* which stage directories exist under that step
+* whether `dump.json` exists under `step0/rank0`
+
+For downstream analysis, use standard msprobe tools such as:
+
+* `msprobe compare`
+* `msprobe visualization`
+
+Example compare usage:
+
+```bash
+msprobe compare \
+  --target-path /path/to/target_dump/dump.json \
+  --golden-path /path/to/golden_dump/dump.json
+```
+
+You can compare:
+
+* the same stage across two runs
+* different global steps of the same stage
+* different ranks when multi-rank collection is enabled
+
+For more advanced analysis workflows, refer to the official msprobe
+documentation for compare and visualization commands.
+
+## Usage notes
+
+* Verl integrates PrecisionDebugger through `DistProfiler.annotate` wrappers on
+  training stages.
+* PrecisionDebugger is automatically discrete: each profiled stage is
+  collected in an independent `start -> stop -> reset_status` session. It does
+  not currently expose the unified profiler `discrete` configuration used by
+  tools such as `nsys` or `npu`.
+* `global_steps` is read from batch `meta_info` or from worker attributes.
+* If `strict` is `True`, missing msprobe or unknown stages raise errors.
+* If a stage prints `PrecisionDebugger model not resolved`, that stage ran
+  normally but no dump was collected because verl could not bind msprobe to a
+  valid model object.
+* Because dump cost is high, prefer collecting a small number of representative
+  steps first, then narrow the stage set if necessary.
+
+## Troubleshooting
+
+### No dump directory is generated
+
+Check:
+
+* `global_profiler.tool=precision_debugger`
+* `global_profiler.steps` contains the target step
+* role profiler is enabled for the target role
+* current rank is included in `profiler.ranks`
+* msprobe is installed in the training environment
+
+### `PrecisionDebugger model not resolved`
+
+This means the stage was reached, but verl could not find the actual model used
+by that worker. The stage itself still runs, but dump is skipped. This usually
+indicates:
+
+* a new worker path was introduced and profiler model resolution needs to be
+  updated
+* the role or engine backend differs from the paths currently supported by the
+  resolver
+
+### `dump.json` keeps growing unexpectedly
+
+If `stop()` is called without resetting msprobe runtime state, cached dump data
+may continue to accumulate across stage invocations. The current verl
+integration resets msprobe runtime status after `stop()` when the service API
+supports it.
diff --git a/docs/ascend_tutorial/quick_start/ascend_quick_start.rst b/docs/ascend_tutorial/quick_start/ascend_quick_start.rst
index 89745815d42..1bd85207c9b 100644
--- a/docs/ascend_tutorial/quick_start/ascend_quick_start.rst
+++ b/docs/ascend_tutorial/quick_start/ascend_quick_start.rst
@@ -26,10 +26,10 @@ Atlas 800T A3
 -----------------------------------
 
 
-DockerFile镜像构建 & 获取 & 使用 
+Dockerfile镜像构建 & 获取 & 使用 
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-如需要通过 DockerFile 构建镜像，或希望使用基于 verl 构建的镜像，请参考 `文档 <https://github.com/volcengine/verl/tree/main/docs/ascend_tutorial/quick_start/dockerfile_build_guidance.rst>`_ 
+如需要通过 Dockerfile 构建镜像，或希望使用基于 verl 构建的镜像，请参考 `文档 <https://github.com/volcengine/verl/tree/main/docs/ascend_tutorial/quick_start/dockerfile_build_guidance.rst>`_ 
 如果想直接获取镜像，请前往`quay.io/ascend/verl <https://quay.io/repository/ascend/verl?tab=tags&tag=latest>`_ 进行获取，镜像中已包含基础环境和依赖软件包。
 
 安装基础环境
@@ -100,7 +100,7 @@ DockerFile镜像构建 & 获取 & 使用
 
         git clone --depth 1 --branch v0.13.0 https://github.com/vllm-project/vllm.git
         cd vllm && pip install -r requirements/build.txt
-        VLLM_TARGET_DEVICE=empty pip install -v -e. && cd ..
+        VLLM_TARGET_DEVICE=empty pip install -v -e . && cd ..
 
 3. vllm-ascend 源码安装指令：
 
@@ -123,9 +123,9 @@ MindSpeed 源码安装指令：
         cd MindSpeed && git checkout 2.3.0_core_r0.12.1 && cd ..
         git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
     
-        # 安装 MindSpeed & Megatron
-        pip install -e MindSpeed
+        # 安装 Megatron & MindSpeed
         pip install -e Megatron-LM
+        pip install -e MindSpeed
     
         # 安装 mbridge
         pip install mbridge
@@ -139,6 +139,36 @@ MindSpeed 对应 Megatron-LM 后端使用场景，使用方式如下：
     3. 更多特性信息可参考 `MindSpeed & verl 文档 <https://gitcode.com/Ascend/MindSpeed/blob/master/docs/user-guide/verl.md>`_ 。
 
 
+新增 MindSpeed-LLM 训练后端支持
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+如需使能 MindSpeed-LLM 训练后端，需要额外下载 MindSpeed-LLM 。
+需要注意的是，MindSpeed-LLM 训练后端依赖 MindSpeed-LLM master 分支、 MindSpeed master 分支以及 Megatron-LM core_v0.12.1 分支。
+MindSpeed-LLM 及相关依赖的源码安装指令：
+
+    .. code-block:: bash
+
+        # 下载 MindSpeed-LLM、 MindSpeed 和 Megatron-LM
+        git clone https://gitcode.com/Ascend/MindSpeed-LLM.git
+        git clone https://gitcode.com/Ascend/MindSpeed.git
+        git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
+
+        # 配置环境变量
+        export PYTHONPATH=$PYTHONPATH:your path/Megatron-LM
+        export PYTHONPATH=$PYTHONPATH:your path/MindSpeed
+        export PYTHONPATH=$PYTHONPATH:your path/MindSpeed-LLM
+
+        # 安装 mbridge
+        pip install mbridge
+
+MindSpeed-LLM 对应 Megatron-LM 后端使用场景，使用方式如下：
+
+    1. 使能 verl worker 模型 ``strategy`` 配置为 ``mindspeed`` ，例如 ``actor_rollout_ref.actor.strategy=mindspeed``。
+
+    2. MindSpeed-LLM 自定义入参可通过 ``llm_kwargs`` 参数传入，例如对 MOE 模型开启 GMM 特性可使用 ``+actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_grouped_gemm=True``。
+
+    3. 更多特性信息可参考 `MindSpeed-LLM 内的特性文档 <https://gitcode.com/Ascend/MindSpeed-LLM/tree/master/docs/zh/pytorch/features/mcore>`_ 。
+
+
 安装verl
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -232,45 +262,49 @@ verl 中昇腾暂不支持生态库如下：
 
 **表1** RL类算法
 
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    | algorithm             |         model           | download link                                                    |   actor.strategy  |   rollout.name    |   shell location                                                                                                                             |     hardware             |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen2.5-7B-instruct     |`7B <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct>`_           |        FSDP       |    vllm-ascend    |`qwen2_5_7b_grpo_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh>`_                        |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen2.5-32B-instruct    |`32B <https://huggingface.co/Qwen/Qwen2.5-32B-Instruct>`_         |        FSDP       |    vllm-ascend    |`qwen2_5_32b_grpo_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh>`_                      |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen2.5-VL-3B-instruct  |`3B <https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct>`_        |        FSDP       |    vllm-ascend    |`qwen2_5_vl_3b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh>`_                            |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen2.5-VL-7B-instruct  |`7B <https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct>`_        |        FSDP       |    vllm-ascend    |`qwen2_5_vl_7b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh>`_                            |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen2.5-VL-32B-instruct |`32B <https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct>`_      |        FSDP       |    vllm-ascend    |`qwen2_5_vl_32b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh>`_                          |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen3-4B                |`4B <https://huggingface.co/Qwen/Qwen3-4B>`_                      |        FSDP       |    vllm-ascend    |`qwen3-4B_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh>`_                         |    Atlas 800T A3         |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_vllm_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-8b_npu.sh>`_                                 |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    sglang         |`qwen3_8b_sglang_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh>`_          |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | Qwen3-32B               |`32B <https://huggingface.co/Qwen/Qwen3-32B>`_                    |        FSDP       |    vllm-ascend    |`qwen3-32B_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-32b_npu.sh>`_                                    |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   GRPO                | DeepSeekv3-671B         |`671B <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_          |        Megatron   |    vllm-ascend    |`deepseek_v3_megatron_npu <https://github.com/verl-project/verl-recipe/blob/main//r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh>`_       |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   DAPO                | Qwen2.5-7B-instruct     |`7B <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct>`_           |        FSDP       |    vllm-ascend    |`qwen2.5_7b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen2.5_7b_npu.sh>`_                                    |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   DAPO                | Qwen2.5-32B             |`32B <https://huggingface.co/Qwen/Qwen2.5-32B>`_                  |        FSDP       |    vllm-ascend    |`qwen2.5_32b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen2.5_32b_npu.sh>`_                                  |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   DAPO                | Qwen3-8B-base           |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_8b_base_npu.sh>`_                                   |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   DAPO                | Qwen3-14B-base          |`14B <https://huggingface.co/Qwen/Qwen3-14B>`_                    |        FSDP       |    vllm-ascend    |`qwen3_14b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_14b_base_npu.sh>`_                                 |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   DAPO                | Qwen3-30B-A3B-base      |`30B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_                |        FSDP       |    vllm-ascend    |`qwen3_30b_fsdp_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh>`_                   |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   DAPO                | Qwen3-30B-A3B-base      |`30B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_                |        Megatron   |    vllm-ascend    |`qwen3_30b_megatron_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh>`_                |    Atlas 200T A2 Box16   |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   PPO                 | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_ppo_npu <https://github.com/volcengine/verl/blob/main/examples/ppo_trainer/run_qwen3-8b_npu.sh>`_                                   |    Atlas 900 A2 PODc     |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
-    |   One_Step_Off_Policy | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP2      |    vllm-ascend    |`qwen3_8b_fsdp2_npu <https://github.com/verl-project/verl-recipe/blob/main//one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh>`_ |    Atlas 800T A3         |
-    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    | algorithm             |         model           | download link                                                    |   actor.strategy  |   rollout.name    |   shell location                                                                                                                              |     hardware             |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-7B-instruct     |`7B <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct>`_           |        FSDP       |    vllm-ascend    |`qwen2_5_7b_grpo_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh>`_                         |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-32B-instruct    |`32B <https://huggingface.co/Qwen/Qwen2.5-32B-Instruct>`_         |        FSDP       |    vllm-ascend    |`qwen2_5_32b_grpo_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh>`_                       |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-VL-3B-instruct  |`3B <https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct>`_        |        FSDP       |    vllm-ascend    |`qwen2_5_vl_3b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh>`_                             |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-VL-7B-instruct  |`7B <https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct>`_        |        FSDP       |    vllm-ascend    |`qwen2_5_vl_7b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh>`_                             |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-VL-32B-instruct |`32B <https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct>`_      |        FSDP       |    vllm-ascend    |`qwen2_5_vl_32b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh>`_                           |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-4B                |`4B <https://huggingface.co/Qwen/Qwen3-4B>`_                      |        FSDP       |    vllm-ascend    |`qwen3-4B_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh>`_                          |    Atlas 800T A3         |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_vllm_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-8b_npu.sh>`_                                  |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    sglang         |`qwen3_8b_sglang_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh>`_           |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-32B               |`32B <https://huggingface.co/Qwen/Qwen3-32B>`_                    |        FSDP       |    vllm-ascend    |`qwen3-32B_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-32b_npu.sh>`_                                     |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-30B-A3B           |`30B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_                |   MindSpeed_LLM   |    sglang         |`qwen3_30b_mindspeedllm_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_mindspeedllm_npu.sh>`_ |    Atlas 800T A3         |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-32B               |`32B <https://huggingface.co/Qwen/Qwen3-32B>`_                    |   MindSpeed_LLM   |    sglang         |`qwen3_32b_mindspeedllm_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-32b_sglang_mindspeedllm_npu.sh>`_    |    Atlas 800T A3         |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | DeepSeekv3-671B         |`671B <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_          |        Megatron   |    vllm-ascend    |`deepseek_v3_megatron_npu <https://github.com/verl-project/verl-recipe/blob/main//r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh>`_        |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen2.5-7B-instruct     |`7B <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct>`_           |        FSDP       |    vllm-ascend    |`qwen2.5_7b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen2.5_7b_npu.sh>`_                                     |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen2.5-32B             |`32B <https://huggingface.co/Qwen/Qwen2.5-32B>`_                  |        FSDP       |    vllm-ascend    |`qwen2.5_32b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen2.5_32b_npu.sh>`_                                   |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen3-8B-base           |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_8b_base_npu.sh>`_                                    |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen3-14B-base          |`14B <https://huggingface.co/Qwen/Qwen3-14B>`_                    |        FSDP       |    vllm-ascend    |`qwen3_14b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_14b_base_npu.sh>`_                                  |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen3-30B-A3B-base      |`30B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_                |        FSDP       |    vllm-ascend    |`qwen3_30b_fsdp_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh>`_                    |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen3-30B-A3B-base      |`30B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_                |        Megatron   |    vllm-ascend    |`qwen3_30b_megatron_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh>`_                 |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   PPO                 | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_ppo_npu <https://github.com/volcengine/verl/blob/main/examples/ppo_trainer/run_qwen3-8b_npu.sh>`_                                    |    Atlas 900 A2 PODc     |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   One_Step_Off_Policy | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP2      |    vllm-ascend    |`qwen3_8b_fsdp2_npu <https://github.com/verl-project/verl-recipe/blob/main//one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh>`_  |    Atlas 800T A3         |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+-----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
 
 **表2** SFT类算法
 
diff --git a/docs/ascend_tutorial/quick_start/ascend_sglang_quick_start.rst b/docs/ascend_tutorial/quick_start/ascend_sglang_quick_start.rst
index ac9b69daf68..7f9037b1236 100644
--- a/docs/ascend_tutorial/quick_start/ascend_sglang_quick_start.rst
+++ b/docs/ascend_tutorial/quick_start/ascend_sglang_quick_start.rst
@@ -20,21 +20,21 @@ Atlas 800T A3
 关键支持版本
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-+-----------+-----------------+
-| software  | version         |
-+===========+=================+
-| Python    | == 3.11         |
-+-----------+-----------------+
-| HDK       | >= 25.3.RC1     |
-+-----------+-----------------+
-| CANN      | >= 8.3.RC1      |
-+-----------+-----------------+
-| torch     | >= 2.7.1        |
-+-----------+-----------------+
-| torch_npu | >= 2.7.1.post2  |
-+-----------+-----------------+
-| sglang    | v0.5.8          |
-+-----------+-----------------+
++-----------+----------------+
+| software  | version        |
++===========+================+
+| Python    | ==3.11         |
++-----------+----------------+
+| HDK       | >=25.3.RC1     |
++-----------+----------------+
+| CANN      | >=8.3.RC1      |
++-----------+----------------+
+| torch     | >=2.7.1        |
++-----------+----------------+
+| torch_npu | >=2.7.1.post2  |
++-----------+----------------+
+| sglang    | v0.5.8         |
++-----------+----------------+
 
 从 Docker 镜像进行安装
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -47,11 +47,11 @@ Atlas 800T A3
 
 异构计算架构CANN(Compute Architecture for Neural Networks)是昇腾针对AI场景推出的异构计算架构, 为了使训练和推理引擎能够利用更好、更快的硬件支持, 我们需要安装以下 `先决条件 <https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=netconda&OS=openEuler&Software=cannToolKit>`_
 
-+-----------+-------------+
-| HDK       | >= 25.3.RC1 |
-+-----------+-------------+
-| CANN      | >= 8.3.RC1  |
-+-----------+-------------+
++-----------+------------+
+| HDK       | >=25.3.RC1 |
++-----------+------------+
+| CANN      | >=8.3.RC1  |
++-----------+------------+
 安装完成后请激活环境
 
 .. code-block:: bash
@@ -139,14 +139,14 @@ vllm后端推理脚本转换为sglang, 需要添加修改以下参数
 .. code-block:: bash
 
     #必须
-    actor_rollout_ref.rollout.name=sglang
-    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
+    actor_rollout_ref.rollout.name=sglang \
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend" \
     #可选
     #使能推理EP，详细使用方法见 https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README_CN.md
-    ++actor_rollout_ref.rollout.engine_kwargs.sglang.deepep_mode="auto" 
-    ++actor_rollout_ref.rollout.engine_kwargs.sglang.moe_a2a_backend="deepep"
+    ++actor_rollout_ref.rollout.engine_kwargs.sglang.deepep_mode="auto" \
+    ++actor_rollout_ref.rollout.engine_kwargs.sglang.moe_a2a_backend="deepep" \
     #Moe模型多DP时必须设置为True
-    +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False \
     #chunked_prefill默认关闭
     +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
 
diff --git a/docs/ascend_tutorial/quick_start/dockerfile_build_guidance.rst b/docs/ascend_tutorial/quick_start/dockerfile_build_guidance.rst
index 507d8497784..b1a455234b3 100644
--- a/docs/ascend_tutorial/quick_start/dockerfile_build_guidance.rst
+++ b/docs/ascend_tutorial/quick_start/dockerfile_build_guidance.rst
@@ -29,7 +29,7 @@ Atlas 800T A3
 ----------------
 
 ================= ============
-组件        版本
+组件               版本
 ================= ============
 基础镜像            Ubuntu 22.04
 Python             3.11
@@ -42,7 +42,7 @@ vLLM-ascend        0.13.0
 Megatron-LM        v0.12.1
 MindSpeed          2.3.0_core_r0.12.1
 triton-ascend      3.2.0
-mbridge            latest version
+mbridge            0.15.1
 SGLang             v0.5.8
 sgl-kernel-npu     (46b73de)
 ================= ============
@@ -61,7 +61,7 @@ A2              8.3.RC1        SGLang          `Dockerfile.ascend.sglang_8.3.rc1
 A3              8.2.RC1        vLLM            `Dockerfile.ascend_8.2.rc1_a3 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend_8.2.rc1_a3>`_
 A3              8.3.RC1        vLLM            `Dockerfile.ascend_8.3.rc1_a3 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend_8.3.rc1_a3>`_
 A3              8.5.0          vLLM            `Dockerfile.ascend_8.5.0_a3 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend_8.5.0_a3>`_
-A3              8.3.RC1         SGLang          `Dockerfile.ascend.sglang_8.3.rc1_a3 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3>`_
+A3              8.3.RC1        SGLang          `Dockerfile.ascend.sglang_8.3.rc1_a3 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3>`_
 ============== ============== ============== ==============================================================
 
 
diff --git a/docs/contributing/editing-agent-instructions.md b/docs/contributing/editing-agent-instructions.md
new file mode 100644
index 00000000000..a1e7cba84b0
--- /dev/null
+++ b/docs/contributing/editing-agent-instructions.md
@@ -0,0 +1,91 @@
+# Editing Agent Instructions
+
+> Read this before modifying `AGENTS.md` or any guide it links to.
+
+## File Layout
+
+| Generic     | Variants                   | Audience        | Scope                          |
+| ----------- | -------------------------- | --------------- | ------------------------------ |
+| `AGENTS.md` | `CLAUDE.md`, ...           | Agents          | Project-wide instructions.     |
+| `.agent/`   | `.claude/`, `.codex/`, ... | Agents          | Agent specification directory. |
+| `docs/`     | N/A                        | Humans & Agents | Project documentation.         |
+
+Generic files are framework-agnostic. Variants directly symlink generic files or refer to them and add framework-specific content.
+
+## Token Budget Mindset
+
+`AGENTS.md` loads on every agent request; domain guides load on entry to a relevant area.
+Keep `AGENTS.md` under **200 lines** and each domain guide under **300 lines**.
+When a file exceeds its budget, split or prune — do not compress prose to fit.
+
+## When NOT to Add Content
+
+Before writing a new rule, ask whether it is actually needed:
+
+- **Agents already do it.** Test with a prompt first. If the agent behaves correctly without the rule, don't add it.
+- **One-off incident.** Prefer a code-level fix (lint rule, CI check, test assertion) over a new doc rule.
+- **Hardcoded paths.** File paths change; use "search for X" patterns instead.
+- **Upstream docs.** Don't reproduce pytest, ruff, or other tool docs — link to them.
+- **Contradicts an existing rule.** Search all linked guides before adding. If two rules conflict, consolidate into one.
+- **Already covered elsewhere.** Search `AGENTS.md` and every linked guide for overlapping guidance.
+
+If any of the above apply, **do not add the content**.
+
+## Where Content Belongs
+
+The goal is a lean `AGENTS.md` plus rich domain guides that teach agents what they can't learn from the code alone.
+
+| Scope                                                                                            | File         |
+| ------------------------------------------------------------------------------------------------ | ------------ |
+| Project-wide invariants (contribution policy, env setup, test/lint commands, commit conventions) | `AGENTS.md`  |
+| Area-specific knowledge (model patterns, format details, deprecation timelines)                  | Domain guide |
+
+**Rules of thumb:**
+
+- If it only matters for one area, put it in a domain guide.
+- If it matters for all areas, consider `AGENTS.md` — but first verify agents don't already do it.
+- Create a new domain guide when you have 5 or more non-obvious instructions sharing a coherent scope.
+
+## What Makes a Good Domain Guide
+
+Add what agents can't infer from the code or public docs: project-specific
+conventions that differ from standard patterns, correct approaches that require
+cross-file context, and fixes for repeated mistakes.
+Each entry should be short, specific, and actionable — e.g., which files to
+touch, what order to change them in, and which tests to run.
+
+## Keeping Docs Lean
+
+- Every addition should trigger review of surrounding content for stale or redundant items.
+- Refer to existing files (e.g., "follow the PR template") instead of restating their content — keep a single source of truth.
+- Prefer examples over explanations — a 3-line snippet beats a paragraph of prose.
+- Merge related bullets into one principle instead of listing variants.
+- Use `search for X` instead of hardcoded file paths.
+- PR references are fine in domain guides for traceability, but avoid them in `AGENTS.md`.
+
+## Anti-Patterns
+
+| Pattern                   | Problem                                                                 |
+| ------------------------- | ----------------------------------------------------------------------- |
+| Reactive accumulation     | Adding a rule per incident without pruning leads to bloat               |
+| Copy-paste between guides | Duplicated content drifts apart; keep in one place, link from the other |
+| Imperative walls          | Long DO NOT lists that agents skim past; consolidate into principles    |
+| Config snapshots          | Show the command to get the value, not the value itself                 |
+
+## Change Checklist
+
+Before submitting changes to any agent instruction file:
+
+- [ ] **Non-obvious?** Would an agent do the wrong thing without this rule?
+- [ ] **No conflicts?** Searched all linked guides for contradictions?
+- [ ] **Right file?** Project-wide goes in `AGENTS.md`, area-specific in a domain guide?
+- [ ] **Offset the addition?** Removed or consolidated something to compensate?
+- [ ] **Under budget?** `AGENTS.md` < 200 lines, domain guides < 300 lines?
+- [ ] **No hardcoded paths?** Uses "search for X" where paths may change?
+- [ ] **Tested?** Verified that an agent actually follows the new instruction?
+
+## Acknowledgements
+
+This guide is adapted from the [vLLM project](https://github.com/vllm-project/vllm)'s [`editing-agent-instructions.md`](https://github.com/vllm-project/vllm/blob/main/docs/contributing/editing-agent-instructions.md).
+
+Last updated: 04/01/2026
diff --git a/docs/examples/config.rst b/docs/examples/config.rst
index b1053903510..a91b67c5a98 100644
--- a/docs/examples/config.rst
+++ b/docs/examples/config.rst
@@ -131,7 +131,7 @@ Actor/Rollout/Reference Policy
       # Rollout Correction (corrects distribution mismatch between rollout and training)
       rollout_correction:
         rollout_is: token # IS weights
-        rollout_is_threshold: 2.0 # Upper threshold for IS weights
+        rollout_is_threshold: 2.0 # TIS upper bound, or "0.5_5.0" for IcePop
         rollout_rs: null # Rejection sampling
         rollout_rs_threshold: null # RS upper threshold
       use_torch_compile: True # False to disable torch compile
diff --git a/docs/examples/gsm8k_example.rst b/docs/examples/gsm8k_example.rst
index 7b89d1943d1..101b6c0f192 100644
--- a/docs/examples/gsm8k_example.rst
+++ b/docs/examples/gsm8k_example.rst
@@ -164,8 +164,8 @@ The script of run_deepseek7b_llm.sh
       critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
       critic.model.enable_gradient_checkpointing=True \
       critic.ppo_micro_batch_size_per_gpu=32 \
-      critic.model.fsdp_config.param_offload=False \
-      critic.model.fsdp_config.optimizer_offload=False \
+      critic.fsdp.param_offload=False \
+      critic.fsdp.optimizer_offload=False \
       algorithm.kl_ctrl.kl_coef=0.001 \
       trainer.critic_warmup=0 \
       trainer.logger='["console","wandb"]' \
diff --git a/docs/index.rst b/docs/index.rst
index 381d3a6bad9..bd1fa5dc981 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -108,6 +108,7 @@ verl is fast with:
    perf/verl_profiler_system.md
    perf/nsight_profiling.md
    perf/torch_profiling.md
+   ascend_tutorial/profiling/precision_debugger.md
 
 .. toctree::
    :maxdepth: 1
@@ -138,6 +139,7 @@ verl is fast with:
    data/transfer_queue.md
    advance/grafana_prometheus.md
    advance/fp8.md
+   advance/nvfp4_qat.md
    advance/async-on-policy-distill
    advance/mtp.md
 
@@ -184,6 +186,12 @@ verl is fast with:
 
    faq/faq
 
+.. toctree::
+   :maxdepth: 1
+   :caption: Contributing
+
+   contributing/editing-agent-instructions.md
+
 .. toctree::
    :maxdepth: 1
    :caption: Development Notes
diff --git a/docs/start/multinode.rst b/docs/start/multinode.rst
index 4dd7d174aa4..47c2bcb3f0c 100644
--- a/docs/start/multinode.rst
+++ b/docs/start/multinode.rst
@@ -796,8 +796,8 @@ slurm_script.sh
         critic.model.path=$MODEL_PATH \
         critic.model.enable_gradient_checkpointing=False \
         critic.ppo_micro_batch_size_per_gpu=8 \
-        critic.model.fsdp_config.param_offload=False \
-        critic.model.fsdp_config.optimizer_offload=False \
+        critic.fsdp.param_offload=False \
+        critic.fsdp.optimizer_offload=False \
         algorithm.kl_ctrl.kl_coef=0.0001 \
         trainer.critic_warmup=0 \
         trainer.logger='["console","wandb"]' \
diff --git a/docs/workers/sglang_worker.rst b/docs/workers/sglang_worker.rst
index 08cc48a075d..6edb8871730 100644
--- a/docs/workers/sglang_worker.rst
+++ b/docs/workers/sglang_worker.rst
@@ -73,8 +73,8 @@ We use Qwen/Qwen2-7B-Instruct on the gsm8k dataset for a simple test.
         critic.optim.lr=1e-5 \
         critic.model.path=Qwen/Qwen2-7B-Instruct \
         critic.ppo_micro_batch_size_per_gpu=4 \
-        critic.model.fsdp_config.param_offload=True \
-        critic.model.fsdp_config.optimizer_offload=True \
+        critic.fsdp.param_offload=True \
+        critic.fsdp.optimizer_offload=True \
         algorithm.kl_ctrl.kl_coef=0.001 \
         trainer.logger=console \
         trainer.val_before_train=False \
@@ -224,8 +224,8 @@ You can see that the cluster has two nodes with 16 GPUs:
         critic.model.path=meta-llama/Llama-3.1-8B-Instruct \
         critic.model.enable_gradient_checkpointing=True \
         critic.ppo_micro_batch_size=16 \
-        critic.model.fsdp_config.param_offload=True \
-        critic.model.fsdp_config.optimizer_offload=True \
+        critic.fsdp.param_offload=True \
+        critic.fsdp.optimizer_offload=True \
         algorithm.kl_ctrl.kl_coef=0.001 \
         trainer.critic_warmup=0 \
         trainer.logger=console \
diff --git a/examples/flowgrpo_trainer/diffusers/qwen_image.py b/examples/flowgrpo_trainer/diffusers/qwen_image.py
new file mode 100644
index 00000000000..3905fbcb3c8
--- /dev/null
+++ b/examples/flowgrpo_trainer/diffusers/qwen_image.py
@@ -0,0 +1,146 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Qwen-Image diffusion model implementation for FlowGRPO training.
+"""
+
+from typing import Optional
+
+import numpy as np
+import torch
+from diffusers.models.transformers.transformer_qwenimage import QwenImageTransformer2DModel
+from diffusers.pipelines.qwenimage.pipeline_qwenimage import calculate_shift
+from tensordict import TensorDict
+
+from verl.models.diffusers_model import DiffusionModelBase
+from verl.utils import tensordict_utils as tu
+from verl.utils.device import get_device_name
+from verl.workers.config import DiffusionModelConfig
+
+from ..scheduler import FlowMatchSDEDiscreteScheduler
+
+
+@DiffusionModelBase.register("QwenImagePipeline")
+class QwenImage(DiffusionModelBase):
+    @classmethod
+    def build_scheduler(cls, model_config: DiffusionModelConfig):
+        scheduler = FlowMatchSDEDiscreteScheduler.from_pretrained(
+            pretrained_model_name_or_path=model_config.local_path, subfolder="scheduler"
+        )
+        cls.set_timesteps(scheduler, model_config, get_device_name())
+        return scheduler
+
+    @classmethod
+    def set_timesteps(cls, scheduler: FlowMatchSDEDiscreteScheduler, model_config: DiffusionModelConfig, device: str):
+        vae_scale_factor = 8
+        latent_height, latent_width = (
+            model_config.height // vae_scale_factor // 2,
+            model_config.width // vae_scale_factor // 2,
+        )
+        num_inference_steps = model_config.num_inference_steps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        mu = calculate_shift(
+            latent_height * latent_width,
+            scheduler.config.get("base_image_seq_len", 256),
+            scheduler.config.get("max_image_seq_len", 4096),
+            scheduler.config.get("base_shift", 0.5),
+            scheduler.config.get("max_shift", 1.15),
+        )
+        scheduler.set_timesteps(num_inference_steps, device=device, sigmas=sigmas, mu=mu)
+
+    @classmethod
+    def prepare_model_inputs(
+        cls,
+        module: QwenImageTransformer2DModel,
+        model_config: DiffusionModelConfig,
+        latents: torch.Tensor,
+        timesteps: torch.Tensor,
+        prompt_embeds: torch.Tensor,
+        prompt_embeds_mask: torch.Tensor,
+        negative_prompt_embeds: torch.Tensor,
+        negative_prompt_embeds_mask: torch.Tensor,
+        micro_batch: TensorDict,
+        step: int,
+    ) -> tuple[dict, dict]:
+        height = tu.get_non_tensor_data(data=micro_batch, key="height", default=None)
+        width = tu.get_non_tensor_data(data=micro_batch, key="width", default=None)
+        vae_scale_factor = tu.get_non_tensor_data(data=micro_batch, key="vae_scale_factor", default=None)
+        img_shapes = [[(1, height // vae_scale_factor // 2, width // vae_scale_factor // 2)]] * latents.shape[0]
+
+        guidance_scale = model_config.extra_configs.get("guidance_scale", None)
+        if getattr(module.config, "guidance_embeds", False):
+            guidance = torch.full([1], guidance_scale, device=timesteps.device, dtype=torch.float32)
+        else:
+            guidance = None
+
+        hidden_states = latents[:, step]
+        timestep = timesteps[:, step] / 1000.0
+
+        model_inputs = {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "guidance": guidance,
+            "encoder_hidden_states_mask": prompt_embeds_mask,
+            "encoder_hidden_states": prompt_embeds,
+            "img_shapes": img_shapes,
+            "return_dict": False,
+        }
+
+        negative_model_inputs = {
+            "hidden_states": hidden_states,
+            "timestep": timestep,
+            "guidance": guidance,
+            "encoder_hidden_states_mask": negative_prompt_embeds_mask,
+            "encoder_hidden_states": negative_prompt_embeds,
+            "img_shapes": img_shapes,
+            "return_dict": False,
+        }
+
+        return model_inputs, negative_model_inputs
+
+    @classmethod
+    def forward_and_sample_previous_step(
+        cls,
+        module: QwenImageTransformer2DModel,
+        scheduler: FlowMatchSDEDiscreteScheduler,
+        model_config: DiffusionModelConfig,
+        model_inputs: dict[str, torch.Tensor],
+        negative_model_inputs: Optional[dict[str, torch.Tensor]],
+        scheduler_inputs: Optional[TensorDict | dict[str, torch.Tensor]],
+        step: int,
+    ):
+        assert scheduler_inputs is not None
+        latents = scheduler_inputs["all_latents"]
+        timesteps = scheduler_inputs["all_timesteps"]
+
+        noise_pred = module(**model_inputs)[0]
+        true_cfg_scale = model_config.extra_configs.get("true_cfg_scale", 1.0)
+        if true_cfg_scale > 1.0:
+            assert negative_model_inputs is not None
+            neg_noise_pred = module(**negative_model_inputs)[0]
+            comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+            cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+            noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
+            noise_pred = comb_pred * (cond_norm / noise_norm)
+
+        _, log_prob, prev_sample_mean, std_dev_t = scheduler.sample_previous_step(
+            sample=latents[:, step].float(),
+            model_output=noise_pred.float(),
+            timestep=timesteps[:, step],
+            noise_level=model_config.extra_configs.get("noise_level", None),
+            prev_sample=latents[:, step + 1].float(),
+            sde_type=model_config.extra_configs.get("sde_type", None),
+            return_logprobs=True,
+        )
+        return log_prob, prev_sample_mean, std_dev_t
diff --git a/examples/flowgrpo_trainer/reward_fn.py b/examples/flowgrpo_trainer/reward_fn.py
new file mode 100644
index 00000000000..a9f6bcf4a4a
--- /dev/null
+++ b/examples/flowgrpo_trainer/reward_fn.py
@@ -0,0 +1,152 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import Optional
+
+import aiohttp
+import numpy as np
+import torch
+from openai.types.chat import ChatCompletion
+from PIL import Image
+from transformers import PreTrainedTokenizer
+
+
+async def chat_complete(router_address: str, chat_complete_request: dict):
+    url = f"http://{router_address}/v1/chat/completions"
+    try:
+        timeout = aiohttp.ClientTimeout(total=None)
+        session = aiohttp.ClientSession(timeout=timeout)
+        async with session.post(url, json=chat_complete_request) as resp:
+            output = await resp.text()
+            output = json.loads(output)
+            return ChatCompletion(**output)
+    except Exception as e:
+        raise e
+    finally:
+        await session.close()
+
+
+async def compute_score_ocr(
+    data_source: str,
+    solution_image: Image.Image | np.ndarray | torch.Tensor,
+    ground_truth: str,
+    extra_info: dict,
+    reward_router_address: str,
+    reward_model_tokenizer: PreTrainedTokenizer = None,
+    model_name: Optional[str] = None,
+):
+    """
+    Compute the image OCR score via a generative reward model.
+
+    The function takes in the image and converts it to base64 format,
+    and sends it to a generative reward model (GRM) through a specified router address.
+    The GRM processes the image and returns a response containing the recognized text.
+    The function then compares the recognized text with the ground truth
+    using Levenshtein distance to compute an OCR score between 0 and 1, where 1 indicates a perfect match.
+
+    Args:
+        data_source (str): The source dataset identifier. Unused here but kept for interface consistency.
+        solution_image (Image.Image | np.ndarray | torch.Tensor): The solution image or video to be evaluated.
+        ground_truth (str): The ground truth text for comparison.
+        extra_info (dict): Additional information needed for scoring. Unused here but kept for interface consistency.
+        reward_router_address (str): The address of the router to send the image for GRM processing.
+        reward_model_tokenizer (PreTrainedTokenizer, optional): Tokenizer for the reward model, unused here.
+        model_name (str, optional): The name or path of the GRM to use for processing the image. Defaults to None.
+
+    Returns:
+        dict: A dictionary containing the computed score, and the raw response from the GRM.
+    """
+    import re
+
+    import Levenshtein
+
+    from verl.utils.experimental.reward_utils import pil_image_to_base64
+    from verl.utils.ray_utils import get_event_loop
+
+    frame_interval = extra_info.get("frame_interval", 1)
+    if solution_image.ndim == 3:  # image
+        solution_image = solution_image.unsqueeze(0)
+    elif solution_image.ndim == 4:  # video
+        solution_image = solution_image[::frame_interval]
+
+    scores = []
+    for image in solution_image:
+        # preprocess image to base64
+        if isinstance(image, torch.Tensor):
+            image = image.float().permute(1, 2, 0).cpu().numpy()
+        if isinstance(image, np.ndarray):
+            assert image.shape[-1] == 3, "must be in HWC format"
+            image = (image * 255).round().clip(0, 255).astype(np.uint8)
+            image = Image.fromarray(image)
+        assert isinstance(image, Image.Image)
+
+        image_base64 = await get_event_loop().run_in_executor(None, pil_image_to_base64, image)
+
+        # prepare chat template
+        grm_prompt = (
+            "Please output only the text content from the image without any additional descriptions or formatting."
+        )
+        query = [
+            {
+                "type": "image_url",
+                "image_url": {"url": image_base64},
+            },
+            {"type": "text", "text": grm_prompt},
+        ]
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {
+                "role": "user",
+                "content": query,
+            },
+        ]
+
+        # TODO: make sampling params configurable
+        sampling_params = {"temperature": 0.7, "top_p": 0.8, "max_tokens": 4096}
+
+        model_name = model_name or os.path.expanduser("~/models/tiny-random/qwen3-vl")
+        chat_complete_request = {
+            "messages": messages,
+            "model": model_name,
+            **sampling_params,
+        }
+        result = await chat_complete(
+            router_address=reward_router_address,
+            chat_complete_request=chat_complete_request,
+        )
+        grm_response = result.choices[0].message.content
+
+        # compute OCR score
+        text = grm_response
+        # remove any nonvisible characters and convert to lowercase
+        gt = re.sub(r"\s+", "", ground_truth).lower()
+        text = re.sub(r"\s+", "", text).lower()
+        if gt in text:
+            dist = 0
+        else:
+            dist = Levenshtein.distance(text, gt)
+
+        # recognized many unrelated characters, only add one character penalty
+        dist = min(dist, len(gt))
+        if len(gt) > 0:
+            score = 1 - dist / len(gt)
+        else:
+            # If ground truth is empty, score is 1.0 only if the OCR text is also empty.
+            score = 1.0 if len(text) == 0 else 0.0
+        scores.append(score)
+    score = sum(scores) / len(scores)
+
+    return {"score": score, "genrm_response": grm_response}
diff --git a/examples/flowgrpo_trainer/scheduler/__init__.py b/examples/flowgrpo_trainer/scheduler/__init__.py
new file mode 100644
index 00000000000..11b9a43ee80
--- /dev/null
+++ b/examples/flowgrpo_trainer/scheduler/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .scheduling_flow_match_sde_discrete import FlowMatchSDEDiscreteScheduler
+
+__all__ = ["FlowMatchSDEDiscreteScheduler"]
diff --git a/examples/vllm_omni/scheduling_flow_match_sde_discrete.py b/examples/flowgrpo_trainer/scheduler/scheduling_flow_match_sde_discrete.py
similarity index 100%
rename from examples/vllm_omni/scheduling_flow_match_sde_discrete.py
rename to examples/flowgrpo_trainer/scheduler/scheduling_flow_match_sde_discrete.py
diff --git a/examples/vllm_omni/pipeline_qwenimage.py b/examples/flowgrpo_trainer/vllm_omni/pipeline_qwenimage.py
similarity index 99%
rename from examples/vllm_omni/pipeline_qwenimage.py
rename to examples/flowgrpo_trainer/vllm_omni/pipeline_qwenimage.py
index 906f38bb197..6459d7e5af5 100644
--- a/examples/vllm_omni/pipeline_qwenimage.py
+++ b/examples/flowgrpo_trainer/vllm_omni/pipeline_qwenimage.py
@@ -25,7 +25,7 @@
 from vllm_omni.diffusion.request import OmniDiffusionRequest
 from vllm_omni.diffusion.utils.tf_utils import get_transformer_config_kwargs
 
-from .scheduling_flow_match_sde_discrete import FlowMatchSDEDiscreteScheduler
+from ..scheduler import FlowMatchSDEDiscreteScheduler
 
 
 def _maybe_to_cpu(v):
diff --git a/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh b/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
index fdc80592e1a..dd5c1e8bf59 100644
--- a/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
+++ b/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
@@ -3,7 +3,7 @@ set -xeuo pipefail
 
 # Need to install Megatron-Bridge
 # NOTE: Make sure you use Megatron-Bridge later than 0.2.0 
-# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/83a7c1134c562d8c6decd10a1f0a6e6a7a8a3a44 or later)
+# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/6259ae83c735c4412796fc5cfb4c9607b949ae29 or later)
 # for proper MoE LoRA support.
 
 # For Megatron communication/computation overlapping
diff --git a/examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh b/examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh
index a2af228faf7..fac9658a5d6 100644
--- a/examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh
+++ b/examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh
@@ -36,7 +36,7 @@ MATH_TEST_PATH=${DATADIR}/math/test.parquet
 TRAIN_FILES="['$GSM8K_TRAIN_PATH', '$MATH_TRAIN_PATH']"
 TEST_FILES="['$GSM8K_TEST_PATH', '$MATH_TEST_PATH']"
 
-USE_FUSED_KERNELS=True
+USE_FUSED_KERNELS=False
 
 # -----
 # Launch
diff --git a/examples/grpo_trainer/run_qwen3-30b_dapo_megatron_fp8_trtllm.sh b/examples/grpo_trainer/run_qwen3-30b_dapo_megatron_fp8_trtllm.sh
index 051e14e54ca..1b4a7688a3b 100644
--- a/examples/grpo_trainer/run_qwen3-30b_dapo_megatron_fp8_trtllm.sh
+++ b/examples/grpo_trainer/run_qwen3-30b_dapo_megatron_fp8_trtllm.sh
@@ -77,7 +77,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     actor_rollout_ref.hybrid_engine=True \
     actor_rollout_ref.model.path=${MODEL_PATH} \
     actor_rollout_ref.model.use_fused_kernels=True \
-    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.model.use_remove_padding=True \
     actor_rollout_ref.actor.optim.lr=1e-5 \
     actor_rollout_ref.actor.ppo_mini_batch_size=16 \
     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
diff --git a/examples/grpo_trainer/run_qwen3-32b_npu.sh b/examples/grpo_trainer/run_qwen3-32b_npu.sh
index ea4883f9516..6384a564a6a 100644
--- a/examples/grpo_trainer/run_qwen3-32b_npu.sh
+++ b/examples/grpo_trainer/run_qwen3-32b_npu.sh
@@ -45,6 +45,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.ref.use_torch_compile=False \
     actor_rollout_ref.rollout.enable_chunked_prefill=True \
     actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096 \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger=['console','tensorboard'] \
diff --git a/examples/grpo_trainer/run_qwen3-32b_sglang_mindspeedllm_npu.sh b/examples/grpo_trainer/run_qwen3-32b_sglang_mindspeedllm_npu.sh
new file mode 100644
index 00000000000..ef514724e61
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3-32b_sglang_mindspeedllm_npu.sh
@@ -0,0 +1,233 @@
+#!/bin/bash
+set -xeuo pipefail
+# Project Configuration
+project_name='GRPO-Qwen3-32B-BASE-MATH'
+exp_name='GRPO-Qwen3-32B-BASE-MindSpeedLLM-SGLang'
+
+# Necessary env
+export HCCL_CONNECT_TIMEOUT=1500
+export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+
+export DISABLE_L2_CACHE=1
+export TASK_QUEUE_ENABLE=1
+
+# Node Info
+NNODES=${NNODES:-1}
+NPUS_PER_NODE=${NPUS_PER_NODE:-16}
+
+# Model Weights Paths
+MODEL_PATH=Qwen/Qwen3-32B
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+
+# File System Paths
+TRAIN_FILE=$RAY_DATA_HOME/gsm8k/train.parquet
+TEST_FILE=$RAY_DATA_HOME/gsm8k/test.parquet
+# Data Length Configuration
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 2))
+
+# Training Batch Configuration
+train_prompt_bsz=16
+train_prompt_mini_bsz=16
+n_resp_per_prompt=8
+micro_batch_size=1
+
+# Algorithm Configuration
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.001
+
+# Performance and Memory Management Configuration
+all_offload=True
+use_dynamic_bsz=False
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
+
+# Megatron Parallelism Configuration
+train_tp=4
+train_pp=4
+train_cp=1
+
+# SGLang Generation Configuration
+gen_tp=4
+gen_dp=1
+gpu_memory_utilization=0.5
+max_model_len=$((max_prompt_length + max_response_length))
+max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
+
+# Data Configuration
+DATA_CONFIG=(
+    # File Paths
+    data.train_files="${TRAIN_FILE}"
+    data.val_files="${TEST_FILE}"
+    # Data Structure
+    data.prompt_key=prompt
+    # Batch and Length Configuration
+    data.train_batch_size=${train_prompt_bsz}
+    data.max_prompt_length=${max_prompt_length}
+    data.max_response_length=${max_response_length}
+    # Preprocessing
+    data.filter_overlong_prompts=False
+    data.truncation='left'
+)
+
+# Model Configuration
+MODEL_CONFIG=(
+    # Model Path
+    actor_rollout_ref.model.path="${MODEL_PATH}"
+    # Model Processing
+    actor_rollout_ref.model.use_remove_padding=True
+)
+
+# Reinforcement Learning Algorithm Configuration
+ALGORITHM_CONFIG=(
+    # Advantage Estimation
+    algorithm.adv_estimator=${adv_estimator}
+    # KL Divergence Control
+    algorithm.use_kl_in_reward=${use_kl_in_reward}
+    algorithm.kl_ctrl.kl_coef=${kl_coef}
+)
+
+ACTOR_CONFIG=(
+    # Core Runtime Settings
+    actor_rollout_ref.actor.use_torch_compile=False
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+    # Loss Function Configuration
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
+    actor_rollout_ref.actor.entropy_coeff=0
+    # PPO Training Parameters
+    actor_rollout_ref.actor.ppo_epochs=1
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
+    # Optimizer Settings
+    actor_rollout_ref.actor.optim.lr=1e-6
+    # Megatron Parallelism Strategy
+    actor_rollout_ref.actor.mindspeed.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.actor.mindspeed.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.actor.mindspeed.context_parallel_size=${train_cp}
+    # Memory Optimization
+    actor_rollout_ref.actor.mindspeed.param_offload=${all_offload}
+    actor_rollout_ref.actor.mindspeed.optimizer_offload=${all_offload}
+    actor_rollout_ref.actor.mindspeed.grad_offload=${all_offload}
+    # Model Weights Management
+    actor_rollout_ref.actor.mindspeed.use_mbridge=True
+    actor_rollout_ref.actor.mindspeed.vanilla_mbridge=True
+    # Transformer Architecture Optimizations
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.spec='[mindspeed_llm.tasks.models.spec.qwen3_spec, layer_spec]'
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.seq_length=${max_model_len}
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.micro_batch_size=${micro_batch_size}
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.num_query_groups=8
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_method=uniform
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_granularity=full
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_num_layers=1
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.overlap_grad_reduce=True
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.overlap_param_gather=True
+)
+
+REF_CONFIG=(
+    # Core Runtime Settings
+    actor_rollout_ref.ref.use_torch_compile=False
+    # Log Probability Inference
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    # Megatron Parallelism Strategy
+    actor_rollout_ref.ref.mindspeed.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.ref.mindspeed.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.ref.mindspeed.context_parallel_size=${train_cp}
+    # Memory Optimization
+    actor_rollout_ref.ref.mindspeed.param_offload=${all_offload}
+    # Model Weights Management
+    actor_rollout_ref.ref.mindspeed.use_mbridge=True
+    actor_rollout_ref.ref.mindspeed.vanilla_mbridge=True
+)
+
+ROLLOUT_CONFIG=(
+    # Rollout Engine
+    actor_rollout_ref.rollout.name=sglang
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
+    # Generation Parameters
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.top_p=1.0
+    actor_rollout_ref.rollout.top_k=-1
+    actor_rollout_ref.rollout.temperature=1.0
+    # Log Probability Inference
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    # Memory Management
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
+    # Parallelism Strategy
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
+    actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
+    # Performance Optimization
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
+    actor_rollout_ref.rollout.enforce_eager=False
+    # Validation Generation
+    actor_rollout_ref.rollout.val_kwargs.n=1
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0
+    actor_rollout_ref.rollout.val_kwargs.top_k=-1
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0
+)
+
+TRAINER_CONFIG=(
+    # Logger Configuration
+    trainer.logger='["console"]'
+    # Project Settings
+    trainer.project_name="${project_name}"
+    trainer.experiment_name="${exp_name}"
+    # Hardware Configuration
+    trainer.nnodes="${NNODES}"
+    trainer.n_gpus_per_node="${NPUS_PER_NODE}"
+    trainer.device='npu'
+    # Training Schedule
+    trainer.total_epochs=15
+    trainer.val_before_train=False
+    trainer.test_freq=-1
+    trainer.save_freq=-1
+    # Checkpoint Directory
+    trainer.default_local_dir="${CKPTS_DIR}"
+    trainer.use_legacy_worker_impl=disable
+)
+
+# profiling configuration
+PROF_CONFIG=(
+    global_profiler.tool=npu 
+    global_profiler.steps=null 
+    global_profiler.save_path=/profpath 
+    actor_rollout_ref.actor.profiler.enable=True 
+    actor_rollout_ref.actor.profiler.ranks="[0]" 
+    actor_rollout_ref.actor.profiler.all_ranks=False 
+    actor_rollout_ref.actor.profiler.tool_config.npu.discrete=True 
+    actor_rollout_ref.actor.profiler.tool_config.npu.contents=['npu','cpu'] 
+    actor_rollout_ref.actor.profiler.tool_config.npu.level=level0 
+    actor_rollout_ref.actor.profiler.tool_config.npu.analysis=True 
+    actor_rollout_ref.rollout.profiler.enable=True 
+    actor_rollout_ref.rollout.profiler.ranks="[0]" 
+    actor_rollout_ref.rollout.profiler.all_ranks=False 
+)
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_trainer.yaml' \
+    model_engine=mindspeed \
+    "${DATA_CONFIG[@]}" \
+    "${MODEL_CONFIG[@]}" \
+    "${ACTOR_CONFIG[@]}" \
+    "${REF_CONFIG[@]}" \
+    "${ROLLOUT_CONFIG[@]}" \
+    "${ALGORITHM_CONFIG[@]}" \
+    "${TRAINER_CONFIG[@]}" \
+    "${PROF_CONFIG[@]}" \
+    "$@"
diff --git a/examples/grpo_trainer/run_qwen3_235b_256k_megatron_npu.sh b/examples/grpo_trainer/run_qwen3_235b_256k_megatron_npu.sh
new file mode 100644
index 00000000000..a5ae5e7ed7c
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3_235b_256k_megatron_npu.sh
@@ -0,0 +1,207 @@
+#!/bin/bash
+# set -xeuo pipefail
+
+## !!!!!!!supplement!!!!!!
+## This script can be used for inference in 256 K and 128 K
+
+ulimit -n 32768
+
+# Project Configuration
+project_name='GRPO-Qwen3-235B-A22B-Instruct-MATH'
+exp_name='GRPO-Qwen3-235B-A22B-Instruct-Megatron-vLLM'
+
+# Node Info
+NNODES=${NNODES:-16}
+NPUS_PER_NODE=${NPUS_PER_NODE:-16}
+
+# Model Weights Paths
+# MODEL_PATH=/mnt/weight/Qwen3-235B-A22B
+MODEL_PATH=${WORK_DIR}/Qwen3-235B-A22B-Instruct-2507
+MCORE_MODEL_PATH=${WORK_DIR}/Qwen3-235B-A22B-Instruct-2507-Mcore
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+
+# File System Paths
+TRAIN_FILE=${WORK_DIR}/gsm8k/train.parquet
+TEST_FILE=${WORK_DIR}/gsm8k/test.parquet
+
+# Data Configuration
+max_prompt_length=$((1024 * 1))
+max_response_length=$((1024 * 255))
+
+# Algorithm Configuration
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.001
+
+# Training Batch Configuration
+train_prompt_bsz=4
+n_resp_per_prompt=4
+train_prompt_mini_bsz=4
+
+# Performance and Memory Related Configuration
+all_offload=True
+use_dynamic_bsz=False
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+optimizer_offload_fraction=1
+
+# Megatron Configuration
+train_tp=2
+train_ep=16
+train_etp=1
+train_pp=16
+train_cp=8
+
+# vLLM Configuration
+gen_tp=4
+gen_dp=32
+gen_ep=128
+gpu_memory_utilization=0.7
+max_model_len=$((max_prompt_length + max_response_length))
+max_num_batched_tokens=2048
+
+# Pipeline Layer Configuration
+first_layer=5
+last_layer=5
+
+# Data Configuration
+DATA_ARGS=(
+    data.train_files="${TRAIN_FILE}"
+    data.val_files="${TEST_FILE}"
+    data.prompt_key=prompt
+    data.train_batch_size=${train_prompt_bsz}
+    data.max_prompt_length=${max_prompt_length}
+    data.max_response_length=${max_response_length}
+    data.filter_overlong_prompts=False
+    data.truncation='left'
+)
+
+# Model Configuration
+MODEL_ARGS=(
+    actor_rollout_ref.model.path="${MODEL_PATH}"
+    actor_rollout_ref.model.use_remove_padding=True
+)
+
+# RL Algorithm Configuration
+ALGORITHM_ARGS=(
+    algorithm.adv_estimator=${adv_estimator}
+    algorithm.use_kl_in_reward=${use_kl_in_reward}
+    algorithm.kl_ctrl.kl_coef=${kl_coef}
+)
+
+# Actor Model Configuration
+ACTOR_ARGS=(
+    actor_rollout_ref.actor.use_torch_compile=False
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
+    actor_rollout_ref.actor.entropy_coeff=0
+    actor_rollout_ref.actor.ppo_epochs=1
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.optim.clip_grad=1.0
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10
+    actor_rollout_ref.actor.optim.weight_decay=0.1
+    actor_rollout_ref.actor.optim.lr=1e-6
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction}
+    +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp}
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp}
+    actor_rollout_ref.actor.megatron.param_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.grad_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=True
+    actor_rollout_ref.actor.megatron.use_mbridge=False
+    actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
+    +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+    +actor_rollout_ref.actor.megatron.override_transformer_config.context_parallel_size=${train_cp}
+    +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=${first_layer}
+    +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${last_layer}
+    +actor_rollout_ref.actor.megatron.override_transformer_config.normalization=RMSNorm
+    +actor_rollout_ref.actor.megatron.override_transformer_config.use_fused_rmsnorm=True
+    +actor_rollout_ref.actor.megatron.override_transformer_config.swiglu=True
+    +actor_rollout_ref.actor.megatron.override_transformer_config.use_fused_swiglu=True
+    +actor_rollout_ref.actor.megatron.override_transformer_config.use_distributed_optimizer=True
+    +actor_rollout_ref.actor.megatron.override_transformer_config.sequence_parallel=True 
+)
+
+# Reference Model Configuration
+REF_ARGS=(
+    actor_rollout_ref.ref.use_torch_compile=False
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp}
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp}
+    actor_rollout_ref.ref.megatron.param_offload=${all_offload}
+    actor_rollout_ref.ref.megatron.use_mbridge=False
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=True
+    actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
+    ++actor_rollout_ref.ref.megatron.override_transformer_config.use_flash_attn=True
+    +actor_rollout_ref.ref.megatron.override_transformer_config.sequence_parallel=True
+)
+
+# Rollout Configuration
+ROLLOUT_ARGS=(
+    actor_rollout_ref.rollout.name=vllm
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.max_num_seqs=16
+    actor_rollout_ref.rollout.top_p=1.0
+    actor_rollout_ref.rollout.top_k=-1
+    actor_rollout_ref.rollout.temperature=1.0
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
+    actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens}
+    actor_rollout_ref.rollout.max_model_len=${max_model_len}
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
+    actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
+    actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
+    actor_rollout_ref.rollout.enable_chunked_prefill=True
+    actor_rollout_ref.rollout.enable_prefix_caching=True
+    actor_rollout_ref.rollout.enforce_eager=True
+    actor_rollout_ref.rollout.free_cache_engine=True
+)
+
+# Trainer Configuration
+TRAINER_ARGS=(
+    trainer.logger='["console","tensorboard"]'
+    trainer.project_name="${project_name}"
+    trainer.experiment_name="${exp_name}"
+    trainer.nnodes="${NNODES}"
+    trainer.n_gpus_per_node="${NPUS_PER_NODE}"
+    trainer.device='npu'
+    trainer.total_epochs=15
+    trainer.val_before_train=False
+    trainer.test_freq=-1
+    trainer.save_freq=-1
+    trainer.default_local_dir="${CKPTS_DIR}"
+)
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    "${DATA_ARGS[@]}" \
+    "${MODEL_ARGS[@]}" \
+    "${ACTOR_ARGS[@]}" \
+    "${REF_ARGS[@]}" \
+    "${ROLLOUT_ARGS[@]}" \
+    "${ALGORITHM_ARGS[@]}" \
+    "${TRAINER_ARGS[@]}" \
+    "$@" | tee logs/run_qwen3moe-wy_235b_grpo_megatron_vllm_npu_$(date +%Y%m%d_%H%M%S).log
diff --git a/examples/grpo_trainer/run_qwen3_235b_megatron_npu.sh b/examples/grpo_trainer/run_qwen3_235b_megatron_npu.sh
index bccb48fa8a9..fb6b8852733 100644
--- a/examples/grpo_trainer/run_qwen3_235b_megatron_npu.sh
+++ b/examples/grpo_trainer/run_qwen3_235b_megatron_npu.sh
@@ -91,7 +91,6 @@ python3 -m verl.trainer.main_ppo --config-path=config  --config-name='ppo_megatr
     actor_rollout_ref.rollout.data_parallel_size=${gen_dp} \
     actor_rollout_ref.rollout.expert_parallel_size=64 \
     actor_rollout_ref.rollout.name=vllm \
-    +actor_rollout_ref.rollout.enable_expert_parallel=True \
     actor_rollout_ref.actor.megatron.param_offload=${offload} \
     actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
     actor_rollout_ref.actor.megatron.grad_offload=${offload} \
@@ -122,8 +121,8 @@ python3 -m verl.trainer.main_ppo --config-path=config  --config-name='ppo_megatr
     +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
     +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
     +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True \
-    +actor_rollout_ref.ref.megatron.override_transformer_config.use_flash_attn=True \
-    actor_rollout_ref.rollout.enforce_eager=True \
+    actor_rollout_ref.ref.megatron.override_transformer_config.use_flash_attn=True \
+    actor_rollout_ref.rollout.enforce_eager=False \
     trainer.device=npu \
     +actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_capture_sizes="[8, 16, 32, 64, 128]" \
     +actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode="FULL_DECODE_ONLY" 2>&1 | tee "logs/verl_qwen3_235b_sy$(date +%Y%m%d_%H%M).log"
diff --git a/examples/grpo_trainer/run_qwen3_5-122b-a10b-megatron.sh b/examples/grpo_trainer/run_qwen3_5-122b-a10b-megatron.sh
new file mode 100644
index 00000000000..909744be72a
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3_5-122b-a10b-megatron.sh
@@ -0,0 +1,189 @@
+#!/usr/bin/env bash
+# Qwen3.5-122B-A10B MoE GRPO RL with Megatron (four nodes, 8 GPUs, H20, 96G, geo3k dataset)
+# Using verlai/verl:vllm017.latest docker image
+# Requirements:
+#   - 32 GPUs (96GB each, e.g. 4x8 H20)
+#   - Additional packages on top of the base image:
+#       pip install --upgrade transformers
+#       pip install flash-linear-attention
+#       pip install -U git+https://github.com/ISEEKYAN/mbridge.git
+#   - Megatron-LM==0.16.0
+#
+# Qwen3.5 architecture notes:
+#   Qwen3.5 uses Gated Delta Net (GDN) linear attention which currently does
+#   NOT support packed sequences (THD format) in Megatron-LM. Therefore:
+#     - model.use_remove_padding=False           (deprecated option, will be removed in the future forces bshd compute format)
+#     - actor.megatron.use_remove_padding=False  (forces bshd compute format)
+#     - actor.use_dynamic_bsz=False              (required for bshd mode)
+#
+#   Once Megatron-LM adds THD support for Qwen3.5 GDN, use_remove_padding
+#   can be set to True for better performance.
+#
+# Tested parallelism config (32 GPUs / 4 node):
+#   TP=2 PP=2 CP=1 EP=8 ETP=1 GEN_TP=8
+#
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export VLLM_USE_V1=1
+export VLLM_ALLREDUCE_USE_SYMM_MEM=0
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+set -xeuo pipefail
+unset http_proxy
+unset https_proxy
+# download geo3k dataset
+hf download tyzhu/geo3k --repo-type dataset --local-dir $HOME/data/geo3k
+
+test_files=${test_files:-$HOME/data/geo3k/test.parquet}
+train_files=${train_files:-$HOME/data/geo3k/train.parquet}
+HF_MODEL_PATH=${HF_MODEL_PATH:-"Qwen/Qwen3.5-122B-A10B"}
+
+save_contents="['model', 'extra', 'optimizer']"
+
+project_name=${project_name:-'verl_grpo_qwen3_5_122b_geo3k'}
+exp_name=${exp_name:-'qwen3_5_122b_megatron'}
+
+rollout_backend="vllm"
+
+save_path=${save_path:-"Qwen/Qwen3.5-122B/verl_checkpoint"}
+save_freq=50
+
+train_batch_size=128
+max_prompt_length=3240
+max_response_length=4096
+adv_estimator=${adv_estimator:-grpo}
+
+TP=${TP:-2}
+PP=${PP:-2}
+CP=${CP:-1}
+EP=${EP:-8}
+ETP=${ETP:-1}
+GEN_TP=${GEN_TP:-8}
+ACTOR_VPP=${ACTOR_VPP:-null}
+ALL_OFFLOAD=${ALL_OFFLOAD:-True}
+
+NODE_GPU_NUM=${NODE_GPU_NUM:-8}
+NODES_NUM=${NODES_NUM:-4}
+
+########################### Parameter Arrays ###########################
+
+ACTOR=(
+    actor_rollout_ref.actor.optim.lr=1e-6
+    actor_rollout_ref.actor.ppo_mini_batch_size=64
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=8192
+    actor_rollout_ref.actor.use_dynamic_bsz=False
+    actor_rollout_ref.actor.use_kl_loss=False
+    actor_rollout_ref.actor.kl_loss_coef=0.01
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.entropy_coeff=0
+    actor_rollout_ref.actor.megatron.vanilla_mbridge=True
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
+    actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
+    actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
+    actor_rollout_ref.actor.megatron.dtype=bfloat16
+    actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=$ACTOR_VPP
+    actor_rollout_ref.actor.megatron.use_remove_padding=False
+    actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+    actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+    actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_load_balancing_type=\"none\"
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=True
+    +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1
+    +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
+    actor_rollout_ref.actor.use_torch_compile=True
+    actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}"
+
+)
+
+
+ROLLOUT=(
+    actor_rollout_ref.rollout.name=${rollout_backend}
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${GEN_TP}
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.66
+    actor_rollout_ref.rollout.n=6
+    actor_rollout_ref.rollout.mode=async
+    actor_rollout_ref.rollout.dtype=bfloat16
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=False
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=8192
+    +actor_rollout_ref.rollout.engine_kwargs.vllm.max_model_len=15768
+)
+
+REF=(
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=False
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=8192
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
+    actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
+    actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
+)
+
+MODEL=(
+    actor_rollout_ref.model.path=$HF_MODEL_PATH
+    actor_rollout_ref.model.enable_gradient_checkpointing=True
+    actor_rollout_ref.model.use_remove_padding=False
+)
+
+ACTOR_ROLLOUT_REF_COMMON=(
+    actor_rollout_ref.nccl_timeout=10800
+)
+
+ALGORITHM=(
+    algorithm.adv_estimator=${adv_estimator}
+    algorithm.use_kl_in_reward=False
+)
+
+
+DATA=(
+    data.train_files=$train_files
+    data.val_files=$test_files
+    data.train_batch_size=$train_batch_size
+    data.max_prompt_length=$max_prompt_length
+    data.max_response_length=$max_response_length
+    data.truncation='right'
+    data.filter_overlong_prompts=True
+    data.filter_overlong_prompts_workers=64
+)
+
+TRAINER=(
+    trainer.logger=['console','tensorboard']
+    trainer.project_name=$project_name
+    trainer.experiment_name=$exp_name
+    trainer.n_gpus_per_node=$NODE_GPU_NUM
+    trainer.nnodes=$NODES_NUM
+    trainer.save_freq=$save_freq
+    trainer.default_local_dir=${save_path}
+    trainer.test_freq=10
+    trainer.val_before_train=False
+    trainer.total_epochs=20
+)
+
+########################### Launch ###########################
+export HYDRA_FULL_ERROR=1
+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    "${ALGORITHM[@]}" \
+    "${DATA[@]}" \
+    "${MODEL[@]}" \
+    "${ACTOR_ROLLOUT_REF_COMMON[@]}" \
+    "${TRAINER[@]}" \
+    "${ROLLOUT[@]}" \
+    "${ACTOR[@]}" \
+    "${REF[@]}" \
+    "$@" 
diff --git a/examples/grpo_trainer/run_qwen3_5_27b_vllm_fsdp.sh b/examples/grpo_trainer/run_qwen3_5_27b_vllm_fsdp.sh
new file mode 100644
index 00000000000..19e48101c3c
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3_5_27b_vllm_fsdp.sh
@@ -0,0 +1,85 @@
+# dependency: vllm==0.18.0, transformers@<cc7ab9be>
+set -x
+
+project_name='GRPO-Qwen3_5'
+exp_name='GRPO-Qwen3_5-27B'
+gen_tp=4
+sp_size=1
+ENGINE=${1:-vllm}
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3.5-27B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/geo3k/train.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/geo3k/test.parquet"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+
+start_time=$(date +%Y%m%d)_$(date +%H%M%S)
+
+mkdir -p logs
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.train_batch_size=64 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.image_key=images \
+    data.shuffle=False \
+    actor_rollout_ref.model.path=${MODEL_PATH} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.kl_loss_coef=0.01 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    actor_rollout_ref.ref.strategy=fsdp2 \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \
+    actor_rollout_ref.actor.fsdp_config.reshard_after_forward=True \
+    actor_rollout_ref.ref.fsdp_config.reshard_after_forward=True \
+    actor_rollout_ref.actor.fsdp_config.entropy_checkpointing=True \
+    actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \
+    actor_rollout_ref.actor.fsdp_config.offload_policy=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$sp_size \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=$sp_size \
+    actor_rollout_ref.ref.use_torch_compile=False \
+    actor_rollout_ref.ref.fsdp_config.offload_policy=True \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    actor_rollout_ref.rollout.ignore_eos=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=8192 \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.enable_prefix_caching=False \
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=6144 \
+    algorithm.use_kl_in_reward=False \
+    trainer.use_legacy_worker_impl=auto \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.balance_batch=False \
+    trainer.resume_from_path=checkpoints/ \
+    trainer.val_before_train=True \
+    trainer.save_freq=5 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@ 2>&1 | tee logs/qwen3_5-27b-${start_time}.log
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3_5_27b_vllm_fsdp_npu.sh b/examples/grpo_trainer/run_qwen3_5_27b_vllm_fsdp_npu.sh
new file mode 100644
index 00000000000..f47c6943213
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3_5_27b_vllm_fsdp_npu.sh
@@ -0,0 +1,85 @@
+# dependency: vllm==0.18.0, vllm-ascend@<54879467>, transformers@<cc7ab9be>
+set -x
+
+project_name='GRPO-Qwen3_5'
+exp_name='GRPO-Qwen3_5-27B'
+gen_tp=4
+sp_size=1
+ENGINE=${1:-vllm}
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3.5-27B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/geo3k/train.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/geo3k/test.parquet"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+
+start_time=$(date +%Y%m%d)_$(date +%H%M%S)
+
+mkdir -p logs
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.train_batch_size=64 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.image_key=images \
+    data.shuffle=False \
+    actor_rollout_ref.model.path=${MODEL_PATH} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.kl_loss_coef=0.01 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    actor_rollout_ref.ref.strategy=fsdp2 \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=16 \
+    actor_rollout_ref.actor.fsdp_config.reshard_after_forward=True \
+    actor_rollout_ref.ref.fsdp_config.reshard_after_forward=True \
+    actor_rollout_ref.actor.fsdp_config.entropy_checkpointing=True \
+    actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \
+    actor_rollout_ref.actor.fsdp_config.offload_policy=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$sp_size \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=$sp_size \
+    actor_rollout_ref.ref.use_torch_compile=False \
+    actor_rollout_ref.ref.fsdp_config.offload_policy=True \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    actor_rollout_ref.rollout.ignore_eos=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=8192 \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.enable_prefix_caching=False \
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=6144 \
+    algorithm.use_kl_in_reward=False \
+    trainer.use_legacy_worker_impl=auto \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=16 \
+    trainer.nnodes=1 \
+    trainer.balance_batch=False \
+    trainer.resume_from_path=checkpoints/ \
+    trainer.val_before_train=True \
+    trainer.save_freq=5 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@ 2>&1 | tee logs/qwen3_5-27b-${start_time}.log
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3_5_35b_vllm_fsdp.sh b/examples/grpo_trainer/run_qwen3_5_35b_vllm_fsdp.sh
new file mode 100644
index 00000000000..3387df4a820
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3_5_35b_vllm_fsdp.sh
@@ -0,0 +1,85 @@
+# dependency: vllm==0.18.0, transformers@<cc7ab9be>
+set -x
+
+project_name='GRPO-Qwen3_5'
+exp_name='GRPO-Qwen3_5-35B'
+gen_tp=4
+sp_size=1
+ENGINE=${1:-vllm}
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3.5-35B-A3B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/geo3k/train.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/geo3k/test.parquet"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+
+start_time=$(date +%Y%m%d)_$(date +%H%M%S)
+
+mkdir -p logs
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.train_batch_size=64 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.image_key=images \
+    data.shuffle=False \
+    actor_rollout_ref.model.path=${MODEL_PATH} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.kl_loss_coef=0.01 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    actor_rollout_ref.ref.strategy=fsdp2 \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \
+    actor_rollout_ref.actor.fsdp_config.reshard_after_forward=True \
+    actor_rollout_ref.ref.fsdp_config.reshard_after_forward=True \
+    actor_rollout_ref.actor.fsdp_config.entropy_checkpointing=True \
+    actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \
+    actor_rollout_ref.actor.fsdp_config.offload_policy=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$sp_size \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=$sp_size \
+    actor_rollout_ref.ref.use_torch_compile=False \
+    actor_rollout_ref.ref.fsdp_config.offload_policy=True \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    actor_rollout_ref.rollout.ignore_eos=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=8192 \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.enable_prefix_caching=False \
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=6144 \
+    algorithm.use_kl_in_reward=False \
+    trainer.use_legacy_worker_impl=auto \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.balance_batch=False \
+    trainer.resume_from_path=checkpoints/ \
+    trainer.val_before_train=True \
+    trainer.save_freq=5 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@ 2>&1 | tee logs/qwen3_5-35b-${start_time}.log
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3_5_35b_vllm_fsdp_npu.sh b/examples/grpo_trainer/run_qwen3_5_35b_vllm_fsdp_npu.sh
new file mode 100644
index 00000000000..f0e7f62ba78
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3_5_35b_vllm_fsdp_npu.sh
@@ -0,0 +1,85 @@
+# dependency: vllm==0.18.0, vllm-ascend@<54879467>, transformers@<cc7ab9be>
+set -x
+
+project_name='GRPO-Qwen3_5'
+exp_name='GRPO-Qwen3_5-35B'
+gen_tp=4
+sp_size=1
+ENGINE=${1:-vllm}
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3.5-35B-A3B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/geo3k/train.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/geo3k/test.parquet"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+
+start_time=$(date +%Y%m%d)_$(date +%H%M%S)
+
+mkdir -p logs
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.train_batch_size=64 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.image_key=images \
+    data.shuffle=False \
+    actor_rollout_ref.model.path=${MODEL_PATH} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=16 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.kl_loss_coef=0.01 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    actor_rollout_ref.ref.strategy=fsdp2 \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=16 \
+    actor_rollout_ref.actor.fsdp_config.reshard_after_forward=True \
+    actor_rollout_ref.ref.fsdp_config.reshard_after_forward=True \
+    actor_rollout_ref.actor.fsdp_config.entropy_checkpointing=True \
+    actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \
+    actor_rollout_ref.actor.fsdp_config.offload_policy=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$sp_size \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=$sp_size \
+    actor_rollout_ref.ref.use_torch_compile=False \
+    actor_rollout_ref.ref.fsdp_config.offload_policy=True \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    actor_rollout_ref.rollout.ignore_eos=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=8192 \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.enable_prefix_caching=False \
+    actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=6144 \
+    algorithm.use_kl_in_reward=False \
+    trainer.use_legacy_worker_impl=auto \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=16 \
+    trainer.nnodes=1 \
+    trainer.balance_batch=False \
+    trainer.resume_from_path=checkpoints/ \
+    trainer.val_before_train=True \
+    trainer.save_freq=5 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@ 2>&1 | tee logs/qwen3_5-35b-${start_time}.log
\ No newline at end of file
diff --git a/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh b/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
index a5e111f8b8e..2bedbd66672 100644
--- a/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
+++ b/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
@@ -3,7 +3,7 @@ set -xeuo pipefail
 
 # Need to install Megatron-Bridge
 # NOTE: Make sure you use Megatron-Bridge later than 0.2.0 
-# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/83a7c1134c562d8c6decd10a1f0a6e6a7a8a3a44 or later)
+# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/6259ae83c735c4412796fc5cfb4c9607b949ae29 or later)
 # for proper MoE LoRA support.
 
 # For Megatron communication/computation overlapping
diff --git a/examples/grpo_trainer/run_qwen3moe-30b_sglang_mindspeedllm_npu.sh b/examples/grpo_trainer/run_qwen3moe-30b_sglang_mindspeedllm_npu.sh
new file mode 100644
index 00000000000..ee4a8f2559e
--- /dev/null
+++ b/examples/grpo_trainer/run_qwen3moe-30b_sglang_mindspeedllm_npu.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+set -xeuo pipefail
+# Project Configuration
+project_name='GRPO-Qwen3-30b-A3B-BASE-MATH'
+exp_name='GRPO-Qwen3-30B-A3B-BASE-MindSpeedLLM-SGLang'
+
+# Necessary env
+export HCCL_CONNECT_TIMEOUT=1500
+export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+
+export DISABLE_L2_CACHE=1
+export TASK_QUEUE_ENABLE=1
+
+# Node Info
+NNODES=${NNODES:-1}
+NPUS_PER_NODE=${NPUS_PER_NODE:-16}
+
+# Model Weights Paths
+MODEL_PATH=Qwen/Qwen3-30B-A3B
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+
+# File System Paths
+TRAIN_FILE=$RAY_DATA_HOME/gsm8k/train.parquet
+TEST_FILE=$RAY_DATA_HOME/gsm8k/test.parquet
+# Data Length Configuration
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 2))
+
+# Training Batch Configuration
+train_prompt_bsz=16
+train_prompt_mini_bsz=16
+n_resp_per_prompt=8
+micro_batch_size=1
+
+# Algorithm Configuration
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.001
+
+# Performance and Memory Management Configuration
+all_offload=True
+use_dynamic_bsz=False
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
+
+# Megatron Parallelism Configuration
+train_tp=4
+train_ep=4
+train_etp=1
+train_pp=4
+train_cp=1
+
+# SGLang Generation Configuration
+gen_tp=4
+gen_dp=1
+gen_ep=1
+gpu_memory_utilization=0.5
+max_model_len=$((max_prompt_length + max_response_length))
+max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
+
+# Data Configuration
+DATA_CONFIG=(
+    # File Paths
+    data.train_files="${TRAIN_FILE}"
+    data.val_files="${TEST_FILE}"
+    # Data Structure
+    data.prompt_key=prompt
+    # Batch and Length Configuration
+    data.train_batch_size=${train_prompt_bsz}
+    data.max_prompt_length=${max_prompt_length}
+    data.max_response_length=${max_response_length}
+    # Preprocessing
+    data.filter_overlong_prompts=False
+    data.truncation='left'
+)
+
+# Model Configuration
+MODEL_CONFIG=(
+    # Model Path
+    actor_rollout_ref.model.path="${MODEL_PATH}"
+    # Model Processing
+    actor_rollout_ref.model.use_remove_padding=True
+)
+
+# Reinforcement Learning Algorithm Configuration
+ALGORITHM_CONFIG=(
+    # Advantage Estimation
+    algorithm.adv_estimator=${adv_estimator}
+    # KL Divergence Control
+    algorithm.use_kl_in_reward=${use_kl_in_reward}
+    algorithm.kl_ctrl.kl_coef=${kl_coef}
+)
+
+ACTOR_CONFIG=(
+    # Core Runtime Settings
+    actor_rollout_ref.actor.use_torch_compile=False
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+    # Loss Function Configuration
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
+    actor_rollout_ref.actor.entropy_coeff=0
+    # PPO Training Parameters
+    actor_rollout_ref.actor.ppo_epochs=1
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
+    # Optimizer Settings
+    actor_rollout_ref.actor.optim.lr=1e-6
+    # Megatron Parallelism Strategy
+    actor_rollout_ref.actor.mindspeed.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.actor.mindspeed.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.actor.mindspeed.context_parallel_size=${train_cp}
+    actor_rollout_ref.actor.mindspeed.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.actor.mindspeed.expert_tensor_parallel_size=${train_etp}
+    # Memory Optimization
+    actor_rollout_ref.actor.mindspeed.param_offload=${all_offload}
+    actor_rollout_ref.actor.mindspeed.optimizer_offload=${all_offload}
+    actor_rollout_ref.actor.mindspeed.grad_offload=${all_offload}
+    # Model Weights Management
+    actor_rollout_ref.actor.mindspeed.use_mbridge=True
+    actor_rollout_ref.actor.mindspeed.vanilla_mbridge=True
+    # Transformer Architecture Optimizations
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.spec='[mindspeed_llm.tasks.models.spec.qwen3_spec, layer_spec]'
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.seq_length=${max_model_len}
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.micro_batch_size=${micro_batch_size}
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.num_query_groups=4
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_method=uniform
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_granularity=full
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_num_layers=1
+    # MOE
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_router_load_balancing_type=aux_loss
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_permutation_async_comm=True
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_token_dispatcher_type=alltoall
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_aux_loss_coeff=0.001
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_grouped_gemm=True
+)
+
+REF_CONFIG=(
+    # Core Runtime Settings
+    actor_rollout_ref.ref.use_torch_compile=False
+    # Log Probability Inference
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    # Megatron Parallelism Strategy
+    actor_rollout_ref.ref.mindspeed.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.ref.mindspeed.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.ref.mindspeed.context_parallel_size=${train_cp}
+    actor_rollout_ref.ref.mindspeed.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.ref.mindspeed.expert_tensor_parallel_size=${train_etp}
+    # Memory Optimization
+    actor_rollout_ref.ref.mindspeed.param_offload=${all_offload}
+    # Model Weights Management
+    actor_rollout_ref.ref.mindspeed.use_mbridge=True
+    actor_rollout_ref.ref.mindspeed.vanilla_mbridge=True
+)
+
+ROLLOUT_CONFIG=(
+    # Rollout Engine
+    actor_rollout_ref.rollout.name=sglang
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
+    # Generation Parameters
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.top_p=1.0
+    actor_rollout_ref.rollout.top_k=-1
+    actor_rollout_ref.rollout.temperature=1.0
+    # Log Probability Inference
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    # Memory Management
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
+    # Parallelism Strategy
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
+    actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
+    actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
+    # Performance Optimization
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
+    actor_rollout_ref.rollout.enforce_eager=False
+    # Validation Generation
+    actor_rollout_ref.rollout.val_kwargs.n=1
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0
+    actor_rollout_ref.rollout.val_kwargs.top_k=-1
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0
+)
+
+TRAINER_CONFIG=(
+    # Logger Configuration
+    trainer.logger='["console"]'
+    # Project Settings
+    trainer.project_name="${project_name}"
+    trainer.experiment_name="${exp_name}"
+    # Hardware Configuration
+    trainer.nnodes="${NNODES}"
+    trainer.n_gpus_per_node="${NPUS_PER_NODE}"
+    trainer.device='npu'
+    # Training Schedule
+    trainer.total_epochs=15
+    trainer.val_before_train=False
+    trainer.test_freq=-1
+    trainer.save_freq=-1
+    # Checkpoint Directory
+    trainer.default_local_dir="${CKPTS_DIR}"
+    trainer.use_legacy_worker_impl=disable
+)
+
+# profiling configuration
+PROF_CONFIG=(
+    global_profiler.tool=npu 
+    global_profiler.steps=null 
+    global_profiler.save_path=/profpath 
+    actor_rollout_ref.actor.profiler.enable=True 
+    actor_rollout_ref.actor.profiler.ranks="[0]" 
+    actor_rollout_ref.actor.profiler.all_ranks=False 
+    actor_rollout_ref.actor.profiler.tool_config.npu.discrete=True 
+    actor_rollout_ref.actor.profiler.tool_config.npu.contents=['npu','cpu'] 
+    actor_rollout_ref.actor.profiler.tool_config.npu.level=level0 
+    actor_rollout_ref.actor.profiler.tool_config.npu.analysis=True 
+    actor_rollout_ref.rollout.profiler.enable=True 
+    actor_rollout_ref.rollout.profiler.ranks="[0]"
+    actor_rollout_ref.rollout.profiler.all_ranks=False 
+)
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_trainer.yaml' \
+    model_engine=mindspeed \
+    "${DATA_CONFIG[@]}" \
+    "${MODEL_CONFIG[@]}" \
+    "${ACTOR_CONFIG[@]}" \
+    "${REF_CONFIG[@]}" \
+    "${ROLLOUT_CONFIG[@]}" \
+    "${ALGORITHM_CONFIG[@]}" \
+    "${TRAINER_CONFIG[@]}" \
+    "${PROF_CONFIG[@]}" \
+    "$@"
diff --git a/examples/on_policy_distillation_trainer/run_qwen_gsm8k.sh b/examples/on_policy_distillation_trainer/run_qwen_gsm8k.sh
index 6e3b961b1f5..5f0901052ec 100644
--- a/examples/on_policy_distillation_trainer/run_qwen_gsm8k.sh
+++ b/examples/on_policy_distillation_trainer/run_qwen_gsm8k.sh
@@ -7,7 +7,7 @@ ROLLOUT_NAME="vllm" # sglang or vllm
 
 FAMILY="Qwen"
 STUDENT_MODEL=Qwen2.5-0.5B
-TEACHER_MODEL=Qwen2.5-0.5B
+TEACHER_MODEL=Qwen2.5-3B-Instruct
 
 # USE_POLICY_GRADIENT=False
 # DISTILLATION_LOSS_MODE="k3"
diff --git a/examples/on_policy_distillation_trainer/run_qwen_gsmk8k_megatron.sh b/examples/on_policy_distillation_trainer/run_qwen_gsmk8_megatron.sh
similarity index 100%
rename from examples/on_policy_distillation_trainer/run_qwen_gsmk8k_megatron.sh
rename to examples/on_policy_distillation_trainer/run_qwen_gsmk8_megatron.sh
diff --git a/examples/ppo_trainer/run_deepseek7b_llm.sh b/examples/ppo_trainer/run_deepseek7b_llm.sh
index 6a93a75b403..1af6359f71b 100644
--- a/examples/ppo_trainer/run_deepseek7b_llm.sh
+++ b/examples/ppo_trainer/run_deepseek7b_llm.sh
@@ -27,8 +27,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=32 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh b/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh
index eb6dc79234a..66e2e2c78b4 100644
--- a/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh
+++ b/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh
@@ -28,8 +28,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=32 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh b/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh
index 312c6b50b78..b63fee59466 100644
--- a/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh
+++ b/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh
@@ -31,8 +31,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=32 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh b/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh
index d8544ebe484..670e4f9cff0 100644
--- a/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh
+++ b/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh
@@ -30,8 +30,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=32 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh b/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh
index 3cb8a852b5f..0eb9a82aa28 100644
--- a/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh
+++ b/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh
@@ -29,8 +29,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=64 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_gemma.sh b/examples/ppo_trainer/run_gemma.sh
index b015275c134..3ac49ede32f 100644
--- a/examples/ppo_trainer/run_gemma.sh
+++ b/examples/ppo_trainer/run_gemma.sh
@@ -26,8 +26,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=google/gemma-2-2b-it \
     critic.model.enable_gradient_checkpointing=False \
     critic.ppo_micro_batch_size_per_gpu=4 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm.sh b/examples/ppo_trainer/run_qwen2-7b_rm.sh
index d2ababea8a1..bc0db041ef1 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm.sh
@@ -51,8 +51,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path="$HOME/models/Qwen2-7B-Instruct" \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=32 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     reward.num_workers=8 \
     reward.reward_model.enable=True \
     reward.reward_model.model_path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh b/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh
index 83888dd65a8..7b626269e82 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh
@@ -42,8 +42,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=32 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     reward.num_workers=8 \
     reward.reward_model.enable=True \
     reward.reward_model.model_path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
index fc4151ff64f..dc0e76621ae 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh
@@ -38,8 +38,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.enable_gradient_checkpointing=True \
     critic.use_dynamic_bsz=True \
     critic.ppo_max_token_len_per_gpu=98304 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     reward.num_workers=8 \
     reward.reward_model.enable=True \
     reward.reward_model.model_path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
index b4c70905207..5433171ed99 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_fused_kernels.sh
@@ -42,8 +42,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.enable_gradient_checkpointing=True \
     critic.use_dynamic_bsz=True \
     critic.ppo_max_token_len_per_gpu=98304 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     reward.num_workers=8 \
     reward.reward_model.enable=True \
     reward.reward_model.model_path=sfairXC/FsfairX-LLaMA3-RM-v0.1 \
diff --git a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
index 0fafc4fdfd0..661ed7c3aba 100644
--- a/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance_nsys.sh
@@ -48,8 +48,8 @@ python3 -m verl.trainer.main_ppo \
     critic.ppo_micro_batch_size_per_gpu=2 \
     critic.use_dynamic_bsz=True \
     critic.ppo_max_token_len_per_gpu=98304 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     critic.profiler.enable=True \
     critic.profiler.ranks=$PROFILE_RANKS \
     critic.profiler.all_ranks=$PROFILE_RANKS_ALL \
diff --git a/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh
index f055ea5d4fd..d18870eb082 100644
--- a/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_seq_balance.sh
@@ -43,8 +43,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=Qwen/Qwen2-7B-Instruct \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_max_token_len_per_gpu=98304 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh b/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh
index 5108e8b5dd9..9ac44da1d49 100644
--- a/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh
+++ b/examples/ppo_trainer/run_qwen2-7b_sglang_seq_balance.sh
@@ -36,8 +36,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=Qwen/Qwen2-7B-Instruct \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_max_token_len_per_gpu=98304 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2.5-32b.sh b/examples/ppo_trainer/run_qwen2.5-32b.sh
index 58037658500..5ee892acbdf 100644
--- a/examples/ppo_trainer/run_qwen2.5-32b.sh
+++ b/examples/ppo_trainer/run_qwen2.5-32b.sh
@@ -36,8 +36,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path=Qwen/Qwen2.5-32B-Instruct \
     critic.model.enable_gradient_checkpointing=False \
     critic.ppo_micro_batch_size_per_gpu=8 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh b/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh
index 867b1edf5b0..08480cc059e 100644
--- a/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh
+++ b/examples/ppo_trainer/run_qwen2.5-3b_rm_reward_loop_colocate.sh
@@ -42,8 +42,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=32 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     reward.num_workers=8 \
     reward.reward_model.enable=True \
     reward.reward_model.model_path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
diff --git a/examples/ppo_trainer/run_qwen3-8b_npu.sh b/examples/ppo_trainer/run_qwen3-8b_npu.sh
index 97b9cd6e8aa..bb210ca613b 100644
--- a/examples/ppo_trainer/run_qwen3-8b_npu.sh
+++ b/examples/ppo_trainer/run_qwen3-8b_npu.sh
@@ -38,8 +38,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=1 \
     critic.ulysses_sequence_parallel_size=2 \
-    critic.model.fsdp_config.param_offload=True \
-    critic.model.fsdp_config.optimizer_offload=True \
+    critic.fsdp.param_offload=True \
+    critic.fsdp.optimizer_offload=True \
     critic.use_dynamic_bsz=True \
     trainer.critic_warmup=0 \
     trainer.logger=console \
diff --git a/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh b/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh
index c8e3fa7fdc3..7dfe4e5e270 100644
--- a/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh
+++ b/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh
@@ -16,6 +16,8 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
     data.train_files=$HOME/data/gsm8k/train.parquet \
     data.val_files=$HOME/data/gsm8k/test.parquet \
     data.micro_batch_size_per_gpu=64 \
+    data.messages_key=messages \
+    data.ignore_input_ids_mismatch=True \
     optim.lr=1e-4 \
     engine=fsdp \
     engine.ulysses_sequence_parallel_size=2 \
diff --git a/examples/split_placement/run_deepseek7b_llm.sh b/examples/split_placement/run_deepseek7b_llm.sh
index 473dcccdd9b..96973f080cf 100644
--- a/examples/split_placement/run_deepseek7b_llm.sh
+++ b/examples/split_placement/run_deepseek7b_llm.sh
@@ -24,8 +24,8 @@ python3 main_ppo_split.py \
     critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
     critic.model.enable_gradient_checkpointing=False \
     critic.ppo_micro_batch_size_per_gpu=8 \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=False \
+    critic.fsdp.optimizer_offload=False \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger='["console","wandb"]' \
diff --git a/requirements-npu.txt b/requirements-npu.txt
index fada1f839c2..579b76a90ad 100644
--- a/requirements-npu.txt
+++ b/requirements-npu.txt
@@ -1,5 +1,6 @@
 # requirements.txt records the full set of dependencies for development
 accelerate
+bytecode
 codetiming
 datasets
 dill
diff --git a/scripts/generate_trainer_config.sh b/scripts/generate_trainer_config.sh
index c4c89cdbdba..157060ad184 100755
--- a/scripts/generate_trainer_config.sh
+++ b/scripts/generate_trainer_config.sh
@@ -5,9 +5,10 @@ set -euox pipefail
 # Define config specifications: "config_name:output_file:config_arg"
 CONFIG_SPECS=(
     "ppo_trainer:_generated_ppo_trainer.yaml:"
-    "ppo_megatron_trainer:_generated_ppo_megatron_trainer.yaml:--config-name=ppo_megatron_trainer.yaml"
+    "ppo_trainer:_generated_ppo_megatron_trainer.yaml:model_engine=megatron"
     "ppo_trainer:_generated_ppo_veomni_trainer.yaml:model_engine=veomni"
     "ppo_trainer:_generated_ppo_torchtitan_trainer.yaml:model_engine=torchtitan"
+    "diffusion_trainer:_generated_diffusion_trainer.yaml:--config-name=diffusion_trainer.yaml"
 )
 
 generate_config() {
diff --git a/tests/experimental/agent_loop/conftest.py b/tests/experimental/agent_loop/conftest.py
new file mode 100644
index 00000000000..5afbd0d76e9
--- /dev/null
+++ b/tests/experimental/agent_loop/conftest.py
@@ -0,0 +1,28 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+
+
+def pytest_configure(config):
+    config.addinivalue_line("markers", "vllm_omni: requires the vllm-omni package")
+
+
+def pytest_collection_modifyitems(config, items):
+    try:
+        import vllm_omni  # noqa: F401
+    except ImportError:
+        skip = pytest.mark.skip(reason="vllm-omni not installed")
+        for item in items:
+            if "vllm_omni" in item.keywords:
+                item.add_marker(skip)
diff --git a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
index 5df7977423b..232c2161a2d 100644
--- a/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
+++ b/tests/experimental/agent_loop/test_agent_loop_extra_fields_schema_on_cpu.py
@@ -219,7 +219,11 @@ async def test_agent_loop_extra_fields_schema_stable_for_training_concat_on_cpu(
     )
 
     # Mimic two "worker chunks" and concatenate as in training.
-    dummy_worker = type("_DummyWorker", (), {"reward_loop_worker_handles": None, "distillation_enabled": False})()
+    dummy_worker = type(
+        "_DummyWorker",
+        (),
+        {"reward_loop_worker_handles": None, "distillation_enabled": False, "stream_teacher_with_rollout": False},
+    )()
     merged = AgentLoopWorker._postprocess(
         dummy_worker,
         inputs=[internal_a],
@@ -256,6 +260,7 @@ class _DummyWorker:
         _compute_score = AgentLoopWorker._compute_score
         _compute_teacher_logprobs = AgentLoopWorker._compute_teacher_logprobs
         distillation_enabled = False
+        stream_teacher_with_rollout = False
 
         def __init__(self):
             self.tokenizer = _FakeTokenizer()
diff --git a/tests/experimental/agent_loop/test_diffusion_agent_loop.py b/tests/experimental/agent_loop/test_diffusion_agent_loop.py
new file mode 100644
index 00000000000..4e4e732b819
--- /dev/null
+++ b/tests/experimental/agent_loop/test_diffusion_agent_loop.py
@@ -0,0 +1,141 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import numpy as np
+import pytest
+import ray
+from omegaconf import DictConfig, open_dict
+
+from verl.experimental.agent_loop.agent_loop import AgentLoopManager
+from verl.protocol import DataProto
+
+pytestmark = pytest.mark.vllm_omni
+
+
+@pytest.fixture
+def init_config() -> DictConfig:
+    from hydra import compose, initialize_config_dir
+
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="diffusion_trainer")
+
+    model_path = os.path.expanduser("~/models/tiny-random/Qwen-Image")
+    config.actor_rollout_ref.model.path = model_path
+    config.actor_rollout_ref.model.tokenizer_path = os.path.join(model_path, "tokenizer")
+    config.actor_rollout_ref.rollout.name = "vllm_omni"
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.enforce_eager = True
+    config.actor_rollout_ref.rollout.n = 4
+    config.actor_rollout_ref.rollout.num_inference_steps = 10
+    config.actor_rollout_ref.rollout.calculate_log_probs = True
+    config.actor_rollout_ref.rollout.agent.num_workers = 2
+    config.actor_rollout_ref.rollout.agent.default_agent_loop = "diffusion_single_turn_agent"
+    tokenizer_max_length = 1024
+    prompt_template_encode_start_idx = 34
+    max_length = tokenizer_max_length + prompt_template_encode_start_idx
+
+    with open_dict(config.actor_rollout_ref.model.extra_configs):
+        config.actor_rollout_ref.model.extra_configs.true_cfg_scale = 4.0
+        config.actor_rollout_ref.model.extra_configs.max_sequence_length = max_length
+        config.actor_rollout_ref.model.extra_configs.noise_level = 1.0
+        config.actor_rollout_ref.model.extra_configs.sde_window_size = 2
+        config.actor_rollout_ref.model.extra_configs.sde_window_range = [0, 5]
+
+    config.actor_rollout_ref.rollout.nnodes = 1
+
+    qwen_pipeline = "examples.flowgrpo_trainer.vllm_omni.pipeline_qwenimage.QwenImagePipelineWithLogProb"
+    config.actor_rollout_ref.rollout.engine_kwargs.vllm_omni = {"custom_pipeline": qwen_pipeline}
+    config.reward.reward_manager.name = "image"
+    config.trainer.n_gpus_per_node = 4
+
+    config.data.apply_chat_template_kwargs = dict(max_length=max_length, padding=True, truncation=True)
+    config.data.max_prompt_length = max_length
+    config.actor_rollout_ref.rollout.max_model_len = max_length
+
+    # TODO (mike): test with TP later
+    config.actor_rollout_ref.rollout.tensor_model_parallel_size = 1
+    return config
+
+
+def test_single_turn(init_config):
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+            }
+        }
+    )
+    try:
+        agent_loop_manager = AgentLoopManager.create(init_config)
+
+        system_prompt = (
+            "Describe the image by detailing the color, shape, size, texture, quantity, text, "
+            "spatial relationships of the objects and background:"
+        )
+        user_prompts = ["A photo of cute cat with long fur and big eyes.", "A photo of cute dog with short hair."]
+
+        raw_prompts = []
+        for user_prompt in user_prompts:
+            raw_prompts.append(
+                [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ]
+            )
+
+        raw_negative_prompts = []
+        for user_prompt in user_prompts:
+            raw_negative_prompts.append(
+                [
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": " "},
+                ]
+            )
+
+        batch = DataProto(
+            non_tensor_batch={
+                "raw_prompt": np.array(raw_prompts),
+                "raw_negative_prompt": np.array(raw_negative_prompts),
+                "data_source": np.array(["jpeg_compressibility"] * len(raw_prompts)),
+                "reward_model": np.array([{"style": "rule", "ground_truth": ""}] * len(raw_prompts)),
+            },
+        )
+        n = init_config.actor_rollout_ref.rollout.n
+        batch = batch.repeat(n)
+        result = agent_loop_manager.generate_sequences(prompts=batch)
+        assert len(result) == len(raw_prompts) * n
+
+        expected_batch_keys = [
+            "responses",
+            "all_latents",
+            "all_timesteps",
+            "prompt_embeds",
+            "prompt_embeds_mask",
+            "input_ids",
+            "attention_mask",
+            "rollout_log_probs",
+        ]
+        for key in expected_batch_keys:
+            assert key in result.batch, f"Key {key} not found in result batch with keys {list(result.batch.keys())}."
+
+        # check turns
+        num_turns = result.non_tensor_batch["__num_turns__"]
+        assert np.all(num_turns == 2)
+
+        print("Test passed!")
+    finally:
+        ray.shutdown()
diff --git a/tests/experimental/reward_loop/test_visual_reward_manager.py b/tests/experimental/reward_loop/test_visual_reward_manager.py
new file mode 100644
index 00000000000..fd75baaccf6
--- /dev/null
+++ b/tests/experimental/reward_loop/test_visual_reward_manager.py
@@ -0,0 +1,146 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import ray
+import torch
+from hydra import compose, initialize_config_dir
+
+from verl.experimental.reward_loop import RewardLoopManager
+from verl.protocol import DataProto
+from verl.utils import hf_tokenizer
+
+
+def create_data_samples(tokenizer, data_source="ocr") -> DataProto:
+    prompts = ['a photo of displaying "OCR"'] * 3
+    responses = [torch.randn((3, 512, 512))] * 3
+    data_source = [data_source] * len(responses)
+    reward_info = [{"ground_truth": "OCR"}] * len(responses)
+    extra_info = [{}] * len(responses)
+
+    responses = torch.stack(responses)
+    prompt_length = 1024
+    pad_token_id = tokenizer.pad_token_id
+    prompt_ids = []
+    for prompt in prompts:
+        prompt_tokens = tokenizer.encode(prompt)
+        padded_prompt = [pad_token_id] * (prompt_length - len(prompt_tokens)) + prompt_tokens
+        prompt_ids.append(torch.tensor(padded_prompt))
+    prompt_ids = torch.stack(prompt_ids)
+
+    data = DataProto.from_dict(
+        tensors={
+            "input_ids": prompt_ids,
+            "responses": responses,
+        },
+        non_tensors={
+            "data_source": data_source,
+            "reward_model": reward_info,
+            "extra_info": extra_info,
+        },
+    )
+    return data
+
+
+def test_reward_model_genrm():
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+
+    rollout_model_name = os.path.expanduser("~/models/tiny-random/Qwen-Image")
+    reward_model_name = os.path.expanduser("~/models/tiny-random/qwen3-vl")
+
+    config.actor_rollout_ref.model.path = rollout_model_name
+    config.actor_rollout_ref.model.tokenizer_path = os.path.join(rollout_model_name, "tokenizer")
+    config.reward.custom_reward_function.path = "examples/flowgrpo_trainer/reward_fn.py"
+    config.reward.custom_reward_function.name = "compute_score_ocr"
+    config.reward.num_workers = 1
+    config.reward.reward_manager.name = "visual"
+    config.reward.reward_model.enable = True
+    config.reward.reward_model.enable_resource_pool = True
+    config.reward.reward_model.n_gpus_per_node = 2
+    config.reward.reward_model.nnodes = 1
+    config.reward.reward_model.model_path = reward_model_name
+    config.reward.reward_model.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.reward.reward_model.rollout.gpu_memory_utilization = 0.9
+    config.reward.reward_model.rollout.tensor_model_parallel_size = 2
+    config.reward.reward_model.rollout.skip_tokenizer_init = False
+    config.reward.reward_model.rollout.prompt_length = 2048
+    config.reward.reward_model.rollout.response_length = 32
+
+    # 1. init reward model manager
+    reward_loop_manager = RewardLoopManager(config)
+
+    # 2. init test data
+    rollout_tokenizer = hf_tokenizer(config.actor_rollout_ref.model.tokenizer_path)
+    data = create_data_samples(rollout_tokenizer)
+
+    # 3. generate responses
+    outputs = reward_loop_manager.compute_rm_score(data)
+
+    for idx, output in enumerate(outputs):
+        print(f"GRM Response {idx}:\n{output.non_tensor_batch['genrm_response']}\n")
+        print(f"Score:\n{output.non_tensor_batch['score']}\n")
+        print("=" * 50 + "\n")
+
+    ray.shutdown()
+
+
+def test_rule_reward():
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+
+    rollout_model_name = os.path.expanduser("~/models/tiny-random/Qwen-Image")
+
+    config.actor_rollout_ref.model.path = rollout_model_name
+    config.actor_rollout_ref.model.tokenizer_path = os.path.join(rollout_model_name, "tokenizer")
+    config.reward.num_workers = 1
+    config.reward.reward_manager.name = "visual"
+    config.reward.reward_model.enable = False
+
+    # 1. init reward model manager
+    reward_loop_manager = RewardLoopManager(config)
+
+    # 2. init test data
+    rollout_tokenizer = hf_tokenizer(config.actor_rollout_ref.model.tokenizer_path)
+    data = create_data_samples(rollout_tokenizer, data_source="jpeg_compressibility")
+
+    # 3. generate responses
+    outputs = reward_loop_manager.compute_rm_score(data)
+
+    for idx, output in enumerate(outputs):
+        print(f"Rule-based Reward Score:\n{output.batch['rm_scores']}\n")
+        print("=" * 50 + "\n")
+
+    ray.shutdown()
diff --git a/tests/models/test_diffusers_fsdp_engine.py b/tests/models/test_diffusers_fsdp_engine.py
new file mode 100644
index 00000000000..b7e7681def5
--- /dev/null
+++ b/tests/models/test_diffusers_fsdp_engine.py
@@ -0,0 +1,206 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from functools import partial
+
+import numpy as np
+import pytest
+import ray
+import torch
+
+from verl import DataProto
+from verl.models.diffusers_model import build_scheduler
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils import tensordict_utils as tu
+from verl.workers.config import DiffusionModelConfig, FSDPActorConfig, TrainingWorkerConfig
+from verl.workers.engine_workers import TrainingWorker
+from verl.workers.utils.losses import diffusion_loss
+from verl.workers.utils.padding import embeds_padding_2_no_padding
+
+EXTERNAL_LIB = "examples.flowgrpo_trainer.diffusers.qwen_image"
+
+
+def create_training_config(model_type, strategy, device_count, model):
+    if device_count == 1:
+        cp = fsdp_size = 1
+    else:
+        cp = 1  # TODO (mike): diffusers backend does not support SP currently.
+        fsdp_size = 4
+    path = os.path.expanduser(model)
+    tokenizer_path = os.path.join(path, "tokenizer")
+    model_config = DiffusionModelConfig(path=path, tokenizer_path=tokenizer_path, external_lib=EXTERNAL_LIB)
+
+    if strategy in ["fsdp", "fsdp2"]:
+        from hydra import compose, initialize_config_dir
+
+        from verl.utils.config import omega_conf_to_dataclass
+
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/model")):
+            cfg = compose(
+                config_name="diffusion_model",
+                overrides=[
+                    "path=" + path,
+                    "tokenizer_path=" + tokenizer_path,
+                    "external_lib=" + EXTERNAL_LIB,
+                    "lora_rank=8",
+                    "lora_alpha=16",
+                    "+extra_configs.true_cfg_scale=4.0",
+                    "+extra_configs.sde_type=sde",
+                    "+extra_configs.noise_level=1.2",
+                ],
+            )
+        model_config: DiffusionModelConfig = omega_conf_to_dataclass(cfg)
+
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/actor")):
+            cfg = compose(
+                config_name="dp_actor",
+                overrides=[
+                    "strategy=" + strategy,
+                    "clip_ratio=0.0001",
+                    "clip_ratio_high=5.0",
+                    "ppo_mini_batch_size=4",
+                    "ppo_micro_batch_size_per_gpu=4",
+                    "optim.lr=1e-4",
+                    "optim.weight_decay=0.0001",
+                    "fsdp_config.param_offload=False",
+                    "fsdp_config.optimizer_offload=False",
+                    "fsdp_config.model_dtype='bfloat16'",
+                    "fsdp_config.dtype='bfloat16'",
+                    "+fsdp_config.mixed_precision.param_dtype='bfloat16'",
+                    "fsdp_config.forward_only=False",
+                    "fsdp_config.fsdp_size=" + str(fsdp_size),
+                    "fsdp_config.ulysses_sequence_parallel_size=" + str(cp),
+                    "policy_loss.loss_mode='flow_grpo'",
+                ],
+            )
+        actor_config: FSDPActorConfig = omega_conf_to_dataclass(cfg)
+
+        engine_config = actor_config.engine
+        optimizer_config = actor_config.optim
+        checkpoint_config = actor_config.checkpoint
+    else:
+        raise NotImplementedError(f"strategy {strategy} is not supported")
+
+    training_config = TrainingWorkerConfig(
+        model_type=model_type,
+        model_config=model_config,
+        engine_config=engine_config,
+        optimizer_config=optimizer_config,
+        checkpoint_config=checkpoint_config,
+    )
+    return training_config, actor_config
+
+
+def create_data_samples(num_device: int, model_config: DiffusionModelConfig) -> DataProto:
+    from tensordict import TensorDict
+
+    scheduler = build_scheduler(model_config)
+
+    batch_size = 8 * num_device
+    seq_len = 64
+    latent_dim = 64
+    encoder_latent_dim = 32
+    vae_scale_factor = 8
+    height, width = 512, 512
+    latent_height, latent_width = height // vae_scale_factor // 2, width // vae_scale_factor // 2
+    num_diffusion_steps = 10
+    timesteps = scheduler.timesteps[None].repeat(batch_size, 1)
+
+    torch.manual_seed(1)
+    np.random.seed(1)
+
+    batch = TensorDict(
+        {
+            "response_mask": torch.ones((batch_size, num_diffusion_steps)),
+            "old_log_probs": torch.randn((batch_size, num_diffusion_steps)),
+            "advantages": torch.randn((batch_size, num_diffusion_steps)),
+            "all_latents": torch.randn((batch_size, num_diffusion_steps + 1, latent_height * latent_width, latent_dim)),
+            "all_timesteps": timesteps,
+            "prompt_embeds": torch.randn((batch_size, seq_len, encoder_latent_dim)),
+            "prompt_embeds_mask": torch.ones((batch_size, seq_len), dtype=torch.int32),
+            "negative_prompt_embeds": torch.randn((batch_size, seq_len, encoder_latent_dim)),
+            "negative_prompt_embeds_mask": torch.ones((batch_size, seq_len), dtype=torch.int32),
+        },
+        batch_size=batch_size,
+    )
+    data = DataProto(batch=batch)
+    data.meta_info["micro_batch_size_per_gpu"] = 4
+    data.meta_info["height"] = height
+    data.meta_info["width"] = width
+    data.meta_info["vae_scale_factor"] = vae_scale_factor
+    data.meta_info["gradient_accumulation_steps"] = 1
+
+    return data
+
+
+@pytest.mark.parametrize("strategy", ["fsdp", "fsdp2"])
+def test_diffusers_fsdp_engine(strategy):
+    # Create configs
+    ray.init()
+    device_count = torch.cuda.device_count()
+    training_config, actor_config = create_training_config(
+        model_type="diffusion_model",
+        strategy=strategy,
+        device_count=device_count,
+        model="~/models/tiny-random/Qwen-Image",
+    )
+    # init model
+    ray_cls_with_init = RayClassWithInitArgs(cls=ray.remote(TrainingWorker), config=training_config)
+    resource_pool = RayResourcePool(process_on_nodes=[device_count])
+    wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init)  # TrainigWorker
+    wg.reset()
+
+    # forward only without loss function
+    data_td = create_data_samples(device_count, training_config.model_config).to_tensordict()
+    data_td = embeds_padding_2_no_padding(data_td)
+    tu.assign_non_tensor(
+        data_td,
+        compute_loss=False,
+        height=training_config.model_config.get("height", 512),
+        width=training_config.model_config.get("width", 512),
+        vae_scale_factor=training_config.model_config.get("vae_scale_factor", 8),
+    )
+    output = wg.infer_batch(data_td)
+    output_dict = output.get()
+
+    for key in ["log_probs", "metrics"]:
+        assert key in output_dict
+
+    # forward and backward with loss function
+    # set loss function
+    loss_fn = partial(diffusion_loss, config=actor_config)
+    wg.set_loss_fn(loss_fn)
+
+    # train batch
+    data_td = create_data_samples(device_count, training_config.model_config).to_tensordict()
+    data_td = embeds_padding_2_no_padding(data_td)
+    ppo_mini_batch_size = 4
+    ppo_epochs = actor_config.ppo_epochs
+    seed = 42
+    shuffle = actor_config.shuffle
+    tu.assign_non_tensor(
+        data_td,
+        global_batch_size=ppo_mini_batch_size * device_count,
+        mini_batch_size=ppo_mini_batch_size * device_count,
+        epochs=ppo_epochs,
+        seed=seed,
+        dataloader_kwargs={"shuffle": shuffle},
+    )
+    output = wg.train_mini_batch(data_td)
+    output_dict = output.get()
+
+    assert "metrics" in output_dict.keys()
+
+    ray.shutdown()
diff --git a/tests/models/test_engine.py b/tests/models/test_engine.py
index 9878ece4d06..2f63a7242dd 100644
--- a/tests/models/test_engine.py
+++ b/tests/models/test_engine.py
@@ -33,6 +33,12 @@
     Qwen3MoeConfig,
 )
 
+try:
+    from transformers.core_model_loading import revert_weight_conversion
+except ImportError:
+    revert_weight_conversion = None
+    pass
+
 from verl import DataProto
 from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
 from verl.trainer.config import CheckpointConfig
@@ -331,7 +337,7 @@ def test_critic_engine(strategy):
     # update again
     # create critic config
     critic_config = CriticConfig(
-        strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1, model_config=config.model_config
+        strategy=strategy, rollout_n=1, ppo_micro_batch_size_per_gpu=-1, model=config.model_config
     )
     value_loss_ = partial(value_loss, config=critic_config)
     wg.set_loss_fn(value_loss_)
@@ -410,7 +416,10 @@ def _worker(rank: int, world_size: int, rendezvous_file: str, strategy: str, mod
     # get per tensor parameter
     per_tensor_params, _ = engine.get_per_tensor_param()
 
-    ref_state_dict = ref_model.state_dict()
+    if strategy == "megatron" and revert_weight_conversion is not None:
+        ref_state_dict = revert_weight_conversion(ref_model, ref_model.state_dict())
+    else:
+        ref_state_dict = ref_model.state_dict()
 
     # load ground truth and compare
     for key, value in per_tensor_params:
diff --git a/tests/special_e2e/generation/run_gen_qwen05.sh b/tests/special_e2e/generation/run_gen_qwen05.sh
deleted file mode 100755
index 61c55b157cd..00000000000
--- a/tests/special_e2e/generation/run_gen_qwen05.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-# Tested with 1 & 4 GPUs
-set -xeuo pipefail
-
-MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
-
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-4}
-OUTPUT_PATH=${OUTPUT_PATH:-$HOME/data/gen/qwen_05_gen_test.parquet}
-GEN_TP=${GEN_TP:-2}  # Default tensor parallel size to 2
-
-python3 -m verl.trainer.main_generation \
-    trainer.nnodes=1 \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    data.path="${HOME}/data/gsm8k/test.parquet" \
-    data.prompt_key=prompt \
-    data.n_samples=1 \
-    data.output_path="${OUTPUT_PATH}" \
-    model.path="${MODEL_ID}" \
-    +model.trust_remote_code=True \
-    rollout.temperature=1.0 \
-    rollout.top_k=50 \
-    rollout.top_p=0.7 \
-    rollout.prompt_length=2048 \
-    rollout.response_length=1024 \
-    rollout.tensor_model_parallel_size="${GEN_TP}" \
-    rollout.gpu_memory_utilization=0.8
diff --git a/tests/special_e2e/generation/run_gen_qwen05_server.sh b/tests/special_e2e/generation/run_gen_qwen05_server.sh
deleted file mode 100644
index 0d55b167de6..00000000000
--- a/tests/special_e2e/generation/run_gen_qwen05_server.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-# Tested with 1 & 4 GPUs
-set -xeuo pipefail
-
-MODEL_ID=${MODEL_ID:-$HOME/models/Qwen/Qwen2.5-0.5B-Instruct}
-NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
-OUTPUT_PATH=${OUTPUT_PATH:-$HOME/data/gen/qwen_05_gen_test.parquet}
-GEN_TP=${GEN_TP:-2}  # Default tensor parallel size to 2
-
-python3 -m verl.trainer.main_generation_server \
-    trainer.nnodes=1 \
-    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
-    actor_rollout_ref.model.path="${MODEL_ID}" \
-    actor_rollout_ref.model.trust_remote_code=True \
-    actor_rollout_ref.rollout.temperature=1.0 \
-    actor_rollout_ref.rollout.top_k=50 \
-    actor_rollout_ref.rollout.top_p=0.7 \
-    actor_rollout_ref.rollout.prompt_length=2048 \
-    actor_rollout_ref.rollout.response_length=1024 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size="${GEN_TP}" \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.n=4 \
-    data.train_files="${HOME}/data/gsm8k/test.parquet" \
-    data.prompt_key=prompt \
-    +data.output_path="${OUTPUT_PATH}" \
diff --git a/tests/special_e2e/ppo_trainer/run_function_reward.sh b/tests/special_e2e/ppo_trainer/run_function_reward.sh
index 90d9aa9c948..50cbb539aef 100644
--- a/tests/special_e2e/ppo_trainer/run_function_reward.sh
+++ b/tests/special_e2e/ppo_trainer/run_function_reward.sh
@@ -22,8 +22,8 @@ RETURN_RAW_CHAT="True"
 SKIP_TOKENIZER_INIT="True"
 
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.7}
-ACTOR_FSDP_PARAM_OFFLOAD=${ACTOR_FSDP_PARAM_OFFLOAD:-False}
-ACTOR_FSDP_OPTIMIZER_OFFLOAD=${ACTOR_FSDP_OPTIMIZER_OFFLOAD:-False}
+ACTOR_FSDP_PARAM_OFFLOAD=${ACTOR_FSDP_PARAM_OFFLOAD:-True}
+ACTOR_FSDP_OPTIMIZER_OFFLOAD=${ACTOR_FSDP_OPTIMIZER_OFFLOAD:-True}
 REF_FSDP_PARAM_OFFLOAD=${REF_FSDP_PARAM_OFFLOAD:-True}
 RM_PAD=${RM_PAD:-True}
 FUSED_KERNELS=${FUSED_KERNELS:-False}
@@ -64,9 +64,9 @@ fi
 train_traj_micro_bsz_per_gpu=2 # b
 n_resp_per_prompt=4 # g
 
-train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * 1)) # b * n
 train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
-train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_mini_bsz=$((train_traj_mini_bsz * 2)) # 2 * b * n / g
 train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
 
 reward_fn_name=null
@@ -117,6 +117,7 @@ python3 -m verl.trainer.main_ppo \
     actor_rollout_ref.actor.policy_loss.loss_mode="${LOSS_MODE}" \
     actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
     actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
     actor_rollout_ref.rollout.name="${ENGINE}" \
     actor_rollout_ref.rollout.mode="${ROLLOUT_MODE}" \
     actor_rollout_ref.rollout.load_format=${LOAD_FORMAT} \
@@ -131,8 +132,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.path="${MODEL_PATH}" \
     critic.model.enable_gradient_checkpointing=False \
     critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
+    critic.fsdp.param_offload=True \
+    critic.fsdp.optimizer_offload=True \
     reward.custom_reward_function.path="${reward_fn_file_path}"\
     reward.custom_reward_function.name="${reward_fn_name}"\
     algorithm.use_kl_in_reward="${USE_KL}" \
diff --git a/tests/special_e2e/ppo_trainer/run_model_reward.sh b/tests/special_e2e/ppo_trainer/run_model_reward.sh
deleted file mode 100644
index 4c3af3ca8a2..00000000000
--- a/tests/special_e2e/ppo_trainer/run_model_reward.sh
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-NUM_GPUS=${NUM_GPUS:-8}
-
-MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
-MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
-#hf download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
-
-TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
-VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
-
-RM_PAD=${RM_PAD:-True}
-FUSED_KERNELS=${FUSED_KERNELS:-False}
-FUSED_KERNEL_BACKEND=${FUSED_KERNEL_BACKEND:-torch} # or 'triton' for triton backend
-SP_SIZE=${SP_SIZE:-1}
-SEQ_BALANCE=${SEQ_BALANCE:-False}
-LIGER=${LIGER:-False}
-# Validation
-VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
-TEST_FREQ=${TEST_FREQ:--1}
-# Save & Resume
-RESUME_MODE=${RESUME_MODE:-disable}
-SAVE_FREQ=${SAVE_FREQ:--1}
-TOTAL_TRAIN_STEPS=${TOTAL_TRAIN_STEPS:-1}
-
-train_traj_micro_bsz_per_gpu=2 # b
-n_resp_per_prompt=4 # g
-
-train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
-train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
-train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
-train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
-
-train_max_token_num_per_gpu=32768
-infer_max_token_num_per_gpu=32768
-
-exp_name="$(basename "${MODEL_ID,,}")-model-reward-minimal"
-
-python3 -m verl.trainer.main_ppo \
-    algorithm.adv_estimator=gae \
-    data.train_files="${TRAIN_FILES}" \
-    data.val_files="${VAL_FILES}" \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.max_prompt_length=512 \
-    data.max_response_length=512 \
-    data.return_raw_chat=True \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.use_liger="${LIGER}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding="${RM_PAD}" \
-    actor_rollout_ref.model.use_fused_kernels=${FUSED_KERNELS} \
-    actor_rollout_ref.model.fused_kernel_options.impl_backend=${FUSED_KERNEL_BACKEND} \
-    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.use_dynamic_bsz="${SEQ_BALANCE}" \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${train_max_token_num_per_gpu} \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size="${SP_SIZE}" \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
-    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    critic.optim.lr=1e-5 \
-    critic.ulysses_sequence_parallel_size="${SP_SIZE}" \
-    critic.model.use_remove_padding="${RM_PAD}" \
-    critic.optim.lr_warmup_steps_ratio=0.05 \
-    critic.model.path="${MODEL_PATH}" \
-    critic.model.enable_gradient_checkpointing=False \
-    critic.use_dynamic_bsz="${SEQ_BALANCE}" \
-    critic.ppo_max_token_len_per_gpu=${train_max_token_num_per_gpu} \
-    critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    critic.model.fsdp_config.param_offload=False \
-    critic.model.fsdp_config.optimizer_offload=False \
-    reward.num_workers=8 \
-    reward.reward_model.enable=True \
-    reward.reward_model.model_path="${MODEL_PATH}" \
-    reward.reward_model.rollout.gpu_memory_utilization=0.8 \
-    reward.reward_model.rollout.tensor_model_parallel_size=1 \
-    reward.reward_model.rollout.prompt_length=1024 \
-    reward.reward_model.rollout.response_length=512 \
-    algorithm.use_kl_in_reward=False \
-    trainer.critic_warmup=0 \
-    trainer.logger=console \
-    trainer.project_name='verl-test' \
-    trainer.experiment_name="${exp_name}" \
-    trainer.nnodes=1 \
-    trainer.n_gpus_per_node="${NUM_GPUS}" \
-    trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
-    trainer.test_freq="${VAL_BEFORE_TRAIN}" \
-    trainer.save_freq="${SAVE_FREQ}" \
-    trainer.resume_mode="${RESUME_MODE}" \
-    trainer.total_epochs=2 \
-    trainer.total_training_steps="${TOTAL_TRAIN_STEPS}" $@
diff --git a/tests/special_e2e/ppo_trainer/run_single_gpu.sh b/tests/special_e2e/ppo_trainer/run_single_gpu.sh
deleted file mode 100644
index 7e8615a24fb..00000000000
--- a/tests/special_e2e/ppo_trainer/run_single_gpu.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
-  data.train_files=$HOME/data/gsm8k/train.parquet \
-  data.val_files=$HOME/data/gsm8k/test.parquet \
-  data.train_batch_size=256  \
-  data.max_prompt_length=512 \
-  data.max_response_length=256  \
-  actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
-  actor_rollout_ref.actor.optim.lr=1e-6 \
-  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
-  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4  \
-  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
-  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-  actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
-  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-  critic.optim.lr=1e-5 \
-  critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
-  critic.ppo_micro_batch_size_per_gpu=4 \
-  algorithm.kl_ctrl.kl_coef=0.001 \
-  trainer.logger=console \
-  trainer.val_before_train=False \
-  trainer.n_gpus_per_node=1 \
-  trainer.nnodes=1 \
-  actor_rollout_ref.rollout.name=hf \
-  trainer.total_training_steps=2
\ No newline at end of file
diff --git a/tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh b/tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh
deleted file mode 100644
index bd081fd88cf..00000000000
--- a/tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
-  data.train_files=$HOME/data/gsm8k/train.parquet \
-  data.val_files=$HOME/data/gsm8k/test.parquet \
-  data.train_batch_size=256  \
-  data.max_prompt_length=512 \
-  data.max_response_length=256  \
-  actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
-  actor_rollout_ref.actor.optim.lr=1e-6 \
-  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
-  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4  \
-  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
-  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-  actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
-  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-  critic.optim.lr=1e-5 \
-  critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
-  critic.ppo_micro_batch_size_per_gpu=4 \
-  algorithm.kl_ctrl.kl_coef=0.001 \
-  trainer.logger=['console'] \
-  trainer.val_before_train=False \
-  trainer.n_gpus_per_node=1 \
-  trainer.nnodes=1 \
-  actor_rollout_ref.rollout.name=hf \
-  trainer.use_legacy_worker_impl=disable \
-  trainer.total_training_steps=2
diff --git a/tests/special_e2e/run_dapo.sh b/tests/special_e2e/run_dapo.sh
deleted file mode 100644
index ff08985b9e8..00000000000
--- a/tests/special_e2e/run_dapo.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-NUM_GPUS=${NUM_GPUS:-8}
-
-MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
-MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
-#hf download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
-
-adv_estimator=grpo
-
-kl_coef=0.0
-use_kl_in_reward=False
-use_kl_loss=False
-kl_loss_coef=0.0
-
-clip_ratio_low=0.2
-clip_ratio_high=0.28
-
-max_prompt_length=1024
-max_response_length=2048
-enable_overlong_buffer=True
-overlong_buffer_len=128
-overlong_penalty_factor=1.0
-
-loss_agg_mode="token-mean"
-
-enable_filter_groups=True
-filter_groups_metric=seq_reward
-max_num_gen_batches=10
-
-train_traj_micro_bsz_per_gpu=2 # b
-n_resp_per_prompt=4 # g
-
-train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
-train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
-train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
-train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
-
-gen_prompt_bsz=$((train_prompt_bsz * 4))
-
-exp_name="$(basename "${MODEL_ID,,}")-dapo-minimal"
-
-python3 -m recipe.dapo.main_dapo \
-    data.train_files="${HOME}/data/gsm8k/train.parquet" \
-    data.val_files="${HOME}/data/gsm8k/test.parquet" \
-    reward.reward_manager.name=dapo \
-    algorithm.adv_estimator=${adv_estimator} \
-    algorithm.use_kl_in_reward=${use_kl_in_reward} \
-    algorithm.kl_ctrl.kl_coef=${kl_coef} \
-    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
-    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
-    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
-    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    data.max_prompt_length=${max_prompt_length} \
-    data.max_response_length=${max_response_length} \
-    reward.overlong_buffer.enable=${enable_overlong_buffer} \
-    reward.overlong_buffer.len=${overlong_buffer_len} \
-    reward.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
-    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
-    data.train_batch_size=${train_prompt_bsz} \
-    data.gen_batch_size=${gen_prompt_bsz} \
-    algorithm.filter_groups.enable=${enable_filter_groups} \
-    algorithm.filter_groups.metric=${filter_groups_metric} \
-    algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.model.use_fused_kernels=True \
-    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    trainer.logger=console \
-    trainer.project_name='verl-test' \
-    trainer.experiment_name="${exp_name}" \
-    trainer.n_gpus_per_node=${NUM_GPUS} \
-    trainer.nnodes=1 \
-    trainer.save_freq=-1 \
-    trainer.total_epochs=2 \
-    trainer.resume_mode=disable \
-    trainer.val_before_train=False \
-    trainer.total_training_steps=1 $@
diff --git a/tests/special_e2e/run_fully_async_policy.sh b/tests/special_e2e/run_fully_async_policy.sh
index 9ab5bf0b6d1..754a2baecf3 100644
--- a/tests/special_e2e/run_fully_async_policy.sh
+++ b/tests/special_e2e/run_fully_async_policy.sh
@@ -185,20 +185,14 @@ elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then
     echo "Running fully async training with Megatron strategy..."
     # Megatron specific parameters
     gen_tp=2
-    train_tp=1
+    train_tp=2
     train_pp=2
     ref_offload=True
-    actor_offload=False
+    actor_offload=True
+    common_params+=(
+        actor_rollout_ref.rollout.gpu_memory_utilization=0.60
+    )
 
-    if [ -n "$device_name" ] && [ "$device_name" == "npu" ]; then
-        train_tp=2
-        actor_offload=True
-        common_params+=(
-            # Todo The checkpoint_engine.backend should be unified to nccl
-            # actor_rollout_ref.rollout.checkpoint_engine.backend='hccl'
-            actor_rollout_ref.rollout.gpu_memory_utilization=0.60
-        )
-    fi
     python3 -m verl.experimental.fully_async_policy.fully_async_main \
         --config-path=config \
         --config-name='fully_async_ppo_megatron_trainer.yaml' \
@@ -207,8 +201,8 @@ elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then
         critic.strategy=megatron \
         actor_rollout_ref.actor.optim.lr_decay_steps=10000000 \
         actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
         actor_rollout_ref.actor.megatron.param_offload=${actor_offload} \
         actor_rollout_ref.actor.megatron.optimizer_offload=${actor_offload} \
         actor_rollout_ref.actor.megatron.grad_offload=${actor_offload} \
diff --git a/tests/special_e2e/run_geo3k_fsdp_sgl_multiturn_w_tool.sh b/tests/special_e2e/run_geo3k_fsdp_sgl_multiturn_w_tool.sh
deleted file mode 100644
index b7cc1261ee2..00000000000
--- a/tests/special_e2e/run_geo3k_fsdp_sgl_multiturn_w_tool.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-# run on 8xH100
-# make sure your current working directory is the root of the project
-
-set -x
-
-#hf download Qwen/Qwen2.5-VL-3B-Instruct --local-dir $HOME/models/Qwen/Qwen2.5-VL-3B-Instruct
-
-ulimit -n 65535
-
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
-FSDP_STRATEGY=${FSDP_STRATEGY:-fsdp}
-
-python3 -m verl.trainer.main_ppo \
-    --config-path="$CONFIG_PATH" \
-    --config-name='geo3k_multiturn_grpo' \
-    algorithm.adv_estimator=grpo \
-    data.train_batch_size=64 \
-    data.max_prompt_length=2048 \
-    data.max_response_length=2048 \
-    data.filter_overlong_prompts=True \
-    data.truncation='error' \
-    data.return_raw_chat=True \
-    actor_rollout_ref.model.path=$HOME/models/Qwen/Qwen2.5-VL-3B-Instruct \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
-    actor_rollout_ref.actor.use_kl_loss=True \
-    actor_rollout_ref.actor.kl_loss_coef=0.001 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.strategy=$FSDP_STRATEGY \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=sglang \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
-    actor_rollout_ref.rollout.n=8 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
-    actor_rollout_ref.ref.strategy=$FSDP_STRATEGY \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
-    trainer.critic_warmup=0 \
-    trainer.logger=console \
-    trainer.project_name='geo3k_async_rl' \
-    trainer.experiment_name=qwen2.5-vl-3b_function_rm-geo3k-sgl-multi-w-tool-$FSDP_STRATEGY-rebased-0619-verify-n8 \
-    trainer.n_gpus_per_node=8 \
-    trainer.nnodes=1 \
-    trainer.save_freq=-1 \
-    trainer.test_freq=-1 \
-    data.train_files=$HOME/data/geo3k_verl_sgl_multi_turn_preprocessed/train.parquet \
-    data.val_files=$HOME/data/geo3k_verl_sgl_multi_turn_preprocessed/test.parquet \
-    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/geo3k_tool_config.yaml" \
-    trainer.val_before_train=False \
-    trainer.total_training_steps=1 $@
\ No newline at end of file
diff --git a/tests/special_e2e/run_grpo_lora_with_merge.sh b/tests/special_e2e/run_grpo_lora_with_merge.sh
deleted file mode 100644
index 4f5fd5d5b24..00000000000
--- a/tests/special_e2e/run_grpo_lora_with_merge.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env bash
-#
-#  An e2e test script for testing the GRPO LoRA training process 
-#  and processing the generated checkpoint using the merge_model.py script.  
-
-set -xeuo pipefail
-
-MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
-MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
-if [ ! -d "$MODEL_PATH" ]; then
-    echo "Downloading model to ${MODEL_PATH}..."
-#    hf download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
-else
-    echo "Model directory ${MODEL_PATH} already exists, skip downloading."
-fi
-
-
-BATCH_SIZE=16
-EXP_NAME="qwen2.5_0.5b_grpo_lora"
-# step 1. train model with grpo-lora for 1 step
-python3 -m verl.trainer.main_ppo \
-    algorithm.adv_estimator=grpo \
-    data.train_files=$HOME/data/gsm8k/train.parquet \
-    data.val_files=$HOME/data/gsm8k/test.parquet \
-    data.train_batch_size=${BATCH_SIZE} \
-    data.max_prompt_length=512 \
-    data.max_response_length=1024 \
-    data.filter_overlong_prompts=True \
-    data.truncation='error' \
-    data.shuffle=False \
-    actor_rollout_ref.model.path=${MODEL_PATH} \
-    actor_rollout_ref.model.use_shm=True \
-    actor_rollout_ref.model.lora_rank=64 \
-    actor_rollout_ref.model.lora_alpha=32 \
-    actor_rollout_ref.actor.optim.lr=3e-6 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.ppo_mini_batch_size=${BATCH_SIZE} \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
-    actor_rollout_ref.actor.use_kl_loss=True \
-    actor_rollout_ref.actor.kl_loss_coef=0.001 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
-    actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.rollout.load_format=safetensors \
-    actor_rollout_ref.rollout.layered_summon=True \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
-    trainer.critic_warmup=0 \
-    trainer.logger='["console","wandb"]' \
-    trainer.project_name='verl_grpo_example_gsm8k' \
-    trainer.experiment_name=${EXP_NAME} \
-    trainer.n_gpus_per_node=8 \
-    trainer.nnodes=1 \
-    trainer.total_training_steps=1 \
-    trainer.save_freq=1 \
-    trainer.test_freq=5 \
-    trainer.total_epochs=1 $@
-
-# step 2. merge model
-python3 -m verl.model_merger merge \
-    --backend fsdp \
-    --local_dir checkpoints/verl_grpo_example_gsm8k/${EXP_NAME}/global_step_1/actor/ \
-    --target_dir checkpoints/verl_grpo_example_gsm8k/${EXP_NAME}/global_step_1/actor/hf
-
-# step 3. assert
-# make sure adapter_model.safetensors exists and its size is larger than 1MB
-file_path="checkpoints/verl_grpo_example_gsm8k/${EXP_NAME}/global_step_1/actor/hf/lora_adapter/adapter_model.safetensors"
-
-if [ ! -f "$file_path" ]; then
-    echo "Error: File $file_path does not exist!"
-    exit 1
-fi
-
-file_size=$(stat -c %s "$file_path")
-
-min_size_mb=1
-min_size=$((min_size_mb * 1024 * 1024))  # 1MB = 1048576 bytes
-
-if [ "$file_size" -lt "$min_size" ]; then
-    echo "Error: File $file_path is too small! Current size: $((file_size/1024))KB, Required: ${min_size_mb}MB"
-    exit 1
-fi
-
-echo "Check passed: File exists and size is $(($file_size/1024/1024))MB"
-exit 0
diff --git a/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh b/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
deleted file mode 100644
index b03515b9920..00000000000
--- a/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-# run on 8xH20
-# make sure your current working directory is the root of the project
-
-set -x
-
-
-export PYTHONUNBUFFERED=1
-export RAY_DEDUP_LOGS=0
-export RUST_BACKTRACE=1
-export HYDRA_FULL_ERROR=1
-
-ulimit -n 65535
-
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
-
-python3 -m verl.trainer.main_ppo \
-    --config-path="$CONFIG_PATH" \
-    --config-name='gsm8k_multiturn_sf_grpo' \
-    algorithm.adv_estimator=grpo \
-    data.train_batch_size=128 \
-    data.max_prompt_length=2048 \
-    data.max_response_length=16384 \
-    data.filter_overlong_prompts=False \
-    data.truncation='error' \
-    data.return_raw_chat=True \
-    data.train_files=$HOME/data/retool_dapo/train.parquet \
-    data.val_files=$HOME/data/retool_aime2024/train.parquet \
-    actor_rollout_ref.model.path=Qwen/Qwen3-4B \
-    actor_rollout_ref.actor.use_dynamic_bsz=True \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.model.use_liger=False \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    +actor_rollout_ref.model.enable_activation_offload=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
-    actor_rollout_ref.actor.use_kl_loss=False \
-    actor_rollout_ref.actor.kl_loss_coef=0.0 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.actor.fsdp_config.param_offload=True \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=sglang \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
-    actor_rollout_ref.rollout.n=8 \
-    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/sandbox_fusion_tool_config.yaml" \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
-    trainer.critic_warmup=0 \
-    trainer.logger='["console","wandb"]' \
-    trainer.project_name='retool_async_rl' \
-    trainer.experiment_name='qwen3-4b_function_rm-retool-async-sgl-no-sft-n8-v2505271300' \
-    trainer.val_before_train=False \
-    trainer.n_gpus_per_node=8 \
-    trainer.nnodes=1 \
-    trainer.save_freq=100 \
-    trainer.test_freq=20 \
-    trainer.total_training_steps=1000 \
-    trainer.total_epochs=1 $@
\ No newline at end of file
diff --git a/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh b/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
deleted file mode 100644
index 109f6760b28..00000000000
--- a/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-# run on 8xH100
-# make sure your current working directory is the root of the project
-
-set -x
-
-#hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen/Qwen2.5-3B-Instruct
-
-ulimit -n 65535
-
-PROJECT_DIR="$(pwd)"
-CONFIG_PATH="$PROJECT_DIR/examples/sglang_multiturn/config"
-FSDP_STRATEGY=${FSDP_STRATEGY:-fsdp}
-
-python3 -m verl.trainer.main_ppo \
-    --config-path="$CONFIG_PATH" \
-    --config-name='gsm8k_multiturn_grpo' \
-    algorithm.adv_estimator=grpo \
-    data.train_batch_size=256 \
-    data.max_prompt_length=1024 \
-    data.max_response_length=1024 \
-    data.filter_overlong_prompts=True \
-    data.truncation='error' \
-    data.return_raw_chat=True \
-    actor_rollout_ref.model.path=$HOME/models/Qwen/Qwen2.5-3B-Instruct \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
-    actor_rollout_ref.actor.use_kl_loss=True \
-    actor_rollout_ref.actor.kl_loss_coef=0.001 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.actor.entropy_coeff=0 \
-    actor_rollout_ref.model.enable_gradient_checkpointing=True \
-    actor_rollout_ref.actor.strategy=$FSDP_STRATEGY \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=sglang \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
-    actor_rollout_ref.rollout.n=8 \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
-    actor_rollout_ref.ref.strategy=$FSDP_STRATEGY \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
-    trainer.critic_warmup=0 \
-    trainer.logger=console \
-    trainer.project_name='gsm8k_async_rl' \
-    trainer.experiment_name=qwen2.5-3b_function_rm-gsm8k-sgl-multi-w-tool-$FSDP_STRATEGY-rebased-0427-verify-n16 \
-    trainer.n_gpus_per_node=8 \
-    trainer.nnodes=1 \
-    trainer.save_freq=-1 \
-    trainer.test_freq=-1 \
-    data.train_files=$HOME/data/gsm8k_verl_sgl_multi_turn_preprocessed/train.parquet \
-    data.val_files=$HOME/data/gsm8k_verl_sgl_multi_turn_preprocessed/test.parquet \
-    actor_rollout_ref.rollout.multi_turn.tool_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml" \
-    trainer.val_before_train=False \
-    trainer.total_training_steps=1 $@
diff --git a/tests/special_e2e/run_one_step_off_policy.sh b/tests/special_e2e/run_one_step_off_policy.sh
index 814c8917f48..df195062f0f 100755
--- a/tests/special_e2e/run_one_step_off_policy.sh
+++ b/tests/special_e2e/run_one_step_off_policy.sh
@@ -142,6 +142,9 @@ if [ "${ACTOR_STRATEGY}" == "fsdp2" ]; then
     python3 -m verl.experimental.one_step_off_policy.main_ppo \
         "${common_params[@]}" \
         actor_rollout_ref.actor.fsdp_config.strategy=fsdp2 \
+        actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
         critic.strategy=fsdp2 \
         actor_rollout_ref.actor.grad_clip=1.0 \
         actor_rollout_ref.model.use_remove_padding=True \
@@ -161,8 +164,8 @@ elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then
     echo "Running with Megatron strategy..."
     # Megatron specific parameters
     gen_tp=2
-    train_tp=1
-    train_pp=2
+    train_tp=2
+    train_pp=3
     ref_offload=True
     actor_offload=False
 
@@ -171,8 +174,6 @@ elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then
             # Todo The checkpoint_engine.backend should be unified to nccl
             # actor_rollout_ref.rollout.checkpoint_engine.backend='hccl'
             actor_rollout_ref.rollout.gpu_memory_utilization=0.70
-            trainer.n_gpus_per_node=4
-            rollout.n_gpus_per_node=4
             actor_rollout_ref.model.use_remove_padding=True \
             actor_rollout_ref.model.enable_gradient_checkpointing=True \
             actor_rollout_ref.actor.use_dynamic_bsz=True \
@@ -189,9 +190,9 @@ elif [ "${ACTOR_STRATEGY}" == "megatron" ]; then
         "${common_params[@]}" \
         actor_rollout_ref.actor.strategy=megatron \
         critic.strategy=megatron \
-        actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
-        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
-        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+        actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
         actor_rollout_ref.actor.megatron.param_offload=${actor_offload} \
         actor_rollout_ref.actor.megatron.optimizer_offload=${actor_offload} \
         actor_rollout_ref.actor.megatron.grad_offload=${actor_offload} \
diff --git a/tests/special_e2e/run_ppo_trainer_megatron.sh b/tests/special_e2e/run_ppo_trainer_megatron.sh
index 28e83b5af67..f10031b334e 100644
--- a/tests/special_e2e/run_ppo_trainer_megatron.sh
+++ b/tests/special_e2e/run_ppo_trainer_megatron.sh
@@ -46,9 +46,9 @@ forward_max_token_len_per_gpu=${FWD_MAX_TOKEN_LEN:-4800}
 train_traj_micro_bsz_per_gpu=${MICRO_BSZ:-2} # b
 n_resp_per_prompt=4 # g
 
-train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * 1)) # b * n
 train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
-train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_mini_bsz=$((train_traj_mini_bsz * 2)) # 2 * b * n / g
 train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
 
 LORA_RANK=${LORA_RANK:-0}
@@ -91,7 +91,7 @@ CRITIC_TP=${CRITIC_TP:-$TRAIN_TP}
 CRITIC_EP=${CRITIC_EP:-$COMMON_EP}
 CRITIC_ETP=${CRITIC_ETP:-$COMMON_ETP}
 
-ALL_OFFLOAD=${ALL_OFFLOAD:-False}
+ALL_OFFLOAD=${ALL_OFFLOAD:-True}
 COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
 COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD}
 COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD}
@@ -104,7 +104,7 @@ CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
 CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
 CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
 RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
-USE_MBRIDGE=${USE_MBRIDGE:-False}
+USE_MBRIDGE=${USE_MBRIDGE:-True}
 VANILLA_MBRIDGE=${VANILLA_MBRIDGE:-True}
 VALUE_VANILLA_MBRIDGE=${VALUE_VANILLA_MBRIDGE:-$VANILLA_MBRIDGE}
 USE_FUSED_KERNELS=${USE_FUSED_KERNELS:-False}
diff --git a/tests/special_e2e/run_ppo_trainer_torchtitan.sh b/tests/special_e2e/run_ppo_trainer_torchtitan.sh
deleted file mode 100644
index 1d744830f46..00000000000
--- a/tests/special_e2e/run_ppo_trainer_torchtitan.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env bash
-set -xeuo pipefail
-
-# Download model if not exists
-MODEL_ID=${MODEL_ID:-Qwen/Qwen3-0.6B}
-MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
-#huggingface-cli download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
-
-VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
-NUM_GPUS=${NUM_GPUS:-1}
-FSDP_SIZE=${FSDP_SIZE:-1}
-TP_SIZE=${TP_SIZE:-1}
-CP_SIZE=${CP_SIZE:-1}
-EP_SIZE=${EP_SIZE:-1}
-VERL_EXP_NAME=${VERL_EXP_NAME:-Titan_Qwen3_30B_A3B_DP8_EP8}
-MAX_PROMPT_LENGTH=${MAX_PROMPT_LENGTH:-512}
-MAX_RESPONSE_LENGTH=${MAX_RESPONSE_LENGTH:-2048}
-MAX_SEQ_LEN=${MAX_SEQ_LEN:-$((MAX_PROMPT_LENGTH + MAX_RESPONSE_LENGTH))}
-
-python3 -m verl.trainer.main_ppo \
-    model_engine=torchtitan \
-    algorithm.adv_estimator=grpo \
-    data.train_files=$HOME/data/gsm8k/train.parquet \
-    data.val_files=$HOME/data/gsm8k/test.parquet \
-    data.train_batch_size=8  \
-    data.max_prompt_length="${MAX_PROMPT_LENGTH}" \
-    data.max_response_length="${MAX_RESPONSE_LENGTH}"  \
-    data.seed=42 \
-    actor_rollout_ref.model.path="${MODEL_PATH}" \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.actor.optim.min_lr_factor=1.0 \
-    actor_rollout_ref.actor.ppo_mini_batch_size=4 \
-    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1  \
-    actor_rollout_ref.actor.torchtitan.data_parallel_shard_size="${FSDP_SIZE}" \
-    actor_rollout_ref.actor.torchtitan.tensor_parallel_size="${TP_SIZE}" \
-    actor_rollout_ref.actor.torchtitan.context_parallel_size="${CP_SIZE}" \
-    actor_rollout_ref.actor.torchtitan.expert_parallel_size="${EP_SIZE}" \
-    actor_rollout_ref.actor.torchtitan.attn_type=flex \
-    actor_rollout_ref.actor.torchtitan.use_torch_compile=False \
-    actor_rollout_ref.actor.torchtitan.param_offload=True \
-    actor_rollout_ref.actor.torchtitan.optimizer_offload=True \
-    actor_rollout_ref.actor.torchtitan.max_seq_len="${MAX_SEQ_LEN}" \
-    actor_rollout_ref.ref.torchtitan.max_seq_len="${MAX_SEQ_LEN}" \
-    actor_rollout_ref.ref.torchtitan.use_torch_compile=False \
-    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
-    actor_rollout_ref.rollout.enable_chunked_prefill=False \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.35 \
-    actor_rollout_ref.rollout.free_cache_engine=True \
-    actor_rollout_ref.rollout.enforce_eager=True \
-    actor_rollout_ref.rollout.n=8 \
-    actor_rollout_ref.rollout.max_model_len="${MAX_SEQ_LEN}" \
-    critic.optim.lr=1e-5 \
-    critic.model.path="${MODEL_PATH}" \
-    critic.ppo_micro_batch_size_per_gpu=2 \
-    algorithm.kl_ctrl.kl_coef=0.001 \
-    trainer.use_legacy_worker_impl=disable \
-    trainer.logger=['console','file','wandb'] \
-    trainer.project_name='verl_grpo_example_gsm8k_0302' \
-    trainer.experiment_name="${VERL_EXP_NAME}" \
-    trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
-    trainer.log_val_generations=1 \
-    trainer.test_freq=1 \
-    trainer.n_gpus_per_node="${NUM_GPUS}" \
-    trainer.nnodes=1 \
-    trainer.total_training_steps=100 $@
diff --git a/tests/special_e2e/run_test.sh b/tests/special_e2e/run_test.sh
deleted file mode 100644
index c4421c61849..00000000000
--- a/tests/special_e2e/run_test.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -xeuo pipefail
-
-# Get the configuration name and engine name from arguments
-CONFIG_NAME="$1"
-ENGINE="${2:-vllm}"
-
-# Download model if needed
-#hf download Qwen/Qwen2.5-0.5B --local-dir "$HOME/models/Qwen/Qwen2.5-0.5B"
-
-# Run the training with the specified configuration
-python3 -m verl.trainer.main_ppo \
-    --config-name "$CONFIG_NAME" "$@" 
\ No newline at end of file
diff --git a/tests/special_npu/nightly_ci_ascend/run_dapo_moonlight-16b_megatron_npu.sh b/tests/special_npu/nightly_ci_ascend/run_dapo_moonlight-16b_megatron_npu.sh
new file mode 100644
index 00000000000..a3d0c3f00bc
--- /dev/null
+++ b/tests/special_npu/nightly_ci_ascend/run_dapo_moonlight-16b_megatron_npu.sh
@@ -0,0 +1,189 @@
+set -x
+
+project_name='moonlight'
+exp_name='exp'
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_penalty="kl"
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.001
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+max_prompt_length=$((1024 * 1))
+max_response_length=$((1024 * 2))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 1))
+overlong_penalty_factor=1.0
+
+loss_agg_mode="token-mean"
+train_prompt_bsz=32
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+train_ppo_micro_batch_size_per_gpu=2
+infer_ppo_micro_batch_size_per_gpu=2
+
+# Paths
+MODEL_ID=${MODEL_ID:-moonshotai/Moonlight-16B-A3B-Instruct}
+MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}}
+
+TRAIN_FILE=$HOME/data/gsm8k/train.parquet
+TEST_FILE=$HOME/data/gsm8k/test.parquet
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+
+optimizer_offload_fraction=1
+
+COMMON_PP=${COMMON_PP:-4}
+COMMON_VPP=${COMMON_VPP:-null}
+COMMON_CP=${COMMON_CP:-1}
+COMMON_TP=${COMMON_TP:-2}
+COMMON_EP=${COMMON_EP:-1}
+COMMON_ETP=${COMMON_ETP:-2}
+
+TRAIN_TP=${TRAIN_TP:-$COMMON_TP}
+INFER_TP=${INFER_TP:-4}
+ACTOR_PP=${ACTOR_PP:-$COMMON_PP}
+ACTOR_VPP=${ACTOR_VPP:-$COMMON_VPP}
+ACTOR_CP=${ACTOR_CP:-$COMMON_CP}
+ACTOR_TP=${ACTOR_TP:-$TRAIN_TP}
+ACTOR_EP=${ACTOR_EP:-$COMMON_EP}
+ACTOR_ETP=${ACTOR_ETP:-$COMMON_ETP}
+ROLLOUT_TP=${ROLLOUT_TP:-$INFER_TP}
+REF_PP=${REF_PP:-$COMMON_PP}
+REF_VPP=${REF_VPP:-$COMMON_VPP}
+REF_CP=${REF_CP:-$COMMON_CP}
+REF_TP=${REF_TP:-$TRAIN_TP}
+REF_EP=${REF_EP:-$COMMON_EP}
+REF_ETP=${REF_ETP:-$COMMON_ETP}
+CRITIC_PP=${CRITIC_PP:-$COMMON_PP}
+CRITIC_VPP=${CRITIC_VPP:-$COMMON_VPP}
+CRITIC_CP=${CRITIC_CP:-$COMMON_CP}
+CRITIC_TP=${CRITIC_TP:-$TRAIN_TP}
+CRITIC_EP=${CRITIC_EP:-$COMMON_EP}
+CRITIC_ETP=${CRITIC_ETP:-$COMMON_ETP}
+RM_PP=${RM_PP:-$COMMON_PP}
+RM_VPP=${RM_VPP:-$COMMON_VPP}
+RM_CP=${RM_CP:-$COMMON_CP}
+RM_TP=${RM_TP:-$TRAIN_TP}
+RM_EP=${RM_EP:-$COMMON_EP}
+RM_ETP=${RM_ETP:-$COMMON_ETP}
+
+USE_MBRIDGE=True
+USE_DIST_CKPT=False
+
+first_layer=7
+last_layer=6
+python3 -m recipe.dapo.main_dapo \
+    --config-path=config \
+    --config-name="dapo_megatron_trainer" \
+    data.shuffle=False \
+    data.validation_shuffle=False \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.trust_remote_code=True \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_penalty=${kl_penalty} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.trust_remote_code=True \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.policy_loss.loss_mode=vanilla \
+    algorithm.filter_groups.enable=False \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_fused_kernels=False \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_ppo_micro_batch_size_per_gpu} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.actor.optim.lr=3e-6 \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=$first_layer \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=$last_layer \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.tensor_model_parallel_size=${ACTOR_TP} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.multi_latent_attention=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True \
+    actor_rollout_ref.actor.megatron.param_offload=True \
+    actor_rollout_ref.actor.megatron.optimizer_offload=True \
+    actor_rollout_ref.actor.megatron.grad_offload=True \
+    actor_rollout_ref.ref.megatron.param_offload=True \
+    ++actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend=fused \
+    actor_rollout_ref.actor.megatron.use_mbridge=$USE_MBRIDGE \
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=$USE_DIST_CKPT \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${ACTOR_TP} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${ACTOR_PP} \
+    actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=${ACTOR_VPP} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${ACTOR_CP} \
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${ACTOR_EP} \
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ACTOR_ETP} \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.load_format=safetensors \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${infer_ppo_micro_batch_size_per_gpu} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.65 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${INFER_TP} \
+    actor_rollout_ref.rollout.data_parallel_size=1 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    actor_rollout_ref.rollout.enforce_eager=True \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${infer_ppo_micro_batch_size_per_gpu} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=${USE_DIST_CKPT} \
+    actor_rollout_ref.ref.megatron.param_offload=True \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${REF_TP} \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${REF_PP} \
+    actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=${REF_VPP} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${REF_CP} \
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${REF_EP} \
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${REF_ETP} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.val_before_train=False \
+    trainer.balance_batch=False \
+    trainer.test_freq=-1 \
+    trainer.save_freq=-1 \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10 \
+    trainer.total_training_steps=15 2>&1 | tee /root/.cache/nightly_log/moonlight/dapo_moonlight16b_megatron_npu-$(date +%Y%m%d_%H%M).log
\ No newline at end of file
diff --git a/tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-7b-instruct_fsdp_npu.sh b/tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-7b-instruct_fsdp_npu.sh
index c2abcb0098d..c379c77bea9 100644
--- a/tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-7b-instruct_fsdp_npu.sh
+++ b/tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-7b-instruct_fsdp_npu.sh
@@ -46,4 +46,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.nnodes=1 \
     trainer.save_freq=-1 \
     trainer.test_freq=-1 \
-    trainer.total_training_steps=15 2>&1 | tee /root/.cache/grpo_qwen25-7b-instruct_fsdp_npu.log
\ No newline at end of file
+    trainer.total_training_steps=15 2>&1 | tee /root/.cache/nightly_log/qwen25-7b/grpo_qwen25-7b-instruct_fsdp_npu-$(date +%Y%m%d_%H%M).log
\ No newline at end of file
diff --git a/tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-vl-3b-instruct_fsdp_npu.sh b/tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-vl-3b-instruct_fsdp_npu.sh
index 594a0a9749f..4ebd8034bb0 100644
--- a/tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-vl-3b-instruct_fsdp_npu.sh
+++ b/tests/special_npu/nightly_ci_ascend/run_grpo_qwen25-vl-3b-instruct_fsdp_npu.sh
@@ -52,4 +52,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.nnodes=1 \
     trainer.save_freq=-1 \
     trainer.test_freq=-1 \
-    trainer.total_training_steps=15  2>&1 | tee /root/.cache/grpo_qwen25-vl-3b-instruct_fsdp_npu.log
\ No newline at end of file
+    trainer.total_training_steps=15  2>&1 | tee /root/.cache/nightly_log/qwen25-vl-3b/grpo_qwen25-vl-3b-instruct_fsdp_npu-$(date +%Y%m%d_%H%M).log
\ No newline at end of file
diff --git a/tests/special_npu/nightly_ci_ascend/run_ppo_qwen3-8b_fsdp_npu.sh b/tests/special_npu/nightly_ci_ascend/run_ppo_qwen3-8b_fsdp_npu.sh
index 1213328f17f..16df63ecc28 100644
--- a/tests/special_npu/nightly_ci_ascend/run_ppo_qwen3-8b_fsdp_npu.sh
+++ b/tests/special_npu/nightly_ci_ascend/run_ppo_qwen3-8b_fsdp_npu.sh
@@ -43,8 +43,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=1 \
     critic.ulysses_sequence_parallel_size=2 \
-    critic.model.fsdp_config.param_offload=True \
-    critic.model.fsdp_config.optimizer_offload=True \
+    critic.fsdp.param_offload=True \
+    critic.fsdp.optimizer_offload=True \
     critic.use_dynamic_bsz=True \
     trainer.critic_warmup=0 \
     trainer.logger=console \
@@ -57,4 +57,4 @@ python3 -m verl.trainer.main_ppo \
     trainer.val_before_train=False \
     trainer.max_actor_ckpt_to_keep=1 \
     trainer.max_critic_ckpt_to_keep=1 \
-    trainer.total_training_steps=15 2>&1 | tee /root/.cache/ppo_qwen3-8b_fsdp_npu.log
\ No newline at end of file
+    trainer.total_training_steps=15 2>&1 | tee /root/.cache/nightly_log/qwen3-8b-ppo/ppo_qwen3-8b_fsdp_npu-$(date +%Y%m%d_%H%M).log
\ No newline at end of file
diff --git a/tests/special_npu/run_qwen3_06b_ppo.sh b/tests/special_npu/run_qwen3_06b_ppo.sh
index 04bd6dbb6e4..d3844414db5 100644
--- a/tests/special_npu/run_qwen3_06b_ppo.sh
+++ b/tests/special_npu/run_qwen3_06b_ppo.sh
@@ -37,8 +37,8 @@ python3 -m verl.trainer.main_ppo \
     critic.model.enable_gradient_checkpointing=True \
     critic.ppo_micro_batch_size_per_gpu=1 \
     critic.ulysses_sequence_parallel_size=2 \
-    critic.model.fsdp_config.param_offload=True \
-    critic.model.fsdp_config.optimizer_offload=True \
+    critic.fsdp.param_offload=True \
+    critic.fsdp.optimizer_offload=True \
     critic.use_dynamic_bsz=True \
     trainer.critic_warmup=0 \
     trainer.logger='["console"]' \
diff --git a/tests/special_npu/run_qwen3_30b_grpo_mindspeedllm.sh b/tests/special_npu/run_qwen3_30b_grpo_mindspeedllm.sh
new file mode 100644
index 00000000000..917d1f75413
--- /dev/null
+++ b/tests/special_npu/run_qwen3_30b_grpo_mindspeedllm.sh
@@ -0,0 +1,268 @@
+set -x
+
+# Project Configuration
+project_name='GRPO-Qwen3-30B-BASE-TEST'
+exp_name='GRPO-Qwen3-30B-BASE-MindSpeedLLM-SGLang'
+
+# Necessary env
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+
+# Node Info
+NNODES=${NNODES:-1}
+NPUS_PER_NODE=${NPUS_PER_NODE:-8}
+
+# Model Weights Paths
+MODEL_ID=${MODEL_ID:-Qwen/Qwen3-30B-A3B-Instruct-2507}
+MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}}
+
+# use dummy model
+if [[ "$USE_DUMMY_MODEL" == "True" ]]; then
+    DUMMY_MODEL_PATH=${DUMMY_MODEL_PATH:-${HOME}/models_dummy/${MODEL_ID}}
+    if [ -z "${DUMMY_MODEL_CONFIG_PATH}" ]; then
+        echo "[ERROR] DUMMY_MODEL_CONFIG_PATH not set"
+        exit 1
+    fi
+
+    # make sure the path is empty
+    if [[ -d $DUMMY_MODEL_PATH && $DUMMY_MODEL_PATH != "/" ]]; then
+        rm -rf $DUMMY_MODEL_PATH
+    fi
+
+    # init model
+    python scripts/init_random_model.py \
+        --hf_model_path "${MODEL_PATH}" \
+        --new_config_path "${DUMMY_MODEL_CONFIG_PATH}" \
+        --output_path "${DUMMY_MODEL_PATH}"
+
+    # replace model path
+    MODEL_PATH=$DUMMY_MODEL_PATH
+fi
+
+# File System Paths
+TRAIN_FILE=$HOME/data/gsm8k/train.parquet
+TEST_FILE=$HOME/data/gsm8k/test.parquet
+# Data Length Configuration
+max_prompt_length=$((512))
+max_response_length=$((128))
+
+# Training Batch Configuration
+train_prompt_bsz=16
+train_prompt_mini_bsz=16
+n_resp_per_prompt=2
+micro_batch_size=1
+
+# Algorithm Configuration
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.001
+
+# Performance and Memory Management Configuration
+all_offload=True
+use_dynamic_bsz=False
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
+
+# Megatron Parallelism Configuration
+train_tp=4
+train_ep=4
+train_etp=1
+train_pp=2
+train_cp=1
+
+# SGLang Generation Configuration
+gen_tp=4
+gen_dp=1
+gen_ep=1
+gpu_memory_utilization=0.5
+max_model_len=$((max_prompt_length + max_response_length))
+max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
+
+# Data Configuration
+DATA_CONFIG=(
+    # File Paths
+    data.train_files="${TRAIN_FILE}"
+    data.val_files="${TEST_FILE}"
+    # Data Structure
+    data.prompt_key=prompt
+    # Batch and Length Configuration
+    data.train_batch_size=${train_prompt_bsz}
+    data.max_prompt_length=${max_prompt_length}
+    data.max_response_length=${max_response_length}
+    # Preprocessing
+    data.filter_overlong_prompts=True
+    data.truncation='left'
+)
+
+# Model Configuration
+MODEL_CONFIG=(
+    # Model Path
+    actor_rollout_ref.model.path="${MODEL_PATH}"
+    # Model Processing
+    actor_rollout_ref.model.use_remove_padding=True
+)
+
+# Reinforcement Learning Algorithm Configuration
+ALGORITHM_CONFIG=(
+    # Advantage Estimation
+    algorithm.adv_estimator=${adv_estimator}
+    # KL Divergence Control
+    algorithm.use_kl_in_reward=${use_kl_in_reward}
+    algorithm.kl_ctrl.kl_coef=${kl_coef}
+)
+
+ACTOR_CONFIG=(
+    # Core Runtime Settings
+    actor_rollout_ref.actor.use_torch_compile=False
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+    # Loss Function Configuration
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
+    actor_rollout_ref.actor.entropy_coeff=0
+    # PPO Training Parameters
+    actor_rollout_ref.actor.ppo_epochs=1
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
+    # Optimizer Settings
+    actor_rollout_ref.actor.optim.lr=1e-6
+    # Megatron Parallelism Strategy
+    actor_rollout_ref.actor.mindspeed.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.actor.mindspeed.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.actor.mindspeed.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.actor.mindspeed.expert_tensor_parallel_size=${train_etp}
+    # Memory Optimization
+    actor_rollout_ref.actor.mindspeed.param_offload=${all_offload}
+    actor_rollout_ref.actor.mindspeed.optimizer_offload=${all_offload}
+    actor_rollout_ref.actor.mindspeed.grad_offload=${all_offload}
+    # Model Weights Management
+    actor_rollout_ref.actor.mindspeed.use_mbridge=True
+    actor_rollout_ref.actor.mindspeed.vanilla_mbridge=True
+    # Transformer Architecture Optimizations
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.spec='[mindspeed_llm.tasks.models.spec.qwen3_spec, layer_spec]'
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.seq_length=${max_model_len}
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.micro_batch_size=${micro_batch_size}
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.num_query_groups=4
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_method=uniform
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_granularity=full
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_num_layers=1
+    # MOE
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_router_load_balancing_type=aux_loss
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_permutation_async_comm=True
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_token_dispatcher_type=alltoall
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_aux_loss_coeff=0.001
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.moe_grouped_gemm=True
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.fix_router=True
+)
+
+REF_CONFIG=(
+    # Core Runtime Settings
+    actor_rollout_ref.ref.use_torch_compile=False
+    # Log Probability Inference
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    # Megatron Parallelism Strategy
+    actor_rollout_ref.ref.mindspeed.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.ref.mindspeed.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.ref.mindspeed.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.ref.mindspeed.expert_tensor_parallel_size=${train_etp}
+    # Memory Optimization
+    actor_rollout_ref.ref.mindspeed.param_offload=${all_offload}
+    # Model Weights Management
+    actor_rollout_ref.ref.mindspeed.use_mbridge=True
+    actor_rollout_ref.ref.mindspeed.vanilla_mbridge=True
+)
+
+ROLLOUT_CONFIG=(
+    # Rollout Engine
+    actor_rollout_ref.rollout.name=sglang
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
+    # Generation Parameters
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.top_p=1.0
+    actor_rollout_ref.rollout.top_k=-1
+    actor_rollout_ref.rollout.temperature=1.0
+    # Log Probability Inference
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    # Memory Management
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
+    # Parallelism Strategy
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
+    actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
+    actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
+    # Performance Optimization
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
+    actor_rollout_ref.rollout.enforce_eager=False
+    # Validation Generation
+    actor_rollout_ref.rollout.val_kwargs.n=1
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0
+    actor_rollout_ref.rollout.val_kwargs.top_k=-1
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0
+)
+
+TRAINER_CONFIG=(
+    # Logger Configuration
+    trainer.logger='["console"]'
+    # Project Settings
+    trainer.project_name="${project_name}"
+    trainer.experiment_name="${exp_name}"
+    # Hardware Configuration
+    trainer.nnodes="${NNODES}"
+    trainer.n_gpus_per_node="${NPUS_PER_NODE}"
+    trainer.device='npu'
+    # Training Schedule
+    trainer.total_epochs=1
+    trainer.val_before_train=False
+    trainer.test_freq=-1
+    trainer.save_freq=-1
+    trainer.total_training_steps=1
+    trainer.use_legacy_worker_impl=disable
+)
+
+# profiling configuration
+PROF_CONFIG=(
+    global_profiler.tool=npu
+    global_profiler.steps=null
+    global_profiler.save_path=/profpath
+    actor_rollout_ref.actor.profiler.enable=True
+    actor_rollout_ref.actor.profiler.ranks="[0]"
+    actor_rollout_ref.actor.profiler.all_ranks=False
+    actor_rollout_ref.actor.profiler.tool_config.npu.discrete=True
+    actor_rollout_ref.actor.profiler.tool_config.npu.contents=['npu','cpu']
+    actor_rollout_ref.actor.profiler.tool_config.npu.level=level0
+    actor_rollout_ref.actor.profiler.tool_config.npu.analysis=True
+    actor_rollout_ref.rollout.profiler.enable=True
+    actor_rollout_ref.rollout.profiler.ranks="[0]"
+    actor_rollout_ref.rollout.profiler.all_ranks=False
+)
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_trainer.yaml' \
+    model_engine=mindspeed \
+    "${DATA_CONFIG[@]}" \
+    "${MODEL_CONFIG[@]}" \
+    "${ACTOR_CONFIG[@]}" \
+    "${REF_CONFIG[@]}" \
+    "${ROLLOUT_CONFIG[@]}" \
+    "${ALGORITHM_CONFIG[@]}" \
+    "${TRAINER_CONFIG[@]}" \
+    "${PROF_CONFIG[@]}" \
+    "$@"
+
+# clean up
+if [[ "$USE_DUMMY_MODEL" == "True" ]]; then
+    rm -rf $DUMMY_MODEL_PATH
+    if [[ "$USE_DIST_CKPT" == "True" ]]; then
+        rm -rf $DIST_CKPT_PATH
+    fi
+fi
\ No newline at end of file
diff --git a/tests/special_npu/run_qwen3_8b_grpo_mindspeedllm.sh b/tests/special_npu/run_qwen3_8b_grpo_mindspeedllm.sh
new file mode 100644
index 00000000000..59aff60b592
--- /dev/null
+++ b/tests/special_npu/run_qwen3_8b_grpo_mindspeedllm.sh
@@ -0,0 +1,223 @@
+set -x
+
+# Project Configuration
+project_name='GRPO-Qwen3-8B-BASE-TEST'
+exp_name='GRPO-Qwen3-8B-BASE-MindSpeedLLM-SGLang'
+
+# Necessary env
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+
+# Node Info
+NNODES=${NNODES:-1}
+NPUS_PER_NODE=${NPUS_PER_NODE:-8}
+
+# Model Weights Paths
+MODEL_ID=${MODEL_ID:-Qwen/Qwen3-8B}
+MODEL_PATH=${MODEL_PATH:-${HOME}/.cache/models/${MODEL_ID}}
+
+# File System Paths
+TRAIN_FILE=$HOME/data/gsm8k/train.parquet
+TEST_FILE=$HOME/data/gsm8k/test.parquet
+# Data Length Configuration
+max_prompt_length=$((512))
+max_response_length=$((128))
+
+# Training Batch Configuration
+train_prompt_bsz=16
+train_prompt_mini_bsz=16
+n_resp_per_prompt=2
+micro_batch_size=1
+
+# Algorithm Configuration
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.001
+
+# Performance and Memory Management Configuration
+all_offload=True
+use_dynamic_bsz=False
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
+
+# Megatron Parallelism Configuration
+train_tp=4
+train_pp=2
+
+# SGLang Generation Configuration
+gen_tp=4
+gen_dp=1
+gpu_memory_utilization=0.5
+max_model_len=$((max_prompt_length + max_response_length))
+max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
+
+# Data Configuration
+DATA_CONFIG=(
+    # File Paths
+    data.train_files="${TRAIN_FILE}"
+    data.val_files="${TEST_FILE}"
+    # Data Structure
+    data.prompt_key=prompt
+    # Batch and Length Configuration
+    data.train_batch_size=${train_prompt_bsz}
+    data.max_prompt_length=${max_prompt_length}
+    data.max_response_length=${max_response_length}
+    # Preprocessing
+    data.filter_overlong_prompts=True
+    data.truncation='left'
+)
+
+# Model Configuration
+MODEL_CONFIG=(
+    # Model Path
+    actor_rollout_ref.model.path="${MODEL_PATH}"
+    # Model Processing
+    actor_rollout_ref.model.use_remove_padding=True
+)
+
+# Reinforcement Learning Algorithm Configuration
+ALGORITHM_CONFIG=(
+    # Advantage Estimation
+    algorithm.adv_estimator=${adv_estimator}
+    # KL Divergence Control
+    algorithm.use_kl_in_reward=${use_kl_in_reward}
+    algorithm.kl_ctrl.kl_coef=${kl_coef}
+)
+
+ACTOR_CONFIG=(
+    # Core Runtime Settings
+    actor_rollout_ref.actor.use_torch_compile=False
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+    # Loss Function Configuration
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
+    actor_rollout_ref.actor.entropy_coeff=0
+    # PPO Training Parameters
+    actor_rollout_ref.actor.ppo_epochs=1
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
+    # Optimizer Settings
+    actor_rollout_ref.actor.optim.lr=1e-6
+    # Megatron Parallelism Strategy
+    actor_rollout_ref.actor.mindspeed.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.actor.mindspeed.pipeline_model_parallel_size=${train_pp}
+    # Memory Optimization
+    actor_rollout_ref.actor.mindspeed.param_offload=${all_offload}
+    actor_rollout_ref.actor.mindspeed.optimizer_offload=${all_offload}
+    actor_rollout_ref.actor.mindspeed.grad_offload=${all_offload}
+    # Model Weights Management
+    actor_rollout_ref.actor.mindspeed.use_mbridge=True
+    actor_rollout_ref.actor.mindspeed.vanilla_mbridge=True
+    # Transformer Architecture Optimizations
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.spec='[mindspeed_llm.tasks.models.spec.qwen3_spec, layer_spec]'
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.seq_length=${max_model_len}
+    actor_rollout_ref.actor.mindspeed.llm_kwargs.micro_batch_size=${micro_batch_size}
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.num_query_groups=8
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_method=uniform
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_granularity=full
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.recompute_num_layers=1
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.overlap_grad_reduce=True
+    +actor_rollout_ref.actor.mindspeed.llm_kwargs.overlap_param_gather=True
+)
+
+REF_CONFIG=(
+    # Core Runtime Settings
+    actor_rollout_ref.ref.use_torch_compile=False
+    # Log Probability Inference
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    # Megatron Parallelism Strategy
+    actor_rollout_ref.ref.mindspeed.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.ref.mindspeed.pipeline_model_parallel_size=${train_pp}
+    # Memory Optimization
+    actor_rollout_ref.ref.mindspeed.param_offload=${all_offload}
+    # Model Weights Management
+    actor_rollout_ref.ref.mindspeed.use_mbridge=True
+    actor_rollout_ref.ref.mindspeed.vanilla_mbridge=True
+)
+
+ROLLOUT_CONFIG=(
+    # Rollout Engine
+    actor_rollout_ref.rollout.name=sglang
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
+    # Generation Parameters
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.top_p=1.0
+    actor_rollout_ref.rollout.top_k=-1
+    actor_rollout_ref.rollout.temperature=1.0
+    # Log Probability Inference
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${micro_batch_size}
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    # Memory Management
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
+    # Parallelism Strategy
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
+    actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
+    # Performance Optimization
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
+    actor_rollout_ref.rollout.enforce_eager=False
+    # Validation Generation
+    actor_rollout_ref.rollout.val_kwargs.n=1
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0
+    actor_rollout_ref.rollout.val_kwargs.top_k=-1
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0
+)
+
+TRAINER_CONFIG=(
+    # Logger Configuration
+    trainer.logger='["console"]'
+    # Project Settings
+    trainer.project_name="${project_name}"
+    trainer.experiment_name="${exp_name}"
+    # Hardware Configuration
+    trainer.nnodes="${NNODES}"
+    trainer.n_gpus_per_node="${NPUS_PER_NODE}"
+    trainer.device='npu'
+    # Training Schedule
+    trainer.total_epochs=1
+    trainer.val_before_train=False
+    trainer.test_freq=-1
+    trainer.save_freq=-1
+    trainer.total_training_steps=1
+    trainer.use_legacy_worker_impl=disable
+)
+
+# profiling configuration
+PROF_CONFIG=(
+    global_profiler.tool=npu
+    global_profiler.steps=null
+    global_profiler.save_path=/profpath
+    actor_rollout_ref.actor.profiler.enable=True
+    actor_rollout_ref.actor.profiler.ranks="[0]"
+    actor_rollout_ref.actor.profiler.all_ranks=False
+    actor_rollout_ref.actor.profiler.tool_config.npu.discrete=True
+    actor_rollout_ref.actor.profiler.tool_config.npu.contents=['npu','cpu']
+    actor_rollout_ref.actor.profiler.tool_config.npu.level=level0
+    actor_rollout_ref.actor.profiler.tool_config.npu.analysis=True
+    actor_rollout_ref.rollout.profiler.enable=True
+    actor_rollout_ref.rollout.profiler.ranks="[0]"
+    actor_rollout_ref.rollout.profiler.all_ranks=False
+)
+
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_trainer.yaml' \
+    model_engine=mindspeed \
+    "${DATA_CONFIG[@]}" \
+    "${MODEL_CONFIG[@]}" \
+    "${ACTOR_CONFIG[@]}" \
+    "${REF_CONFIG[@]}" \
+    "${ROLLOUT_CONFIG[@]}" \
+    "${ALGORITHM_CONFIG[@]}" \
+    "${TRAINER_CONFIG[@]}" \
+    "${PROF_CONFIG[@]}" \
+    "$@"
diff --git a/tests/special_sanity/check_device_api_usage.py b/tests/special_sanity/check_device_api_usage.py
index afbc012f6dc..3f285acc8f5 100644
--- a/tests/special_sanity/check_device_api_usage.py
+++ b/tests/special_sanity/check_device_api_usage.py
@@ -41,6 +41,7 @@
     "verl/workers/engine/base.py",  # appear in default device_name
     "verl/workers/engine/utils.py",  # appear in enable_full_determinism
     "verl/workers/engine/fsdp/transformer_impl.py",  # appear in default device_name
+    "verl/workers/engine/fsdp/diffusers_impl.py",  # appear in default device_name
     "verl/workers/engine/veomni/transformer_impl.py",  # appear in default device_name
     "verl/workers/engine/torchtitan/transformer_impl.py",  # appear in default device_name
     "verl/workers/engine/torchtitan/utils.py",  # appear in torch.cuda.empty_cache()
diff --git a/tests/special_sanity/check_license.py b/tests/special_sanity/check_license.py
index c5c562f6dac..93e57f8e3d1 100644
--- a/tests/special_sanity/check_license.py
+++ b/tests/special_sanity/check_license.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import subprocess
 from argparse import ArgumentParser
 from pathlib import Path
 from typing import Iterable
@@ -28,6 +29,7 @@
 license_head_facebook = "Copyright (c) 2016-     Facebook, Inc"
 license_head_meituan = "Copyright 2025 Meituan Ltd. and/or its affiliates"
 license_head_huawei = "Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved."
+license_head_huawei_26 = "Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved."
 license_head_nvidia = "Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved."
 license_headers = [
     license_head_bytedance,
@@ -42,21 +44,29 @@
     license_head_facebook,
     license_head_meituan,
     license_head_huawei,
+    license_head_huawei_26,
     license_head_nvidia,
 ]
 
 
-def get_py_files(path_arg: Path) -> Iterable[Path]:
+def _git_tracked_py_files() -> set[Path]:
+    """Return the set of .py files tracked by git (respects .gitignore)."""
+    result = subprocess.run(["git", "ls-files", "*.py", "**/*.py"], capture_output=True, text=True, check=True)
+    return {Path(line) for line in result.stdout.splitlines() if line}
+
+
+def get_py_files(path_arg: Path, tracked: set[Path]) -> Iterable[Path]:
     """get py files under a dir. if already py file return it
 
     Args:
         path_arg (Path): path to scan for py files
+        tracked (set[Path]): set of git-tracked py files
 
     Returns:
         Iterable[Path]: list of py files
     """
     if path_arg.is_dir():
-        return path_arg.glob("**/*.py")
+        return (p for p in tracked if p == path_arg or path_arg in p.parents)
     elif path_arg.is_file() and path_arg.suffix == ".py":
         return [path_arg]
     return []
@@ -74,8 +84,9 @@ def get_py_files(path_arg: Path) -> Iterable[Path]:
     )
     args = parser.parse_args()
 
-    # Collect all Python files from specified directories
-    pathlist = set(path for path_arg in args.directories for path in get_py_files(path_arg))
+    # Collect all Python files from specified directories (only git-tracked files)
+    tracked = _git_tracked_py_files()
+    pathlist = set(path for path_arg in args.directories for path in get_py_files(path_arg, tracked))
 
     for path in pathlist:
         # because path is object not string
diff --git a/tests/special_sanity/test_config_docs.py b/tests/special_sanity/test_config_docs.py
index b8dc7476245..c84b98d15e9 100644
--- a/tests/special_sanity/test_config_docs.py
+++ b/tests/special_sanity/test_config_docs.py
@@ -67,6 +67,7 @@ def test_trainer_config_doc():
         "verl/trainer/config/ref/ref.yaml",
         "verl/trainer/config/ref/dp_ref.yaml",
         "verl/trainer/config/rollout/rollout.yaml",
+        "verl/trainer/config/rollout/diffusion_rollout.yaml",
     ]
     success = True
     for yaml_to_inspect in yamls_to_inspect:
diff --git a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml
deleted file mode 100644
index 3dd0b8a38d6..00000000000
--- a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml
+++ /dev/null
@@ -1,471 +0,0 @@
-data:
-  tokenizer: null
-  train_files: ~/data/rlhf/gsm8k/train.parquet
-  val_files: ~/data/rlhf/gsm8k/test.parquet
-  train_max_samples: -1  # set to -1 to use full dataset
-  val_max_samples: -1  # set to -1 to use full dataset
-  prompt_key: prompt
-  reward_fn_key: data_source
-  max_prompt_length: 512
-  max_response_length: 512
-  train_batch_size: 1024
-  val_batch_size: null # DEPRECATED: Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves
-  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
-  return_raw_chat: True
-  return_full_prompt: False
-  shuffle: True
-  seed: null # An integer seed to use when shuffling the data. If not set or set to `null`, the data shuffling will not be seeded, resulting in a different data order on each run.
-  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You cat set the filter_overlong_prompts_workers to use multiprocessing to speed up.
-  filter_overlong_prompts_workers: 1
-  truncation: error
-  trust_remote_code: False  # main_ppo will check this config to determine whether to use remote code for tokenizer
-  custom_cls:
-      path: null
-      name: null
-  sampler:
-    class_path: null
-    class_name: null
-  dataloader_num_workers: 8
-  return_multi_modal_inputs: True
-
-actor_rollout_ref:
-  hybrid_engine: True
-  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
-  model:
-    path: ~/models/deepseek-llm-7b-chat
-    custom_chat_template: null
-    external_lib: null
-    override_config:
-      model_config: {}
-      moe_config:
-        freeze_moe_router: False
-    enable_gradient_checkpointing: True
-    gradient_checkpointing_kwargs:
-      ## Activation Checkpointing
-      activations_checkpoint_method: null # 'uniform', 'block'; not used with 'selective'
-      # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk
-      # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
-      activations_checkpoint_granularity: null # 'selective' or 'full'
-      # 'full' will checkpoint the entire transformer layer and 'selective' only checkpoints memory intensive part of attention
-      activations_checkpoint_num_layers: null # not used with 'selective'
-    trust_remote_code: False
-  actor:
-    strategy: megatron  # This is for backward-compatibility
-    ppo_mini_batch_size: 256
-    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-    ppo_micro_batch_size_per_gpu: null
-    use_dynamic_bsz: False
-    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
-    use_torch_compile: True # False to disable torch compile
-    # pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
-    clip_ratio: 0.2 # default value if clip_ratio_low and clip_ratio_high are not specified
-    clip_ratio_low: 0.2
-    clip_ratio_high: 0.2
-    clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729
-    loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean" / "seq-mean-token-sum-norm"
-    # NOTE: "token-mean" is the default behavior
-    loss_scale_factor: null  # Scale factor for "seq-mean-token-sum-norm" mode. If null, uses response_length.
-    entropy_coeff: 0
-    use_kl_loss: False # True for GRPO
-    kl_loss_coef: 0.001 # for grpo
-    kl_loss_type: low_var_kl # for grpo
-    ppo_epochs: 1
-    data_loader_seed: 42
-    shuffle: False
-    policy_loss:   # policy loss config
-      loss_mode: "vanilla" # Loss function mode: vanilla / clip-cov / kl-cov / gpg from https://arxiv.org/abs/2505.22617,
-      clip_cov_ratio: 0.0002 # Ratio of tokens to be clipped for clip-cov loss
-      clip_cov_lb: 1.0 # Lower bound for clip-cov loss
-      clip_cov_ub: 5.0 # Upper bound for clip-cov loss
-      kl_cov_ratio: 0.0002 # Ratio of tokens to be applied kl penalty for kl-cov loss
-      ppo_kl_coef: 0.1 # KL divergence penalty coefficient
-    optim:
-      optimizer: adam
-      lr: 1e-6
-      clip_grad: 1.0
-      total_training_steps: -1  # must be override by program
-      lr_warmup_init: 0.0  # initial learning rate for warmup, default to 0.0
-      lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
-      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-      lr_decay_steps: null
-      lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root
-      min_lr: 0.0 # minimum learning rate, default to 0.0
-      weight_decay: 0.01
-      weight_decay_incr_style: constant # select from constant/linear/cosine
-      lr_wsd_decay_style: exponential # select from constant/exponential/cosine
-      lr_wsd_decay_steps: null
-      use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
-    megatron:
-      param_offload: False
-      grad_offload: False
-      optimizer_offload: False
-      tensor_model_parallel_size: 1
-      expert_model_parallel_size: 1
-      expert_tensor_parallel_size: null
-      pipeline_model_parallel_size: 1
-      virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
-      context_parallel_size: 1
-      sequence_parallel: True
-      use_distributed_optimizer: True
-      use_dist_checkpointing: False
-      dist_checkpointing_path: null
-      seed: 42
-      override_transformer_config: {} # additional transformer config like: num_layers_in_first(/last)_pipeline_stage
-      use_mbridge: True
-      vanilla_mbridge: True
-    profile: # profile the actor model in `update_policy`
-      use_profile: False # open it when you want to profile the actor model
-      profile_ranks: null # list, you can specify the ranks to profile
-      step_start: -1 # start step in update_policy
-      step_end: -1 # end step
-      save_path: null # the path to save the profile result
-    load_weight: True
-    checkpoint:
-      async_save: False # save checkpoint asynchronously
-      # What to include in saved checkpoints
-      # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
-      save_contents: ['model', 'optimizer', 'extra']
-      # For more flexibility, you can specify the contents to load from the checkpoint.
-      load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
-  ref:
-    strategy: ${actor_rollout_ref.actor.strategy}
-    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
-    megatron:
-      param_offload: False
-      tensor_model_parallel_size: 1
-      expert_model_parallel_size: 1
-      expert_tensor_parallel_size: null
-      pipeline_model_parallel_size: 1
-      virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
-      context_parallel_size: 1
-      sequence_parallel: True
-      use_distributed_optimizer: True
-      use_dist_checkpointing: False
-      dist_checkpointing_path: null
-      seed: ${actor_rollout_ref.actor.megatron.seed}
-      override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
-      use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
-      vanilla_mbridge: ${actor_rollout_ref.actor.megatron.vanilla_mbridge}
-    profile:
-      use_profile: False
-      profile_ranks: null
-      step_start: -1
-      step_end: -1
-      save_path: null
-    load_weight: True
-    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: null
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-  rollout:
-    name: vllm
-    mode: async # sync: LLM, async: AsyncLLM
-    temperature: 1.0
-    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-    top_p: 1
-    prompt_length: ${data.max_prompt_length}  # for xperf_gpt
-    response_length: ${data.max_response_length}
-    # for vllm rollout
-    dtype: bfloat16 # should align with FSDP
-    gpu_memory_utilization: 0.5
-    ignore_eos: False
-    enforce_eager: False
-    free_cache_engine: True
-    load_format: dummy
-    tensor_model_parallel_size: 2
-    max_num_batched_tokens: 8192
-    max_model_len: null
-    max_num_seqs: 1024
-    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: null
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-    disable_log_stats: True
-    enable_chunked_prefill: True # could get higher throughput
-    # for hf rollout
-    do_sample: True
-    layer_name_map:
-      qkv_layer_name: qkv
-      gate_proj_layer_name: gate_up
-    # number of responses (i.e. num sample times)
-    n: 1
-    engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail
-      vllm: {}
-      sglang: {}
-    val_kwargs:
-      # sampling parameters for validation
-      top_k: -1 # 0 for hf rollout, -1 for vllm rollout
-      top_p: 1.0
-      temperature: 0
-      n: 1
-      do_sample: False # default eager for validation
-
-    # Multi-turn interaction config for tools or chat.
-    multi_turn:
-      # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
-      enable: False
-
-      # null for no limit (default max_length // 3)
-      max_assistant_turns: null
-
-      # null for no tool
-      tool_config_path: null
-
-      # null for no limit (default max_length // 3)
-      max_user_turns: null
-
-      # max parallel call for tools in single turn
-      max_parallel_calls: 1
-
-      # max length of tool response
-      max_tool_response_length: 256
-
-      # truncate side of tool response: left, middle, right
-      tool_response_truncate_side: middle
-
-      # null for no interaction
-      interaction_config_path: null
-
-      # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
-      # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
-      #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
-      use_inference_chat_template: False
-
-      # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
-      # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
-      # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
-      # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
-      # Qwen/QwQ-32B, Qwen/Qwen3-xxB
-      # - disable: disable tokenization sanity check
-      # - strict: enable strict tokenization sanity check (default)
-      # - ignore_strippable: ignore strippable tokens when checking tokenization sanity
-      tokenization_sanity_check_mode: strict
-
-      # Format of the multi-turn interaction. Options: hermes, llama3_json, ...
-      format: hermes
-
-    # [Experimental] agent loop based rollout configs
-    agent:
-
-      # Number of agent loop workers
-      num_workers: 8
-
-      custom_async_server:
-        path: null
-        name: null
-
-    # support logging rollout prob for debugging purpose
-    calculate_log_probs: False
-    # Nsight system profiler configs
-  profiler:
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.utils.profiler.ProfilerConfig
-    discrete: False
-    all_ranks: False
-    ranks: []
-
-critic:
-  rollout_n: ${actor_rollout_ref.rollout.n}
-  strategy: ${actor_rollout_ref.actor.strategy}
-  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
-  optim:
-    optimizer: adam
-    lr: 1e-6
-    clip_grad: 1.0
-    total_training_steps: -1  # must be override by program
-    lr_warmup_init: 0.0  # initial learning rate for warmup, default to 0.0
-    lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
-    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
-    lr_decay_steps: null
-    lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root
-    min_lr: 0.0 # minimum learning rate, default to 0.0
-    weight_decay: 0.01
-    weight_decay_incr_style: constant # select from constant/linear/cosine
-    lr_wsd_decay_style: exponential # select from constant/exponential/cosine
-    lr_wsd_decay_steps: null
-    use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
-  model:
-    path: ~/models/deepseek-llm-7b-chat
-    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config:
-      model_config: {}
-      moe_config:
-        freeze_moe_router: False
-    external_lib: ${actor_rollout_ref.model.external_lib}
-    trust_remote_code: False
-    enable_gradient_checkpointing: True
-    gradient_checkpointing_kwargs:
-      ## Activation Checkpointing
-      activations_checkpoint_method: null
-      activations_checkpoint_granularity: null
-      activations_checkpoint_num_layers: null
-  megatron:
-    param_offload: False
-    grad_offload: False
-    optimizer_offload: False
-    tensor_model_parallel_size: 1
-    expert_model_parallel_size: 1
-    expert_tensor_parallel_size: null
-    pipeline_model_parallel_size: 1
-    virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
-    context_parallel_size: 1
-    sequence_parallel: True
-    use_distributed_optimizer: True
-    use_dist_checkpointing: False
-    dist_checkpointing_path: null
-    seed: ${actor_rollout_ref.actor.megatron.seed}
-    override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
-    use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
-    vanilla_mbridge: ${actor_rollout_ref.actor.megatron.vanilla_mbridge}
-  load_weight: True
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: null
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-  data_loader_seed: ${actor_rollout_ref.actor.data_loader_seed}
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-  cliprange_value: 0.5
-  loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
-  checkpoint:
-    async_save: False # save checkpoint asynchronously
-    # What to include in saved checkpoints
-    # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
-    save_contents: ['model', 'optimizer', 'extra']
-    load_contents: ${critic.checkpoint.save_contents}
-  # Nsight system profiler configs
-  profiler:
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.utils.profiler.ProfilerConfig
-    discrete: False
-    all_ranks: False
-    ranks: []
-reward_model:
-  enable: False
-  strategy: ${actor_rollout_ref.actor.strategy}
-  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
-  megatron:
-    param_offload: False
-    tensor_model_parallel_size: 1
-    expert_model_parallel_size: 1
-    expert_tensor_parallel_size: null
-    pipeline_model_parallel_size: 1
-    virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
-    context_parallel_size: 1
-    sequence_parallel: True
-    use_distributed_optimizer: False
-    use_dist_checkpointing: False
-    dist_checkpointing_path: null
-    seed: ${actor_rollout_ref.actor.megatron.seed}
-    override_transformer_config: {}
-    use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
-    vanilla_mbridge: ${actor_rollout_ref.actor.megatron.vanilla_mbridge}
-  model:
-    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-    trust_remote_code: False
-    external_lib: ${actor_rollout_ref.model.external_lib}
-  load_weight: True
-  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  micro_batch_size_per_gpu: null
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
-  max_length: null
-  reward_manager: naive
-  launch_reward_fn_async: False # custom reward function executed async on CPU, during log_prob
-  sandbox_fusion:
-    url: null # faas url to run code in cloud sandbox
-    max_concurrent: 64 # max concurrent requests to sandbox
-    memory_limit_mb: 1024 # Max memory limit for each sandbox process in MB
-  # Nsight system profiler configs
-  profiler:
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.utils.profiler.ProfilerConfig
-    discrete: False
-    all_ranks: False
-    ranks: []
-
-custom_reward_function:
-  path: null
-  name: compute_score
-
-algorithm:
-  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-  _target_: verl.trainer.config.AlgoConfig
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: gae
-  norm_adv_by_std_in_grpo: True
-  use_kl_in_reward: False
-  kl_penalty: kl  # how to estimate kl divergence
-  kl_ctrl:
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.trainer.config.KLControlConfig
-    type: fixed
-    kl_coef: 0.001
-    horizon: 10000
-    target_kl: 0.1
-  use_pf_ppo: False
-  pf_ppo:
-    reweight_method: pow  # ["pow", "max_min", "max_random"]
-    weight_pow: 2.0
-
-trainer:
-  balance_batch: True
-  total_epochs: 30
-  total_training_steps: null
-  profile_steps: null # [1,2,5] or [] or null
-  project_name: verl_examples
-  experiment_name: gsm8k
-  logger: ['console', 'wandb']
-  log_val_generations: 0
-  nnodes: 1
-  n_gpus_per_node: 8
-  save_freq: -1
-  esi_redundant_time: 0
-
-  # auto: find the last ckpt to resume. If can't find, start from scratch
-  resume_mode: auto # or disable or resume_path if resume_from_path is set
-  resume_from_path: null
-  del_local_ckpt_after_load: False
-  val_before_train: True
-  test_freq: -1
-  critic_warmup: 0
-  default_hdfs_dir: null
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
-  max_actor_ckpt_to_keep: null
-  max_critic_ckpt_to_keep: null
-  # The timeout for ray worker group to wait for the register center to be ready
-  ray_wait_register_center_timeout: 300
-  device: cuda
-  # see ppo_trainer.yaml for more details
-  controller_nsight_options:
-    trace: "cuda,nvtx,cublas,ucx"
-    cuda-memory-usage: "true"
-    cuda-graph-trace: "graph"
-  worker_nsight_options:
-    trace: "cuda,nvtx,cublas,ucx"
-    cuda-memory-usage: "true"
-    cuda-graph-trace: "graph"
-    capture-range: "cudaProfilerApi"
-    capture-range-end: null
-    kill: none
-  npu_profile:
-    options:
-      save_path: ./profiler_data
-      roles: ["all"]
-      level: level0
-      with_memory: False
-      record_shapes: False
-      with_npu: True
-      with_cpu: True
-      with_module: False
-      with_stack: False
-      analysis: True
-
-ray_kwargs:
-  ray_init:
-    num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
-  timeline_json_file: null
diff --git a/tests/trainer/config/legacy_ppo_trainer.yaml b/tests/trainer/config/legacy_ppo_trainer.yaml
deleted file mode 100644
index 25919bd15d9..00000000000
--- a/tests/trainer/config/legacy_ppo_trainer.yaml
+++ /dev/null
@@ -1,1126 +0,0 @@
-# Format checks enforced on CI:
-# 1. Comments must appear above each field.
-# 2. There must be a blank line between each field.
-# 3. Inline comments (after a field on the same line) are not allowed.
-# 4. Indentation level is respected for nested fields.
-
-# dataset config
-data:
-
-  # Tokenizer class or path. If null, it will be inferred from the model.
-  tokenizer: null
-
-  # Whether to use shared memory for data loading.
-  use_shm: False
-
-  # Training set parquet. Can be a list or a single file.
-  # The program will read all files into memory, so it can't be too large (< 100GB).
-  # The path can be either a local path or an HDFS path.
-  # For HDFS path, we provide utils to download it to DRAM and convert it to a local path.
-  train_files: ~/data/rlhf/gsm8k/train.parquet
-
-  # Validation parquet. Can be a list or a single file.
-  val_files: ~/data/rlhf/gsm8k/test.parquet
-
-  # Maximum sample length to be used.
-  # Set to -1 to use full dataset, otherwise, randomly
-  # select the specified number of samples from train dataset
-  train_max_samples: -1
-
-  # Maximum sample length to be used.
-  # Set to -1 to use full dataset, otherwise, randomly
-  # select the specified number of samples from val dataset
-  val_max_samples: -1
-
-  # The field in the dataset where the prompt is located. Default is 'prompt'.
-  prompt_key: prompt
-
-  # The field used to select the reward function (if using different ones per example).
-  reward_fn_key: data_source
-
-  # Maximum prompt length. All prompts will be left-padded to this length.
-  # An error will be reported if the length is too long.
-  max_prompt_length: 512
-
-  # Maximum response length. Rollout in RL algorithms (e.g. PPO) generates up to this length.
-  max_response_length: 512
-
-  # Batch size sampled for one training iteration of different RL algorithms.
-  train_batch_size: 1024
-
-  # Batch size used during validation. Can be null.
-  val_batch_size: null
-
-  # Whether to return the original input_ids without adding chat template.
-  # This is used when the reward model's chat template differs from the policy.
-  # If using a model-based RM with different templates, this should be True.
-  return_raw_input_ids: False
-
-  # Whether to return the original chat (prompt) without applying chat template.
-  return_raw_chat: True
-
-  # Whether to return the full prompt with chat template.
-  return_full_prompt: False
-
-  # Whether to shuffle the data in the dataloader.
-  shuffle: True
-
-  # An integer seed to use when shuffling the data. If not set or set to
-  # `null`, the data shuffling will not be seeded, resulting in a different data order on each run.
-  seed: null
-
-  # num dataloader workers
-  dataloader_num_workers: 8
-
-  # Whether to shuffle the validation set.
-  validation_shuffle: False
-
-  # Whether to filter overlong prompts.
-  filter_overlong_prompts: False
-
-  # Number of workers for filtering overlong prompts.
-  # For large-scale datasets, filtering can be time-consuming.
-  # Use multiprocessing to speed up. Default is 1.
-  filter_overlong_prompts_workers: 1
-
-  # Truncate the input_ids or prompt if they exceed max_prompt_length.
-  # Options: 'error', 'left', or 'right'. Default is 'error'.
-  truncation: error
-
-  # The field in the multi-modal dataset where the image is located. Default is 'images'.
-  image_key: images
-
-  # The field in the multi-modal dataset where the video is located.
-  video_key: videos
-
-  # If the remote tokenizer has a Python file, this flag determines whether to allow using it.
-  trust_remote_code: False
-
-  # Optional: specify a custom dataset class path and name if overriding default loading behavior.
-  custom_cls:
-
-    # The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
-    path: null
-
-    # The name of the dataset class within the specified file.
-    name: null
-
-  # Whether to return multi-modal inputs in the dataset. Set to False if rollout generates new multi-modal inputs.
-  return_multi_modal_inputs: True
-
-  # Data generation configuration for augmenting the dataset.
-  datagen:
-
-    # The path to the file containing your customized data generation class.
-    # E.g. 'pkg://verl.experimental.dynamic_dataset.dynamicgen_dataset'
-    path: null
-
-    # The class name of the data generation class within the specified file.
-    # E.g. 'MockDataGenerator'
-    name: null
-
-  # settings related to data sampler
-  sampler:
-
-    # the path to the module containing a curriculum class which implements the
-    # AbstractSampler interface
-    class_path: null
-
-    # the name of the curriculum class like `MySampler`
-    class_name: null
-
-  # Additional kwargs when calling tokenizer.apply_chat_template
-  apply_chat_template_kwargs: {}
-
-# config for actor, rollout and reference model
-actor_rollout_ref:
-
-  # Whether it's a hybrid engine, currently only supports hybrid engine
-  hybrid_engine: true
-
-  # common configs for the model
-  model:
-
-    _target_: verl.workers.config.HFModelConfig
-
-    # Huggingface model path. This can be either local path or HDFS path.
-    path: ~/models/deepseek-llm-7b-chat
-
-    # Custom chat template for the model.
-    custom_chat_template: null
-
-    # Whether to use shared memory (SHM) for accelerating the loading of model weights
-    use_shm: false
-
-    # Additional Python packages to register huggingface models/tokenizers.
-    external_lib: null
-
-    # Used to override model's original configurations, mainly dropout
-    override_config: {}
-
-    # Enable gradient checkpointing for actor
-    enable_gradient_checkpointing: true
-
-    # Enable activation offloading for actor
-    enable_activation_offload: false
-
-    # Whether to remove padding tokens in inputs during training
-    use_remove_padding: true
-
-    # Set to positive value to enable LoRA (e.g., 32)
-    lora_rank: 0
-
-    # LoRA scaling factor
-    lora_alpha: 16
-
-    # Target modules to apply LoRA. Options: "all-linear" (not recommended for VLMs) or
-    # [q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj]
-    target_modules: all-linear
-
-    # Exclude modules from applying Lora. Similar usage to target_modules and Peft.
-    # Example: '.*visual.*' for excluding the ViT in Qwen2.5-VL, as currently vllm does not support ViT Lora.
-    exclude_modules: null
-
-    # Whether to use Liger for linear layer fusion
-    use_liger: false
-
-    # Whether to use custom fused kernels (e.g., FlashAttention, fused MLP)
-    use_fused_kernels: false
-
-    # Options for fused kernels. If use_fused_kernels is true, this will be used.
-    fused_kernel_options:
-
-      # Implementation backend for fused kernels. Options: "triton" or "torch".
-      impl_backend: torch
-
-    # Whether to enable loading a remote code model
-    trust_remote_code: false
-
-  # actor configs
-  actor:
-
-    # fsdp, fsdp2 or megatron. fsdp backend used here.
-    strategy: fsdp
-
-    # Split each sample into sub-batches of this size for PPO
-    ppo_mini_batch_size: 256
-
-    # [Deprecated] Global micro batch size
-    ppo_micro_batch_size: null
-
-    # Local per-GPU micro batch size
-    ppo_micro_batch_size_per_gpu: null
-
-    # Whether to automatically adjust batch size at runtime
-    use_dynamic_bsz: false
-
-    # Max tokens per GPU in one PPO batch; affects gradient accumulation
-    # Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length}
-    ppo_max_token_len_per_gpu: 16384
-
-    # Gradient clipping for actor updates
-    grad_clip: 1.0
-
-    # PPO clip ratio
-    clip_ratio: 0.2
-
-    # Lower bound for asymmetric clipping (used in dual-clip PPO)
-    clip_ratio_low: 0.2
-
-    # Upper bound for asymmetric clipping (used in dual-clip PPO)
-    clip_ratio_high: 0.2
-
-    # policy loss config
-    policy_loss:
-
-      # Loss function mode: vanilla / clip-cov / kl-cov /gpg from https://arxiv.org/abs/2505.22617
-      loss_mode: "vanilla"
-
-      # Ratio of tokens to be clipped for clip-cov loss
-      clip_cov_ratio: 0.0002
-
-      # Lower bound for clip-cov loss
-      clip_cov_lb: 1.0
-
-      # Upper bound for clip-cov loss
-      clip_cov_ub: 5.0
-
-      # Ratio of tokens to be applied kl penalty for kl-cov loss
-      kl_cov_ratio: 0.0002
-
-      # KL divergence penalty coefficient
-      ppo_kl_coef: 0.1
-
-    # Constant C in Dual-clip PPO; clips when advantage < 0 and ratio > C
-    clip_ratio_c: 3.0
-
-    # Loss aggregation mode: "token-mean", "seq-mean-token-sum", "seq-mean-token-mean", or "seq-mean-token-sum-norm"
-    loss_agg_mode: token-mean
-
-    # Scale factor for "seq-mean-token-sum-norm" loss aggregation mode.
-    # If null, uses response_length. Set to a constant to ensure consistent normalization.
-    loss_scale_factor: null
-
-    # Entropy regularization coefficient in PPO loss
-    entropy_coeff: 0
-
-    # Whether to use KL loss instead of KL reward penalty. True for GRPO
-    use_kl_loss: false
-
-    # Whether to use torch.compile()
-    use_torch_compile: true
-
-    # KL loss coefficient when use_kl_loss is enabled. For GRPO
-    kl_loss_coef: 0.001
-
-    # Type of KL divergence loss. Options: "kl"(k1), "abs", "mse"(k2), "low_var_kl"(k3), "full"
-    kl_loss_type: low_var_kl
-
-    # Number of PPO epochs per batch
-    ppo_epochs: 1
-
-    # Shuffle training data across PPO epochs
-    shuffle: false
-
-    # Sequence parallelism size for Ulysses-style model parallelism
-    ulysses_sequence_parallel_size: 1
-
-    # calculate entropy with chunking to reduce memory peak
-    entropy_from_logits_with_chunking: False
-
-    # recompute entropy
-    entropy_checkpointing: False
-
-    # checkpoint configs
-    checkpoint:
-
-      # What to include in saved checkpoints
-      # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
-      save_contents: ['model', 'optimizer', 'extra']
-
-      # For more flexibility, you can specify the contents to load from the checkpoint.
-      load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
-
-    # optimizer configs
-    optim:
-
-      # Learning rate
-      lr: 1e-6
-
-      # Warmup steps; negative value delegates to lr_warmup_steps_ratio
-      lr_warmup_steps: -1
-
-      # Warmup steps ratio (used if lr_warmup_steps is negative)
-      lr_warmup_steps_ratio: 0.0
-
-      # Minimum LR ratio for cosine schedule
-      min_lr_ratio: 0.0
-
-      # Number of cosine cycles in LR schedule
-      num_cycles: 0.5
-
-      # LR scheduler type: "constant" or "cosine"
-      lr_scheduler_type: constant
-
-      # Total training steps (must be overridden at runtime)
-      total_training_steps: -1
-
-      # Weight decay
-      weight_decay: 0.01
-
-    # configs for FSDP
-    fsdp_config:
-
-      # policy for wrapping the model
-      wrap_policy:
-
-        # Minimum number of parameters to trigger wrapping a layer with FSDP
-        min_num_params: 0
-
-      # Whether to offload model parameters to CPU (trades speed for memory)
-      param_offload: false
-
-      # Whether to offload optimizer state to CPU
-      optimizer_offload: false
-
-      # Only for FSDP2: offload param/grad/optimizer during train
-      offload_policy: false
-
-      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
-      reshard_after_forward: true
-
-      # Number of GPUs in each FSDP shard group; -1 means auto
-      fsdp_size: -1
-
-      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
-      # before the current forward computation.
-      forward_prefetch: False
-
-  # Reference model config.
-  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
-  ref:
-
-    # actor_rollout_ref.ref: FSDP config same as actor. For models larger than 7B, it’s recommended to turn on offload for ref by default
-    strategy: ${actor_rollout_ref.actor.strategy}
-
-    # config for FSDP strategy
-    fsdp_config:
-
-      # whether to offload parameters in FSDP
-      param_offload: False
-
-      # whether to perform reshard after model forward to save memory.
-      # only for fsdp2, [True, False, int between 1 and fsdp_size]
-      reshard_after_forward: True
-
-      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
-      # before the current forward computation.
-      forward_prefetch: False
-
-      # the wrap policy for FSDP model
-      wrap_policy:
-
-        # minimum number of params in a wrapped module
-        min_num_params: 0
-
-    # whether to enable torch.compile
-    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
-
-    # [Will be deprecated, use log_prob_micro_batch_size_per_gpu]
-    # The batch size for one forward pass in the computation of log_prob. Global batch size.
-    log_prob_micro_batch_size: null
-
-    # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
-    log_prob_micro_batch_size_per_gpu: null
-
-    # enable dynamic batch size (sequence packing) for log_prob computation
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-
-    # the max token length per GPU
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-
-    # sequence parallel size
-    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
-
-    # calculate entropy with chunking to reduce memory peak
-    entropy_from_logits_with_chunking: False
-
-    # recompute entropy
-    entropy_checkpointing: False
-
-  # Rollout model config.
-  rollout:
-
-    # actor_rollout_ref.rollout.name: hf/vllm/sglang.
-    name: vllm
-
-    # sync: LLM, async: AsyncLLM
-    mode: async
-
-    # Sampling temperature for rollout.
-    temperature: 1.0
-
-    # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
-    top_k: -1
-
-    # Top-p sampling parameter. Default 1.0.
-    top_p: 1
-
-
-    # typically the same as data max prompt length
-    prompt_length: ${data.max_prompt_length}
-
-    # typically the same as data max response length
-    response_length: ${data.max_response_length}
-
-    # for vllm rollout
-    # Rollout model parameters type. Align with actor model's FSDP/Megatron type.
-    dtype: bfloat16
-
-    # Fraction of GPU memory used by vLLM/SGLang for KV cache.
-    gpu_memory_utilization: 0.5
-
-    # Whether to ignore EOS and continue generating after EOS is hit.
-    ignore_eos: False
-
-    # Whether to disable CUDA graph. Default True to allow cache freeing.
-    enforce_eager: False
-
-    # Whether to free engine KVCache after generation. Set enforce_eager=True when enabled.
-    free_cache_engine: True
-
-    # Which loader to use for rollout model weights: dummy_dtensor, hf, megatron, etc.
-    # safetensors (for huge model, and set use_shm=True); dummy_dtensor: randomly init model weight
-    load_format: dummy
-
-    # for huge model, layered summon can save memory (prevent OOM) but make it slower
-    layered_summon: False
-
-    # TP size for rollout. Only effective for vLLM.
-    tensor_model_parallel_size: 2
-
-    # max number of tokens in a batch
-    max_num_batched_tokens: 8192
-
-    # max length for rollout
-    max_model_len: null
-
-    # max length of sequences
-    max_num_seqs: 1024
-
-    # [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size.
-    log_prob_micro_batch_size: null
-
-    # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
-    log_prob_micro_batch_size_per_gpu: null
-
-    # enable dynamic batch size (sequence packing) for log_prob computation
-    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-
-    # max token length for log_prob computation
-    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
-
-    # disable logging statistics
-    disable_log_stats: True
-
-    # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
-    enable_chunked_prefill: True
-
-    # for hf rollout
-    # Whether to sample during training rollout. False uses greedy sampling.
-    do_sample: True
-
-    # number of responses (i.e. num sample times). > 1 for grpo
-    n: 1
-
-    # Whether to wake up inference engine in multi-stage to reduce peak memory during training-rollout transition.
-    multi_stage_wake_up: false
-
-    # Extra inference engine arguments, please refer vllm/sglang official doc for detail
-    engine_kwargs:
-
-      # vllm engine config
-      vllm: {}
-
-      # sglang engine config
-      sglang: {}
-
-    # Sampling parameters used during validation.
-    val_kwargs:
-
-      # sampling parameters for validation
-      # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
-      top_k: -1
-
-      # Top-p sampling parameter. Default 1.0.
-      top_p: 1.0
-
-      # Sampling temperature for rollout.
-      temperature: 0
-
-      # whether to repeat n times for validation
-      n: 1
-
-      # Whether to sample during training rollout. False uses greedy sampling.
-      do_sample: False
-
-    # Multi-turn interaction config for tools or chat.
-    multi_turn:
-
-      # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
-      enable: False
-
-      # null for no limit (default max_length // 3)
-      max_assistant_turns: null
-
-      # null for no tool
-      tool_config_path: null
-
-      # null for no limit (default max_length // 3)
-      max_user_turns: null
-
-      # max parallel call for tools in single turn
-      max_parallel_calls: 1
-
-      # max length of tool response
-      max_tool_response_length: 256
-
-      # truncate side of tool response: left, middle, right
-      tool_response_truncate_side: middle
-
-      # null for no interaction
-      interaction_config_path: null
-
-      # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
-      # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
-      #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
-      use_inference_chat_template: False
-
-      # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
-      # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
-      # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
-      # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
-      # Qwen/QwQ-32B, Qwen/Qwen3-xxB
-      # - disable: disable tokenization sanity check
-      # - strict: enable strict tokenization sanity check (default)
-      # - ignore_strippable: ignore strippable tokens when checking tokenization sanity
-      tokenization_sanity_check_mode: strict
-
-      # Format of the multi-turn interaction. Options: hermes, llama3_json, ...
-      format: hermes
-
-    # support logging rollout prob for debugging purpose
-    calculate_log_probs: False
-
-    # [Experimental] agent loop based rollout configs
-    agent:
-
-      # Number of agent loop workers
-      num_workers: 8
-
-      # custom async server configs
-      custom_async_server:
-
-        # Path to the custom async server implementation
-        path: null
-
-        # Class name of the custom async server class (e.g. AsyncvLLMServer)
-        name: null
-
-  # profiler configs
-  profiler:
-
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.utils.profiler.ProfilerConfig
-
-    # True for each task has its own database, False for all tasks in one training step share one database.
-    discrete: False
-
-    # Whether to profile all ranks.
-    all_ranks: False
-
-    # The ranks that will be profiled. [] or [0,1,...]
-    ranks: []
-
-# configs for the critic
-critic:
-
-  # Number of rollouts per update (mirrors actor rollout_n)
-  rollout_n: ${actor_rollout_ref.rollout.n}
-
-  # fsdp or fsdp2 strategy used for critic model training
-  strategy: ${actor_rollout_ref.actor.strategy}
-
-  # optimizer configs
-  optim:
-
-    # Learning rate
-    lr: 1e-5
-
-    # Warmup steps ratio; total steps will be injected at runtime
-    lr_warmup_steps_ratio: 0.
-
-    # Minimum LR ratio for cosine schedule
-    min_lr_ratio: 0.0
-
-    # LR scheduler type: "constant" or "cosine"
-    lr_scheduler_type: constant
-
-    # Total training steps (must be overridden at runtime)
-    total_training_steps: -1
-
-    # Weight decay
-    weight_decay: 0.01
-
-  # model config for the critic
-  model:
-
-    # Path to pretrained model weights
-    path: ~/models/deepseek-llm-7b-chat
-
-    # Whether to use shared memory for loading the model
-    use_shm: False
-
-    # Tokenizer path (defaults to actor's model path)
-    tokenizer_path: ${actor_rollout_ref.model.path}
-
-    # Hugging Face config override
-    override_config: { }
-
-    # External model implementation (optional)
-    external_lib: ${actor_rollout_ref.model.external_lib}
-
-    # Enable gradient checkpointing to save memory
-    enable_gradient_checkpointing: True
-
-    # Offload activations to CPU to reduce GPU memory usage
-    enable_activation_offload: False
-
-    # Use remove padding optimization (saves compute)
-    use_remove_padding: False
-
-    # Whether to trust remote code from Hugging Face models
-    trust_remote_code: ${actor_rollout_ref.model.trust_remote_code}
-
-    # FSDP-specific config
-    fsdp_config:
-
-      # Whether to offload model parameters to CPU
-      param_offload: False
-
-      # Whether to offload optimizer state to CPU
-      optimizer_offload: False
-
-      # Only for FSDP2: offload param/grad/optimizer during train
-      offload_policy: False
-
-      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
-      reshard_after_forward: True
-
-      # Policy for wrapping layers with FSDP
-      wrap_policy:
-
-        # Minimum number of parameters to trigger wrapping
-        min_num_params: 0
-
-      # Number of GPUs in each FSDP shard group; -1 means auto
-      fsdp_size: -1
-
-      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
-      # before the current forward computation.
-      forward_prefetch: False
-
-    # Set to positive value to enable LoRA (e.g., 32)
-    lora_rank: 0
-
-    # LoRA scaling factor
-    lora_alpha: 16
-
-    # LoRA target modules: "all-linear" or list of linear projection layers
-    target_modules: all-linear
-
-  # PPO mini-batch size per update
-  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
-
-  # [Deprecated] Global micro batch size
-  ppo_micro_batch_size: null
-
-  # Local per-GPU micro batch size
-  ppo_micro_batch_size_per_gpu: null
-
-  # Forward-only batch size (global)
-  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
-
-  # Forward-only batch size (per GPU)
-  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
-
-  # Whether to automatically adjust batch size at runtime
-  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
-
-  # Max tokens per GPU in one PPO batch (doubled for critic)
-  ppo_max_token_len_per_gpu: 32768
-
-  # Max token length per GPU in forward pass
-  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
-
-  # Sequence parallelism size for Ulysses-style model parallelism
-  ulysses_sequence_parallel_size: 1
-
-  # Number of PPO epochs per batch
-  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
-
-  # Shuffle training data across PPO epochs
-  shuffle: ${actor_rollout_ref.actor.shuffle}
-
-  # Gradient clipping for critic updates
-  grad_clip: 1.0
-
-  # PPO value function clipping range
-  cliprange_value: 0.5
-
-  # Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
-  loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
-
-  # checkpoint configs
-  checkpoint:
-
-    # What to include in saved checkpoints
-    # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
-    save_contents: ['model', 'optimizer', 'extra']
-
-    # What to include when loading checkpoints
-    load_contents: ${critic.checkpoint.save_contents}
-
-  # profiler configs
-  # the corresponding dataclass is verl.utils.profiler.ProfilerConfig.
-  profiler:
-
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.utils.profiler.ProfilerConfig
-
-    # True for each task has its own database, False for all tasks in one training step share one database.
-    discrete: False
-
-    # Whether to profile all ranks.
-    all_ranks: False
-
-    # The ranks that will be profiled. [] or [0,1,...]
-    ranks: []
-
-# configs for the reward model
-reward_model:
-
-  # Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
-  # In GSM8K and Math examples, we disable reward model.
-  # For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
-  # If False, the following parameters are not effective
-  enable: False
-
-  # FSDP strategy: "fsdp" or "fsdp2"
-  strategy: ${actor_rollout_ref.actor.strategy}
-
-  # model config for reward scoring
-  model:
-
-    # Input tokenizer. If the reward model’s chat template is inconsistent with the policy,
-    # we need to first decode to plaintext, then apply the rm’s chat_template.
-    # Then score with RM. If chat_templates are consistent, it can be set to null.
-    input_tokenizer: ${actor_rollout_ref.model.path}
-
-    # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
-    # Other model types need to define their own RewardModelWorker and pass it from the code.
-    path: ~/models/FsfairX-LLaMA3-RM-v0.1
-
-    # Whether to use shared memory for loading the model
-    use_shm: False
-
-    # External model implementation (optional)
-    external_lib: ${actor_rollout_ref.model.external_lib}
-
-    # Use remove padding optimization (saves compute)
-    use_remove_padding: False
-
-    # Whether to use fused reward kernels for speedup
-    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
-
-    # Whether to enable loading a remote code model, default to False
-    trust_remote_code: False
-
-    # FSDP-specific config
-    fsdp_config:
-
-      # Policy for wrapping layers with FSDP
-      wrap_policy:
-
-        # Minimum number of parameters to trigger wrapping
-        min_num_params: 0
-
-      # Whether to offload model parameters to CPU
-      param_offload: False
-
-      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
-      reshard_after_forward: True
-
-      # Number of GPUs in each FSDP shard group; -1 means auto
-      fsdp_size: -1
-
-      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
-      # before the current forward computation.
-      forward_prefetch: False
-
-  # [Deprecated] Global micro batch size
-  micro_batch_size: null
-
-  # Local per-GPU micro batch size
-  micro_batch_size_per_gpu: null
-
-  # Maximum sequence length to process for scoring
-  max_length: null
-
-  # Sequence parallelism size for Ulysses-style model parallelism
-  ulysses_sequence_parallel_size: 1
-
-  # Whether to dynamically adjust batch size at runtime
-  use_dynamic_bsz: ${critic.use_dynamic_bsz}
-
-  # Maximum number of tokens per GPU in one forward pass
-  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
-
-  # Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
-  # Default is naive. If all verification functions are multiprocessing-safe,
-  # the reward manager can be set to prime for parallel verification.
-  reward_manager: naive
-
-  # Whether to launch custom reward function asynchronously during log_prob
-  launch_reward_fn_async: False
-
-  # Cloud/local sandbox fusion configuration for custom reward logic
-  sandbox_fusion:
-
-    # Cloud/local function URL for sandbox execution
-    url: null
-
-    # Max concurrent requests allowed to sandbox
-    max_concurrent: 64
-
-    # Max memory limit for each sandbox process in MB
-    memory_limit_mb: 1024
-
-  # profiler configs
-  profiler:
-
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.utils.profiler.ProfilerConfig
-
-    # True for each task has its own database, False for all tasks in one training step share one database.
-    discrete: False
-
-    # Whether to profile all ranks.
-    all_ranks: False
-
-    # The ranks that will be profiled. [] or [0,1,...]
-    ranks: []
-
-# custom reward function definition
-custom_reward_function:
-
-  # The path to the file containing your customized reward function.
-  # If not specified, pre-implemented reward functions will be used.
-  path: null
-
-  # The name of the reward function within the specified file. Default is 'compute_score'.
-  name: compute_score
-
-# config for the algorithm
-algorithm:
-
-  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-  _target_: verl.trainer.config.AlgoConfig
-
-  # Discount factor for future rewards
-  gamma: 1.0
-
-  # Trade-off between bias and variance in the GAE estimator
-  lam: 1.0
-
-  # Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc.
-  adv_estimator: gae
-
-  # Whether to normalize advantages by std (specific to GRPO)
-  norm_adv_by_std_in_grpo: True
-
-  # Whether to enable in-reward KL penalty
-  use_kl_in_reward: False
-
-  # How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full"
-  kl_penalty: kl
-
-  # KL control configuration
-  kl_ctrl:
-
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.trainer.config.KLControlConfig
-
-    # KL control type: "fixed" or "adaptive"
-    type: fixed
-
-    # Initial coefficient for KL penalty
-    kl_coef: 0.001
-
-    # Horizon value for adaptive controller (if enabled)
-    horizon: 10000
-
-    # Target KL divergence (used for adaptive controller)
-    target_kl: 0.1
-
-  # Whether to enable preference feedback PPO
-  use_pf_ppo: False
-
-  # Preference feedback PPO settings
-  pf_ppo:
-
-    # Method for reweighting samples: "pow", "max_min", or "max_random"
-    reweight_method: pow
-
-    # Power used for weight scaling in "pow" method
-    weight_pow: 2.0
-
-# config for the trainer
-trainer:
-
-  # Whether to balance batch sizes across distributed workers
-  balance_batch: True
-
-  # Number of epochs in training
-  total_epochs: 30
-
-  # Total training steps (can be set explicitly or derived from epochs)
-  total_training_steps: null
-
-  # The steps that will be profiled. null means no profiling. null or [1,2,5,...]
-  profile_steps: null
-
-  # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
-  ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
-  ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
-  controller_nsight_options:
-
-    # Select the API(s) to be traced.
-    trace: "cuda,nvtx,cublas,ucx"
-
-    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
-    cuda-memory-usage: "true"
-
-    # CUDA graphs will be traced as a whole
-    cuda-graph-trace: "graph"
-
-  # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
-  worker_nsight_options:
-
-    # Select the API(s) to be traced.
-    trace: "cuda,nvtx,cublas,ucx"
-
-    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
-    cuda-memory-usage: "true"
-
-    # CUDA graphs will be traced as a whole
-    cuda-graph-trace: "graph"
-
-    # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
-    capture-range: "cudaProfilerApi"
-
-    # Specify the desired behavior when a capture range ends.
-    # In verl we need the orch.cuda.profiler.start/stop pair to repeats n times.
-    # valid values are "repeat-shutdown:n" or null.
-    # For normal whole step profiling, n = len(profile_steps);
-    # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
-    # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
-    capture-range-end: null
-
-    # Send signal to the target application's process group. We let the program to exit by itself.
-    kill: none
-
-  # Config for npu profiler. Must set when profile_steps is not None and torch_npu is available.
-  npu_profile:
-
-    # Options for the npu profiler
-    options:
-
-      # Storage path of collected data.
-      save_path: ./profiler_data
-
-      # The roles that will be profiled. Only takes effect in discrete mode.
-      # optional values: all, rollout_generate, actor_compute_log_prob, actor_update and ref_compute_log_prob.
-      # "all" means all roles will be profiled.
-      roles: ["all"]
-
-      # Collection level, optional values: level_none, level0, level1, level2.
-      level: level0
-
-      # Whether to enable memory analysis.
-      with_memory: False
-
-      # Whether to record tensor shape.
-      record_shapes: False
-
-      # Whether to record Device-side performance data.
-      with_npu: True
-
-      # Whether to record Host-side performance data.
-      with_cpu: True
-
-      # Whether to record Python call stack information.
-      with_module: False
-
-      # Whether to record operator call stack information.
-      with_stack: False
-
-      # Whether to automatically parse the data.
-      analysis: True
-
-  # Project name for experiment tracking (e.g., wandb)
-  project_name: verl_examples
-
-  # Experiment name for run identification in tracking tools
-  experiment_name: gsm8k
-
-  # Logging backends to use: "console", "wandb", etc.
-  logger: [ 'console', 'wandb' ]
-
-  # Number of generations to log during validation
-  log_val_generations: 0
-
-  # Directory for logging rollout data; no dump if null
-  rollout_data_dir: null
-
-  # Directory for logging validation data; no dump if null
-  validation_data_dir: null
-
-  # Number of nodes used in the training
-  nnodes: 1
-
-  # Number of GPUs per node
-  n_gpus_per_node: 8
-
-  # Save frequency (by iteration) for model checkpoints
-  save_freq: -1
-
-  # ESI refers to the elastic server instance used during training, similar to the training plan. For example,
-  # if you purchase 10 hours of computing power, the ESI will automatically shut down after 10 hours of training.
-  # To ensure a checkpoint is saved before ESI shuts down, the system will start saving a checkpoint in advance.
-  # The advance time is calculated as: Advance Time = Longest historical step duration + Checkpoint save duration + esi_redundant_time.
-  # Here, esi_redundant_time is a user-defined value that further extends the advance time for added safety.
-  esi_redundant_time: 0
-
-  # Resume mode: "auto", "disable", or "resume_path"
-  # "auto": resume from last checkpoint if available
-  # "disable": start from scratch
-  # "resume_path": resume from a user-defined path
-  resume_mode: auto
-
-  # Path to resume training from (only used when resume_mode is "resume_path")
-  resume_from_path: null
-
-  # Whether to run validation before training begins
-  val_before_train: True
-
-  # Whether to run validation only
-  val_only: False
-
-  # Validation frequency (in training iterations)
-  test_freq: -1
-
-  # Number of iterations to warm up the critic before updating policy
-  critic_warmup: 0
-
-  # Default path to distributed filesystem for saving checkpoints
-  default_hdfs_dir: null
-
-  # Whether to delete local checkpoints after loading
-  del_local_ckpt_after_load: False
-
-  # Default local directory for saving checkpoints
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
-
-  # Maximum number of actor checkpoints to keep
-  max_actor_ckpt_to_keep: null
-
-  # Maximum number of critic checkpoints to keep
-  max_critic_ckpt_to_keep: null
-
-  # Timeout (in seconds) for Ray worker to wait for registration
-  ray_wait_register_center_timeout: 300
-
-  # Device to run training on (e.g., "cuda", "cpu")
-  device: cuda
-
-# configs related to ray
-ray_kwargs:
-  # configs related to ray initialization
-  ray_init:
-
-    # Number of CPUs for Ray. Use a fixed number instead of null when using SLURM.
-    num_cpus: null
-
-  # Path to save Ray timeline JSON for performance profiling
-  timeline_json_file: null
diff --git a/tests/trainer/config/test_algo_config_on_cpu.py b/tests/trainer/config/test_algo_config_on_cpu.py
deleted file mode 100644
index d08c949ee48..00000000000
--- a/tests/trainer/config/test_algo_config_on_cpu.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-import torch
-from omegaconf import OmegaConf
-
-from verl.trainer.config import AlgoConfig, KLControlConfig
-from verl.trainer.ppo.core_algos import (
-    compute_gae_advantage_return,
-    compute_grpo_outcome_advantage,
-    get_adv_estimator_fn,
-)
-from verl.utils.config import omega_conf_to_dataclass
-
-
-class TestAlgoConfig(unittest.TestCase):
-    """Test the AlgoConfig dataclass and its integration with core algorithms."""
-
-    def setUp(self):
-        """Set up test fixtures."""
-        # Create a sample algorithm config as DictConfig (similar to what comes from YAML)
-        self.config_dict = {
-            "_target_": "verl.trainer.config.AlgoConfig",
-            "gamma": 0.99,
-            "lam": 0.95,
-            "adv_estimator": "gae",
-            "norm_adv_by_std_in_grpo": True,
-            "use_kl_in_reward": True,
-            "kl_penalty": "kl",
-            "kl_ctrl": {
-                "_target_": "verl.trainer.config.KLControlConfig",
-                "type": "adaptive",
-                "kl_coef": 0.002,
-                "horizon": 5000,
-                "target_kl": 0.05,
-            },
-            "use_pf_ppo": True,
-            "pf_ppo": {"reweight_method": "max_min", "weight_pow": 3.0},
-        }
-        self.omega_config = OmegaConf.create(self.config_dict)
-
-    def test_dataclass_creation_from_dict(self):
-        """Test creating AlgoConfig from dictionary."""
-        config = omega_conf_to_dataclass(self.config_dict)
-
-        self.assertIsInstance(config, AlgoConfig)
-        self.assertEqual(config.gamma, 0.99)
-        self.assertEqual(config.lam, 0.95)
-        self.assertEqual(config.adv_estimator, "gae")
-        self.assertTrue(config.norm_adv_by_std_in_grpo)
-        self.assertTrue(config.use_kl_in_reward)
-        self.assertEqual(config.kl_penalty, "kl")
-        self.assertTrue(config.use_pf_ppo)
-
-    def test_dataclass_creation_from_omega_config(self):
-        """Test creating AlgoConfig from OmegaConf DictConfig."""
-        config = omega_conf_to_dataclass(self.omega_config)
-
-        self.assertIsInstance(config, AlgoConfig)
-        self.assertEqual(config.gamma, 0.99)
-        self.assertEqual(config.lam, 0.95)
-
-    def test_nested_configs(self):
-        """Test that nested configurations are properly converted."""
-        config = omega_conf_to_dataclass(self.omega_config)
-
-        # Test KL control config
-        self.assertIsInstance(config.kl_ctrl, KLControlConfig)
-        self.assertEqual(config.kl_ctrl.type, "adaptive")
-        self.assertEqual(config.kl_ctrl.kl_coef, 0.002)
-        self.assertEqual(config.kl_ctrl.horizon, 5000)
-        self.assertEqual(config.kl_ctrl.target_kl, 0.05)
-
-        # Test PF PPO config
-        self.assertEqual(config.pf_ppo.get("reweight_method"), "max_min")
-        self.assertEqual(config.pf_ppo.get("weight_pow"), 3.0)
-
-    def test_default_values(self):
-        """Test that default values are properly set."""
-        minimal_config = {"gamma": 0.8}
-        config = omega_conf_to_dataclass(minimal_config, AlgoConfig)
-
-        self.assertEqual(config.gamma, 0.8)
-        self.assertEqual(config.lam, 1.0)  # default value
-        self.assertEqual(config.adv_estimator, "gae")  # default value
-        self.assertTrue(config.norm_adv_by_std_in_grpo)  # default value
-        self.assertFalse(config.use_kl_in_reward)  # default value
-        self.assertEqual(config.kl_penalty, "kl")  # default value
-        self.assertFalse(config.use_pf_ppo)  # default value
-
-    def test_get_method_backward_compatibility(self):
-        """Test the get method for backward compatibility."""
-        config = omega_conf_to_dataclass(self.omega_config)
-
-        # Test existing attribute
-        self.assertEqual(config.get("gamma"), 0.99)
-        self.assertEqual(config.get("gamma", 1.0), 0.99)
-
-        # Test non-existing attribute
-        self.assertIsNone(config.get("non_existing"))
-        self.assertEqual(config.get("non_existing", "default"), "default")
-
-    def test_post_init_nested_configs(self):
-        """Test that __post_init__ properly initializes nested configs when None."""
-        # Create config without nested configs
-        minimal_config = AlgoConfig(gamma=0.9)
-
-        # Check that nested configs are initialized
-        self.assertIsNotNone(minimal_config.kl_ctrl)
-        self.assertIsInstance(minimal_config.kl_ctrl, KLControlConfig)
-        assert not minimal_config.pf_ppo
-
-    def test_config_init_from_yaml(self):
-        import os
-
-        from hydra import compose, initialize_config_dir
-
-        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
-            cfg = compose(config_name="ppo_trainer")
-        algo_config = omega_conf_to_dataclass(cfg.algorithm)
-        from verl.trainer.config import AlgoConfig
-
-        assert isinstance(algo_config, AlgoConfig)
-
-
-class TestAlgoCompute(unittest.TestCase):
-    """Test the AlgoConfig dataclass and its integration with core algorithms."""
-
-    def setUp(self):
-        """Set up test fixtures."""
-        self.algo_config = AlgoConfig(
-            gamma=0.99,
-            lam=0.95,
-            adv_estimator="gae",
-            norm_adv_by_std_in_grpo=True,
-            use_kl_in_reward=True,
-            kl_penalty="kl",
-            kl_ctrl=KLControlConfig(type="adaptive", kl_coef=0.002, horizon=5000, target_kl=0.05),
-            use_pf_ppo=True,
-            pf_ppo={"reweight_method": "max_min", "weight_pow": 3.0},
-        )
-
-    def test_advantage_estimator_with_cfg(self):
-        """Test integration with advantage estimators from core_algos."""
-        config = self.algo_config
-
-        # Test GAE advantage estimator
-        adv_fn = get_adv_estimator_fn(config.adv_estimator)
-        self.assertIsNotNone(adv_fn)
-
-        # Test with actual GAE computation
-        batch_size, seq_len = 2, 5
-        token_level_rewards = torch.randn(batch_size, seq_len)
-        values = torch.randn(batch_size, seq_len)
-        response_mask = torch.ones(batch_size, seq_len)
-
-        advantages, returns = compute_gae_advantage_return(
-            token_level_rewards=token_level_rewards,
-            values=values,
-            response_mask=response_mask,
-            gamma=config.gamma,
-            lam=config.lam,
-        )
-
-        self.assertEqual(advantages.shape, (batch_size, seq_len))
-        self.assertEqual(returns.shape, (batch_size, seq_len))
-
-    def test_grpo_advantage_estimator_with_cfg(self):
-        """Test integration with GRPO advantage estimator."""
-        grpo_config = AlgoConfig(adv_estimator="grpo", norm_adv_by_std_in_grpo=True)
-
-        # Test GRPO advantage computation
-        batch_size, seq_len = 4, 3
-        token_level_rewards = torch.tensor([[1.0, 0.5, 0.0], [2.0, 1.0, 0.0], [0.5, 0.2, 0.0], [1.5, 0.8, 0.0]])
-        response_mask = torch.ones(batch_size, seq_len)
-        index = np.array([0, 0, 1, 1])  # Two groups
-
-        advantages, returns = compute_grpo_outcome_advantage(
-            token_level_rewards=token_level_rewards,
-            response_mask=response_mask,
-            index=index,
-            norm_adv_by_std_in_grpo=grpo_config.norm_adv_by_std_in_grpo,
-        )
-
-        self.assertEqual(advantages.shape, (batch_size, seq_len))
-        self.assertEqual(returns.shape, (batch_size, seq_len))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/trainer/config/test_legacy_config_on_cpu.py b/tests/trainer/config/test_legacy_config_on_cpu.py
deleted file mode 100644
index df77b9229e3..00000000000
--- a/tests/trainer/config/test_legacy_config_on_cpu.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import warnings
-
-from hydra import compose, initialize_config_dir
-from hydra.core.global_hydra import GlobalHydra
-from omegaconf import OmegaConf
-
-_BREAKING_CHANGES = [
-    "critic.optim.lr",  # mcore critic lr init value 1e-6 -> 1e-5
-    "actor_rollout_ref.actor.optim.lr_warmup_steps",  # None -> -1
-    "critic.optim.lr_warmup_steps",  # None -> -1
-    "actor_rollout_ref.rollout.name",  # vllm -> ???
-    "actor_rollout_ref.actor.megatron.expert_tensor_parallel_size",
-    "actor_rollout_ref.ref.megatron.expert_tensor_parallel_size",
-    "critic.megatron.expert_tensor_parallel_size",
-    "reward_model.megatron.expert_tensor_parallel_size",
-]
-
-
-class TestConfigComparison(unittest.TestCase):
-    """Test that current configs match their legacy counterparts exactly."""
-
-    ignored_keys = [
-        "enable_gradient_checkpointing",
-        "gradient_checkpointing_kwargs",
-        "activations_checkpoint_method",
-        "activations_checkpoint_granularity",
-        "activations_checkpoint_num_layers",
-        "discrete",
-        "profiler",
-        "profile",
-        "use_profile",
-        "npu_profile",
-        "profile_steps",
-        "worker_nsight_options",
-        "controller_nsight_options",
-    ]
-    ignored_paths = ["reward_model", "custom_reward_function"]
-
-    def _compare_configs_recursively(
-        self, current_config, legacy_config, path="", legacy_allow_missing=True, current_allow_missing=False
-    ):
-        """Recursively compare two OmegaConf configs and assert they are identical.
-
-        Args:
-            legacy_allow_missing (bool): sometimes the legacy megatron config contains fewer keys and
-              we allow that to happen
-        """
-        if path in self.ignored_paths:
-            return
-
-        if isinstance(current_config, dict) and isinstance(legacy_config, dict):
-            current_keys = set(current_config.keys())
-            legacy_keys = set(legacy_config.keys())
-
-            missing_in_current = legacy_keys - current_keys
-            missing_in_legacy = current_keys - legacy_keys
-
-            # Ignore specific keys that are allowed to be missing
-            for key in self.ignored_keys:
-                if key in missing_in_current:
-                    missing_in_current.remove(key)
-                if key in missing_in_legacy:
-                    missing_in_legacy.remove(key)
-
-            if missing_in_current:
-                msg = f"Keys missing in current config at {path}: {missing_in_current}"
-                if current_allow_missing:
-                    warnings.warn(msg, stacklevel=1)
-                else:
-                    self.fail(f"Keys missing in current config at {path}: {missing_in_current}")
-            if missing_in_legacy:
-                # if the legacy
-                msg = f"Keys missing in legacy config at {path}: {missing_in_legacy}"
-                if legacy_allow_missing:
-                    warnings.warn(msg, stacklevel=1)
-                else:
-                    self.fail(msg)
-
-            for key in current_keys:
-                current_path = f"{path}.{key}" if path else key
-                if key in legacy_config:
-                    self._compare_configs_recursively(current_config[key], legacy_config[key], current_path)
-        elif isinstance(current_config, list) and isinstance(legacy_config, list):
-            self.assertEqual(
-                len(current_config),
-                len(legacy_config),
-                f"List lengths differ at {path}: current={len(current_config)}, legacy={len(legacy_config)}",
-            )
-            for i, (current_item, legacy_item) in enumerate(zip(current_config, legacy_config, strict=True)):
-                self._compare_configs_recursively(current_item, legacy_item, f"{path}[{i}]")
-        elif path not in _BREAKING_CHANGES:
-            self.assertEqual(
-                current_config,
-                legacy_config,
-                f"Values differ at {path}: current={current_config}, legacy={legacy_config}",
-            )
-
-    def test_ppo_trainer_config_matches_legacy(self):
-        """Test that ppo_trainer.yaml matches legacy_ppo_trainer.yaml exactly."""
-        import os
-
-        from hydra import compose, initialize_config_dir
-        from hydra.core.global_hydra import GlobalHydra
-
-        GlobalHydra.instance().clear()
-
-        try:
-            with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
-                current_config = compose(config_name="ppo_trainer")
-
-            legacy_config = OmegaConf.load("tests/trainer/config/legacy_ppo_trainer.yaml")
-            current_dict = OmegaConf.to_container(current_config, resolve=True)
-            legacy_dict = OmegaConf.to_container(legacy_config, resolve=True)
-
-            if "defaults" in current_dict:
-                del current_dict["defaults"]
-
-            self._compare_configs_recursively(current_dict, legacy_dict)
-        finally:
-            GlobalHydra.instance().clear()
-
-    def test_ppo_megatron_trainer_config_matches_legacy(self):
-        """Test that ppo_megatron_trainer.yaml matches legacy_ppo_megatron_trainer.yaml exactly."""
-
-        GlobalHydra.instance().clear()
-
-        try:
-            with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
-                current_config = compose(config_name="ppo_megatron_trainer")
-
-            legacy_config = OmegaConf.load("tests/trainer/config/legacy_ppo_megatron_trainer.yaml")
-            current_dict = OmegaConf.to_container(current_config, resolve=True)
-            legacy_dict = OmegaConf.to_container(legacy_config, resolve=True)
-
-            if "defaults" in current_dict:
-                del current_dict["defaults"]
-
-            self._compare_configs_recursively(
-                current_dict, legacy_dict, legacy_allow_missing=True, current_allow_missing=False
-            )
-        finally:
-            GlobalHydra.instance().clear()
-
-    def test_load_component(self):
-        """Test that ppo_megatron_trainer.yaml matches legacy_ppo_megatron_trainer.yaml exactly."""
-
-        GlobalHydra.instance().clear()
-        configs_to_load = [
-            ("verl/trainer/config/actor", "dp_actor"),
-            ("verl/trainer/config/actor", "megatron_actor"),
-            ("verl/trainer/config/ref", "dp_ref"),
-            ("verl/trainer/config/ref", "megatron_ref"),
-            ("verl/trainer/config/rollout", "rollout"),
-        ]
-        for config_dir, config_file in configs_to_load:
-            try:
-                with initialize_config_dir(config_dir=os.path.abspath(config_dir)):
-                    compose(config_name=config_file)
-            finally:
-                GlobalHydra.instance().clear()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/trainer/ppo/test_core_algos_on_cpu.py b/tests/trainer/ppo/test_core_algos_on_cpu.py
index 288f28e6398..7f99475a738 100644
--- a/tests/trainer/ppo/test_core_algos_on_cpu.py
+++ b/tests/trainer/ppo/test_core_algos_on_cpu.py
@@ -313,5 +313,50 @@ def test_grpo_and_vectorized_equivalence(batch_size: int, seq_len: int, num_grou
     assert torch.allclose(ret1, ret2, rtol=1e-5, atol=1e-6)
 
 
+def test_compute_policy_loss_flow_grpo() -> None:
+    """Test flow-GRPO policy loss computation."""
+
+    # prepare input
+    batch_size = 8
+    steps = 10
+    rollout_log_probs = torch.randn((batch_size, steps), dtype=torch.float32)
+    current_log_probs = torch.randn((batch_size, steps), dtype=torch.float32)
+    advantages = torch.randn((batch_size, steps), dtype=torch.float32)
+    response_mask = torch.ones((batch_size, steps), dtype=torch.int32)
+    import os
+
+    from hydra import compose, initialize_config_dir
+
+    from verl.trainer.ppo.diffusion_algos import compute_policy_loss_flow_grpo
+    from verl.utils.config import omega_conf_to_dataclass
+    from verl.workers.config.actor import FSDPActorConfig
+
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config/actor")):
+        cfg = compose(
+            config_name="dp_actor",
+            overrides=[
+                "strategy=fsdp",
+                "clip_ratio=0.0001",
+                "clip_ratio_high=5.0",
+                "ppo_micro_batch_size_per_gpu=8",
+            ],
+        )
+    actor_config: FSDPActorConfig = omega_conf_to_dataclass(cfg)
+
+    for step in range(steps):
+        pg_loss, pg_metrics = compute_policy_loss_flow_grpo(
+            old_log_prob=rollout_log_probs[:, step],
+            log_prob=current_log_probs[:, step],
+            advantages=advantages[:, step],
+            response_mask=response_mask[:, step],
+            loss_agg_mode="token-mean",
+            config=actor_config,
+        )
+
+        assert pg_loss.shape == ()
+        assert isinstance(pg_loss.item(), float)
+        assert "actor/ppo_kl" in pg_metrics.keys()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/trainer/ppo/test_rollout_corr.py b/tests/trainer/ppo/test_rollout_corr.py
index aafbbf9b440..a081d8b73d3 100644
--- a/tests/trainer/ppo/test_rollout_corr.py
+++ b/tests/trainer/ppo/test_rollout_corr.py
@@ -365,6 +365,53 @@ def test_mask_mode():
     print("   ✓ Mask mode correctly separates IS weights from rejection")
 
 
+def test_exact_icepop_zeroes_weights_without_changing_mask():
+    """IcePop should zero OOB IS weights while preserving response_mask."""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    raw_is_weights = torch.tensor([[0.4, 0.8, 6.0]], device=device)
+    old_log_prob = torch.zeros_like(raw_is_weights)
+    rollout_log_prob = -torch.log(raw_is_weights)
+    response_mask = torch.ones_like(raw_is_weights)
+
+    weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=response_mask,
+        rollout_is="token",
+        rollout_is_threshold="0.5_5.0",
+        rollout_rs=None,
+    )
+
+    weights = weights_proto.batch["rollout_is_weights"]
+    expected_weights = torch.tensor([[0.0, 0.8, 0.0]], device=device)
+
+    torch.testing.assert_close(weights, expected_weights, atol=1e-6, rtol=1e-6)
+    assert torch.equal(modified_response_mask, response_mask)
+    assert metrics["rollout_corr/rollout_is_oob_ratio"] == pytest.approx(2.0 / 3.0, abs=1e-6)
+    assert metrics["rollout_corr/rollout_is_std"] == pytest.approx(0.3771236, abs=1e-6)
+    assert metrics["rollout_corr/rollout_is_eff_sample_size"] == pytest.approx(1.0 / 3.0, abs=1e-6)
+
+
+def test_bool_rollout_is_threshold_is_rejected():
+    """Boolean thresholds should not be silently accepted via bool <: int."""
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    old_log_prob = torch.zeros(1, 2, device=device)
+    rollout_log_prob = torch.zeros(1, 2, device=device)
+    response_mask = torch.ones(1, 2, device=device)
+
+    with pytest.raises(TypeError, match="not a boolean"):
+        compute_rollout_correction_and_rejection_mask(
+            old_log_prob=old_log_prob,
+            rollout_log_prob=rollout_log_prob,
+            response_mask=response_mask,
+            rollout_is="token",
+            rollout_is_threshold=True,
+            rollout_rs=None,
+        )
+
+
 if __name__ == "__main__":
     print("=" * 60)
     print("Rollout Correction Test Suite")
diff --git a/tests/trainer/ppo/test_rollout_corr_integration.py b/tests/trainer/ppo/test_rollout_corr_integration.py
index 1f05414d2e1..dfdc022b221 100644
--- a/tests/trainer/ppo/test_rollout_corr_integration.py
+++ b/tests/trainer/ppo/test_rollout_corr_integration.py
@@ -230,6 +230,44 @@ def test_metrics_only_mode(self, sample_data, config_with_rollout_is):
         # Losses should be different (weights have an effect)
         assert not torch.allclose(pg_loss_no_weights, pg_loss_with_weights)
 
+    def test_exact_icepop_matches_filtered_weighted_ppo_loss(self, config_with_rollout_is):
+        """IcePop should match the local RL zero-weight semantics."""
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        old_log_prob = torch.tensor([[-1.0, -1.0, -1.0]], device=device)
+        log_prob = old_log_prob.clone()
+        rollout_log_prob = torch.tensor([[-0.5, -3.5, -0.8]], device=device)
+        advantages = torch.tensor([[1.0, -1.0, 2.0]], device=device)
+        response_mask = torch.ones_like(old_log_prob)
+
+        rollout_is_weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
+            old_log_prob=old_log_prob,
+            rollout_log_prob=rollout_log_prob,
+            response_mask=response_mask,
+            rollout_is="token",
+            rollout_is_threshold="0.5_5.0",
+            rollout_rs=None,
+        )
+
+        rollout_is_weights = rollout_is_weights_proto.batch["rollout_is_weights"]
+        expected_weights = torch.tensor([[0.60653067, 0.0, 0.81873075]], device=device)
+        expected_loss = torch.mean(expected_weights * (-advantages))
+
+        pg_loss, _ = compute_policy_loss_vanilla(
+            old_log_prob=old_log_prob,
+            log_prob=log_prob,
+            advantages=advantages,
+            response_mask=response_mask,
+            loss_agg_mode="token-mean",
+            config=config_with_rollout_is,
+            rollout_is_weights=rollout_is_weights,
+        )
+
+        assert torch.equal(modified_response_mask, response_mask)
+        assert metrics["rollout_corr/rollout_is_oob_ratio"] == pytest.approx(1.0 / 3.0, abs=1e-6)
+        torch.testing.assert_close(rollout_is_weights, expected_weights, atol=1e-6, rtol=1e-6)
+        torch.testing.assert_close(pg_loss, expected_loss, atol=1e-6, rtol=1e-6)
+
 
 class TestRolloutCorrectionConfigNormalization:
     """Unit tests for RolloutCorrectionConfig canonicalization logic."""
@@ -257,6 +295,11 @@ def test_float_threshold_conversion_in_factory(self):
         assert config.rollout_rs == "seq_mean_k1"
         assert config.rollout_rs_threshold == 1.001
 
+    def test_icepop_factory(self):
+        config = RolloutCorrectionConfig.decoupled_token_icepop()
+        assert config.rollout_is == "token"
+        assert config.rollout_is_threshold == "0.5_5.0"
+
 
 if __name__ == "__main__":
     pytest.main([__file__, "-v", "-s"])
diff --git a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
index 6c86e049321..3f3360ca6de 100644
--- a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
+++ b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
@@ -405,7 +405,6 @@ def test_multiturn_sft_vlm_dataset_on_cpu(model_path, vlm_data_file):
     "model_path",
     [
         f"{custom_model_prefix}/Qwen/Qwen3-VL-2B-Instruct",
-        f"{custom_model_prefix}/Qwen/Qwen3.5-0.8B",
     ],
 )
 def test_multiturn_sft_vlm_dataloader_on_cpu(model_path, vlm_data_file):
@@ -442,7 +441,6 @@ def test_multiturn_sft_vlm_dataloader_on_cpu(model_path, vlm_data_file):
         assert position_ids.is_nested, "position_ids should be a nested tensor"
         assert position_ids.dim() == 3, "position_ids must be 3-dimensional"
         assert position_ids.shape[0] == batch_size
-        assert position_ids.shape[1] == 4
         values = position_ids.values()
         assert values.shape == (4, len(input_ids.values()))
 
diff --git a/tests/utils/test_padding_on_cpu.py b/tests/utils/test_padding_on_cpu.py
index 9ebf2992b3d..238522bc712 100644
--- a/tests/utils/test_padding_on_cpu.py
+++ b/tests/utils/test_padding_on_cpu.py
@@ -15,7 +15,7 @@
 import torch
 from tensordict import TensorDict
 
-from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding
+from verl.workers.utils.padding import embeds_padding_2_no_padding, left_right_2_no_padding, no_padding_2_padding
 
 
 def test_padding_conversion_with_log_probs():
@@ -238,9 +238,47 @@ def test_no_padding_2_padding_varying_lengths():
     print("All varied length tests passed")
 
 
+def test_embeds_padding_2_no_padding_varying_lengths():
+    """Test that padding tokens are stripped correctly when sequences have different valid lengths."""
+    batch_size = 3
+    max_seq_len = 20
+    dim = 16
+    num_steps = 8
+
+    # Simulate different valid lengths: 20, 15, 10 (rest are padding zeros)
+    valid_lens = [20, 15, 10]
+    prompt_embeds = torch.randn(batch_size, max_seq_len, dim)
+    prompt_embeds_mask = torch.zeros(batch_size, max_seq_len, dtype=torch.int32)
+    for i, vlen in enumerate(valid_lens):
+        prompt_embeds_mask[i, :vlen] = 1
+    response_mask = torch.ones(batch_size, num_steps)
+
+    data = TensorDict(
+        {
+            "prompt_embeds": prompt_embeds,
+            "prompt_embeds_mask": prompt_embeds_mask,
+            "response_mask": response_mask,
+        },
+        batch_size=batch_size,
+    )
+
+    result = embeds_padding_2_no_padding(data)
+
+    assert result["prompt_embeds"].is_nested
+
+    # Each sample's nested embedding should have the correct stripped length
+    embeds_nested = result["prompt_embeds"]
+    for i, vlen in enumerate(valid_lens):
+        sample_embed = embeds_nested[i]
+        assert sample_embed.shape[0] == vlen, f"Sample {i}: expected {vlen} tokens, got {sample_embed.shape[0]}"
+        # Values should match the original (unpadded portion)
+        torch.testing.assert_close(sample_embed, prompt_embeds[i, :vlen, :])
+
+
 if __name__ == "__main__":
     test_padding_conversion_with_log_probs()
     test_padding_conversion_without_log_probs()
     test_padding_roundtrip()
     test_no_padding_2_padding_varying_lengths()
+    test_embeds_padding_2_no_padding_varying_lengths()
     print("All padding conversion tests passed!")
diff --git a/tests/utils/test_rollout_skip_on_cpu.py b/tests/utils/test_rollout_skip_on_cpu.py
index 5b8b31e641d..84338fee9ba 100644
--- a/tests/utils/test_rollout_skip_on_cpu.py
+++ b/tests/utils/test_rollout_skip_on_cpu.py
@@ -16,86 +16,134 @@
 from pathlib import Path
 from unittest.mock import MagicMock
 
+import numpy as np
 import pytest
 import torch
+from omegaconf import OmegaConf
 
-from verl.utils.rollout_skip import DataProto, RolloutSkip
+import verl
+from verl.protocol import DataProto
+from verl.utils.rollout_skip import RolloutSkip
 
-len_prompt = 50
-len_response = 100
 
+def build_generate_fn(cfg):
+    torch.manual_seed(42)
+    len_tokenizer = 65536
 
-def temp_dir():
-    # Create a temporary directory
-    temp_dir = Path(tempfile.mkdtemp())
-    yield temp_dir
-    # Cleanup
-    shutil.rmtree(temp_dir)
-
+    n = cfg.actor_rollout_ref.rollout.n
+    gen_bs = cfg.data.gen_batch_size
+    max_prompt_length = cfg.data.max_prompt_length
+    max_response_length = cfg.data.max_response_length
 
-def build_generate_fn(gen_bs, n):
-    len_tokenizer = 1024
+    def iterate_gen_batch():
+        mark_i = 0
+        while True:
+            mark_i += 1
+            prompt = torch.randint(len_tokenizer, size=(gen_bs, max_prompt_length)).repeat_interleave(n, dim=0)
+            generate = torch.randint(len_tokenizer, size=(gen_bs * n, max_response_length))
+            tmp_mark = torch.Tensor([mark_i]).repeat(gen_bs * n, 1)
+            data = DataProto.from_dict(
+                tensors={"prompt": prompt, "response": generate, "tmp_mark": tmp_mark},
+            )
+            yield data
 
-    def iterate():
+    def iterate_new_batch():
+        mark_i = 0
         while True:
-            prompt = torch.randint(len_tokenizer, size=(gen_bs, len_prompt)).repeat_interleave(n, dim=0)
-            generate = torch.randint(len_tokenizer, size=(gen_bs * n, len_response))
-            data = DataProto.from_dict(tensors={"prompt": prompt, "response": generate})
+            mark_i += 1
+            data = DataProto.from_dict(
+                non_tensors={
+                    "data_source": ["math_dapo"] * (gen_bs * n),
+                    "reward_model": np.array(
+                        [{"ground_truth": mark_i, "style": "rule-lighteval/MATH_v2"}] * (gen_bs * n), dtype=object
+                    ),
+                }
+            )
+
             yield data
 
-    mock_infer_engine = iterate()
+    mock_infer_engine_gen = iterate_gen_batch()
+    mock_infer_engine_new = iterate_new_batch()
+
+    def fn_gen_batch(batch, **kwargs):
+        # Simulate the inference engine returning the next batch
+        return next(mock_infer_engine_gen)
 
-    def fn(batch, **kwargs):
+    def fn_new_batch(**kwargs):
         # Simulate the inference engine returning the next batch
-        return next(mock_infer_engine)
+        return next(mock_infer_engine_new)
 
-    return fn
+    return fn_gen_batch, fn_new_batch
 
 
-@pytest.fixture(params=[(32, 4), (64, 4), (64, 8)])
-def mock_rollout_wg(request):
-    gen_bs, n = request.param
+@pytest.fixture
+def mock_rollout_wg():
+    default_n = 16
+    default_gen_batch_size = 8
+    default_max_prompt_length = 1 * 1024
+    default_max_response_length = 10 * 1024
+
+    config_path = Path(verl.version_folder).joinpath("trainer/config")
+    cfg = OmegaConf.load(str(config_path.joinpath("ppo_trainer.yaml")))
+    cfg.data = OmegaConf.load(str(config_path.joinpath("data/legacy_data.yaml")))
+    cfg.actor_rollout_ref.rollout = OmegaConf.load(config_path.joinpath("rollout/rollout.yaml"))
+
+    temp_dir = Path(tempfile.mkdtemp())
+
     rollout_wg = MagicMock()
 
-    config = MagicMock()
-    config.actor_rollout_ref.rollout = {
-        "n": n,
-        "skip_dump_dir": next(temp_dir()),
-    }
-    config.data = {"gen_batch_size": gen_bs}
+    cfg.trainer.experiment_name = "skip"
+    cfg.trainer.project_name = "verl_feat"
+
+    cfg.actor_rollout_ref.rollout.n = default_n
+    cfg.actor_rollout_ref.rollout.skip.dump_dir = str(temp_dir)
+    cfg.actor_rollout_ref.rollout.skip.max_dump_step = 1
+    cfg.actor_rollout_ref.rollout.skip.enable = True
+
+    cfg.data.gen_batch_size = default_gen_batch_size
+    cfg.data.max_prompt_length = default_max_prompt_length
+    cfg.data.max_response_length = default_max_response_length
 
-    rollout_wg.generate_sequences = build_generate_fn(gen_bs, n)
+    rollout_wg.generate_sequences, new_batch_generator = build_generate_fn(cfg)
 
-    yield config, rollout_wg
-    # Cleanup
-    shutil.rmtree(next(temp_dir()))
+    yield cfg, rollout_wg, new_batch_generator
+
+    # 清理
+    shutil.rmtree(temp_dir, ignore_errors=True)
 
 
 class TestRolloutSkip:
-    def test_initialization(self, capsys):
+    def test_initialization(self, mock_rollout_wg, capsys):
         """Test that RolloutSkip initializes correctly"""
-        config = MagicMock()
-        config.actor_rollout_ref.rollout = {
-            "n": 16,
-            "skip_dump_dir": "tmp/rollout_dump",
-        }
-        config.data = {"gen_batch_size": 128}
-        mock_rollout_wg = MagicMock()
-        skip = RolloutSkip(config, mock_rollout_wg)
-
-        assert skip.n == 16
-        assert skip.gbs == 128
-        assert str(skip.dumped_dir) == "tmp/rollout_dump"
-
-        assert skip._rollout_wg == mock_rollout_wg
+        config, rollout_wg, _ = mock_rollout_wg
+
+        skip = RolloutSkip(config, rollout_wg)
+
+        assert skip.n == config.actor_rollout_ref.rollout.n
+        assert skip.gbs == config.data.gen_batch_size
+        assert skip.prompt_length == config.data.max_prompt_length
+        assert skip.response_length == config.data.max_response_length
+
+        assert skip.is_enable
+        assert str(skip.specify_dumped_dir).startswith(config.actor_rollout_ref.rollout.skip.dump_dir)
+
+        # rollout_wg is passed in __init__, so is_active and is_dump_step are True after init
+        assert skip.is_active
+        assert skip.is_dump_step
         skip.wrap_generate_sequences()
+
+        assert skip.is_dump_step
+        assert skip.is_active
+
+        assert skip._rollout_wg == rollout_wg
+
         captured = capsys.readouterr()
         assert "Successfully patched" in captured.out
 
     def test_generate_without_wrap(self, mock_rollout_wg):
         """Test that generate_sequences works without wrapping"""
 
-        config, rollout_wg = mock_rollout_wg
+        config, rollout_wg, _ = mock_rollout_wg
         _ = RolloutSkip(config, rollout_wg)
 
         _result = rollout_wg.generate_sequences(MagicMock())
@@ -103,40 +151,154 @@ def test_generate_without_wrap(self, mock_rollout_wg):
             result = rollout_wg.generate_sequences(MagicMock())
             assert isinstance(result, DataProto)
             # * make sure the data is different
-            assert torch.abs(_result.batch["prompt"] - result.batch["prompt"]).sum() > 0
-            assert torch.abs(_result.batch["response"] - result.batch["response"]).sum() > 0
+            assert not torch.allclose(_result.batch["prompt"], result.batch["prompt"])
+            assert not torch.allclose(_result.batch["response"], result.batch["response"])
             _result = result
 
-    def test_dump(self, mock_rollout_wg, capsys):
-        config, rollout_wg = mock_rollout_wg
+
+class TestAction:
+    @pytest.mark.parametrize("step", [4])
+    def test_rollout_with_REPEAT(self, mock_rollout_wg, step, capsys):
+        config, rollout_wg, new_batch_generator = mock_rollout_wg
+        config.actor_rollout_ref.rollout.skip.action = "repeat"
+        config.actor_rollout_ref.rollout.skip.max_dump_step = step
         skip = RolloutSkip(config, rollout_wg)
         skip.wrap_generate_sequences()
 
-        result = rollout_wg.generate_sequences(MagicMock())
-        # * check if dump is OK
-        assert skip.curr_path_dump.exists()
-        captured = capsys.readouterr()
-        assert "Successfully dump data in" in captured.out
-        # * get file size, estimate file size
-        file_size = skip.curr_path_dump.stat().st_size
-        est_file_size = (len_prompt + len_response) * skip.gbs * skip.n * result.batch["prompt"].dtype.itemsize
-        assert file_size >= est_file_size, "Dumped file size is smaller than expected"
+        list_new_batch = []
+        list_gen_batch = []
+        for _ in range(step):
+            new_batch = new_batch_generator()
+            skip.record(new_batch)
+            list_new_batch.append(new_batch)
+            list_gen_batch.append(rollout_wg.generate_sequences(MagicMock()))
 
-    def test_generate_with_wrap(self, mock_rollout_wg, capsys):
-        """Test that generate_sequences works without wrapping"""
+        # Check repeat
+        for i in range(step * 3):
+            ori_step = i % step
+            compare_batch = list_gen_batch[ori_step]
+
+            skip.record(new_batch_generator())
+            gen_batch = rollout_wg.generate_sequences(MagicMock())
 
-        config, rollout_wg = mock_rollout_wg
+            assert torch.allclose(compare_batch.batch["prompt"], gen_batch.batch["prompt"])
+            assert torch.allclose(compare_batch.batch["response"], gen_batch.batch["response"])
+
+    @pytest.mark.parametrize("step", [4, 16])
+    def test_rollout_with_REPEAT_LAST(self, mock_rollout_wg, step, capsys):
+        config, rollout_wg, new_batch_generator = mock_rollout_wg
+        config.actor_rollout_ref.rollout.skip.action = "repeat_last"
+        config.actor_rollout_ref.rollout.skip.max_dump_step = step
         skip = RolloutSkip(config, rollout_wg)
         skip.wrap_generate_sequences()
 
-        _result = rollout_wg.generate_sequences(MagicMock())
+        list_new_batch = []
+        list_gen_batch = []
+        for _ in range(step):
+            new_batch = new_batch_generator()
+            skip.record(new_batch)
+            list_new_batch.append(new_batch)
+            list_gen_batch.append(rollout_wg.generate_sequences(MagicMock()))
 
+        # Check repeat_last
+        compare_batch = list_gen_batch[-1]
         for _ in range(10):
-            result = rollout_wg.generate_sequences(MagicMock())
-            assert isinstance(result, DataProto)
-            # * make sure the data is different
-            assert torch.abs(_result.batch["prompt"] - result.batch["prompt"]).sum() == 0
-            assert torch.abs(_result.batch["response"] - result.batch["response"]).sum() == 0
-            captured = capsys.readouterr()
-            assert "Successfully load pre-generated data from" in captured.out
-            _result = result
+            skip.record(new_batch_generator())
+            gen_batch = rollout_wg.generate_sequences(MagicMock())
+
+            assert torch.allclose(compare_batch.batch["prompt"], gen_batch.batch["prompt"])
+            assert torch.allclose(compare_batch.batch["response"], gen_batch.batch["response"])
+
+    @pytest.mark.parametrize("step", [1, 16])
+    def test_rollout_with_CACHE(self, mock_rollout_wg, step, capsys):
+        config, rollout_wg, new_batch_generator = mock_rollout_wg
+        config.actor_rollout_ref.rollout.skip.action = "cache"
+        config.actor_rollout_ref.rollout.skip.max_dump_step = step
+        skip = RolloutSkip(config, rollout_wg)
+        skip.wrap_generate_sequences()
+
+        list_new_batch = []
+        list_gen_batch = []
+        for _ in range(step):
+            new_batch = new_batch_generator()
+            skip.record(new_batch)
+            list_new_batch.append(new_batch)
+            list_gen_batch.append(rollout_wg.generate_sequences(MagicMock()))
+
+        skip.record(new_batch_generator())
+        rollout_wg.generate_sequences(MagicMock())
+
+
+class TestActionWithResume:
+    @pytest.mark.parametrize("step", [16])
+    def test_rollout_with_CACHE_with_RESUME(self, mock_rollout_wg, step, capsys):
+        resume_more_step = 4
+        saved_step = max(step - 5, 1)
+        saved_gen_step = 0
+
+        config, rollout_wg, new_batch_generator = mock_rollout_wg
+        fixed_generate_sequences = rollout_wg.generate_sequences
+        config.actor_rollout_ref.rollout.skip.action = "cache"
+        config.actor_rollout_ref.rollout.skip.max_dump_step = step + resume_more_step
+        skip = RolloutSkip(config, rollout_wg)
+        skip.wrap_generate_sequences()
+
+        list_new_batch = []
+        list_gen_batch = []
+        # * mock group filter by DAPO
+        count_gen_step = 0
+
+        for i in range(step):
+            num_rerollout = i % 3  # max rerollout 2 times
+            print("train_step:", i)
+            for ii in range(num_rerollout + 1):
+                count_gen_step += 1
+                new_batch = new_batch_generator()
+                skip.record(new_batch, i + 1, count_gen_step)  # train_step start from 1
+                assert skip.record_global_steps == i + 1
+                assert skip.record_gen_steps == count_gen_step
+                list_new_batch.append(new_batch)
+                list_gen_batch.append(rollout_wg.generate_sequences(MagicMock()))
+            if i + 1 == saved_step:
+                saved_gen_step = count_gen_step
+
+        # * RESUME
+        skip = RolloutSkip(config, rollout_wg)
+        rollout_wg.generate_sequences = fixed_generate_sequences  # restore
+        skip.wrap_generate_sequences()
+
+        real_gen_step = saved_gen_step
+        count_gen_step = 0
+        for i in range(saved_step, step):
+            num_rerollout = i % 3  # max rerollout 2 times
+            print("train_step:", i)
+            # After resume, DAPO may reset the local gen_steps counter; however, the
+            # per-train_step rerollout pattern should remain consistent.
+            for ii in range(num_rerollout + 1):
+                count_gen_step += 1
+                real_gen_step += 1
+                new_batch = new_batch_generator()
+                skip.record(new_batch, i + 1, count_gen_step)  # train_step start from 1
+                assert skip.record_global_steps == i + 1
+                assert skip.record_gen_steps == real_gen_step
+                list_new_batch.append(new_batch)
+                list_gen_batch.append(rollout_wg.generate_sequences(MagicMock()))
+
+        # * Resume cover dump
+        for i in range(step, step + 5):
+            num_rerollout = i % 3  # max rerollout 2 times
+            print("train_step:", i)
+            for ii in range(saved_step):  # resume from step - 2
+                count_gen_step += 1
+                real_gen_step += 1
+                new_batch = new_batch_generator()
+                skip.record(new_batch, i + 1, count_gen_step)  # train_step start from 1
+                assert skip.record_global_steps == i + 1
+                assert skip.record_gen_steps >= count_gen_step
+                assert skip.record_gen_steps == real_gen_step
+                list_new_batch.append(new_batch)
+                list_gen_batch.append(rollout_wg.generate_sequences(MagicMock()))
+
+        # * Final
+        skip.record(new_batch, step + resume_more_step + 1, None)  # train_step start from 1
+        rollout_wg.generate_sequences(MagicMock())
diff --git a/tests/utils/test_special_adapter_path_integration.py b/tests/utils/test_special_adapter_path_integration.py
new file mode 100644
index 00000000000..d52ee022080
--- /dev/null
+++ b/tests/utils/test_special_adapter_path_integration.py
@@ -0,0 +1,224 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Integration test for sglang LoRA adapter path.
+
+Tests the two-phase weight sync (base weights then adapter deltas) that
+engine_workers.update_weights() performs when lora.merge=False.
+
+Requires 1 GPU with sglang installed.
+"""
+
+from dataclasses import asdict
+from importlib.util import find_spec
+
+import pytest
+import torch
+from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+pytestmark = pytest.mark.skipif(find_spec("sglang") is None, reason="sglang not installed")
+
+MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
+ADAPTER_NAME = "verl_lora_adapter"
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    return AutoTokenizer.from_pretrained(MODEL_ID)
+
+
+@pytest.fixture(scope="module")
+def lora_config():
+    return LoraConfig(
+        r=8,
+        lora_alpha=16,
+        target_modules=["q_proj", "v_proj"],
+        lora_dropout=0.0,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+
+
+@pytest.fixture(scope="module")
+def peft_model(lora_config):
+    """Create a peft-wrapped model (CPU, for extracting params)."""
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
+    return get_peft_model(model, lora_config)
+
+
+@pytest.fixture(scope="module")
+def engine():
+    """Launch sglang engine with LoRA support."""
+    import sglang as sgl
+
+    eng = sgl.Engine(
+        model_path=MODEL_ID,
+        dtype="bfloat16",
+        mem_fraction_static=0.5,
+        tp_size=1,
+        enable_lora=True,
+        max_loras_per_batch=4,
+        max_lora_rank=16,
+        lora_target_modules=["q_proj", "v_proj"],
+    )
+    yield eng
+    eng.shutdown()
+
+
+def _make_prompt(tokenizer, text="What is 2+2?"):
+    return tokenizer.apply_chat_template(
+        [{"role": "user", "content": text}],
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+
+
+def _extract_base_params(peft_model):
+    """Extract base model params with clean HF key names.
+
+    Mimics the base_sync_done=False path in engine_workers — sends full
+    base model weights (no LoRA deltas) to the rollout engine.
+    """
+    params = []
+    for name, param in peft_model.named_parameters():
+        if "lora_" in name:
+            continue
+        clean = name.replace("base_model.model.", "").replace(".base_layer", "")
+        params.append((clean, param.detach().clone()))
+    return params
+
+
+def _extract_lora_tensors(peft_model):
+    """Extract LoRA adapter tensors as a list of (name, tensor) tuples.
+
+    Mimics the base_sync_done=True path — sends adapter deltas via
+    LoadLoRAAdapterFromTensorsReqInput.
+    """
+    from peft import get_peft_model_state_dict
+
+    state_dict = get_peft_model_state_dict(peft_model)
+    return [(k, v.detach().clone()) for k, v in state_dict.items()]
+
+
+class TestBaseWeightSync:
+    """Phase 1: sync base weights (base_sync_done=False path)."""
+
+    def test_update_weights_succeeds(self, engine, peft_model):
+        base_params = _extract_base_params(peft_model)
+        engine.update_weights_from_tensor(named_tensors=base_params)
+
+    def test_generation_after_base_sync(self, engine, tokenizer):
+        prompt = _make_prompt(tokenizer)
+        output = engine.generate(prompt, {"max_new_tokens": 32, "temperature": 0.0})
+        text = output["text"]
+        print(f"[base sync] Generated: {text!r}")
+        assert len(text) > 0, "Empty generation after base weight sync"
+
+
+class TestAdapterLoading:
+    """Phase 2: load adapter deltas (base_sync_done=True path)."""
+
+    def test_load_adapter_from_tensors(self, engine, peft_model, lora_config):
+        lora_tensors = _extract_lora_tensors(peft_model)
+        config_dict = {k: v for k, v in asdict(lora_config).items() if v is not None}
+        engine.load_lora_adapter_from_tensors(
+            lora_name=ADAPTER_NAME,
+            tensors=lora_tensors,
+            config_dict=config_dict,
+        )
+
+    @pytest.mark.xfail(
+        reason="sglang load_lora_adapter_from_tensors doesn't populate lora_ref_cache, "
+        "so _resolve_lora_path validation fails. Adapter IS loaded in TP workers. "
+        "This works in verl's actual flow because verl uses the HTTP server adapter "
+        "which bypasses this validation.",
+        raises=Exception,
+    )
+    def test_generation_with_adapter(self, engine, tokenizer):
+        prompt = _make_prompt(tokenizer)
+        output = engine.generate(
+            prompt,
+            {"max_new_tokens": 32, "temperature": 0.0},
+            lora_path=ADAPTER_NAME,
+        )
+        text = output["text"]
+        print(f"[adapter gen] Generated: {text!r}")
+        assert len(text) > 0, "Empty generation with adapter"
+
+
+class TestAdapterLifecycle:
+    """Test unload + reload cycle (simulates subsequent training iterations)."""
+
+    def test_unload_adapter(self, engine):
+        engine.unload_lora_adapter(ADAPTER_NAME)
+
+    def test_generation_without_adapter(self, engine, tokenizer):
+        """After unload, base model should still generate."""
+        prompt = _make_prompt(tokenizer)
+        output = engine.generate(prompt, {"max_new_tokens": 32, "temperature": 0.0})
+        text = output["text"]
+        print(f"[after unload] Generated: {text!r}")
+        assert len(text) > 0, "Empty generation after adapter unload"
+
+    def test_reload_adapter(self, engine, peft_model, lora_config):
+        lora_tensors = _extract_lora_tensors(peft_model)
+        config_dict = {k: v for k, v in asdict(lora_config).items() if v is not None}
+        engine.load_lora_adapter_from_tensors(
+            lora_name=ADAPTER_NAME,
+            tensors=lora_tensors,
+            config_dict=config_dict,
+        )
+
+    @pytest.mark.xfail(
+        reason="sglang load_lora_adapter_from_tensors doesn't populate lora_ref_cache",
+        raises=Exception,
+    )
+    def test_generation_after_reload(self, engine, tokenizer):
+        prompt = _make_prompt(tokenizer)
+        output = engine.generate(
+            prompt,
+            {"max_new_tokens": 32, "temperature": 0.0},
+            lora_path=ADAPTER_NAME,
+        )
+        text = output["text"]
+        print(f"[after reload] Generated: {text!r}")
+        assert len(text) > 0, "Empty generation after adapter reload"
+
+
+class TestSleepWakeCycle:
+    """Test release/resume with adapter-aware tags."""
+
+    def test_release_kv_only_keeps_weights(self, engine, tokenizer):
+        """Adapter mode: release only kv_cache, keep base weights."""
+        engine.release_memory_occupation(tags=["kv_cache"])
+        engine.resume_memory_occupation(tags=["kv_cache"])
+
+        # Should still generate after kv-only release/resume
+        prompt = _make_prompt(tokenizer)
+        output = engine.generate(prompt, {"max_new_tokens": 16, "temperature": 0.0})
+        text = output["text"]
+        print(f"[kv-only cycle] Generated: {text!r}")
+        assert len(text) > 0, "Empty generation after kv-only release/resume"
+
+    def test_full_release_and_resume(self, engine, tokenizer):
+        """Merge/no-LoRA mode: release everything, resume everything."""
+        engine.release_memory_occupation(tags=["kv_cache", "weights"])
+        engine.resume_memory_occupation(tags=["kv_cache", "weights"])
+
+        prompt = _make_prompt(tokenizer)
+        output = engine.generate(prompt, {"max_new_tokens": 16, "temperature": 0.0})
+        text = output["text"]
+        print(f"[full cycle] Generated: {text!r}")
+        assert len(text) > 0, "Empty generation after full release/resume"
diff --git a/tests/workers/rollout/rollout_trtllm/test_async_server.py b/tests/workers/rollout/rollout_trtllm/test_async_server.py
index 3224a8ce13f..af4445d9777 100644
--- a/tests/workers/rollout/rollout_trtllm/test_async_server.py
+++ b/tests/workers/rollout/rollout_trtllm/test_async_server.py
@@ -172,7 +172,7 @@ def test_async_generate(self):
             os.environ.setdefault("TLLM_RAY_FORCE_LOCAL_CLUSTER", "1")
             ray.init(address="local", ignore_reinit_error=True, include_dashboard=False)
 
-            rollout_config, model_config = self._build_rollout_config(response_length=50)
+            rollout_config, model_config = self._build_rollout_config(response_length=16)
 
             server = self._create_server(
                 rollout_config,
diff --git a/tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py b/tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py
index 21ab5689113..6874d31bf29 100644
--- a/tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py
+++ b/tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py
@@ -22,11 +22,11 @@
 from PIL import Image
 from transformers import AutoTokenizer
 
-UNIMODAL_MODEL_PATH = "Qwen/Qwen2.5-Math-7B"
-MULTIMODAL_MODEL_PATH = "Qwen/Qwen2.5-VL-7B-Instruct"
+UNIMODAL_MODEL_PATH = "Qwen/Qwen2.5-0.5B-Instruct"
+MULTIMODAL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
 
-MAX_MODEL_LEN = 4096
-RESPONSE_LENGTH = 256
+MAX_MODEL_LEN = 2048
+RESPONSE_LENGTH = 32
 MAX_NUM_SEQS = 16
 GPU_MEMORY_UTILIZATION = 0.8
 TENSOR_PARALLEL_SIZE = 1
diff --git a/tests/workers/rollout/rollout_vllm/test_vllm_omni_generate.py b/tests/workers/rollout/rollout_vllm/test_vllm_omni_generate.py
index 496f0bd3c83..08299aa60e3 100644
--- a/tests/workers/rollout/rollout_vllm/test_vllm_omni_generate.py
+++ b/tests/workers/rollout/rollout_vllm/test_vllm_omni_generate.py
@@ -26,6 +26,7 @@
 
 import pytest
 import ray
+import torch
 from omegaconf import OmegaConf
 from transformers import AutoTokenizer
 
@@ -96,10 +97,11 @@ def init_server():
             "height": 512,
             "width": 512,
             "num_inference_steps": 10,
-            "guidance_scale": 4.0,
             "engine_kwargs": {
                 "vllm_omni": {
-                    "custom_pipeline": "examples.vllm_omni.pipeline_qwenimage.QwenImagePipelineWithLogProb",
+                    "custom_pipeline": (
+                        "examples.flowgrpo_trainer.vllm_omni.pipeline_qwenimage.QwenImagePipelineWithLogProb"
+                    ),
                 }
             },
         }
@@ -180,7 +182,7 @@ def test_generate(init_server):
 
 
 def test_generate_with_logprobs(init_server):
-    """generate() with logprobs=True returns log_probs list."""
+    """generate() with logprobs=True returns non-empty log_probs (tensor or sequence)."""
     server = init_server
     prompt_ids = _tokenize_prompt(
         "a futuristic city at night with neon lights glowing on tall glass "
@@ -205,11 +207,18 @@ def test_generate_with_logprobs(init_server):
 
     assert isinstance(output, DiffusionOutput)
     assert len(output.diffusion_output) == 3
-    assert output.log_probs is not None, "log_probs should be present when logprobs=True"
-    assert isinstance(output.log_probs, list)
-    assert len(output.log_probs) > 0
-
-    print(f"log_probs: {len(output.log_probs)} values, sample: {output.log_probs[:3]}")
+    lp = output.log_probs
+    assert lp is not None, "log_probs should be present when logprobs=True"
+    if isinstance(lp, torch.Tensor):
+        assert lp.numel() > 0
+        sample = lp.detach().cpu().flatten()[:3].tolist()
+        n = lp.numel()
+    else:
+        assert len(lp) > 0
+        sample = lp[:3]
+        n = len(lp)
+
+    print(f"log_probs: {n} values, sample: {sample}")
 
 
 def test_generate_concurrent(init_server):
diff --git a/tests/workers/test_engine_workers_lora_sync.py b/tests/workers/test_engine_workers_lora_sync.py
new file mode 100644
index 00000000000..11109717e76
--- /dev/null
+++ b/tests/workers/test_engine_workers_lora_sync.py
@@ -0,0 +1,465 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for LoRA weight sync ordering in engine_workers.update_weights().
+
+Tests the branching logic that controls how weights are synced to the rollout:
+  - Adapter mode (peft_merge=False): base weights first, then adapter deltas
+  - Merge mode (peft_merge=True): single sync with merged weights
+  - Non-LoRA: single sync, standard weights
+
+These tests mock the actor engine and rollout to verify call ordering and
+arguments without requiring GPU, ray, or sglang infrastructure.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, call
+
+# ---------------------------------------------------------------------------
+# Helper: simulate engine_workers.update_weights() logic
+# ---------------------------------------------------------------------------
+
+
+async def _update_weights(
+    *,
+    rollout,
+    actor_engine,
+    peft_merge: bool,
+    base_sync_done: bool,
+    free_cache_engine: bool,
+    layered_summon: bool = False,
+    global_steps: int = None,
+    checkpoint_backend: str = "naive",
+):
+    """Reproduce the update_weights() logic from engine_workers.py.
+
+    This mirrors the actual code so we can test the branching without
+    importing the real class (which requires ray, torch, etc.).
+    """
+    # 0. early return for non-naive checkpoint backend
+    if checkpoint_backend != "naive":
+        per_tensor_param, _ = actor_engine.get_per_tensor_param()
+        return
+
+    # 1. resume weights (conditional on sleep_level)
+    if free_cache_engine:
+        if getattr(rollout, "sleep_level", 2) != 1:
+            await rollout.resume(tags=["weights"])
+
+    # 2. probe adapter-mode params first so we can discover peft_config
+    per_tensor_param, peft_config = actor_engine.get_per_tensor_param(
+        layered_summon=layered_summon, base_sync_done=True
+    )
+
+    # 3. determine base sync need
+    do_lora_base_sync = False
+    if not peft_merge and peft_config is not None:
+        rollout.sleep_level = 1
+        do_lora_base_sync = not base_sync_done
+
+    # 4. sync weights
+    if do_lora_base_sync:
+        per_tensor_param_base, peft_config = actor_engine.get_per_tensor_param(
+            layered_summon=layered_summon, base_sync_done=False
+        )
+        await rollout.update_weights(
+            per_tensor_param_base, peft_config=peft_config, base_sync_done=False, global_steps=global_steps
+        )
+
+    await rollout.update_weights(
+        per_tensor_param, peft_config=peft_config, base_sync_done=True, global_steps=global_steps
+    )
+
+    # 5. resume kv_cache
+    if free_cache_engine:
+        await rollout.resume(tags=["kv_cache"])
+
+
+def _make_mocks(peft_config=None, params_by_base_sync_done=None):
+    """Create mock rollout and actor engine.
+
+    Args:
+        peft_config: If not None, get_per_tensor_param returns this as peft_config.
+            Use a truthy value (e.g. MagicMock()) for LoRA, None for non-LoRA/merge.
+        params_by_base_sync_done: Optional mapping used to return different params
+            for probe (`True`) and base sync (`False`) calls.
+    """
+    rollout = AsyncMock()
+    # Don't pre-set sleep_level — let the code set it via getattr default
+    del rollout.sleep_level
+
+    if params_by_base_sync_done is None:
+        params_by_base_sync_done = {False: "fake_params", True: "fake_params"}
+
+    actor_engine = MagicMock()
+
+    def _get_per_tensor_param(*args, **kwargs):
+        base_sync_done = kwargs.get("base_sync_done", True)
+        return params_by_base_sync_done[base_sync_done], peft_config
+
+    actor_engine.get_per_tensor_param = MagicMock(side_effect=_get_per_tensor_param)
+
+    return rollout, actor_engine
+
+
+# ---------------------------------------------------------------------------
+# Adapter mode tests (peft_merge=False, peft_config is not None)
+# ---------------------------------------------------------------------------
+
+
+class TestAdapterModeFirstIteration:
+    """First iteration in adapter mode: base_sync_done=False."""
+
+    def test_sends_base_before_adapter(self):
+        """Base weights must be sent before adapter deltas."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(
+            peft_config=peft_cfg,
+            params_by_base_sync_done={False: "fake_base_params", True: "fake_adapter_params"},
+        )
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=True,
+            )
+        )
+
+        # get_per_tensor_param called twice: first probe with base_sync_done=True, then fetch base weights
+        assert engine.get_per_tensor_param.call_count == 2
+        calls = engine.get_per_tensor_param.call_args_list
+        assert calls[0] == call(layered_summon=False, base_sync_done=True)
+        assert calls[1] == call(layered_summon=False, base_sync_done=False)
+
+    def test_update_weights_called_twice(self):
+        """Two update_weights calls: base (base_sync_done=False), then adapter (True)."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(
+            peft_config=peft_cfg,
+            params_by_base_sync_done={False: "fake_base_params", True: "fake_adapter_params"},
+        )
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=True,
+            )
+        )
+
+        assert rollout.update_weights.call_count == 2
+        first_call = rollout.update_weights.call_args_list[0]
+        second_call = rollout.update_weights.call_args_list[1]
+        assert first_call.args[0] == "fake_base_params"
+        assert second_call.args[0] == "fake_adapter_params"
+        assert first_call.kwargs["base_sync_done"] is False
+        assert second_call.kwargs["base_sync_done"] is True
+
+    def test_sets_sleep_level_to_1(self):
+        """After first iteration, sleep_level should be set to 1."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(peft_config=peft_cfg)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=True,
+            )
+        )
+
+        assert rollout.sleep_level == 1
+
+    def test_first_call_resumes_weights(self):
+        """First iteration: sleep_level not yet set, so weight resume fires."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(peft_config=peft_cfg)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=True,
+            )
+        )
+
+        # resume called twice: weights then kv_cache
+        resume_calls = rollout.resume.call_args_list
+        assert call(tags=["weights"]) in resume_calls
+        assert call(tags=["kv_cache"]) in resume_calls
+
+
+class TestAdapterModeSubsequentIterations:
+    """Subsequent iterations in adapter mode: base_sync_done=True, sleep_level=1."""
+
+    def test_single_update_weights_call(self):
+        """Only adapter deltas sent, no base sync."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(peft_config=peft_cfg)
+        rollout.sleep_level = 1  # Set from previous iteration
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=True,
+                free_cache_engine=True,
+            )
+        )
+
+        assert engine.get_per_tensor_param.call_count == 1
+        assert rollout.update_weights.call_count == 1
+        assert rollout.update_weights.call_args.kwargs["base_sync_done"] is True
+
+    def test_skips_weight_resume(self):
+        """With sleep_level=1, weight resume is skipped."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(peft_config=peft_cfg)
+        rollout.sleep_level = 1
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=True,
+                free_cache_engine=True,
+            )
+        )
+
+        # Only kv_cache resume, no weight resume
+        resume_calls = rollout.resume.call_args_list
+        assert call(tags=["weights"]) not in resume_calls
+        assert call(tags=["kv_cache"]) in resume_calls
+
+
+# ---------------------------------------------------------------------------
+# Merge mode tests (peft_merge=True)
+# ---------------------------------------------------------------------------
+
+
+class TestMergeMode:
+    """Merge mode: peft_merge=True, peft_config=None (merged into base)."""
+
+    def test_single_update_weights_call(self):
+        rollout, engine = _make_mocks(peft_config=None)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=True,
+                base_sync_done=True,
+                free_cache_engine=True,
+            )
+        )
+
+        assert engine.get_per_tensor_param.call_count == 1
+        assert rollout.update_weights.call_count == 1
+
+    def test_resumes_weights(self):
+        """Merge mode always resumes weights (sleep_level stays at default 2)."""
+        rollout, engine = _make_mocks(peft_config=None)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=True,
+                base_sync_done=True,
+                free_cache_engine=True,
+            )
+        )
+
+        resume_calls = rollout.resume.call_args_list
+        assert call(tags=["weights"]) in resume_calls
+
+    def test_does_not_set_sleep_level(self):
+        """Merge mode should not touch sleep_level."""
+        rollout, engine = _make_mocks(peft_config=None)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=True,
+                base_sync_done=True,
+                free_cache_engine=True,
+            )
+        )
+
+        assert not hasattr(rollout, "sleep_level")
+
+
+# ---------------------------------------------------------------------------
+# Non-LoRA tests
+# ---------------------------------------------------------------------------
+
+
+class TestNonLora:
+    """Non-LoRA model: peft_config=None, peft_merge=False."""
+
+    def test_single_update_weights_call(self):
+        rollout, engine = _make_mocks(peft_config=None)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=True,
+                free_cache_engine=True,
+            )
+        )
+
+        assert engine.get_per_tensor_param.call_count == 1
+        assert rollout.update_weights.call_count == 1
+
+    def test_peft_config_none_skips_adapter_path(self):
+        """Even with peft_merge=False, if peft_config is None, adapter path is not entered."""
+        rollout, engine = _make_mocks(peft_config=None)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=True,
+            )
+        )
+
+        # Should NOT set sleep_level or do double sync
+        assert not hasattr(rollout, "sleep_level")
+        assert rollout.update_weights.call_count == 1
+
+
+# ---------------------------------------------------------------------------
+# Edge cases
+# ---------------------------------------------------------------------------
+
+
+class TestEdgeCases:
+    def test_free_cache_engine_false_skips_resume(self):
+        """When free_cache_engine=False, no resume calls should happen."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(peft_config=peft_cfg)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=False,
+            )
+        )
+
+        rollout.resume.assert_not_called()
+
+    def test_adapter_full_call_ordering(self):
+        """Verify the complete call sequence on first adapter iteration."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(
+            peft_config=peft_cfg,
+            params_by_base_sync_done={False: "fake_base_params", True: "fake_adapter_params"},
+        )
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=True,
+                global_steps=42,
+            )
+        )
+
+        # Full expected ordering:
+        # 1. resume(weights)  2. update_weights(base)  3. update_weights(adapter)  4. resume(kv_cache)
+        expected = [
+            call.resume(tags=["weights"]),
+            call.update_weights("fake_base_params", peft_config=peft_cfg, base_sync_done=False, global_steps=42),
+            call.update_weights("fake_adapter_params", peft_config=peft_cfg, base_sync_done=True, global_steps=42),
+            call.resume(tags=["kv_cache"]),
+        ]
+        # Filter to only resume and update_weights calls
+        actual = [c for c in rollout.mock_calls if c[0] in ("resume", "update_weights")]
+        assert actual == expected
+
+    def test_global_steps_forwarded(self):
+        """Verify global_steps is passed through to update_weights."""
+        rollout, engine = _make_mocks(peft_config=None)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=True,
+                base_sync_done=True,
+                free_cache_engine=True,
+                global_steps=99,
+            )
+        )
+
+        assert rollout.update_weights.call_args.kwargs["global_steps"] == 99
+
+    def test_non_naive_backend_early_return(self):
+        """Non-naive checkpoint backend returns early, skips all LoRA logic."""
+        peft_cfg = MagicMock()
+        rollout, engine = _make_mocks(peft_config=peft_cfg)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=True,
+                checkpoint_backend="disaggregated",
+            )
+        )
+
+        rollout.update_weights.assert_not_called()
+        rollout.resume.assert_not_called()
+
+    def test_non_lora_probe_still_uses_base_sync_done_true(self):
+        """Non-LoRA path still probes with base_sync_done=True and sends a standard update."""
+        rollout, engine = _make_mocks(peft_config=None)
+
+        asyncio.run(
+            _update_weights(
+                rollout=rollout,
+                actor_engine=engine,
+                peft_merge=False,
+                base_sync_done=False,
+                free_cache_engine=True,
+            )
+        )
+
+        engine.get_per_tensor_param.assert_called_once_with(layered_summon=False, base_sync_done=True)
+        assert rollout.update_weights.call_args.kwargs["base_sync_done"] is True
diff --git a/verl/checkpoint_engine/base.py b/verl/checkpoint_engine/base.py
index 6b3a7cd2584..abdea1076d5 100644
--- a/verl/checkpoint_engine/base.py
+++ b/verl/checkpoint_engine/base.py
@@ -22,6 +22,7 @@
 from verl.single_controller.base.decorator import Dispatch, register
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
 from verl.utils.distributed import initialize_global_process_group_ray
+from verl.utils.import_utils import import_external_libs
 from verl.utils.ray_utils import auto_await
 from verl.workers.config import CheckpointEngineConfig, HFModelConfig, RolloutConfig
 from verl.workers.rollout import BaseRollout, RolloutReplica, get_rollout_class
@@ -266,6 +267,9 @@ def __init__(
         backend = self.rollout_config.checkpoint_engine.backend
         bucket_size = self.rollout_config.checkpoint_engine.update_weights_bucket_megabytes << 20
         engine_kwargs = self.rollout_config.checkpoint_engine.engine_kwargs.get(backend, {})
+        # If custom_backend_module is set, import it so plugins can register
+        # in CheckpointEngineRegistry before the backend is instantiated.
+        import_external_libs(self.rollout_config.checkpoint_engine.custom_backend_module or None)
         self.checkpoint_engine: CheckpointEngine = CheckpointEngineRegistry.new(
             backend, bucket_size=bucket_size, **engine_kwargs
         )
@@ -342,6 +346,7 @@ def __init__(
     ) -> None:
         self.config = config
         self.backend = config.backend
+        import_external_libs(self.config.custom_backend_module or None)
         self.backend_cls = CheckpointEngineRegistry.get(config.backend)
         self.trainer = trainer
         self.replicas = replicas
diff --git a/verl/experimental/agent_loop/__init__.py b/verl/experimental/agent_loop/__init__.py
index d43683df3e4..7c875407d13 100644
--- a/verl/experimental/agent_loop/__init__.py
+++ b/verl/experimental/agent_loop/__init__.py
@@ -12,10 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .agent_loop import AgentLoopBase, AgentLoopManager, AgentLoopWorker, AsyncLLMServerManager
+from .agent_loop import (
+    AgentLoopBase,
+    AgentLoopManager,
+    AgentLoopWorker,
+    AsyncLLMServerManager,
+)
+from .diffusion_agent_loop import DiffusionAgentLoopWorker
 from .single_turn_agent_loop import SingleTurnAgentLoop
 from .tool_agent_loop import ToolAgentLoop
 
 _ = [SingleTurnAgentLoop, ToolAgentLoop]
 
-__all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager", "AgentLoopWorker"]
+__all__ = ["AgentLoopBase", "AgentLoopManager", "AsyncLLMServerManager", "AgentLoopWorker", "DiffusionAgentLoopWorker"]
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index edd8d4defda..7dd814e1ef6 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -23,7 +23,6 @@
 import numpy as np
 import ray
 import torch
-import torch.nn.functional as F
 from cachetools import LRUCache
 from omegaconf import DictConfig, OmegaConf
 from PIL import Image
@@ -48,7 +47,7 @@
 from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op
 from verl.utils.tokenizer import normalize_token_ids
 from verl.workers.config import DistillationConfig, DistillationLossConfig, HFModelConfig, RolloutConfig
-from verl.workers.rollout.replica import TokenOutput, get_rollout_replica_class
+from verl.workers.rollout.replica import DiffusionOutput, TokenOutput, get_rollout_replica_class
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -143,7 +142,8 @@ async def generate(
         sampling_params: dict[str, Any],
         image_data: Optional[list[Any]] = None,
         video_data: Optional[list[Any]] = None,
-    ) -> TokenOutput:
+        **kwargs: Any,
+    ) -> TokenOutput | DiffusionOutput:
         """Generate tokens from prompt ids.
 
         Args:
@@ -152,16 +152,17 @@ async def generate(
             sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
 
         Returns:
-            TokenOutput: token output
+            TokenOutput | DiffusionOutput: token or diffusion output
         """
         server_id, server = await self._acquire_server(request_id)
         try:
-            output: TokenOutput = await server.generate.remote(
+            output: TokenOutput | DiffusionOutput = await server.generate.remote(
                 request_id=uuid4().hex,  # use new request_id for each turn
                 prompt_ids=prompt_ids,
                 sampling_params=sampling_params,
                 image_data=image_data,
                 video_data=video_data,
+                **kwargs,
             )
             return output
         finally:
@@ -227,7 +228,7 @@ class _InternalAgentLoopOutput(AgentLoopOutput):
     routed_experts: Optional[torch.Tensor] = None
     """Padded routed experts for the total tokens."""
     multi_modal_inputs: Optional[dict[str, torch.Tensor]] = None
-    """Multi-modal inputs for processors (e.g., pixel_values, image_grid_thw)."""
+    """Multi-modal inputs for processors (e.g. pixel_values, image_grid_thw, video_grid_thw)."""
     extra_fields: dict[str, Any] = {}
     """Extra fields for dynamic addition."""
 
@@ -431,20 +432,29 @@ def __init__(
         self.distillation_config = config.get("distillation", None)
         self.distillation_enabled = is_distillation_enabled(self.distillation_config)
         if self.distillation_enabled:
-            if teacher_servers is None:
-                raise ValueError("Distillation is enabled but no teacher servers provided.")
-            if teacher_load_balancer_handle is None:
-                raise ValueError("Distillation is enabled but no teacher load balancer provided.")
             self.distillation_config: DistillationConfig = omega_conf_to_dataclass(self.distillation_config)
             self.distillation_loss_config: DistillationLossConfig = self.distillation_config.distillation_loss
-
-            # for recipe to change
-            if not hasattr(self, "teacher_server_manager"):
-                self.teacher_server_manager = AsyncLLMServerManager(
-                    config,
-                    teacher_servers,
-                    load_balancer_handle=teacher_load_balancer_handle,
-                )
+            self.stream_teacher_with_rollout = self.distillation_config.teacher_model.enable_resource_pool
+
+            if self.stream_teacher_with_rollout:
+                if teacher_servers is None:
+                    raise ValueError("Distillation streaming is enabled but no teacher servers were provided.")
+                if teacher_load_balancer_handle is None:
+                    raise ValueError("Distillation streaming is enabled but no teacher load balancer was provided.")
+                if not hasattr(self, "teacher_server_manager"):
+                    from verl.experimental.teacher_loop.teacher_manager import AsyncTeacherLLMServerManager
+
+                    self.teacher_server_manager = AsyncTeacherLLMServerManager(
+                        config,
+                        teacher_servers,
+                        load_balancer_handle=teacher_load_balancer_handle,
+                        distillation_config=self.distillation_config,
+                        pad_token_id=self.model_config.tokenizer.pad_token_id,
+                    )
+            else:
+                self.teacher_server_manager = None
+        else:
+            self.stream_teacher_with_rollout = False
 
         # for recipe to change
         if not hasattr(self, "server_manager"):
@@ -713,11 +723,18 @@ async def _agent_loop_postprocess(self, output, validate, **kwargs) -> _Internal
             output.extra_fields.pop("teacher_logprobs", None),
         )
         if teacher_ids is not None and teacher_logprobs is not None:
-            left_pad_size = prompt_output["input_ids"].shape[1] - len(output.prompt_ids)
-            right_pad_size = response_output["input_ids"].shape[1] - len(output.response_ids)
-            padding = (0, 0, left_pad_size, right_pad_size)  # pad the sequence dimension
-            teacher_ids = F.pad(teacher_ids, padding, value=self.tokenizer.pad_token_id).unsqueeze(0)
-            teacher_logprobs = F.pad(teacher_logprobs, padding, value=0.0).unsqueeze(0)
+            # TODO(wuxibin): remove padding and use tensordict.
+            from verl.experimental.teacher_loop.teacher_manager import _pad_teacher_outputs
+
+            teacher_ids, teacher_logprobs = _pad_teacher_outputs(
+                teacher_ids,
+                teacher_logprobs,
+                prompt_width=prompt_output["input_ids"].shape[1],
+                response_width=response_output["input_ids"].shape[1],
+                prompt_length=len(output.prompt_ids),
+                response_length=len(output.response_ids),
+                pad_token_id=self.tokenizer.pad_token_id,
+            )
 
         return _InternalAgentLoopOutput(
             prompt_ids=prompt_output["input_ids"],
@@ -836,40 +853,11 @@ async def _compute_score(self, output, prompts, responses, attention_mask, input
 
     async def _compute_teacher_logprobs(self, output: AgentLoopOutput, prompt_ids, response_ids, validate):
         """Compute teacher logprobs for single sample."""
-        if self.distillation_enabled and not validate:
-            # This assumes that the teacher processes multi-modal data in the same way as the student
-            multi_modal_data = output.multi_modal_data
-            images = multi_modal_data.get("images")
-            videos = multi_modal_data.get("videos")
-
-            if self.distillation_config.teacher_model.inference.temperature != 1.0:
-                raise NotImplementedError("vLLM does not support temperature for prompt_logprobs.")
-
-            num_logprobs = (
-                self.distillation_loss_config.topk if self.distillation_loss_config.loss_settings.use_topk else 0
-            )
-            sampling_params = {
-                "max_tokens": 1,
-                "temperature": self.distillation_config.teacher_model.inference.temperature,
-                "prompt_logprobs": num_logprobs,
-            }
-            teacher_output = await self.teacher_server_manager.generate(
-                request_id=uuid4().hex,
-                prompt_ids=prompt_ids + response_ids,
-                sampling_params=sampling_params,
-                image_data=images,
-                video_data=videos,
-            )
-            response_ids_ls, response_logprobs_ls = (
-                teacher_output.extra_fields["prompt_ids"],
-                teacher_output.extra_fields["prompt_logprobs"],
+        if self.stream_teacher_with_rollout and not validate:
+            teacher_ids, teacher_logprobs = await self.teacher_server_manager.compute_teacher_logprobs_single(
+                sequence_ids=prompt_ids + response_ids,
+                multi_modal_data=output.multi_modal_data,
             )
-            # Shapes: # S, (1 or K), where S is the response length, K is either 1 or topk depending on
-            # the distillation loss settings.
-            teacher_ids = torch.tensor(response_ids_ls, dtype=torch.int32)
-            teacher_logprobs = torch.tensor(response_logprobs_ls)
-            assert teacher_ids.shape[0] == teacher_logprobs.shape[0] == len(prompt_ids + response_ids)
-
             output.extra_fields["teacher_ids"] = teacher_ids
             output.extra_fields["teacher_logprobs"] = teacher_logprobs
 
@@ -892,7 +880,7 @@ def _postprocess(
             optional_outputs["rollout_log_probs"] = torch.cat([input.response_logprobs for input in inputs], dim=0)
         if inputs[0].routed_experts is not None:
             optional_outputs["routed_experts"] = torch.cat([input.routed_experts for input in inputs], dim=0)
-        if self.distillation_enabled and not validate:
+        if inputs[0].teacher_logprobs is not None and inputs[0].teacher_ids is not None:
             optional_outputs["teacher_logprobs"] = torch.cat([input.teacher_logprobs for input in inputs], dim=0)
             optional_outputs["teacher_ids"] = torch.cat([input.teacher_ids for input in inputs], dim=0)
         batch = TensorDict(
@@ -934,6 +922,12 @@ def _postprocess(
         if any(mmi is not None for mmi in multi_modal_inputs_list):
             non_tensor_batch["multi_modal_inputs"] = np.array(multi_modal_inputs_list, dtype=object)
 
+        # if distillation is enabled but not streaming teacher with rollout, store multi-modal data for
+        # batched teacher logprob computation.
+        if self.distillation_enabled and not self.stream_teacher_with_rollout:
+            teacher_multi_modal_data = [input.multi_modal_data for input in inputs]
+            non_tensor_batch["teacher_multi_modal_data"] = np.array(teacher_multi_modal_data, dtype=object)
+
         metrics = [input.metrics.model_dump() for input in inputs]
         # Collect extra fields from all inputs and convert them to np.ndarray
         # Keep a stable set of keys so downstream batch concat stays consistent across agent loops.
@@ -1019,6 +1013,9 @@ def __init__(
 
         self.teacher_model_manager = teacher_model_manager
         self.distillation_enabled = is_distillation_enabled(self.config.get("distillation", None))
+        self.stream_teacher_with_rollout = (
+            self.distillation_enabled and self.config.distillation.teacher_model.enable_resource_pool
+        )
 
         assert worker_group is not None or self.rollout_config.nnodes > 0, "nnodes must be > 0 in standalone mode"
 
@@ -1026,7 +1023,12 @@ def __init__(
         if not hasattr(self, "rollout_replica_class"):
             self.rollout_replica_class = get_rollout_replica_class(self.rollout_config.name)
         if not hasattr(self, "agent_loop_workers_class"):
-            self.agent_loop_workers_class = ray.remote(AgentLoopWorker)
+            if OmegaConf.select(self.config, "actor_rollout_ref.model.model_type", default=None) == "diffusion_model":
+                from verl.experimental.agent_loop.diffusion_agent_loop import DiffusionAgentLoopWorker
+
+                self.agent_loop_workers_class = ray.remote(DiffusionAgentLoopWorker)
+            else:
+                self.agent_loop_workers_class = ray.remote(AgentLoopWorker)
 
     @classmethod
     @auto_await
@@ -1098,11 +1100,11 @@ async def _init_agent_loop_workers(self):
         load_balancer_handle = self.global_load_balancer
         servers = list(zip(self.server_addresses, self.server_handles, strict=True))
 
-        if self.distillation_enabled:
+        if self.stream_teacher_with_rollout:
             teacher_server_handles = self.teacher_model_manager.server_handles
             teacher_server_addresses = self.teacher_model_manager.server_addresses
             teacher_servers = list(zip(teacher_server_addresses, teacher_server_handles, strict=True))
-            teacher_load_balancer_handle = self.teacher_global_load_balancer
+            teacher_load_balancer_handle = self.teacher_model_manager.load_balancer_handle
         else:
             teacher_servers = None
             teacher_load_balancer_handle = None
@@ -1132,11 +1134,6 @@ async def _init_global_load_balancer(self) -> None:
             server_actor_ids=self.server_addresses,
             max_cache_size=DEFAULT_ROUTING_CACHE_SIZE,
         )
-        if self.distillation_enabled:
-            self.teacher_global_load_balancer = GlobalRequestLoadBalancer.remote(
-                server_actor_ids=self.teacher_model_manager.server_addresses,
-                max_cache_size=DEFAULT_ROUTING_CACHE_SIZE,
-            )
 
     @auto_await
     async def generate_sequences(self, prompts: DataProto) -> DataProto:
@@ -1148,6 +1145,9 @@ async def generate_sequences(self, prompts: DataProto) -> DataProto:
         Returns:
             DataProto: Output batch.
         """
+        if self.stream_teacher_with_rollout:
+            await self.teacher_model_manager.wake_up()
+
         spec_before = None
         if self.rollout_config.name == "vllm" and self.rollout_config.speculative_decoding.enable:
             try:
@@ -1155,8 +1155,6 @@ async def generate_sequences(self, prompts: DataProto) -> DataProto:
             except Exception as e:
                 print(f"speculative decoding unavailable: {e}")
 
-        if self.distillation_enabled:
-            await self.teacher_model_manager.wake_up()
         chunkes = prompts.chunk(len(self.agent_loop_workers))
         outputs = await asyncio.gather(
             *[
@@ -1164,7 +1162,7 @@ async def generate_sequences(self, prompts: DataProto) -> DataProto:
                 for worker, chunk in zip(self.agent_loop_workers, chunkes, strict=True)
             ]
         )
-        if self.distillation_enabled:
+        if self.stream_teacher_with_rollout:
             await self.teacher_model_manager.sleep()
         output = DataProto.concat(outputs)
 
@@ -1219,14 +1217,16 @@ def _performance_metrics(self, metrics: list[list[dict[str, str]]], output: Data
 
         # batch sequence generation is bounded by the slowest sample
         slowest = np.argmax(t_generate_sequences + t_tool_calls)
-        attention_mask = output.batch["attention_mask"][slowest]
         prompt_length = output.batch["prompts"].shape[1]
         timing["agent_loop/slowest/generate_sequences"] = t_generate_sequences[slowest]
         timing["agent_loop/slowest/tool_calls"] = t_tool_calls[slowest]
-        timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item()
-        timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item()
         timing["agent_loop/slowest/num_preempted"] = num_preempted[slowest]
 
+        if "attention_mask" in output.batch:
+            attention_mask = output.batch["attention_mask"][slowest]
+            timing["agent_loop/slowest/prompt_length"] = attention_mask[:prompt_length].sum().item()
+            timing["agent_loop/slowest/response_length"] = attention_mask[prompt_length:].sum().item()
+
         return timing
 
     @auto_await
diff --git a/verl/experimental/agent_loop/diffusion_agent_loop.py b/verl/experimental/agent_loop/diffusion_agent_loop.py
new file mode 100644
index 00000000000..1d91c6f338c
--- /dev/null
+++ b/verl/experimental/agent_loop/diffusion_agent_loop.py
@@ -0,0 +1,382 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import random
+from typing import Any, Optional
+
+import hydra
+import numpy as np
+import ray
+import torch
+import torch.nn.functional as F
+from omegaconf import DictConfig, OmegaConf
+from pydantic import BaseModel, ConfigDict
+from tensordict import TensorDict
+
+from verl.experimental.agent_loop.agent_loop import (
+    AgentLoopMetrics,
+    AsyncLLMServerManager,
+    DictConfigWrap,
+    _agent_loop_registry,
+    _get_rollout_and_model_config,
+)
+from verl.experimental.agent_loop.utils import resolve_config_path
+from verl.protocol import DataProto
+from verl.utils.config import omega_conf_to_dataclass
+from verl.utils.dataset.rl_dataset import get_dataset_class
+from verl.workers.config import DiffusionModelConfig, DiffusionRolloutConfig
+
+
+class DiffusionAgentLoopOutput(BaseModel):
+    """Agent loop output."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    prompt_ids: list[int]
+    """Prompt token ids."""
+    response_diffusion_output: Any
+    """Response diffusion output (torch.Tensor): image tensor (CHW) / video tensor (TCHW)."""
+    response_logprobs: Optional[Any] = None
+    """Log probabilities for the response tokens. (torch.Tensor)"""
+    multi_modal_data: Optional[dict[str, Any]] = None
+    """Multi-modal data for multi-modal tools."""
+    reward_score: Optional[float] = None
+    """Reward score for the trajectory."""
+    num_turns: int = 0
+    """Number of chat turns, including user, assistant, tool."""
+    metrics: AgentLoopMetrics
+    """Auxiliary performance metrics"""
+    extra_fields: dict[str, Any] = {}
+    """Extra fields for dynamic addition."""
+
+
+class _InternalDiffusionAgentLoopOutput(DiffusionAgentLoopOutput):
+    """Internal agent loop output with padded sequences."""
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    prompt_ids: torch.Tensor
+    """Padded prompt token ids."""
+    response_diffusion_output: torch.Tensor
+    """Response diffusion output: image (NCHW format) / video (NTCHW format)."""
+    input_ids: torch.Tensor
+    """Padded input ids(prompt_ids)."""
+    attention_mask: torch.Tensor
+    """Padded attention mask."""
+    response_logprobs: Optional[torch.Tensor] = None
+    """Log probabilities for the response tokens."""
+    multi_modal_inputs: Optional[dict[str, torch.Tensor]] = None
+    """Multi-modal inputs for processors (e.g. pixel_values, image_grid_thw, video_grid_thw)."""
+    extra_fields: dict[str, Any] = {}
+    """Extra fields for dynamic addition."""
+
+
+class DiffusionAgentLoopWorker:
+    """Diffusion Agent loop worker takes a batch of messages and run each message in an agent loop.
+
+    Args:
+        config (DictConfig): whole config for main entrypoint.
+        servers (list[tuple[str, ray.actor.ActorHandle]]): (address, handle) pairs for each LLM server.
+        load_balancer_handle (ray.actor.ActorHandle): shared global load balancer actor.
+        teacher_servers (list[tuple[str, ray.actor.ActorHandle]]): Not used.
+        teacher_load_balancer_handle (ray.actor.ActorHandle): Not used.
+        reward_loop_worker_handles (List[ray.actor.ActorHandle]): Actor handles for streaming reward computation.
+    """
+
+    def __init__(
+        self,
+        config: DictConfig,
+        servers: list[tuple[str, ray.actor.ActorHandle]],
+        load_balancer_handle: ray.actor.ActorHandle,
+        teacher_servers: list[tuple[str, ray.actor.ActorHandle]] = None,
+        teacher_load_balancer_handle: ray.actor.ActorHandle = None,
+        reward_loop_worker_handles: list[ray.actor.ActorHandle] = None,
+    ):
+        self.config = config
+        rollout_config, model_config = _get_rollout_and_model_config(config)
+        self.rollout_config: DiffusionRolloutConfig = omega_conf_to_dataclass(rollout_config)
+        self.model_config: DiffusionModelConfig = omega_conf_to_dataclass(model_config)
+
+        if not hasattr(self, "server_manager"):
+            self.server_manager = AsyncLLMServerManager(
+                config,
+                servers,
+                load_balancer_handle=load_balancer_handle,
+            )
+
+        self.dataset_cls = get_dataset_class(config.data)
+        self.reward_loop_worker_handles = reward_loop_worker_handles
+
+        self.tokenizer = self.model_config.tokenizer
+        self.processor = self.model_config.processor
+
+        self.max_prompt_embed_length = self.model_config.extra_configs.get(
+            "max_sequence_length", self.rollout_config.prompt_length
+        )
+
+        agent_loop_config_path = self.rollout_config.agent.agent_loop_config_path
+        if agent_loop_config_path:
+            resolved_path = resolve_config_path(agent_loop_config_path)
+            agent_loop_configs = OmegaConf.load(resolved_path)
+            for agent_loop_config in agent_loop_configs:
+                _agent_loop_registry[agent_loop_config.name] = agent_loop_config
+        if self.model_config.get("custom_chat_template", None) is not None:
+            if self.model_config.processor is not None:
+                self.model_config.processor.chat_template = self.model_config.custom_chat_template
+            self.model_config.tokenizer.chat_template = self.model_config.custom_chat_template
+
+    async def generate_sequences(self, batch: DataProto) -> DataProto:
+        """Generate sequences from agent loop.
+
+        Args:
+            batch (DataProto): Input batch.
+
+        Returns:
+            DataProto: Output batch.
+            - prompts: [bsz, prompt_length], prompt token ids from dataset.
+            - responses: diffusion output, typically [bsz, C, H, W] (image) or [bsz, T, C, H, W] (video).
+            - rm_scores (optional): [bsz, 1], reward model scores.
+            - meta_info:
+              - metrics: List[dict], per-sample agent loop metrics.
+              - reward_extra_keys (optional): List[str], keys for reward extra info for logging/validation.
+            ...
+        """
+        config = self.rollout_config
+
+        sampling_params = dict(self.model_config.extra_configs)
+        sampling_params.update(
+            height=config.height,
+            width=config.width,
+            num_inference_steps=config.num_inference_steps,
+            logprobs=config.calculate_log_probs,
+        )
+
+        # override sampling params for validation
+        if batch.meta_info.get("validate", False):
+            sampling_params["num_inference_steps"] = config.val_kwargs.num_inference_steps
+            sampling_params["seed"] = config.val_kwargs.seed
+            sampling_params["noise_level"] = config.val_kwargs.noise_level
+
+        # by default, we assume it's a single turn agent
+        if "agent_name" not in batch.non_tensor_batch:
+            default_agent_loop = config.agent.default_agent_loop
+            batch.non_tensor_batch["agent_name"] = np.array([default_agent_loop] * len(batch), dtype=object)
+
+        tasks = []
+        for i in range(len(batch)):
+            kwargs = {k: v[i] for k, v in batch.non_tensor_batch.items()}
+            tasks.append(asyncio.create_task(self._run_agent_loop(sampling_params, **kwargs)))
+        outputs = await asyncio.gather(*tasks)
+
+        output = self._postprocess(outputs, input_non_tensor_batch=batch.non_tensor_batch)
+
+        return output
+
+    async def _run_agent_loop(
+        self,
+        sampling_params: dict[str, Any],
+        *,
+        agent_name: str,
+        **kwargs,
+    ) -> _InternalDiffusionAgentLoopOutput:
+        assert agent_name in _agent_loop_registry, (
+            f"Agent loop {agent_name} not registered, registered agent loops: {_agent_loop_registry.keys()}"
+        )
+
+        agent_loop_config = _agent_loop_registry[agent_name]
+        agent_loop = hydra.utils.instantiate(
+            config=agent_loop_config,
+            trainer_config=DictConfigWrap(config=self.config),
+            server_manager=self.server_manager,
+            tokenizer=self.tokenizer,
+            processor=self.processor,
+            dataset_cls=self.dataset_cls,
+            data_config=DictConfigWrap(self.config.data),
+        )
+        output: DiffusionAgentLoopOutput = await agent_loop.run(sampling_params, **kwargs)
+        return await self._agent_loop_postprocess(output, **kwargs)
+
+    async def _agent_loop_postprocess(self, output, **kwargs) -> _InternalDiffusionAgentLoopOutput:
+        """Perform post-processing operations on the output of each individual agent loop."""
+        # handling extra tensor ouputs from vllm-omni, like prompt embedding, etc.
+        extra_fields = {}
+        for k, v in output.extra_fields.items():
+            if isinstance(v, torch.Tensor):
+                # handle prompt embedding padding
+                # TODO (andy): reduce padding length for more effiency
+                if k in ["prompt_embeds", "negative_prompt_embeds"]:
+                    pad_tuple = (0, 0, 0, self.max_prompt_embed_length - v.shape[0])
+                    v = F.pad(v, pad_tuple, value=0)
+                elif k in ["prompt_embeds_mask", "negative_prompt_embeds_mask"]:
+                    pad_tuple = (0, self.max_prompt_embed_length - v.shape[0])
+                    v = F.pad(v, pad_tuple, value=0)
+                extra_fields[k] = v.unsqueeze(0)
+            else:
+                extra_fields[k] = v
+
+        extra_fields["raw_prompt"] = kwargs["raw_prompt"]
+
+        self.tokenizer.padding_side = "left"
+        prompt_output = self.tokenizer.pad(
+            {"input_ids": output.prompt_ids},
+            padding="max_length",
+            max_length=self.rollout_config.prompt_length,
+            return_tensors="pt",
+            return_attention_mask=True,
+        )
+        if prompt_output["input_ids"].dim() == 1:
+            prompt_output["input_ids"] = prompt_output["input_ids"].unsqueeze(0)
+            prompt_output["attention_mask"] = prompt_output["attention_mask"].unsqueeze(0)
+
+        response_diffusion_output = output.response_diffusion_output.unsqueeze(0)
+
+        response_logprobs = None
+        if output.response_logprobs is not None:
+            response_logprobs = output.response_logprobs.unsqueeze(0)
+
+        attention_mask = prompt_output["attention_mask"]
+        input_ids = prompt_output["input_ids"]
+
+        await self._compute_score(
+            output,
+            prompts=input_ids,
+            responses=response_diffusion_output,
+            attention_mask=attention_mask,
+            input_ids=input_ids,
+            kwargs=kwargs,
+        )
+
+        if "reward_extra_info" in output.extra_fields:
+            extra_fields["reward_extra_info"] = output.extra_fields["reward_extra_info"]
+
+        return _InternalDiffusionAgentLoopOutput(
+            prompt_ids=input_ids,
+            response_diffusion_output=response_diffusion_output,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            response_logprobs=response_logprobs,
+            multi_modal_data=output.multi_modal_data,
+            reward_score=output.reward_score,
+            num_turns=output.num_turns,
+            metrics=output.metrics,
+            extra_fields=extra_fields,
+        )
+
+    async def _compute_score(self, output, prompts, responses, attention_mask, input_ids, kwargs):
+        """Compute reward score for single sample."""
+        enable_async_reward = self.reward_loop_worker_handles is not None
+
+        if output.reward_score is None and enable_async_reward:
+            batch = TensorDict(
+                {
+                    "prompts": prompts,  # [1, prompt_length]
+                    "responses": responses,  # [1, C, H, W] or [1, T, C, H, W]
+                    "attention_mask": attention_mask,  # [1, prompt_length]
+                    "input_ids": input_ids,  # [1, prompt_length]
+                },
+                batch_size=1,
+            )
+            non_tensor_batch = {
+                **{k: np.array([v]) for k, v in kwargs.items()},
+                "__num_turns__": np.array([output.num_turns]),
+                "tool_extra_fields": np.array([output.extra_fields], dtype=object),
+            }
+
+            data = DataProto(
+                batch=batch,
+                non_tensor_batch=non_tensor_batch,
+            )
+            selected_reward_loop_worker_handle = random.choice(self.reward_loop_worker_handles)
+            result = await selected_reward_loop_worker_handle.compute_score.remote(data)
+            output.reward_score = result["reward_score"]
+            output.extra_fields["reward_extra_info"] = result["reward_extra_info"]
+
+    def _postprocess(
+        self,
+        inputs: list[_InternalDiffusionAgentLoopOutput],
+        input_non_tensor_batch: dict | None = None,
+    ) -> DataProto:
+        """Process the padded outputs from _run_agent_loop and combine them into a batch."""
+        # Convert lists back to tensors and stack them to create a batch.
+        prompt_ids = torch.cat([input.prompt_ids for input in inputs], dim=0)
+        response_diffusion_output = torch.cat([input.response_diffusion_output for input in inputs], dim=0)
+        attention_mask = torch.cat([input.attention_mask for input in inputs], dim=0)
+        input_ids = torch.cat([input.input_ids for input in inputs], dim=0)
+        optional_outputs = {}
+        if inputs[0].response_logprobs is not None:
+            optional_outputs["rollout_log_probs"] = torch.cat([input.response_logprobs for input in inputs], dim=0)
+
+        # Handle extra fields that are tensors
+        extra_keys = [k for k, v in inputs[0].extra_fields.items() if isinstance(v, torch.Tensor)]
+        for key in extra_keys:
+            optional_outputs[key] = torch.cat([input.extra_fields[key] for input in inputs], dim=0)
+            for input in inputs:
+                del input.extra_fields[key]
+
+        batch = TensorDict(
+            {
+                "prompts": prompt_ids,  # [bsz, prompt_length]
+                "responses": response_diffusion_output,  # [bsz, C, H, W] or [bsz, T, C, H, W]
+                "input_ids": input_ids,  # [bsz, prompt_length]
+                "attention_mask": attention_mask,  # [bsz, prompt_length]
+                **optional_outputs,
+            },
+            batch_size=len(inputs),
+        )
+
+        scores = [input.reward_score for input in inputs]
+        if all(score is not None for score in scores):
+            rm_scores = torch.tensor(scores, dtype=torch.float32).unsqueeze(-1)
+            batch["rm_scores"] = rm_scores
+
+        non_tensor_batch = {
+            "__num_turns__": np.array([input.num_turns for input in inputs], dtype=np.int32),
+        }
+        if self.reward_loop_worker_handles is None and input_non_tensor_batch:
+            non_tensor_batch.update(input_non_tensor_batch)
+
+        # add reward_extra_info to non_tensor_batch
+        reward_extra_infos = [input.extra_fields.get("reward_extra_info", {}) for input in inputs]
+        reward_extra_keys = list(reward_extra_infos[0].keys())
+        for key in reward_extra_keys:
+            non_tensor_batch[key] = np.array([info[key] for info in reward_extra_infos])
+
+        # Add multi_modal_inputs to non_tensor_batch if any samples have them
+        multi_modal_inputs_list = [input.multi_modal_inputs for input in inputs]
+        if any(mmi is not None for mmi in multi_modal_inputs_list):
+            non_tensor_batch["multi_modal_inputs"] = np.array(multi_modal_inputs_list, dtype=object)
+
+        metrics = [input.metrics.model_dump() for input in inputs]
+        # Collect extra fields from all inputs and convert them to np.ndarray
+        extra_fields = {}
+        all_keys = set(key for input_item in inputs for key in input_item.extra_fields)
+        for key in all_keys:
+            temp_arr = np.empty(len(inputs), dtype=object)
+            temp_arr[:] = [input.extra_fields.get(key) for input in inputs]
+            extra_fields[key] = temp_arr
+
+        non_tensor_batch.update(extra_fields)
+
+        # Only include reward_extra_keys in meta_info if rm_scores is in batch
+        # This avoids conflicts when reward_tensor is merged later in ray_trainer.py
+        if "rm_scores" in batch.keys():
+            meta_info = {"metrics": metrics, "reward_extra_keys": reward_extra_keys}
+        else:
+            meta_info = {"metrics": metrics}
+
+        return DataProto(
+            batch=batch,
+            non_tensor_batch=non_tensor_batch,
+            meta_info=meta_info,
+        )
diff --git a/verl/experimental/agent_loop/single_turn_agent_loop.py b/verl/experimental/agent_loop/single_turn_agent_loop.py
index d45082f5fa1..27c0d46a43c 100644
--- a/verl/experimental/agent_loop/single_turn_agent_loop.py
+++ b/verl/experimental/agent_loop/single_turn_agent_loop.py
@@ -16,8 +16,14 @@
 from typing import Any
 from uuid import uuid4
 
+import torch
+from PIL import Image
+
 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
+from verl.experimental.agent_loop.diffusion_agent_loop import DiffusionAgentLoopOutput
+from verl.utils.chat_template import apply_chat_template
 from verl.utils.profiler import simple_timer
+from verl.utils.tokenizer import normalize_token_ids
 from verl.workers.rollout.replica import TokenOutput
 
 logger = logging.getLogger(__file__)
@@ -82,3 +88,110 @@ async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutpu
         output.extra_fields.update({"turn_scores": [], "tool_rewards": []})
 
         return output
+
+
+@register("diffusion_single_turn_agent")
+class DiffusionSingleTurnAgentLoop(AgentLoopBase):
+    """Agent loop for diffusion model serving."""
+
+    # Keys from non_tensor_batch that are pipeline/dataset metadata and must
+    # NOT be forwarded to server_manager.generate() (which passes **kwargs
+    # down to the vllm-omni server that has a fixed signature).
+    _KEYS_EXCLUDED_FROM_GENERATE = frozenset(
+        {
+            "raw_prompt",
+            "raw_negative_prompt",
+            "data_source",
+            "reward_model",
+            "index",
+        }
+    )
+
+    async def apply_chat_template(
+        self,
+        messages: list[dict],
+        tools: list[dict] | None = None,
+        images: list[Image.Image] | None = None,
+        videos: list[tuple[torch.Tensor, dict]] | None = None,
+        remove_system_prompt: bool = False,
+    ) -> list[int]:
+        """Tokenize on the asyncio thread for fast tokenizers when no processor is used.
+
+        Rust-backed fast tokenizers are not reliably safe across ``run_in_executor`` thread
+        boundaries with recent transformers (``RuntimeError: Already borrowed``). The diffusion
+        path is tokenizer-only for Qwen-Image-style models; keep tokenization on the event-loop
+        thread in that case.
+        """
+        if self.processor is not None:
+            return await super().apply_chat_template(
+                messages,
+                tools=tools,
+                images=images,
+                videos=videos,
+                remove_system_prompt=remove_system_prompt,
+            )
+        if getattr(self.tokenizer, "is_fast", False):
+            tokenized_prompt = apply_chat_template(
+                self.tokenizer,
+                messages,
+                tools=tools,
+                add_generation_prompt=True,
+                tokenize=True,
+                **self.apply_chat_template_kwargs,
+            )
+            prompt_ids = normalize_token_ids(tokenized_prompt)
+            if remove_system_prompt:
+                prompt_ids = prompt_ids[len(self.system_prompt) :]
+            return prompt_ids
+        return await super().apply_chat_template(
+            messages,
+            tools=tools,
+            images=images,
+            videos=videos,
+            remove_system_prompt=remove_system_prompt,
+        )
+
+    async def run(self, sampling_params: dict[str, Any], **kwargs) -> DiffusionAgentLoopOutput:
+        raw_prompt = kwargs.pop("raw_prompt")
+        raw_negative_prompt = kwargs.pop("raw_negative_prompt", None)
+        for key in self._KEYS_EXCLUDED_FROM_GENERATE:
+            kwargs.pop(key, None)
+
+        # 1. extract images and videos from messages
+        multi_modal_data = await self.process_vision_info(raw_prompt)
+        images = multi_modal_data.get("images")
+        videos = multi_modal_data.get("videos")
+
+        # 2. apply chat template and tokenize
+        prompt_ids = await self.apply_chat_template(raw_prompt, images=images, videos=videos)
+
+        if raw_negative_prompt is not None:
+            negative_prompt_ids = await self.apply_chat_template(raw_negative_prompt, images=images, videos=videos)
+        else:
+            negative_prompt_ids = None
+
+        # 3. generate sequences
+        metrics = {}
+        with simple_timer("generate_sequences", metrics):
+            output = await self.server_manager.generate(
+                request_id=uuid4().hex,
+                prompt_ids=prompt_ids,
+                sampling_params=sampling_params,
+                image_data=images,
+                video_data=videos,
+                negative_prompt_ids=negative_prompt_ids,
+                **kwargs,
+            )
+        if metrics.get("num_preempted") is None:
+            metrics["num_preempted"] = output.num_preempted if output.num_preempted is not None else -1
+
+        output = DiffusionAgentLoopOutput(
+            prompt_ids=prompt_ids,
+            response_diffusion_output=output.diffusion_output,
+            response_logprobs=output.log_probs,
+            multi_modal_data=multi_modal_data,
+            num_turns=2,
+            metrics=metrics,
+            extra_fields=output.extra_fields,
+        )
+        return output
diff --git a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
index f1df5cdf17a..2502de62fff 100644
--- a/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
+++ b/verl/experimental/fully_async_policy/agent_loop/agent_loop.py
@@ -33,6 +33,7 @@
 from verl.utils.rollout_trace import (
     rollout_trace_op,
 )
+from verl.utils.tokenizer import normalize_token_ids
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -65,6 +66,8 @@ async def generate(
         Returns:
             TokenOutput: token output
         """
+        prompt_ids = normalize_token_ids(prompt_ids)
+
         limit_key = None
         if "max_tokens" in sampling_params:
             limit_key = "max_tokens"
diff --git a/verl/experimental/fully_async_policy/detach_utils.py b/verl/experimental/fully_async_policy/detach_utils.py
index 47c9d4d05c1..8cd7eec91f8 100644
--- a/verl/experimental/fully_async_policy/detach_utils.py
+++ b/verl/experimental/fully_async_policy/detach_utils.py
@@ -71,9 +71,7 @@ def prepare_single_generation_data(batch_dict, config) -> DataProto:
         )
 
     # Setting selected agent, that supports partial
-    if config.actor_rollout_ref.rollout.multi_turn.enable:
-        full_batch.non_tensor_batch["agent_name"] = np.array(["tool_agent"] * len(full_batch), dtype=object)
-    else:
+    if not config.actor_rollout_ref.rollout.multi_turn.enable:
         full_batch.non_tensor_batch["agent_name"] = np.array(["single_turn_agent"] * len(full_batch), dtype=object)
 
     # Add global step count to generated data
diff --git a/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_math_fsdp_npu.sh b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_math_fsdp_npu.sh
new file mode 100644
index 00000000000..53cf0005256
--- /dev/null
+++ b/verl/experimental/fully_async_policy/shell/dapo_30b_a3b_math_fsdp_npu.sh
@@ -0,0 +1,168 @@
+#!/usr/bin/env bash
+set -xeuo pipefail
+
+TRAIN_FILE=dapo-math-17k.parquet
+TEST_FILE=aime-2024.parquet
+CKPTS_DIR=./ckpt
+MODEL_ID=${MODEL_ID:-Qwen/Qwen3-30B-A3B}
+MODEL_PATH=Qwen3-30B-A3B
+
+rollout_mode="async"
+rollout_name="vllm" 
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+
+# Fully async specific parameters
+n_gpus_rollout=16
+n_gpus_training=16
+n_nodes_rollout=1
+n_nodes_train=1
+
+# Algorithm parameters
+adv_estimator=grpo
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+# Response length parameters
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 6))
+enable_overlong_buffer=False
+overlong_buffer_len=$((1024 * 6))
+overlong_penalty_factor=1.0
+
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1
+val_top_p=0.7
+
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=4
+train_prompt_mini_bsz=16
+total_rollout_steps=$(((32*100)))
+test_freq=25
+staleness_threshold=0.75
+
+trigger_parameter_sync_step=2
+partial_rollout=True
+enforce_eager=False
+nccl_timeout=7200
+
+# Performance Related Parameter
+sp_size=8
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / 2))
+ref_offload=True
+actor_offload=False
+recompute=True
+max_num_seqs=128
+loss_agg_mode="token-mean"
+gen_tp=2
+fsdp_size=-1
+
+
+python3 -m verl.experimental.fully_async_policy.fully_async_main \
+    --config-path=config \
+    --config-name='fully_async_ppo_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.fsdp_config.strategy=fsdp  \
+    actor_rollout_ref.nccl_timeout=${nccl_timeout} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.hybrid_engine=False \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    +actor_rollout_ref.model.override_config.attention_dropout=0. \
+    +actor_rollout_ref.model.override_config.embd_pdrop=0. \
+    +actor_rollout_ref.model.override_config.resid_pdrop=0. \
+    actor_rollout_ref.model.enable_gradient_checkpointing=${recompute} \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${actor_offload} \
+    actor_rollout_ref.actor.fsdp_config.forward_prefetch=False \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.expert_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_seqs=${max_num_seqs} \
+    +actor_rollout_ref.rollout.enable_sleep_mode=False \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.enforce_eager=${enforce_eager} \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.ref.use_torch_compile=False \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${ref_offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.ref.fsdp_config.forward_prefetch=False \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    reward.reward_manager.name=dapo \
+    +reward.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger=['console'] \
+    trainer.val_before_train=False \
+    trainer.test_freq="${test_freq}" \
+    trainer.save_freq=50 \
+    trainer.resume_mode=auto \
+    trainer.nnodes="${n_nodes_train}" \
+    trainer.n_gpus_per_node="${n_gpus_training}" \
+    rollout.nnodes="${n_nodes_rollout}" \
+    rollout.n_gpus_per_node="${n_gpus_rollout}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=100 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}" \
+    trainer.device=npu
diff --git a/verl/experimental/reward_loop/reward_loop.py b/verl/experimental/reward_loop/reward_loop.py
index ceee8441fa2..bd8c82bcfc6 100644
--- a/verl/experimental/reward_loop/reward_loop.py
+++ b/verl/experimental/reward_loop/reward_loop.py
@@ -21,13 +21,16 @@
 import ray
 import torch
 from omegaconf import DictConfig, open_dict
+from PIL import Image
 from tensordict import TensorDict
 
 from verl.protocol import DataProto
 from verl.single_controller.ray.base import RayResourcePool
 from verl.trainer.ppo.reward import load_reward_manager
 from verl.utils import hf_tokenizer
+from verl.utils.experimental.reward_utils import pil_image_to_base64, prepare_query_for_multi_modal
 from verl.utils.fs import copy_to_local
+from verl.utils.ray_utils import get_event_loop
 
 from .reward_model import RewardModelManager
 
@@ -114,9 +117,13 @@ def __init__(self, config: DictConfig, reward_router_address: str = None):
         self.config = config
         self.reward_router_address = reward_router_address
         self._init_reward_fn()
+        self.loop = get_event_loop()
 
     def _init_reward_fn(self):
-        input_tokenizer_local_path = copy_to_local(self.config.actor_rollout_ref.model.path)
+        input_tokenizer_path = self.config.actor_rollout_ref.model.tokenizer_path
+        if input_tokenizer_path is None:
+            input_tokenizer_path = self.config.actor_rollout_ref.model.path
+        input_tokenizer_local_path = copy_to_local(input_tokenizer_path)
         self.input_tokenizer = hf_tokenizer(input_tokenizer_local_path, trust_remote_code=True)
         self.reward_model_tokenizer = None
         if self.config.reward.reward_model.enable:
@@ -199,17 +206,32 @@ async def _preprocess_reward_inputs(self, data: DataProto) -> str:
         chat: list = list(data_item.non_tensor_batch["raw_prompt"])
 
         # extract response
-        response_ids = data_item.batch["responses"]
-        response_length = response_ids.shape[-1]
-        valid_response_length = data_item.batch["attention_mask"][-response_length:].sum()
-        valid_response_ids = response_ids[:valid_response_length]
+        response = data_item.batch["responses"]
+        if response.ndim == 3:
+            # handling multi-modal response
+            response_image = response
+            if isinstance(response_image, torch.Tensor):
+                response_image = response_image.float().permute(1, 2, 0).cpu().numpy()
+            assert response_image.shape[-1] == 3, "must be in HWC format"
+            response_image = (response_image * 255).round().clip(0, 255).astype(np.uint8)
+            response_image = Image.fromarray(response_image)
+
+            image_base64 = await self.loop.run_in_executor(None, pil_image_to_base64, response_image)
+            query = prepare_query_for_multi_modal(image_base64)
+
+            chat.append({"role": "assistant", "content": query})
+        else:
+            response_ids = response
+            response_length = response_ids.shape[-1]
+            valid_response_length = data_item.batch["attention_mask"][-response_length:].sum()
+            valid_response_ids = response_ids[:valid_response_length]
 
-        # decode
-        rollout_response = self.input_tokenizer.decode(valid_response_ids)
-        # remove bos and eos
-        rollout_response = rollout_response.replace(self.input_tokenizer.eos_token, "")
+            # decode
+            rollout_response = self.input_tokenizer.decode(valid_response_ids)
+            # remove bos and eos
+            rollout_response = rollout_response.replace(self.input_tokenizer.eos_token, "")
 
-        chat.append({"role": "assistant", "content": rollout_response})
+            chat.append({"role": "assistant", "content": rollout_response})
 
         rm_prompt = self.reward_model_tokenizer.apply_chat_template(
             chat,
@@ -320,12 +342,16 @@ def compute_rm_score(self, data: DataProto) -> DataProto:
 
         # compute rm score
         scores = [item["reward_score"] for item in outputs_flat]
-        prompt_length = data.batch["prompts"].size(1)
-        valid_response_length = data.batch["attention_mask"][:, prompt_length:].sum(dim=1)
-        rm_scores = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
-        rm_scores[torch.arange(rm_scores.size(0)), valid_response_length - 1] = torch.tensor(
-            scores, dtype=torch.float32
-        )
+        if self.config.reward.reward_manager.name == "visual":
+            # visual reward only has one score for the whole response
+            rm_scores = torch.tensor(scores, dtype=torch.float32).unsqueeze(-1)
+        else:
+            prompt_length = data.batch["prompts"].size(1)
+            valid_response_length = data.batch["attention_mask"][:, prompt_length:].sum(dim=1)
+            rm_scores = torch.zeros_like(data.batch["responses"], dtype=torch.float32)
+            rm_scores[torch.arange(rm_scores.size(0)), valid_response_length - 1] = torch.tensor(
+                scores, dtype=torch.float32
+            )
         batch = TensorDict({"rm_scores": rm_scores}, batch_size=len(data))
 
         reward_extra_infos = [output.get("reward_extra_info", {}) for output in outputs_flat]
diff --git a/verl/experimental/reward_loop/reward_manager/__init__.py b/verl/experimental/reward_loop/reward_manager/__init__.py
index f72b8c89ce6..236620fc843 100644
--- a/verl/experimental/reward_loop/reward_manager/__init__.py
+++ b/verl/experimental/reward_loop/reward_manager/__init__.py
@@ -18,6 +18,7 @@
 from .naive import NaiveRewardManager
 from .limited import RateLimitedRewardManager
 from .remote import RemoteRewardManager
+from .visual import VisualRewardManager
 
 __all__ = [
     "DAPORewardManager",
@@ -25,6 +26,7 @@
     "NaiveRewardManager",
     "RateLimitedRewardManager",
     "RemoteRewardManager",
+    "VisualRewardManager",
     "register",
     "get_reward_manager_cls",
 ]
diff --git a/verl/experimental/reward_loop/reward_manager/visual.py b/verl/experimental/reward_loop/reward_manager/visual.py
new file mode 100644
index 00000000000..33ef6f86cbd
--- /dev/null
+++ b/verl/experimental/reward_loop/reward_manager/visual.py
@@ -0,0 +1,92 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+
+from verl import DataProto
+from verl.experimental.reward_loop.reward_manager import register
+from verl.experimental.reward_loop.reward_manager.base import RewardManagerBase
+from verl.utils.reward_score import default_compute_score_image
+
+
+@register("visual")
+class VisualRewardManager(RewardManagerBase):
+    """The reward manager for visual response."""
+
+    def __init__(self, config, tokenizer, compute_score, reward_router_address=None, reward_model_tokenizer=None):
+        super().__init__(config, tokenizer, compute_score)
+        self.compute_score = compute_score or default_compute_score_image
+        self.is_async_reward_score = inspect.iscoroutinefunction(self.compute_score)
+        self.reward_router_address = reward_router_address
+        self.reward_model_tokenizer = reward_model_tokenizer
+
+    async def run_single(self, data: DataProto) -> dict:
+        assert len(data) == 1, "Only support single data item"
+        data_item = data[0]
+        response_visual = data_item.batch["responses"]
+        data_source = data_item.non_tensor_batch["data_source"]
+        ground_truth = data_item.non_tensor_batch["reward_model"]["ground_truth"]
+        extra_info = data_item.non_tensor_batch.get("extra_info", {})
+        tool_extra_fields = data_item.non_tensor_batch.get("tool_extra_fields", None)
+        if tool_extra_fields is not None:
+            extra_info.update(tool_extra_fields.items())
+
+        num_turns = data_item.non_tensor_batch.get("__num_turns__", None)
+        rollout_reward_scores = data_item.non_tensor_batch.get("reward_scores", {})
+        extra_info["num_turns"] = num_turns
+        extra_info["rollout_reward_scores"] = rollout_reward_scores
+
+        extra_reward_kwargs = (
+            {
+                "reward_router_address": self.reward_router_address,
+                "reward_model_tokenizer": self.reward_model_tokenizer,
+                "model_name": self.config.reward.reward_model.model_path,
+            }
+            if self.reward_router_address is not None
+            else {}
+        )
+        if self.is_async_reward_score:
+            result = await self.compute_score(
+                data_source=data_source,
+                solution_image=response_visual,
+                ground_truth=ground_truth,
+                extra_info=extra_info,
+                **extra_reward_kwargs,
+            )
+        else:
+            result = await self.loop.run_in_executor(
+                None,
+                lambda: self.compute_score(
+                    data_source=data_source,
+                    solution_image=response_visual,
+                    ground_truth=ground_truth,
+                    extra_info=extra_info,
+                    **extra_reward_kwargs,
+                ),
+            )
+
+        reward_extra_info = {}
+
+        score: float
+        if isinstance(result, dict):
+            score = result["score"]
+            for key, value in result.items():
+                reward_extra_info[key] = value
+        else:
+            score = result
+            reward_extra_info["acc"] = score
+
+        reward = score
+
+        return {"reward_score": reward, "reward_extra_info": reward_extra_info}
diff --git a/verl/experimental/teacher_loop/teacher_manager.py b/verl/experimental/teacher_loop/teacher_manager.py
new file mode 100644
index 00000000000..28a89262e96
--- /dev/null
+++ b/verl/experimental/teacher_loop/teacher_manager.py
@@ -0,0 +1,175 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+from typing import Any, Optional
+from uuid import uuid4
+
+import ray
+import torch
+from omegaconf import DictConfig
+from tensordict import TensorDict
+from torch.nn import functional as F
+
+from verl.experimental.agent_loop import AsyncLLMServerManager
+from verl.protocol import DataProto
+from verl.utils.config import omega_conf_to_dataclass
+from verl.utils.tokenizer import normalize_token_ids
+from verl.workers.config import DistillationConfig, DistillationLossConfig
+
+
+def _get_teacher_sampling_params(
+    distillation_config: DistillationConfig,
+    distillation_loss_config: DistillationLossConfig,
+) -> dict[str, Any]:
+    """Get sampling parameters for teacher model when computing log probabilities for distillation."""
+    if distillation_config.teacher_model.inference.temperature != 1.0:
+        raise NotImplementedError("vLLM does not support temperature for prompt_logprobs.")
+
+    num_logprobs = distillation_loss_config.topk if distillation_loss_config.loss_settings.use_topk else 0
+    return {
+        "max_tokens": 1,
+        "temperature": distillation_config.teacher_model.inference.temperature,
+        "prompt_logprobs": num_logprobs,
+    }
+
+
+def _pad_teacher_outputs(
+    teacher_ids: torch.Tensor,
+    teacher_logprobs: torch.Tensor,
+    prompt_width: int,
+    response_width: int,
+    prompt_length: int,
+    response_length: int,
+    pad_token_id: int,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # TODO(wuxibin): remove padding and use tensordict.
+    left_pad_size = prompt_width - prompt_length
+    right_pad_size = response_width - response_length
+    padding = (0, 0, left_pad_size, right_pad_size)
+    return (
+        F.pad(teacher_ids, padding, value=pad_token_id).unsqueeze(0),
+        F.pad(teacher_logprobs, padding, value=0.0).unsqueeze(0),
+    )
+
+
+def _unpad_teacher_inputs(data: DataProto) -> tuple[list[int], int, int]:
+    """Unpad valid sequence ids and prompt/response lengths from a single sample.
+    The sample is a left-padded prompt concatenated with a right-padded response.
+    TODO(wuxibin): remove padding and use tensordict.
+    """
+    assert len(data) == 1, "Teacher logprob computation expects a single sample"
+
+    input_ids = data.batch["input_ids"][0]
+    attention_mask = data.batch["attention_mask"][0]
+    prompt_width = data.batch["prompts"][0].shape[0]
+    response_width = data.batch["responses"][0].shape[0]
+    assert attention_mask.shape[0] == prompt_width + response_width, (
+        "attention_mask sequence length must match prompt and response widths"
+    )
+    valid_prompt_length = int(attention_mask[:prompt_width].sum().item())
+    valid_response_length = int(attention_mask[-response_width:].sum().item())
+    prompt_num_padding = prompt_width - valid_prompt_length
+    sequence_ids = input_ids[prompt_num_padding : prompt_width + valid_response_length]
+    sequence_ids = normalize_token_ids(sequence_ids)
+    return sequence_ids, valid_prompt_length, valid_response_length
+
+
+class AsyncTeacherLLMServerManager(AsyncLLMServerManager):
+    """Teacher-specific async client used for distillation logprob computation."""
+
+    def __init__(
+        self,
+        config: DictConfig,
+        servers: list[tuple[str, ray.actor.ActorHandle]],
+        load_balancer_handle: ray.actor.ActorHandle,
+        distillation_config: DictConfig | DistillationConfig,
+        pad_token_id: int,
+    ):
+        super().__init__(config=config, servers=servers, load_balancer_handle=load_balancer_handle)
+        if isinstance(distillation_config, DistillationConfig):
+            self.distillation_config = distillation_config
+        else:
+            self.distillation_config: DistillationConfig = omega_conf_to_dataclass(distillation_config)
+        self.distillation_loss_config: DistillationLossConfig = self.distillation_config.distillation_loss
+        self.pad_token_id = pad_token_id
+
+    async def compute_teacher_logprobs_single(
+        self,
+        sequence_ids: list[int],
+        multi_modal_data: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Compute teacher log probabilities for a single unpadded sequence."""
+        multi_modal_data = multi_modal_data or {}
+        teacher_output = await self.generate(
+            request_id=uuid4().hex,
+            prompt_ids=sequence_ids,
+            sampling_params=_get_teacher_sampling_params(self.distillation_config, self.distillation_loss_config),
+            image_data=multi_modal_data.get("images"),
+            video_data=multi_modal_data.get("videos"),
+        )
+        # Shapes: # S, (1 or K), where S is the response length, K is either 1 or topk depending on
+        # the distillation loss settings.
+        teacher_ids = torch.tensor(teacher_output.extra_fields["prompt_ids"], dtype=torch.int32)
+        teacher_logprobs = torch.tensor(teacher_output.extra_fields["prompt_logprobs"])
+        assert teacher_ids.shape[0] == teacher_logprobs.shape[0] == len(sequence_ids)
+        return teacher_ids, teacher_logprobs
+
+    async def compute_teacher_logprobs_batch(self, data: DataProto) -> DataProto:
+        """Compute teacher log probabilities for a batch of prompt-response pairs."""
+        multi_modal_data_batch = data.non_tensor_batch.get("teacher_multi_modal_data")
+        tasks = []
+        lengths = []
+        prompt_width = data.batch["prompts"].shape[1]
+        response_width = data.batch["responses"].shape[1]
+
+        # Compute logprobs for each sample in the batch
+        for i in range(len(data)):
+            item = data[i : i + 1]
+            sequence_ids, prompt_length, response_length = _unpad_teacher_inputs(item)
+            multi_modal_data = None if multi_modal_data_batch is None else multi_modal_data_batch[i]
+            lengths.append((prompt_length, response_length))
+            tasks.append(
+                asyncio.create_task(
+                    self.compute_teacher_logprobs_single(
+                        sequence_ids=sequence_ids,
+                        multi_modal_data=multi_modal_data,
+                    )
+                )
+            )
+        outputs = await asyncio.gather(*tasks)
+
+        # Pad the teacher logprobs and ids
+        padded_teacher_ids = []
+        padded_teacher_logprobs = []
+        for (teacher_ids, teacher_logprobs), (prompt_length, response_length) in zip(outputs, lengths, strict=True):
+            padded_ids, padded_logprobs = _pad_teacher_outputs(
+                teacher_ids,
+                teacher_logprobs,
+                prompt_width=prompt_width,
+                response_width=response_width,
+                prompt_length=prompt_length,
+                response_length=response_length,
+                pad_token_id=self.pad_token_id,
+            )
+            padded_teacher_ids.append(padded_ids)
+            padded_teacher_logprobs.append(padded_logprobs)
+
+        batch = TensorDict(
+            {
+                "teacher_ids": torch.cat(padded_teacher_ids),
+                "teacher_logprobs": torch.cat(padded_teacher_logprobs),
+            },
+            batch_size=len(data),
+        )
+        return DataProto(batch=batch)
diff --git a/verl/experimental/teacher_loop/teacher_model.py b/verl/experimental/teacher_loop/teacher_model.py
index 8ee1f1e539a..966efa969ab 100644
--- a/verl/experimental/teacher_loop/teacher_model.py
+++ b/verl/experimental/teacher_loop/teacher_model.py
@@ -48,6 +48,7 @@ def __init__(
         self.config: DistillationConfig = omega_conf_to_dataclass(config)
         self.resource_pool = resource_pool
         self._initialize_llm_servers()
+        self._initialize_async_server_manager()
         self._initialize_router()
 
         self.sleep()
@@ -70,6 +71,10 @@ def _initialize_llm_servers(self):
         rollout_config = teacher_model_config.inference
         model_config = HFModelConfig(path=teacher_model_config.model_path)
         self.tokenizer = model_config.get_processor()
+        text_tokenizer = model_config.tokenizer
+        if model_config.tokenizer is None:
+            raise ValueError(f"Tokenizer is required for teacher model {teacher_model_config.model_path}")
+        self.pad_token_id = text_tokenizer.pad_token_id
         self.rollout_replicas = [
             rollout_replica_class(
                 replica_rank=replica_rank,
@@ -94,6 +99,21 @@ def _initialize_llm_servers(self):
         self.server_handles = [server._server_handle for server in self.rollout_replicas]
         self.server_addresses = [server._server_address for server in self.rollout_replicas]
 
+    def _initialize_async_server_manager(self):
+        from verl.experimental.agent_loop.agent_loop import GlobalRequestLoadBalancer
+        from verl.experimental.teacher_loop.teacher_manager import AsyncTeacherLLMServerManager
+
+        self.load_balancer_handle = GlobalRequestLoadBalancer.remote(
+            server_actor_ids=self.server_addresses,
+        )
+        self.server_manager = AsyncTeacherLLMServerManager(
+            config=self.config,
+            servers=list(zip(self.server_addresses, self.server_handles, strict=True)),
+            load_balancer_handle=self.load_balancer_handle,
+            distillation_config=self.config,
+            pad_token_id=self.pad_token_id,
+        )
+
     def _initialize_router(self):
         worker_urls = [f"http://{server_address}" for server_address in self.server_addresses]
 
@@ -104,6 +124,13 @@ def _initialize_router(self):
     def get_router_address(self):
         return self.router_address
 
+    def compute_logprobs(self, data):
+        self.wake_up()
+        try:
+            return self._run_single(self.server_manager.compute_teacher_logprobs_batch(data))
+        finally:
+            self.sleep()
+
     @auto_await
     async def wake_up(self):
         """Wake up all rollout replica instances."""
@@ -117,3 +144,9 @@ async def sleep(self):
     @auto_await
     async def _run_all(self, tasks: list[asyncio.Task]):
         await asyncio.gather(*tasks)
+
+    def _run_single(self, task):
+        async def run():
+            return await task
+
+        return asyncio.run(run())
diff --git a/verl/model_merger/base_model_merger.py b/verl/model_merger/base_model_merger.py
index 37cf926762c..8f65d5425cb 100644
--- a/verl/model_merger/base_model_merger.py
+++ b/verl/model_merger/base_model_merger.py
@@ -349,8 +349,16 @@ def save_lora_adapter(self, state_dict: dict[str, torch.Tensor]):
         if task_type is not None:
             peft_dict["task_type"] = task_type
         peft_config = peft.LoraConfig(**peft_dict).to_dict()
-        peft_config["task_type"] = peft_config["task_type"].value if peft_config["task_type"] else None
-        peft_config["peft_type"] = peft_config["peft_type"].value if peft_config["peft_type"] else None
+        peft_config["task_type"] = (
+            peft_config["task_type"].value
+            if hasattr(peft_config["task_type"], "value")
+            else (peft_config["task_type"] or None)
+        )
+        peft_config["peft_type"] = (
+            peft_config["peft_type"].value
+            if hasattr(peft_config["peft_type"], "value")
+            else (peft_config["peft_type"] or None)
+        )
         peft_config["target_modules"] = list(peft_config["target_modules"])
 
         lora_path = os.path.join(self.config.target_dir, "lora_adapter")
diff --git a/verl/models/diffusers_model/__init__.py b/verl/models/diffusers_model/__init__.py
new file mode 100644
index 00000000000..70ab3a63470
--- /dev/null
+++ b/verl/models/diffusers_model/__init__.py
@@ -0,0 +1,24 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import DiffusionModelBase
+from .utils import build_scheduler, forward_and_sample_previous_step, prepare_model_inputs, set_timesteps
+
+__all__ = [
+    "DiffusionModelBase",
+    "build_scheduler",
+    "set_timesteps",
+    "prepare_model_inputs",
+    "forward_and_sample_previous_step",
+]
diff --git a/verl/models/diffusers_model/base.py b/verl/models/diffusers_model/base.py
new file mode 100644
index 00000000000..33a7bcfcc44
--- /dev/null
+++ b/verl/models/diffusers_model/base.py
@@ -0,0 +1,170 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
+from diffusers import ModelMixin, SchedulerMixin
+from tensordict import TensorDict
+
+from verl.workers.config import DiffusionModelConfig
+
+
+class DiffusionModelBase(ABC):
+    """Abstract base class for diffusion model training helpers.
+
+    Different diffusion models have very different forward / sampling logic.
+    Subclass this ABC and implement the three abstract methods to plug your
+    model into the verl training loop.
+
+    Registration
+    ------------
+    Decorate your subclass with ``@DiffusionModelBase.register("name")``.
+    The *name* must match the ``_class_name`` value in the pipeline's
+    ``model_index.json`` (which is auto-detected into
+    ``DiffusionModelConfig.architecture``).
+
+    Example::
+
+        @DiffusionModelBase.register("QwenImagePipeline")
+        class QwenImage(DiffusionModelBase):
+            ...
+
+    Loading external implementations
+    ---------------------------------
+    Implementations live outside the core verl package (e.g. under
+    ``examples/``).  Set ``external_lib`` on ``DiffusionModelConfig``
+    to the module that contains your subclass so it is imported (and
+    thus registered) before the registry is queried::
+
+        DiffusionModelConfig(
+            ...,
+            external_lib="examples.flowgrpo_trainer.diffusers.qwen_image",
+        )
+    """
+
+    _registry: dict[str, type["DiffusionModelBase"]] = {}
+
+    @classmethod
+    def register(cls, name: str):
+        """Class decorator that registers a subclass under *name*."""
+
+        def decorator(subclass: type["DiffusionModelBase"]) -> type["DiffusionModelBase"]:
+            cls._registry[name] = subclass
+            return subclass
+
+        return decorator
+
+    @classmethod
+    def get_class(cls, model_config: DiffusionModelConfig) -> type["DiffusionModelBase"]:
+        """Return the registered subclass for ``model_config.architecture``."""
+        if model_config.architecture not in cls._registry and model_config.external_lib is not None:
+            from verl.utils.import_utils import import_external_libs
+
+            import_external_libs(model_config.external_lib)
+
+        try:
+            return cls._registry[model_config.architecture]
+        except KeyError:
+            registered = list(cls._registry)
+            raise NotImplementedError(
+                f"No diffusion model registered for architecture={model_config.architecture!r}. "
+                f"Registered: {registered}. "
+                f"Set ``external_lib`` in DiffusionModelConfig to load your implementation."
+            ) from None
+
+    @classmethod
+    @abstractmethod
+    def build_scheduler(cls, model_config: DiffusionModelConfig) -> SchedulerMixin:
+        """Build and configure the diffusion scheduler for this model.
+        The returned scheduler should have timesteps and sigmas already set.
+
+        Args:
+            model_config (DiffusionModelConfig): the configuration of the diffusion model.
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def set_timesteps(cls, scheduler: SchedulerMixin, model_config: DiffusionModelConfig, device: str):
+        """Set timesteps and sigmas on the scheduler and move them to *device*.
+
+        Args:
+            scheduler (SchedulerMixin): the scheduler used for the diffusion process.
+            model_config (DiffusionModelConfig): the configuration of the diffusion model.
+            device (str): the device to move the timesteps and sigmas to.
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def prepare_model_inputs(
+        cls,
+        module: ModelMixin,
+        model_config: DiffusionModelConfig,
+        latents: torch.Tensor,
+        timesteps: torch.Tensor,
+        prompt_embeds: torch.Tensor,
+        prompt_embeds_mask: torch.Tensor,
+        negative_prompt_embeds: torch.Tensor,
+        negative_prompt_embeds_mask: torch.Tensor,
+        micro_batch: TensorDict,
+        step: int,
+    ) -> tuple[dict, dict]:
+        """Build architecture-specific model inputs for the forward pass.
+        The caller is responsible for universal pre-processing (common tensor extraction
+        and nested-embed unpadding) before invoking this method.
+
+        Args:
+            module (ModelMixin): the diffusion transformer module.
+            model_config (DiffusionModelConfig): the configuration of the diffusion model.
+            latents (torch.Tensor): full latent tensor from the micro-batch, shape (B, T, ...).
+            timesteps (torch.Tensor): full timestep tensor from the micro-batch, shape (B, T).
+            prompt_embeds (torch.Tensor): dense positive prompt embeddings, shape (B, L, D).
+            prompt_embeds_mask (torch.Tensor): attention mask for prompt_embeds, shape (B, L).
+            negative_prompt_embeds (torch.Tensor): dense negative prompt embeddings, shape (B, L, D).
+            negative_prompt_embeds_mask (torch.Tensor): attention mask for negative_prompt_embeds.
+            micro_batch (TensorDict): the full micro-batch, available for architecture-specific
+                metadata (e.g. height, width, vae_scale_factor).
+            step (int): the current denoising step index.
+        """
+        pass
+
+    @classmethod
+    @abstractmethod
+    def forward_and_sample_previous_step(
+        cls,
+        module: ModelMixin,
+        scheduler: SchedulerMixin,
+        model_config: DiffusionModelConfig,
+        model_inputs: dict[str, torch.Tensor],
+        negative_model_inputs: Optional[dict[str, torch.Tensor]],
+        scheduler_inputs: Optional[TensorDict | dict[str, torch.Tensor]],
+        step: int,
+    ):
+        """Forward the model and sample the previous step.
+        Used for RL-algorithms based on reversed-sampling (FlowGRPO, DanceGRPO, etc.).
+
+        Args:
+            module (ModelMixin): the diffusion model to be forwarded.
+            scheduler (SchedulerMixin): the scheduler used for the diffusion process.
+            model_config (DiffusionModelConfig): the configuration of the diffusion model.
+            model_inputs (dict[str, torch.Tensor]): the inputs to the diffusion model.
+            negative_model_inputs (Optional[dict[str, torch.Tensor]]): the negative inputs for guidance.
+            scheduler_inputs (Optional[TensorDict | dict[str, torch.Tensor]]): the extra inputs for the scheduler,
+                which may contain the latents and timesteps.
+            step (int): the current step in the diffusion process.
+        """
+        pass
diff --git a/verl/models/diffusers_model/utils.py b/verl/models/diffusers_model/utils.py
new file mode 100644
index 00000000000..2a35b0abe29
--- /dev/null
+++ b/verl/models/diffusers_model/utils.py
@@ -0,0 +1,114 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+from diffusers import ModelMixin, SchedulerMixin
+from tensordict import TensorDict
+
+from verl.utils.device import get_device_name
+from verl.workers.config import DiffusionModelConfig
+
+from .base import DiffusionModelBase
+
+
+def prepare_model_inputs(
+    module: ModelMixin,
+    model_config: DiffusionModelConfig,
+    latents: torch.Tensor,
+    timesteps: torch.Tensor,
+    prompt_embeds: torch.Tensor,
+    prompt_embeds_mask: torch.Tensor,
+    negative_prompt_embeds: torch.Tensor,
+    negative_prompt_embeds_mask: torch.Tensor,
+    micro_batch: TensorDict,
+    step: int,
+) -> tuple[dict, dict]:
+    """Build architecture-specific model inputs for the forward pass.
+    Dispatches to the registered DiffusionModelBase subclass for the current architecture.
+
+    Args:
+        module (ModelMixin): the diffusion transformer module.
+        model_config (DiffusionModelConfig): the configuration of the diffusion model.
+        latents (torch.Tensor): full latent tensor from the micro-batch, shape (B, T, ...).
+        timesteps (torch.Tensor): full timestep tensor from the micro-batch, shape (B, T).
+        prompt_embeds (torch.Tensor): dense positive prompt embeddings, shape (B, L, D).
+        prompt_embeds_mask (torch.Tensor): attention mask for prompt_embeds, shape (B, L).
+        negative_prompt_embeds (torch.Tensor): dense negative prompt embeddings, shape (B, L, D).
+        negative_prompt_embeds_mask (torch.Tensor): attention mask for negative_prompt_embeds.
+        micro_batch (TensorDict): the full micro-batch, available for architecture-specific
+            metadata (e.g. height, width, vae_scale_factor).
+        step (int): the current denoising step index.
+    """
+    return DiffusionModelBase.get_class(model_config).prepare_model_inputs(
+        module,
+        model_config,
+        latents,
+        timesteps,
+        prompt_embeds,
+        prompt_embeds_mask,
+        negative_prompt_embeds,
+        negative_prompt_embeds_mask,
+        micro_batch,
+        step,
+    )
+
+
+def build_scheduler(model_config: DiffusionModelConfig) -> SchedulerMixin:
+    """Build and configure the scheduler for the diffusion model.
+    The returned scheduler has timesteps and sigmas already set.
+
+    Args:
+        model_config (DiffusionModelConfig): the configuration of the diffusion model.
+    """
+    return DiffusionModelBase.get_class(model_config).build_scheduler(model_config)
+
+
+def set_timesteps(scheduler: SchedulerMixin, model_config: DiffusionModelConfig):
+    """Set correct timesteps and sigmas for diffusion model schedulers.
+
+    Args:
+        scheduler (SchedulerMixin): the scheduler used for the diffusion process.
+        model_config (DiffusionModelConfig): the configuration of the diffusion model.
+    """
+    DiffusionModelBase.get_class(model_config).set_timesteps(scheduler, model_config, get_device_name())
+
+
+def forward_and_sample_previous_step(
+    module: ModelMixin,
+    scheduler: SchedulerMixin,
+    model_config: DiffusionModelConfig,
+    model_inputs: dict,
+    negative_model_inputs: Optional[dict],
+    scheduler_inputs: Optional[TensorDict | dict[str, torch.Tensor]],
+    step: int,
+):
+    """Forward the model and sample previous step.
+    This method is usually used for RL-algorithms based on reversed-sampling process.
+    Such as FlowGRPO, DanceGRPO, etc.
+
+    Args:
+        module (ModelMixin): the diffusion model to be forwarded.
+        scheduler (SchedulerMixin): the scheduler used for the diffusion process.
+        model_config (DiffusionModelConfig): the configuration of the diffusion model.
+        model_inputs (dict[str, torch.Tensor]): the inputs to the diffusion model.
+        negative_model_inputs (Optional[dict[str, torch.Tensor]]): the negative inputs for guidance.
+        scheduler_inputs (Optional[TensorDict | dict[str, torch.Tensor]]): the extra inputs for the scheduler,
+            which may contain the latents and timesteps.
+        step (int): the current step in the diffusion process.
+    """
+    return DiffusionModelBase.get_class(model_config).forward_and_sample_previous_step(
+        module, scheduler, model_config, model_inputs, negative_model_inputs, scheduler_inputs, step
+    )
diff --git a/verl/models/mcore/__init__.py b/verl/models/mcore/__init__.py
index 648f34ab2e1..ffebbc717f5 100644
--- a/verl/models/mcore/__init__.py
+++ b/verl/models/mcore/__init__.py
@@ -16,10 +16,10 @@
 from verl.models.mcore.patch import apply_patch_megatron_v012_with_torch_v28
 
 from .registry import (
+    get_mcore_engine_forward_fn,
     get_mcore_forward_fn,
     get_mcore_forward_fused_fn,
-    get_mcore_forward_fused_no_padding_fn,
-    get_mcore_forward_no_padding_fn,
+    get_mcore_forward_fused_model_engine_fn,
     get_mcore_weight_converter,
     hf_to_mcore_config,
     init_mcore_model,
@@ -31,8 +31,8 @@
     "get_mcore_forward_fn",
     "get_mcore_weight_converter",
     "get_mcore_forward_fused_fn",
-    "get_mcore_forward_fused_no_padding_fn",
-    "get_mcore_forward_no_padding_fn",
+    "get_mcore_engine_forward_fn",
+    "get_mcore_forward_fused_model_engine_fn",
 ]
 
 apply_patch_megatron_v012_with_torch_v28()
diff --git a/verl/models/mcore/config_converter.py b/verl/models/mcore/config_converter.py
index c4df9382861..e1b264612da 100644
--- a/verl/models/mcore/config_converter.py
+++ b/verl/models/mcore/config_converter.py
@@ -26,6 +26,8 @@
 from megatron.core.transformer import MLATransformerConfig, TransformerConfig
 from transformers import PretrainedConfig
 
+from verl.utils.megatron_utils import get_hf_rope_theta
+
 T = TypeVar("T", bound=TransformerConfig)
 
 
@@ -120,7 +122,7 @@ def _get_mla_transformer_config(
         "qk_head_dim": hf_config.qk_nope_head_dim,
         "qk_pos_emb_head_dim": hf_config.qk_rope_head_dim,
         "v_head_dim": hf_config.v_head_dim,
-        "rotary_base": hf_config.rope_theta,
+        "rotary_base": get_hf_rope_theta(hf_config),
         "rotary_scaling_factor": mla_rope_config["factor"],
         "rope_type": mla_rope_config["type"],
         "max_position_embeddings": mla_rope_config["original_max_position_embeddings"],
diff --git a/verl/models/mcore/model_forward.py b/verl/models/mcore/model_forward.py
index d1ae01e3d13..cceec2f6333 100644
--- a/verl/models/mcore/model_forward.py
+++ b/verl/models/mcore/model_forward.py
@@ -13,8 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
+from typing import Optional
 
 import torch
+from megatron.core import parallel_state as mpu
 from torch.nested._internal.nested_tensor import NestedTensor
 
 from verl.utils.megatron_utils import unwrap_model
@@ -22,13 +25,13 @@
 
 from .util import (
     postprocess_bshd,
-    postprocess_bshd_no_padding,
+    postprocess_bshd_engine,
     postprocess_packed_seqs,
-    postprocess_thd_no_padding,
+    postprocess_thd_engine,
     preprocess_bshd,
-    preprocess_bshd_no_padding,
+    preprocess_bshd_engine,
     preprocess_packed_seqs,
-    preprocess_thd_no_padding,
+    preprocess_thd_engine,
 )
 
 
@@ -207,7 +210,7 @@ def _convert_to_nested_tensor(v, input_ids_lengths):
     return v
 
 
-def gptmodel_forward_no_padding(
+def gptmodel_forward_model_engine(
     model,
     input_ids,
     multi_modal_inputs: dict,
@@ -218,6 +221,7 @@ def gptmodel_forward_no_padding(
     pad_token_id=None,
     data_format: str = "thd",
     mtp_enable_train: bool = False,
+    local_cp_size: Optional[int] = None,
 ):
     """Default forward pass for GPT models with optional sequence packing."""
 
@@ -240,8 +244,11 @@ def gptmodel_forward_no_padding(
 
     batch_size = input_ids.shape[0]
     if data_format == "thd":
-        input_ids_rmpad, packed_seq_params, position_ids_rmpad = preprocess_thd_no_padding(
-            input_ids, pre_process=pre_process or (post_process and mtp_enable_train), use_fp8_padding=use_fp8_padding
+        input_ids_rmpad, packed_seq_params, position_ids_rmpad = preprocess_thd_engine(
+            input_ids,
+            pre_process=pre_process or (post_process and mtp_enable_train),
+            use_fp8_padding=use_fp8_padding,
+            local_cp_size=local_cp_size,
         )
         input_ids_rmpad = input_ids_rmpad.contiguous()
 
@@ -255,8 +262,12 @@ def gptmodel_forward_no_padding(
                 v = logits_processor_args[k]
                 v = _convert_to_nested_tensor(v, input_ids_lengths)
                 logits_processor_args[k] = v
-                args[k] = preprocess_thd_no_padding(
-                    v, pre_process=True, need_roll=True, use_fp8_padding=use_fp8_padding
+                args[k] = preprocess_thd_engine(
+                    v,
+                    pre_process=True,
+                    need_roll=True,
+                    use_fp8_padding=use_fp8_padding,
+                    local_cp_size=local_cp_size,
                 )[0]
 
             model_kwargs["labels"] = args["label"].contiguous()
@@ -284,19 +295,30 @@ def gptmodel_forward_no_padding(
 
         if post_process and logits_processor is not None:
             args = {
-                k: preprocess_thd_no_padding(
-                    v, pre_process=True, need_roll=(k == "label"), use_fp8_padding=use_fp8_padding
+                k: preprocess_thd_engine(
+                    v,
+                    pre_process=True,
+                    need_roll=(k == "label"),
+                    use_fp8_padding=use_fp8_padding,
+                    local_cp_size=local_cp_size,
                 )[0]
                 for k, v in logits_processor_args.items()
             }
             output_dict = logits_processor(output_orig, **args)
             output = {
-                k: postprocess_thd_no_padding(v, packed_seq_params, input_ids, batch_size, post_process=post_process)
+                k: postprocess_thd_engine(
+                    v, packed_seq_params, input_ids, batch_size, post_process=post_process, local_cp_size=local_cp_size
+                )
                 for k, v in output_dict.items()
             }
         else:
-            output = postprocess_thd_no_padding(
-                output_orig, packed_seq_params, input_ids, batch_size, post_process=post_process
+            output = postprocess_thd_engine(
+                output_orig,
+                packed_seq_params,
+                input_ids,
+                batch_size,
+                post_process=post_process,
+                local_cp_size=local_cp_size,
             )
     else:
         """
@@ -306,8 +328,9 @@ def gptmodel_forward_no_padding(
         When using the bshd format, we have to add paddings to the input_ids to meet the longest sequence length, 
         so it is recommended to disable dynamic batch size and set batch size to 1
         """
+        assert local_cp_size is None, "dynamic_CP is not supported for bshd format"
 
-        input_ids_bshd, attention_mask_bshd, position_ids_bshd = preprocess_bshd_no_padding(
+        input_ids_bshd, attention_mask_bshd, position_ids_bshd = preprocess_bshd_engine(
             input_ids, pre_process=pre_process or (post_process and mtp_enable_train), use_fp8_padding=use_fp8_padding
         )
 
@@ -321,35 +344,54 @@ def gptmodel_forward_no_padding(
                 v = logits_processor_args[k]
                 v = _convert_to_nested_tensor(v, input_ids_lengths)
                 logits_processor_args[k] = v
-                args[k] = preprocess_bshd_no_padding(
-                    v, pre_process=True, need_roll=True, use_fp8_padding=use_fp8_padding
-                )[0]
+                args[k] = preprocess_bshd_engine(v, pre_process=True, need_roll=True, use_fp8_padding=use_fp8_padding)[
+                    0
+                ]
             model_kwargs["labels"] = args["label"].contiguous()
             model_kwargs["loss_mask"] = args["loss_mask"].contiguous()
 
         if logits_processor_args and "loss_mask" in logits_processor_args:
             logits_processor_args.pop("loss_mask")
 
+        # For VLM model, need to pass bshd format `input_ids` and `attention_mask`.
+        attention_mask = attention_mask_bshd
+        if vision_model:
+            seqlens_in_batch = input_ids.offsets().diff()
+            max_seqlen = seqlens_in_batch.max().item()
+
+            # For CP, sequence length must be divisible by (2 * cp_size), and for SP by tp_size.
+            tp_size = mpu.get_tensor_model_parallel_world_size()
+            cp_size = mpu.get_context_parallel_world_size()
+            align_size = math.lcm(tp_size, 2 * cp_size) if cp_size > 1 else tp_size
+            if align_size > 1:
+                pad_size = (align_size - max_seqlen % align_size) % align_size
+                max_seqlen += pad_size
+
+            input_ids_bshd = input_ids.to_padded_tensor(pad_token_id, output_size=(batch_size, max_seqlen))
+            attention_mask = torch.zeros_like(input_ids_bshd, dtype=torch.bool)
+            for i, seqlen in enumerate(seqlens_in_batch):
+                attention_mask[i, :seqlen] = True
+
         output_orig = model(
             input_ids=input_ids_bshd,
-            attention_mask=attention_mask_bshd,
+            attention_mask=attention_mask,
             position_ids=None if vision_model else position_ids_bshd,
             **model_kwargs,
         )
         if post_process and logits_processor is not None:
             args = {
-                k: preprocess_bshd_no_padding(
+                k: preprocess_bshd_engine(
                     v, pre_process=True, need_roll=(k == "label"), use_fp8_padding=use_fp8_padding
                 )[0]
                 for k, v in logits_processor_args.items()
             }
             output_dict = logits_processor(output_orig, **args)
             output = {
-                k: postprocess_bshd_no_padding(v, attention_mask_bshd, post_process=post_process)
+                k: postprocess_bshd_engine(v, attention_mask_bshd, post_process=post_process)
                 for k, v in output_dict.items()
             }
         else:
-            output = postprocess_bshd_no_padding(output_orig, attention_mask_bshd, post_process=post_process)
+            output = postprocess_bshd_engine(output_orig, attention_mask_bshd, post_process=post_process)
 
     if value_model and post_process:
         # output = output[..., 0]
diff --git a/verl/models/mcore/model_forward_fused.py b/verl/models/mcore/model_forward_fused.py
index 273ade1f69e..f045b7738e2 100644
--- a/verl/models/mcore/model_forward_fused.py
+++ b/verl/models/mcore/model_forward_fused.py
@@ -29,12 +29,12 @@
 from packaging import version
 from torch import Tensor
 
-from verl.models.mcore.util import preprocess_packed_seqs, preprocess_thd_no_padding
+from verl.models.mcore.util import preprocess_packed_seqs, preprocess_thd_engine
 from verl.utils.kernel.linear_cross_entropy import linear_cross_entropy
 from verl.utils.megatron_utils import unwrap_model
 from verl.utils.model import CausalLMOutputForPPO
 
-from .util import postprocess_packed_seqs_for_dict_output, postprocess_thd_no_padding
+from .util import postprocess_packed_seqs_for_dict_output, postprocess_thd_engine
 
 
 def _get_patching_model(model: torch.nn.Module):
@@ -137,8 +137,8 @@ def fused_forward_model(
     return fused_forward_model
 
 
-def fused_forward_no_padding_gen(vision_model: bool = False):
-    def fused_forward_no_padding(
+def fused_forward_model_engine(vision_model: bool = False):
+    def fused_forward_model_engine_inner(
         model,
         input_ids: Tensor,
         labels: Tensor,
@@ -153,7 +153,7 @@ def fused_forward_no_padding(
         fp8 = unwrap_model(model).config.fp8
         use_fp8_padding = fp8 in ["e4m3", "hybrid"]
 
-        input_ids_rmpad, packed_seq_params, _ = preprocess_thd_no_padding(
+        input_ids_rmpad, packed_seq_params, _ = preprocess_thd_engine(
             input_ids, pre_process=pre_process, use_fp8_padding=use_fp8_padding
         )
         input_ids_rmpad = input_ids_rmpad.contiguous()
@@ -177,7 +177,7 @@ def fused_forward_no_padding(
                 0
             ) < seqlens_in_batch.unsqueeze(1)
 
-        labels_rmpad, _, _ = preprocess_thd_no_padding(
+        labels_rmpad, _, _ = preprocess_thd_engine(
             labels, pre_process=True, need_roll=True, use_fp8_padding=use_fp8_padding
         )
         labels_rmpad = labels_rmpad.contiguous()
@@ -197,7 +197,7 @@ def fused_forward_no_padding(
         log_probs = output_orig.log_probs
         if log_probs.dim() == 1:
             log_probs = log_probs.unsqueeze(0)
-        log_probs = postprocess_thd_no_padding(
+        log_probs = postprocess_thd_engine(
             log_probs, packed_seq_params, input_ids, input_ids.shape[0], post_process=post_process
         )
 
@@ -207,14 +207,14 @@ def fused_forward_no_padding(
             entropy = output_orig.entropy
             if entropy.dim() == 1:
                 entropy = entropy.unsqueeze(0)
-            entropy = postprocess_thd_no_padding(
+            entropy = postprocess_thd_engine(
                 entropy, packed_seq_params, input_ids, input_ids.shape[0], post_process=post_process
             )
             output["entropy"] = entropy
 
         return output
 
-    return fused_forward_no_padding
+    return fused_forward_model_engine_inner
 
 
 def _fused_GPTModel_forward(
diff --git a/verl/models/mcore/model_initializer.py b/verl/models/mcore/model_initializer.py
index 49a30bc9e2c..2b8bc83332a 100644
--- a/verl/models/mcore/model_initializer.py
+++ b/verl/models/mcore/model_initializer.py
@@ -21,7 +21,7 @@
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec, get_gpt_mtp_block_spec
 from megatron.core.models.gpt.gpt_model import GPTModel
 
-from .config_converter import PretrainedConfig, TransformerConfig
+from .config_converter import PretrainedConfig, TransformerConfig, get_hf_rope_theta
 
 
 class BaseModelInitializer(ABC):
@@ -80,17 +80,17 @@ def initialize(
             post_process=post_process,
             share_embeddings_and_output_weights=share_embeddings_and_output_weights,
             position_embedding_type="rope",
-            rotary_base=self.hf_config.rope_theta,
+            rotary_base=get_hf_rope_theta(self.hf_config),
             **rope_scaling_args,
             mtp_block_spec=mtp_block_spec,
             **({} if not self.has_vp_stage else {"vp_stage": vp_stage}),
         )
 
         if post_process and value:
-            from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+            from verl.models.mcore.bridge import LinearForLastLayer
 
             model.output_layer = LinearForLastLayer(
-                input_size=self.tfconfig.hidden_size, output_size=1, config=self.tfconfig
+                input_size=self.tfconfig.hidden_size, output_size=1, sequence_parallel=self.tfconfig.sequence_parallel
             )
 
         return model
@@ -257,7 +257,7 @@ def initialize(
             vision_projection_config=vision_projection_config,
             vision_projection_layer_spec=vision_projection_layer_spec,
             vision_projection_type="mlp",
-            language_rotary_base=hf_config.rope_theta,
+            language_rotary_base=get_hf_rope_theta(hf_config),
             pre_process=pre_process,
             post_process=post_process,
             add_decoder=True,
@@ -267,10 +267,10 @@ def initialize(
         )
 
         if post_process and value:
-            from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+            from verl.models.mcore.bridge import LinearForLastLayer
 
             qwen25_vl_model.language_model.output_layer = LinearForLastLayer(
-                input_size=tfconfig.hidden_size, output_size=1, config=tfconfig
+                input_size=tfconfig.hidden_size, output_size=1, sequence_parallel=tfconfig.sequence_parallel
             )
 
         return qwen25_vl_model
diff --git a/verl/models/mcore/registry.py b/verl/models/mcore/registry.py
index 8f9e3447f9f..fe5910ab802 100644
--- a/verl/models/mcore/registry.py
+++ b/verl/models/mcore/registry.py
@@ -22,8 +22,8 @@
 import torch
 import torch.nn as nn
 
-from .model_forward import gptmodel_forward_no_padding, model_forward_gen
-from .model_forward_fused import fused_forward_model_gen, fused_forward_no_padding_gen
+from .model_forward import gptmodel_forward_model_engine, model_forward_gen
+from .model_forward_fused import fused_forward_model_gen, fused_forward_model_engine
 
 
 class SupportedVLM(Enum):
@@ -49,12 +49,12 @@ def get_mcore_forward_fn(hf_config) -> Callable:
         return model_forward_gen(False)
 
 
-def get_mcore_forward_no_padding_fn(hf_config) -> Callable:
+def get_mcore_engine_forward_fn(hf_config) -> Callable:
     """
     Get the forward function for given model architecture.
     """
     assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
-    return gptmodel_forward_no_padding
+    return gptmodel_forward_model_engine
 
 
 def get_mcore_forward_fused_fn(hf_config) -> Callable:
@@ -69,16 +69,16 @@ def get_mcore_forward_fused_fn(hf_config) -> Callable:
         return fused_forward_model_gen(False)
 
 
-def get_mcore_forward_fused_no_padding_fn(hf_config) -> Callable:
+def get_mcore_forward_fused_model_engine_fn(hf_config) -> Callable:
     """
     Get the fused forward function for no-padding inputs.
     """
     assert len(hf_config.architectures) == 1, "Only one architecture is supported for now"
     if hf_config.architectures[0] in supported_vlm:
-        return fused_forward_no_padding_gen(True)
+        return fused_forward_model_engine(True)
     else:
         # default to language model
-        return fused_forward_no_padding_gen(False)
+        return fused_forward_model_engine(False)
 
 
 # ruff: noqa
@@ -186,26 +186,6 @@ class SupportedModel(Enum):
     SupportedModel.MIMO: model_forward_gen(),
 }
 
-# Registry for model forward functions
-MODEL_FORWARD_NOPAD_REGISTRY: dict[SupportedModel, Callable] = {
-    SupportedModel.LLAMA: gptmodel_forward_no_padding,
-    SupportedModel.QWEN2: gptmodel_forward_no_padding,
-    SupportedModel.QWEN2_MOE: gptmodel_forward_no_padding,
-    SupportedModel.MIXTRAL: gptmodel_forward_no_padding,
-    SupportedModel.DEEPSEEK_V3: gptmodel_forward_no_padding,
-    SupportedModel.QWEN2_5_VL: gptmodel_forward_no_padding,
-    SupportedModel.QWEN3_MOE_VL: gptmodel_forward_no_padding,
-    SupportedModel.QWEN3_VL: gptmodel_forward_no_padding,
-    SupportedModel.LLAMA4: gptmodel_forward_no_padding,
-    SupportedModel.QWEN3: gptmodel_forward_no_padding,
-    SupportedModel.QWEN3_MOE: gptmodel_forward_no_padding,
-    SupportedModel.GLM4_MOE: gptmodel_forward_no_padding,
-    SupportedModel.QWEN3_TOKEN_CLASSIFICATION: gptmodel_forward_no_padding,
-    SupportedModel.LLAMA_TOKEN_CLASSIFICATION: gptmodel_forward_no_padding,
-    SupportedModel.GPT_OSS: gptmodel_forward_no_padding,
-    SupportedModel.MIMO: gptmodel_forward_no_padding,
-}
-
 # Registry for model forward functions
 MODEL_FORWARD_FUSED_REGISTRY: dict[SupportedModel, Callable] = {
     SupportedModel.LLAMA: fused_forward_model_gen(),
diff --git a/verl/models/mcore/util.py b/verl/models/mcore/util.py
index ec9237298d5..c4f7b481e1d 100644
--- a/verl/models/mcore/util.py
+++ b/verl/models/mcore/util.py
@@ -22,6 +22,7 @@
 from megatron.core import parallel_state as mpu
 from megatron.core.packed_seq_params import PackedSeqParams
 
+from verl.utils.device import is_npu_available
 from verl.utils.model import CausalLMOutputForPPO
 
 logger = logging.getLogger(__file__)
@@ -288,10 +289,35 @@ def postprocess_packed_seqs_for_dict_output(
     return ret
 
 
+def preprocess_for_mindspeed(input_ids, cu_seqlens_padded, seqlens_in_batch_padded, batch_size):
+    if not is_npu_available:
+        return
+    try:
+        from mindspeed.core.context_parallel.get_batch_utils import set_actual_seq_len
+        from mindspeed.utils import set_position_ids
+
+        set_actual_seq_len(cu_seqlens_padded)
+        # Generate position IDs within each padded segment
+        pack_length = int(seqlens_in_batch_padded.sum().item())
+        position_ids_packed = torch.zeros(pack_length, dtype=torch.int32, device=input_ids.device)
+        for i in range(batch_size):
+            start = cu_seqlens_padded[i].item()
+            end = cu_seqlens_padded[i + 1].item()
+            position_ids_packed[start:end] = torch.arange(end - start, dtype=torch.int32, device=input_ids.device)
+
+        set_position_ids(position_ids_packed.unsqueeze(0).transpose(0, 1).contiguous())
+    except ImportError as e:
+        logger.warning(f"Could not import mindspeed modules, skipping position_id setting: {e}")
+
+
 ### No padding versions for model engine
 ### inputs are nested tensors
-def preprocess_thd_no_padding(
-    input_ids: torch.Tensor, pre_process: bool = True, need_roll: bool = False, use_fp8_padding: bool = False
+def preprocess_thd_engine(
+    input_ids: torch.Tensor,
+    pre_process: bool = True,
+    need_roll: bool = False,
+    use_fp8_padding: bool = False,
+    local_cp_size: Optional[int] = None,
 ) -> tuple[torch.Tensor, PackedSeqParams, Optional[torch.Tensor]]:
     """
     Preprocess packed sequences
@@ -302,8 +328,17 @@ def preprocess_thd_no_padding(
     batch_size = input_ids.shape[0]
 
     tp_size = mpu.get_tensor_model_parallel_world_size()
-    cp_size = mpu.get_context_parallel_world_size()
-    cp_rank = mpu.get_context_parallel_rank()
+    extra_packed_args = {}
+    if local_cp_size is not None:
+        # dynamic CP
+        cp_size = local_cp_size
+        cp_group = mpu.get_dynamic_data_context_parallel_groups(group_size=local_cp_size)
+        cp_rank = torch.distributed.get_rank(group=cp_group)
+        extra_packed_args["local_cp_size"] = local_cp_size
+        extra_packed_args["cp_group"] = cp_group
+    else:
+        cp_size = mpu.get_context_parallel_world_size()
+        cp_rank = mpu.get_context_parallel_rank()
     align_size = tp_size * cp_size * 2 if cp_size > 1 else tp_size
     seqlens_in_batch = input_ids.offsets().diff()
 
@@ -319,6 +354,8 @@ def preprocess_thd_no_padding(
     cu_seqlens_padded = torch.zeros(batch_size + 1, dtype=torch.int32, device=input_ids.device)
     cu_seqlens_padded[1:] = torch.cumsum(seqlens_in_batch_padded, dim=0)
 
+    preprocess_for_mindspeed(input_ids, cu_seqlens_padded, seqlens_in_batch_padded, batch_size)
+
     if use_fp8_padding:
         # Pad the last sequence so total length is divisible by total_align for TE
         pad_size_last = (total_align - cu_seqlens_padded[-1] % total_align) % total_align
@@ -428,6 +465,7 @@ def preprocess_thd_no_padding(
         max_seqlen_kv=max_seqlen_in_batch,
         cu_seqlens_q_padded=cu_seqlens_padded,
         cu_seqlens_kv_padded=cu_seqlens_padded,
+        **extra_packed_args,
     )
     if pre_process:
         return input_ids_rmpad.unsqueeze(0), packed_seq_params, position_ids_rmpad.unsqueeze(0)
@@ -435,12 +473,13 @@ def preprocess_thd_no_padding(
         return input_ids, packed_seq_params, None
 
 
-def postprocess_thd_no_padding(
+def postprocess_thd_engine(
     output: torch.Tensor,
     packed_seq_params: PackedSeqParams,
     input_ids: torch.Tensor,
     batch_size: int,
     post_process: bool = True,
+    local_cp_size: Optional[int] = None,
 ) -> torch.Tensor:
     """
     Postprocess packed sequences
@@ -460,14 +499,21 @@ def postprocess_thd_no_padding(
 
     output_new = []
 
-    cp_size = mpu.get_context_parallel_world_size()
+    if local_cp_size is not None:
+        cp_size = local_cp_size
+        cp_group = packed_seq_params.cp_group
+        cp_rank = torch.distributed.get_rank(group=cp_group)
+    else:
+        cp_size = mpu.get_context_parallel_world_size()
+        cp_group = mpu.get_context_parallel_group()
+        cp_rank = mpu.get_context_parallel_rank()
     # all gather output across context parallel group
     if cp_size > 1:
         # output shape: [1, packed_len, hidden_dim]
         # need to gather across cp group and concatenate in sequence dimension
         output_list = [torch.empty_like(output) for _ in range(cp_size)]
-        torch.distributed.all_gather(output_list, output.detach(), group=mpu.get_context_parallel_group())
-        output_list[mpu.get_context_parallel_rank()] = output
+        torch.distributed.all_gather(output_list, output.detach(), group=cp_group)
+        output_list[cp_rank] = output
     else:
         output_list = [output]
 
@@ -499,7 +545,16 @@ def postprocess_thd_no_padding(
     return output_new_tensor
 
 
-def preprocess_bshd_no_padding(
+def _build_npu_attn_mask(original_attention_mask: torch.Tensor) -> torch.Tensor:
+    """Build attn_mask for torch_npu.npu_fusion_attention (B1SS / [B, 1, Sq, Skv])"""
+    _, seq_len = original_attention_mask.shape
+    causal_mask = torch.tril(torch.ones(seq_len, seq_len, device=original_attention_mask.device)).to(torch.bool)
+    attn_mask = original_attention_mask.unsqueeze(-1) & original_attention_mask.unsqueeze(-2)
+    attn_mask = attn_mask & causal_mask
+    return (~attn_mask).unsqueeze(1).contiguous()
+
+
+def preprocess_bshd_engine(
     input_ids: torch.Tensor, pre_process: bool = True, need_roll: bool = False, use_fp8_padding: bool = False
 ):
     """
@@ -507,41 +562,97 @@ def preprocess_bshd_no_padding(
     return "input_ids, attention_mask, position_ids"
     """
     cp_size = mpu.get_context_parallel_world_size()
-    # TODO: support context parallel size > 1
-    assert cp_size == 1, "Context parallel size without bshd is not supported yet"
+    cp_rank = mpu.get_context_parallel_rank()
 
     batch_size = input_ids.shape[0]
     seqlens_in_batch = input_ids.offsets().diff()
     max_seqlen = seqlens_in_batch.max().item()
     tp_size = mpu.get_tensor_model_parallel_world_size()
-    if tp_size > 1:
-        sp_world_size = tp_size
-        pad_size = (sp_world_size - max_seqlen % sp_world_size) % sp_world_size
-        max_seqlen = max_seqlen + pad_size
+    # For CP, sequence length must be divisible by (2 * cp_size), and for SP by tp_size.
+    align_size = math.lcm(tp_size, 2 * cp_size) if cp_size > 1 else tp_size
+    if align_size > 1:
+        pad_size = (align_size - max_seqlen % align_size) % align_size
+        max_seqlen += pad_size
     if use_fp8_padding:
         # For FP8 block quantization, batch_size * max_seqlen / tp_size must be divisible by 128.
-        # We need: max_seqlen % tp_size == 0 (for SP) AND batch_size * max_seqlen % (128 * tp_size) == 0.
+        # With CP, local sequence length is max_seqlen / cp_size.
+        # We need:
+        # 1) max_seqlen aligned for SP/CP splitting.
+        # 2) batch_size * max_seqlen % (128 * tp_size * cp_size) == 0.
         # Compute the required alignment for max_seqlen:
-        fp8_total_align = 128 * tp_size
+        fp8_total_align = 128 * tp_size * cp_size
         fp8_seq_align = fp8_total_align // math.gcd(batch_size, fp8_total_align)
-        # Also ensure tp alignment for SP
-        fp8_seq_align = math.lcm(fp8_seq_align, tp_size)
+        # Also ensure SP and CP split alignment.
+        fp8_seq_align = math.lcm(fp8_seq_align, align_size)
         max_seqlen = ((max_seqlen + fp8_seq_align - 1) // fp8_seq_align) * fp8_seq_align
 
-    attention_mask = torch.zeros(batch_size, max_seqlen, dtype=torch.bool, device=input_ids.device)
-    input_ids_bshd = torch.zeros(batch_size, max_seqlen, dtype=input_ids.dtype, device=input_ids.device)
+    local_max_seqlen = max_seqlen // cp_size if cp_size > 1 else max_seqlen
+    attention_mask = torch.zeros(batch_size, local_max_seqlen, dtype=torch.bool, device=input_ids.device)
+    input_ids_bshd = torch.zeros(batch_size, local_max_seqlen, dtype=input_ids.dtype, device=input_ids.device)
+    seqlens_in_batch_cpu: list[int] = seqlens_in_batch.tolist()
     for i in range(batch_size):
-        attention_mask[i, : seqlens_in_batch[i]] = True
-        input_ids_bshd[i, : seqlens_in_batch[i]] = input_ids[i]
-    position_ids = torch.arange(max_seqlen, dtype=torch.long, device=input_ids.device)
-    position_ids = position_ids.unsqueeze(0).expand_as(input_ids_bshd)
-    if need_roll:
+        seqlen_i = int(seqlens_in_batch_cpu[i])
+        if cp_size <= 1:
+            attention_mask[i, :seqlen_i] = True
+            input_ids_bshd[i, :seqlen_i] = input_ids[i]
+            continue
+
+        seq = input_ids[i]
+        if seqlen_i < max_seqlen:
+            seq_padded = torch.zeros(max_seqlen, dtype=seq.dtype, device=seq.device)
+            seq_padded[:seqlen_i] = seq
+            seq = seq_padded
+
+        chunk_len = max_seqlen // (2 * cp_size)
+        first_start = cp_rank * chunk_len
+        second_start = (2 * cp_size - cp_rank - 1) * chunk_len
+        first_chunk = seq[first_start : first_start + chunk_len]
+        second_chunk = seq[second_start : second_start + chunk_len]
+        local_seq = torch.cat((first_chunk, second_chunk), dim=0)
+        if need_roll:
+            local_pos = torch.cat(
+                (
+                    torch.arange(first_start, first_start + chunk_len, dtype=torch.long, device=seq.device),
+                    torch.arange(second_start, second_start + chunk_len, dtype=torch.long, device=seq.device),
+                ),
+                dim=0,
+            )
+            local_seq = seq[(local_pos + 1) % max_seqlen]
+        input_ids_bshd[i] = local_seq
+
+        valid_first = max(0, min(seqlen_i - first_start, chunk_len))
+        valid_second = max(0, min(seqlen_i - second_start, chunk_len))
+        if valid_first > 0:
+            attention_mask[i, :valid_first] = True
+        if valid_second > 0:
+            attention_mask[i, chunk_len : chunk_len + valid_second] = True
+
+    if cp_size <= 1:
+        position_ids = torch.arange(local_max_seqlen, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids_bshd)
+    else:
+        chunk_len = max_seqlen // (2 * cp_size)
+        first_pos = torch.arange(
+            cp_rank * chunk_len, (cp_rank + 1) * chunk_len, dtype=torch.long, device=input_ids.device
+        )
+        second_pos = torch.arange(
+            max_seqlen - (cp_rank + 1) * chunk_len,
+            max_seqlen - cp_rank * chunk_len,
+            dtype=torch.long,
+            device=input_ids.device,
+        )
+        position_ids = torch.cat((first_pos, second_pos), dim=0).unsqueeze(0).expand_as(input_ids_bshd)
+    if need_roll and cp_size <= 1:
         input_ids_bshd = torch.roll(input_ids_bshd, shifts=-1, dims=1)
 
+    if is_npu_available:
+        # Ascend npu_fusion_attention's attn_mask must be BNSS / B1SS / 11SS / SS; [B, S] is invalid.
+        attention_mask = _build_npu_attn_mask(attention_mask)
+
     return input_ids_bshd, attention_mask, position_ids
 
 
-def postprocess_bshd_no_padding(
+def postprocess_bshd_engine(
     output: torch.Tensor,
     attention_mask: torch.Tensor,
     post_process: bool = True,
@@ -552,12 +663,65 @@ def postprocess_bshd_no_padding(
     if not post_process:
         return output
 
+    if is_npu_available:
+        attention_mask = attention_mask.diagonal(dim1=-2, dim2=-1).squeeze(1)
+        attention_mask = ~attention_mask.bool()
+
+    assert output.shape[:2] == attention_mask.shape, (
+        f"output.shape: {output.shape}, attention_mask.shape: {attention_mask.shape}"
+    )
+
+    cp_size = mpu.get_context_parallel_world_size()
+    cp_rank = mpu.get_context_parallel_rank()
+    cp_group = mpu.get_context_parallel_group()
+
     batch_size = output.shape[0]
+
+    if cp_size > 1:
+        output_list = [torch.empty_like(output, dtype=output.dtype) for _ in range(cp_size)]
+        torch.distributed.all_gather(output_list, output.detach(), group=cp_group)
+        output_list[cp_rank] = output
+
+        mask_list = [torch.empty_like(attention_mask, dtype=attention_mask.dtype) for _ in range(cp_size)]
+        torch.distributed.all_gather(mask_list, attention_mask, group=cp_group)
+    else:
+        output_list = [output]
+        mask_list = [attention_mask]
+
     output_new = []
 
     for i in range(batch_size):
-        mask = attention_mask[i].bool()
-        output_new.append(output[i][mask])
+        if cp_size <= 1:
+            mask = attention_mask[i].bool()
+            output_new.append(output[i][mask])
+            continue
+
+        local_seqlen = output.shape[1]
+        assert local_seqlen % 2 == 0, "CP bshd expects local sequence length to be divisible by 2"
+        half_seqlen = local_seqlen // 2
+        full_seqlen = local_seqlen * cp_size
+
+        tmp = torch.empty(full_seqlen, *output.shape[2:], device=output.device, dtype=output.dtype)
+        full_mask = torch.zeros(full_seqlen, device=attention_mask.device, dtype=torch.bool)
+
+        for j in range(cp_size):
+            o = output_list[j][i]
+            m = mask_list[j][i].bool()
+
+            o0, o1 = o[:half_seqlen], o[half_seqlen:]
+            m0, m1 = m[:half_seqlen], m[half_seqlen:]
+
+            front_start = j * half_seqlen
+            front_end = (j + 1) * half_seqlen
+            back_start = full_seqlen - (j + 1) * half_seqlen
+            back_end = full_seqlen - j * half_seqlen
+
+            tmp[front_start:front_end] = o0
+            tmp[back_start:back_end] = o1
+            full_mask[front_start:front_end] = m0
+            full_mask[back_start:back_end] = m1
+
+        output_new.append(tmp[full_mask])
 
     output_new_tensor = torch.nested.as_nested_tensor(output_new, layout=torch.jagged)
 
diff --git a/verl/models/transformers/glm4v.py b/verl/models/transformers/glm4v.py
index b2efe369a26..d35075c315e 100644
--- a/verl/models/transformers/glm4v.py
+++ b/verl/models/transformers/glm4v.py
@@ -301,9 +301,16 @@ def glm4v_attn_forward(
 
     # Because the input can be padded, the absolute sequence length depends on the max position id.
     cos, sin = position_embeddings
-    query_states, key_states = apply_multimodal_rotary_pos_emb(
-        query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
-    )
+    if getattr(self, "rope_scaling", None) is not None:
+        # for transformers < 5.0.0
+        mrope_section = self.rope_scaling.get("mrope_section", None)
+    else:
+        # for transformers >= 5.0.0, only rope_parameters present in the config
+        assert getattr(self, "rope_parameter", None) is not None, (
+            "Either rope_scaling or rope_parameter should be defined in the config for GLM4V."
+        )
+        mrope_section = self.rope_parameter.get("mrope_section", None)
+    query_states, key_states = apply_multimodal_rotary_pos_emb(query_states, key_states, cos, sin, mrope_section)
     key_states = repeat_kv(key_states, self.num_key_value_groups)
     value_states = repeat_kv(value_states, self.num_key_value_groups)
     dropout_rate = 0.0 if not self.training else self.attention_dropout
@@ -382,7 +389,7 @@ def _get_input_embeds(
         pixel_values = torch.zeros((16, 1176), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
         image_grid_thw = torch.tensor([[1, 4, 4]], dtype=torch.long, device=inputs_embeds.device)
         image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
-        inputs_embeds += 0.0 * image_embeds.mean()
+        inputs_embeds = inputs_embeds + 0.0 * image_embeds.mean()
 
     if attention_mask is not None:
         attention_mask = attention_mask.to(inputs_embeds.device)
diff --git a/verl/models/transformers/monkey_patch.py b/verl/models/transformers/monkey_patch.py
index bb26dac2da9..a5117d73af4 100644
--- a/verl/models/transformers/monkey_patch.py
+++ b/verl/models/transformers/monkey_patch.py
@@ -265,6 +265,11 @@ def patch_forward_with_backends(
     elif model.config.model_type == "glm4v":
         from verl.models.transformers.glm4v import forward_with_torch_backend, forward_with_triton_backend
 
+        forward_with_torch_backend_function = forward_with_torch_backend
+        forward_with_triton_backend_function = forward_with_triton_backend
+    elif model.config.model_type in ["qwen3_5", "qwen3_5_moe"]:
+        from verl.models.transformers.qwen3_5 import forward_with_torch_backend, forward_with_triton_backend
+
         forward_with_torch_backend_function = forward_with_torch_backend
         forward_with_triton_backend_function = forward_with_triton_backend
     else:
@@ -479,6 +484,34 @@ def state_dict(self, *args, **kwargs):
             print("Not support fused kernels for KimiVL")
 
         return
+    elif model.config.model_type in ["qwen3_5", "qwen3_5_moe"]:
+        # Step 1: patch model to support image-text mixed data
+        from transformers.models.qwen3_5.modeling_qwen3_5 import (
+            Qwen3_5ForConditionalGeneration,
+            Qwen3_5Model,
+            Qwen3_5VisionModel,
+        )
+        from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (
+            Qwen3_5MoeForConditionalGeneration,
+            Qwen3_5MoeModel,
+            Qwen3_5MoeVisionModel,
+        )
+
+        from verl.models.transformers.qwen3_5 import (
+            fast_pos_embed_interpolate,
+            forward_with_normal_backend,
+            qwen3_5_base_forward,
+        )
+
+        Qwen3_5Model.forward = qwen3_5_base_forward
+        Qwen3_5MoeModel.forward = qwen3_5_base_forward
+        Qwen3_5ForConditionalGeneration.forward = forward_with_normal_backend
+        Qwen3_5MoeForConditionalGeneration.forward = forward_with_normal_backend
+        print(f"Monkey patch {model.__class__.__name__} model forward")
+
+        # Step 2: patch vision model to fix fsdp2 cpu_offload bug.
+        Qwen3_5VisionModel.fast_pos_embed_interpolate = fast_pos_embed_interpolate
+        Qwen3_5MoeVisionModel.fast_pos_embed_interpolate = fast_pos_embed_interpolate
 
     if use_remove_padding or ulysses_sp_size > 1:
         if hasattr(module, "_flash_attention_forward"):  # transformers <= 4.47.1 or legacy models
diff --git a/verl/models/transformers/npu_patch.py b/verl/models/transformers/npu_patch.py
index d224b884703..53bff39624f 100644
--- a/verl/models/transformers/npu_patch.py
+++ b/verl/models/transformers/npu_patch.py
@@ -23,6 +23,8 @@
 from transformers.models.qwen2 import modeling_qwen2
 from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl
 from transformers.models.qwen3 import modeling_qwen3
+from transformers.models.qwen3_5 import modeling_qwen3_5
+from transformers.models.qwen3_5_moe import modeling_qwen3_5_moe
 from transformers.models.qwen3_moe import modeling_qwen3_moe
 from transformers.models.qwen3_next import modeling_qwen3_next
 from transformers.models.qwen3_vl import modeling_qwen3_vl
@@ -285,6 +287,29 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return routed_out
 
 
+def qwen3_5_moe_experts_forward_npu(
+    self,
+    hidden_states: torch.Tensor,
+    top_k_index: torch.Tensor,
+    top_k_weights: torch.Tensor,
+) -> torch.Tensor:
+    selected_experts = top_k_index
+    routing_weights = top_k_weights
+    gate_up_proj = self.gate_up_proj.permute(0, 2, 1).contiguous()
+    down_proj = self.down_proj.permute(0, 2, 1).contiguous()
+    permuted_hidden_states, row_ids_map = torch_npu.npu_moe_token_permute(
+        hidden_states, selected_experts.to(torch.int32)
+    )
+    tokens_per_expert = torch.histc(selected_experts, bins=self.num_experts, min=0, max=self.num_experts)
+    intermediate_hidden_states = NPUGmmFunction.apply(permuted_hidden_states, gate_up_proj, tokens_per_expert)
+    intermediate_activations = torch_npu.npu_swiglu(intermediate_hidden_states, dim=-1)
+    output = NPUGmmFunction.apply(intermediate_activations, down_proj, tokens_per_expert)
+    final_hidden_states = torch_npu.npu_moe_token_unpermute(
+        output.to(routing_weights.dtype), row_ids_map, probs=routing_weights
+    )
+    return final_hidden_states.to(hidden_states.dtype)
+
+
 # Patches for Qwen2 Model
 modeling_qwen2.Qwen2RMSNorm.forward = rms_norm_forward_npu
 modeling_qwen2.Qwen2MLP.forward = silu_forward_npu
@@ -318,3 +343,14 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 modeling_qwen3_next.Qwen3NextRMSNormGated.forward = qwen3_next_rms_norm_forward_gated_npu
 modeling_qwen3_next.Qwen3NextRMSNorm.forward = qwen3_next_rms_norm_forward_npu
 modeling_qwen3_next.apply_rotary_pos_emb = qwen3_next_apply_rotary_pos_emb_npu
+
+# Patches for Qwen3.5 Model
+modeling_qwen3_5.Qwen3_5RMSNormGated.forward = qwen3_next_rms_norm_forward_gated_npu
+modeling_qwen3_5.Qwen3_5RMSNorm.forward = qwen3_next_rms_norm_forward_npu
+modeling_qwen3_5.apply_rotary_pos_emb = qwen3_next_apply_rotary_pos_emb_npu
+
+# Patches for Qwen3.5 MoE Model
+modeling_qwen3_5_moe.Qwen3_5MoeExperts.forward = qwen3_5_moe_experts_forward_npu
+modeling_qwen3_5_moe.Qwen3_5MoeRMSNormGated.forward = qwen3_next_rms_norm_forward_gated_npu
+modeling_qwen3_5_moe.Qwen3_5MoeRMSNorm.forward = qwen3_next_rms_norm_forward_npu
+modeling_qwen3_5_moe.apply_rotary_pos_emb = qwen3_next_apply_rotary_pos_emb_npu
diff --git a/verl/models/transformers/qwen2_vl.py b/verl/models/transformers/qwen2_vl.py
index 5e82fdd4dd4..448d7938b4f 100644
--- a/verl/models/transformers/qwen2_vl.py
+++ b/verl/models/transformers/qwen2_vl.py
@@ -29,7 +29,7 @@
 from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10
 
 from verl.utils.device import is_npu_available
-from verl.utils.transformers_compat import is_transformers_version_in_range
+from verl.utils.transformers_compat import is_transformers_version_in_range, unpack_visual_output
 from verl.utils.ulysses import (
     gather_heads_scatter_seq,
     gather_seq_scatter_heads,
@@ -286,9 +286,16 @@ def qwen2_vl_attn_forward(
 
     # Because the input can be padded, the absolute sequence length depends on the max position id.
     cos, sin = position_embeddings
-    query_states, key_states = apply_multimodal_rotary_pos_emb(
-        query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
-    )
+    if getattr(self, "rope_scaling", None) is not None:
+        # for transformers < 5.0.0
+        mrope_section = self.rope_scaling.get("mrope_section", None)
+    else:
+        # for transformers >= 5.0.0, only rope_parameters present in the config
+        assert getattr(self, "rope_parameters", None) is not None, (
+            "Either rope_scaling or rope_parameters should be defined in the config for Qwen2 VL."
+        )
+        mrope_section = self.rope_parameters.get("mrope_section", None)
+    query_states, key_states = apply_multimodal_rotary_pos_emb(query_states, key_states, cos, sin, mrope_section)
     key_states = repeat_kv(key_states, self.num_key_value_groups)
     value_states = repeat_kv(value_states, self.num_key_value_groups)
     dropout_rate = 0.0 if not self.training else self.attention_dropout
@@ -344,7 +351,7 @@ def _get_input_embeds(
     inputs_embeds = model.get_input_embeddings()(input_ids)
     if pixel_values is not None:
         pixel_values = pixel_values.type(model.visual.dtype)
-        image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
+        image_embeds, _ = unpack_visual_output(model.visual(pixel_values, grid_thw=image_grid_thw))
         n_image_tokens = (input_ids == model.config.image_token_id).sum().item()
         n_image_features = image_embeds.shape[0]
         if n_image_tokens != n_image_features:
@@ -362,7 +369,7 @@ def _get_input_embeds(
 
     if pixel_values_videos is not None:
         pixel_values_videos = pixel_values_videos.type(model.visual.dtype)
-        video_embeds = model.visual(pixel_values_videos, grid_thw=video_grid_thw)
+        video_embeds, _ = unpack_visual_output(model.visual(pixel_values_videos, grid_thw=video_grid_thw))
         n_video_tokens = (input_ids == model.config.video_token_id).sum().item()
         n_video_features = video_embeds.shape[0]
         if n_video_tokens != n_video_features:
@@ -384,7 +391,7 @@ def _get_input_embeds(
         pixel_values = torch.zeros((16, patch_dim), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
         image_grid_thw = torch.tensor([[1, 4, 4]], dtype=torch.long, device=inputs_embeds.device)
         image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
-        inputs_embeds += 0.0 * image_embeds.mean()
+        inputs_embeds = inputs_embeds + 0.0 * image_embeds.mean()
 
     if attention_mask is not None:
         attention_mask = attention_mask.to(inputs_embeds.device)
diff --git a/verl/models/transformers/qwen3_5.py b/verl/models/transformers/qwen3_5.py
new file mode 100644
index 00000000000..3d1f8153ea6
--- /dev/null
+++ b/verl/models/transformers/qwen3_5.py
@@ -0,0 +1,262 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+from dataclasses import dataclass
+from typing import Optional
+
+import torch
+from transformers.models.qwen3_5.modeling_qwen3_5 import (
+    Qwen3_5CausalLMOutputWithPast,
+    Qwen3_5ForConditionalGeneration,
+)
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+def fast_pos_embed_interpolate(self, grid_thw):
+    grid_thw_list = grid_thw.tolist()
+    grid_ts = [row[0] for row in grid_thw_list]
+    grid_hs = [row[1] for row in grid_thw_list]
+    grid_ws = [row[2] for row in grid_thw_list]
+    # Modification: # Get device from grid_thw to avoid self.pos_embed being on CPU when FSDP2 enables cpu_offload
+    device = grid_thw.device
+
+    idx_list = [[] for _ in range(4)]
+    weight_list = [[] for _ in range(4)]
+
+    for t, h, w in grid_thw_list:
+        h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h)
+        w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w)
+
+        h_idxs_floor = h_idxs.int()
+        w_idxs_floor = w_idxs.int()
+        h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
+        w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
+
+        dh = h_idxs - h_idxs_floor
+        dw = w_idxs - w_idxs_floor
+
+        base_h = h_idxs_floor * self.num_grid_per_side
+        base_h_ceil = h_idxs_ceil * self.num_grid_per_side
+
+        indices = [
+            (base_h[None].T + w_idxs_floor[None]).flatten(),
+            (base_h[None].T + w_idxs_ceil[None]).flatten(),
+            (base_h_ceil[None].T + w_idxs_floor[None]).flatten(),
+            (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(),
+        ]
+
+        weights = [
+            ((1 - dh)[None].T * (1 - dw)[None]).flatten(),
+            ((1 - dh)[None].T * dw[None]).flatten(),
+            (dh[None].T * (1 - dw)[None]).flatten(),
+            (dh[None].T * dw[None]).flatten(),
+        ]
+
+        for i in range(4):
+            idx_list[i].extend(indices[i].tolist())
+            weight_list[i].extend(weights[i].tolist())
+
+    idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=device)
+    weight_tensor = torch.tensor(weight_list, dtype=self.pos_embed.weight.dtype, device=device)
+    pos_embeds = self.pos_embed(idx_tensor).to(device) * weight_tensor[:, :, None]
+    patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
+
+    patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws, strict=False)])
+
+    patch_pos_embeds_permute = []
+    merge_size = self.config.spatial_merge_size
+    for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws, strict=False):
+        pos_embed = pos_embed.repeat(t, 1)
+        pos_embed = (
+            pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1)
+            .permute(0, 1, 3, 2, 4, 5)
+            .flatten(0, 4)
+        )
+        patch_pos_embeds_permute.append(pos_embed)
+    patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
+    return patch_pos_embeds
+
+
+def _get_input_embeds(
+    model: "Qwen3_5CausalLMOutputWithPast",
+    input_ids: torch.LongTensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+):
+    inputs_embeds = model.get_input_embeddings()(input_ids)
+    if pixel_values is not None:
+        pixel_values = pixel_values.type(model.visual.dtype)
+        image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw).pooler_output
+        n_image_tokens = (input_ids == model.config.image_token_id).sum().item()
+        n_image_features = image_embeds.shape[0]
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+
+        mask = input_ids == model.config.image_token_id
+        mask_unsqueezed = mask.unsqueeze(-1)
+        mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+        image_mask = mask_expanded.to(inputs_embeds.device)
+
+        image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+
+    if pixel_values_videos is not None:
+        pixel_values_videos = pixel_values_videos.type(model.visual.dtype)
+        video_embeds = model.visual(pixel_values_videos, grid_thw=video_grid_thw).pooler_output
+        n_video_tokens = (input_ids == model.config.video_token_id).sum().item()
+        n_video_features = video_embeds.shape[0]
+        if n_video_tokens != n_video_features:
+            raise ValueError(
+                f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+            )
+
+        mask = input_ids == model.config.video_token_id
+        mask_unsqueezed = mask.unsqueeze(-1)
+        mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+        video_mask = mask_expanded.to(inputs_embeds.device)
+
+        video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+
+    if pixel_values is None and pixel_values_videos is None:
+        config = model.config.vision_config
+        patch_dim = config.in_channels * config.temporal_patch_size * config.patch_size**2
+        pixel_values = torch.zeros((16, patch_dim), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+        image_grid_thw = torch.tensor([[1, 4, 4]], dtype=torch.long, device=inputs_embeds.device)
+        image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw).pooler_output
+        inputs_embeds = inputs_embeds + 0.0 * image_embeds.mean()
+
+    if attention_mask is not None:
+        attention_mask = attention_mask.to(inputs_embeds.device)
+
+    return {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
+
+
+def qwen3_5_base_forward(
+    self: "Qwen3_5ForConditionalGeneration",
+    input_ids: torch.LongTensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    **kwargs,
+):
+    input_kwargs = _get_input_embeds(
+        self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw
+    )  # avoid lora module having multiple keyword arguments
+    kwargs.update(input_kwargs)
+    return self.language_model(
+        input_ids=None,
+        **kwargs,
+    )
+
+
+@dataclass
+class Qwen3_5CausalLMOutputForPPO(Qwen3_5CausalLMOutputWithPast):
+    log_probs: Optional[torch.FloatTensor] = None
+    entropy: Optional[torch.FloatTensor] = None
+
+
+def forward_with_normal_backend(
+    self: "Qwen3_5ForConditionalGeneration",
+    input_ids: torch.LongTensor = None,
+    labels: Optional[torch.LongTensor] = None,
+    temperature: float = 1.0,
+    **kwargs,
+) -> "Qwen3_5CausalLMOutputForPPO":
+    outputs = self.model(input_ids, **kwargs)
+    hidden_states = outputs[0]
+    logits = self.lm_head(hidden_states)
+    return Qwen3_5CausalLMOutputForPPO(
+        logits=logits,
+        hidden_states=outputs.hidden_states,
+    )
+
+
+def forward_with_torch_backend(
+    self: "Qwen3_5ForConditionalGeneration",
+    input_ids: torch.LongTensor = None,
+    labels: Optional[torch.LongTensor] = None,
+    temperature: float = 1.0,
+    **kwargs,
+) -> "Qwen3_5CausalLMOutputForPPO":
+    from verl.utils.experimental.torch_functional import FusedLinearForPPO
+
+    outputs = self.model(input_ids, **kwargs)
+    hidden_states = outputs[0]
+
+    # Loss calculations
+    if labels is not None:
+        rolled_labels = torch.roll(labels, shifts=-1, dims=-1)
+    elif input_ids is not None:
+        rolled_labels = torch.roll(input_ids, shifts=-1, dims=-1)
+    else:
+        raise RuntimeError("To use forward_with_torch_backend, either labels or input_ids must be provided.")
+
+    fused_linear_for_ppo = FusedLinearForPPO()
+    log_probs, entropy = fused_linear_for_ppo.forward(
+        hidden_states=hidden_states,
+        vocab_weights=self.lm_head.weight,
+        input_ids=rolled_labels,
+        temperature=temperature,
+    )
+    return Qwen3_5CausalLMOutputForPPO(
+        log_probs=log_probs,
+        entropy=entropy,
+        hidden_states=outputs.hidden_states,
+    )
+
+
+def forward_with_triton_backend(
+    self: "Qwen3_5ForConditionalGeneration",
+    input_ids: torch.LongTensor = None,
+    labels: Optional[torch.LongTensor] = None,
+    temperature: float = 1.0,
+    **kwargs,
+) -> "Qwen3_5CausalLMOutputForPPO":
+    from verl.utils.kernel.linear_cross_entropy import linear_cross_entropy
+
+    outputs = self.model(input_ids, **kwargs)
+    hidden_states = outputs[0]
+
+    # Loss calculations
+    if labels is not None:
+        rolled_labels = torch.roll(labels, shifts=-1, dims=-1)
+    elif input_ids is not None:
+        rolled_labels = torch.roll(input_ids, shifts=-1, dims=-1)
+    else:
+        raise RuntimeError("To use forward_with_triton_backend, either labels or input_ids must be provided.")
+
+    log_probs, entropy = linear_cross_entropy(
+        hidden_states,
+        self.lm_head.weight,
+        rolled_labels,
+        temperature,
+        "none",
+    )
+    return Qwen3_5CausalLMOutputForPPO(
+        log_probs=log_probs,
+        entropy=entropy,
+        hidden_states=outputs.hidden_states,
+    )
diff --git a/verl/models/transformers/qwen3_vl.py b/verl/models/transformers/qwen3_vl.py
index 972848a1a08..f385d716d52 100644
--- a/verl/models/transformers/qwen3_vl.py
+++ b/verl/models/transformers/qwen3_vl.py
@@ -24,6 +24,8 @@
     Qwen3VLForConditionalGeneration,
 )
 
+from verl.utils.transformers_compat import unpack_visual_output
+
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
@@ -147,7 +149,7 @@ def _get_input_embeds(
     image_mask, video_mask = None, None
     if pixel_values is not None:
         pixel_values = pixel_values.type(model.visual.dtype)
-        image_embeds, deepstack_image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
+        image_embeds, deepstack_image_embeds = unpack_visual_output(model.visual(pixel_values, grid_thw=image_grid_thw))
         n_image_tokens = (input_ids == model.config.image_token_id).sum().item()
         n_image_features = image_embeds.shape[0]
         if n_image_tokens != n_image_features:
@@ -165,7 +167,9 @@ def _get_input_embeds(
 
     if pixel_values_videos is not None:
         pixel_values_videos = pixel_values_videos.type(model.visual.dtype)
-        video_embeds, deepstack_video_embeds = model.visual(pixel_values_videos, grid_thw=video_grid_thw)
+        video_embeds, deepstack_video_embeds = unpack_visual_output(
+            model.visual(pixel_values_videos, grid_thw=video_grid_thw)
+        )
         n_video_tokens = (input_ids == model.config.video_token_id).sum().item()
         n_video_features = video_embeds.shape[0]
         if n_video_tokens != n_video_features:
@@ -210,10 +214,12 @@ def _get_input_embeds(
         patch_dim = config.in_channels * config.temporal_patch_size * config.patch_size**2
         pixel_values = torch.zeros((16, patch_dim), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
         image_grid_thw = torch.tensor([[1, 4, 4]], dtype=torch.long, device=inputs_embeds.device)
-        image_embeds, dummy_deepstack_image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
-        inputs_embeds += 0.0 * image_embeds.mean()
+        image_embeds, dummy_deepstack_image_embeds = unpack_visual_output(
+            model.visual(pixel_values, grid_thw=image_grid_thw)
+        )
+        inputs_embeds = inputs_embeds + 0.0 * image_embeds.mean()
         for emb in dummy_deepstack_image_embeds or []:
-            inputs_embeds += 0.0 * emb.mean()
+            inputs_embeds = inputs_embeds + 0.0 * emb.mean()
 
     if attention_mask is not None:
         attention_mask = attention_mask.to(inputs_embeds.device)
diff --git a/verl/single_controller/ray/base.py b/verl/single_controller/ray/base.py
index 2f6ee47064f..2ab1f406ad0 100644
--- a/verl/single_controller/ray/base.py
+++ b/verl/single_controller/ray/base.py
@@ -28,7 +28,7 @@
 from verl.protocol import DataProto, _padding_size_key
 from verl.single_controller.base import ClassWithInitArgs, ResourcePool, Worker, WorkerGroup
 from verl.single_controller.base.decorator import MAGIC_ATTR, Dispatch
-from verl.utils.device import get_device_name
+from verl.utils.device import get_device_name, is_torch_npu_available
 from verl.utils.py_functional import temp_env_var
 
 __all__ = ["Worker"]
@@ -294,7 +294,8 @@ def split_resource_pool(
         start_bundle_idx_list = np.cumsum([0] + split_size_list[:-1])
 
     # ensure resource_pool.pgs has been initialized
-    placement_groups = resource_pool.get_placement_groups()
+    device = "npu" if is_torch_npu_available(check_device=False) else "cuda"
+    placement_groups = resource_pool.get_placement_groups(device_name=device)
     split_resource_pools = [
         SubRayResourcePool(
             process_on_nodes=resource_pool.store,
diff --git a/verl/trainer/config/_generated_diffusion_trainer.yaml b/verl/trainer/config/_generated_diffusion_trainer.yaml
new file mode 100644
index 00000000000..10640606970
--- /dev/null
+++ b/verl/trainer/config/_generated_diffusion_trainer.yaml
@@ -0,0 +1,665 @@
+# This reference configration yaml is automatically generated via 'scripts/generate_trainer_config.sh'
+# in which it invokes 'python3 scripts/print_cfg.py --cfg job --config-name=diffusion_trainer.yaml' to flatten the 'verl/trainer/config/diffusion_trainer.yaml' config fields into a single file.
+# Do not modify this file directly.
+# The file is usually only for reference and never used.
+
+actor_rollout_ref:
+  actor:
+    optim:
+      _target_: verl.workers.config.FSDPOptimizerConfig
+      optimizer: AdamW
+      optimizer_impl: torch.optim
+      lr: 1.0e-06
+      lr_warmup_steps_ratio: 0.0
+      total_training_steps: -1
+      weight_decay: 0.01
+      lr_warmup_steps: -1
+      betas:
+      - 0.9
+      - 0.999
+      clip_grad: 1.0
+      min_lr_ratio: 0.0
+      num_cycles: 0.5
+      lr_scheduler_type: constant
+      zero_indexed_step: true
+      warmup_style: null
+      override_optimizer_config: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: false
+      strategy: fsdp
+      dtype: bfloat16
+      qat:
+        _target_: verl.workers.config.QATEngineConfig
+        enable: false
+        mode: w4a16
+        group_size: 16
+        ignore_patterns:
+        - lm_head
+        - embed_tokens
+        - re:.*mlp.gate$
+        activation_observer: static_minmax
+        quantization_config_path: null
+    _target_: verl.workers.config.FSDPActorConfig
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: fsdp
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: false
+    ppo_max_token_len_per_gpu: 16384
+    clip_ratio: 0.0001
+    clip_ratio_low: 0.2
+    clip_ratio_high: 5.0
+    tau_pos: 1.0
+    tau_neg: 1.05
+    freeze_vision_tower: false
+    policy_loss:
+      _target_: verl.workers.config.PolicyLossConfig
+      loss_mode: vanilla
+      clip_cov_ratio: 0.0002
+      clip_cov_lb: 1.0
+      clip_cov_ub: 5.0
+      kl_cov_ratio: 0.0002
+      ppo_kl_coef: 0.1
+    clip_ratio_c: 3.0
+    loss_agg_mode: token-mean
+    loss_scale_factor: null
+    entropy_coeff: 0
+    calculate_entropy: false
+    use_kl_loss: false
+    use_prefix_grouper: false
+    use_torch_compile: true
+    kl_loss_coef: 0.001
+    kl_loss_type: low_var_kl
+    ppo_epochs: 1
+    shuffle: false
+    data_loader_seed: 42
+    checkpoint:
+      _target_: verl.trainer.config.CheckpointConfig
+      save_contents:
+      - model
+      - optimizer
+      - extra
+      load_contents: ${.save_contents}
+      async_save: false
+      mbridge_config: {}
+    use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    qat:
+      enable: false
+      mode: w4a16
+      group_size: 16
+      ignore_patterns:
+      - lm_head
+      - embed_tokens
+      - re:.*mlp.gate$
+      activation_observer: static_minmax
+      quantization_config_path: null
+    grad_clip: 1.0
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+    use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false}
+    calculate_sum_pi_squared: false
+    sum_pi_squared_checkpointing: false
+  ref:
+    rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true}
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: false
+      all_ranks: false
+      ranks: []
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        nsys:
+          _target_: verl.utils.profiler.config.NsightToolConfig
+          discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: []
+          level: level0
+          analysis: true
+          discrete: false
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: []
+          discrete: false
+        torch_memory:
+          _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+          trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+          stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+    router_replay:
+      _target_: verl.workers.config.RouterReplayConfig
+      mode: disabled
+      record_file: null
+      replay_file: null
+    fsdp_config:
+      _target_: verl.workers.config.FSDPEngineConfig
+      wrap_policy:
+        min_num_params: 0
+      param_offload: false
+      optimizer_offload: false
+      offload_policy: false
+      reshard_after_forward: true
+      fsdp_size: -1
+      forward_prefetch: false
+      model_dtype: fp32
+      use_orig_params: false
+      seed: 42
+      full_determinism: false
+      ulysses_sequence_parallel_size: 1
+      entropy_from_logits_with_chunking: false
+      use_torch_compile: true
+      entropy_checkpointing: false
+      forward_only: true
+      strategy: fsdp
+      dtype: bfloat16
+      qat:
+        _target_: verl.workers.config.QATEngineConfig
+        enable: false
+        mode: w4a16
+        group_size: 16
+        ignore_patterns:
+        - lm_head
+        - embed_tokens
+        - re:.*mlp.gate$
+        activation_observer: static_minmax
+        quantization_config_path: null
+    _target_: verl.workers.config.FSDPActorConfig
+    ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1}
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
+  rollout:
+    _target_: verl.workers.config.DiffusionRolloutConfig
+    name: ???
+    mode: async
+    nnodes: 0
+    n_gpus_per_node: ${oc.select:trainer.n_gpus_per_node,8}
+    temperature: 1.0
+    top_k: -1
+    top_p: 1
+    prompt_length: ${oc.select:data.max_prompt_length,512}
+    response_length: ${oc.select:data.max_response_length,512}
+    dtype: bfloat16
+    gpu_memory_utilization: 0.5
+    ignore_eos: false
+    enforce_eager: false
+    cudagraph_capture_sizes: null
+    free_cache_engine: true
+    tensor_model_parallel_size: 2
+    data_parallel_size: 1
+    expert_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    enable_chunked_prefill: true
+    enable_prefix_caching: true
+    logprobs_mode: processed_logprobs
+    scheduling_policy: fcfs
+    load_format: dummy
+    layered_summon: false
+    log_prob_micro_batch_size: null
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+    log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384}
+    disable_log_stats: true
+    do_sample: true
+    'n': 1
+    over_sample_rate: 0
+    multi_stage_wake_up: false
+    engine_kwargs:
+      vllm: {}
+      sglang: {}
+      trtllm: {}
+      vllm_omni: {}
+    val_kwargs:
+      _target_: verl.workers.config.DiffusionSamplingConfig
+      top_k: -1
+      top_p: 1.0
+      temperature: 0
+      'n': 1
+      do_sample: false
+      num_inference_steps: 40
+      noise_level: 0.0
+      seed: 42
+    multi_turn:
+      _target_: verl.workers.config.MultiTurnConfig
+      enable: false
+      max_assistant_turns: null
+      tool_config_path: null
+      max_user_turns: null
+      max_parallel_calls: 1
+      max_tool_response_length: 256
+      tool_response_truncate_side: middle
+      interaction_config_path: null
+      use_inference_chat_template: false
+      tokenization_sanity_check_mode: strict
+      format: hermes
+      num_repeat_rollouts: null
+    calculate_log_probs: false
+    agent:
+      _target_: verl.workers.config.AgentLoopConfig
+      num_workers: 8
+      default_agent_loop: single_turn_agent
+      agent_loop_config_path: null
+      custom_async_server:
+        _target_: verl.workers.config.CustomAsyncServerConfig
+        path: null
+        name: null
+    checkpoint_engine:
+      _target_: verl.workers.config.CheckpointEngineConfig
+      backend: naive
+      update_weights_bucket_megabytes: 2048
+      engine_kwargs: {}
+      custom_backend_module: null
+    trace:
+      _target_: verl.workers.config.TraceConfig
+      project_name: ${oc.select:trainer.project_name,null}
+      experiment_name: ${oc.select:trainer.experiment_name,null}
+      backend: null
+      token2text: false
+      max_samples_per_step_per_worker: null
+    skip:
+      _target_: verl.workers.config.SkipConfig
+      enable: false
+      dump_dir: ~/.verl/rollout_dump
+      max_dump_step: 1
+      action: cache
+    skip_tokenizer_init: true
+    enable_rollout_routing_replay: false
+    profiler:
+      _target_: verl.utils.profiler.ProfilerConfig
+      tool: ${oc.select:global_profiler.tool,null}
+      enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false}
+      all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false}
+      ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]}
+      save_path: ${oc.select:global_profiler.save_path,null}
+      tool_config:
+        npu:
+          _target_: verl.utils.profiler.config.NPUToolConfig
+          contents: ${oc.select:actor_rollout_ref.actor.profiler.tool_config.npu.contents,[]}
+          level: ${oc.select:actor_rollout_ref.actor.profiler.tool_config.npu.level,level0}
+          analysis: ${oc.select:actor_rollout_ref.actor.profiler.tool_config.npu.analysis,false}
+          discrete: ${oc.select:actor_rollout_ref.actor.profiler.tool_config.npu.discrete,false}
+        torch:
+          _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+          contents: ${oc.select:actor_rollout_ref.actor.profiler.tool_config.torch.contents,[]}
+          discrete: ${oc.select:actor_rollout_ref.actor.profiler.tool_config.torch.discrete,false}
+    prometheus:
+      _target_: verl.workers.config.PrometheusConfig
+      enable: false
+      port: 9090
+      file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml
+      served_model_name: ${oc.select:actor_rollout_ref.model.path,null}
+    quantization: null
+    quantization_config_file: null
+    mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
+    speculative_decoding:
+      _target_: verl.workers.config.rollout.SpeculativeDecodingConfig
+      enable: false
+      method: eagle3
+      num_steps: 1
+      num_draft_tokens: 4
+      draft_model_path: null
+      draft_tensor_parallel_size: 1
+    qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
+    height: 512
+    width: 512
+    num_inference_steps: 10
+  model:
+    _target_: verl.workers.config.DiffusionModelConfig
+    path: ~/models/Qwen/Qwen-Image
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    enable_gradient_checkpointing: true
+    lora_rank: 32
+    lora_alpha: 64
+    lora_init_weights: gaussian
+    target_modules: all-linear
+    target_parameters: null
+    exclude_modules: null
+    lora_adapter_path: null
+    height: ${oc.select:actor_rollout_ref.rollout.height,512}
+    width: ${oc.select:actor_rollout_ref.rollout.width,512}
+    num_inference_steps: ${oc.select:actor_rollout_ref.rollout.num_inference_steps,10}
+    extra_configs: {}
+    model_type: diffusion_model
+  hybrid_engine: true
+  nccl_timeout: 600
+data:
+  tokenizer: null
+  use_shm: false
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+  train_max_samples: -1
+  val_max_samples: -1
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 512
+  train_batch_size: 1024
+  val_batch_size: null
+  tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path,
+    null}
+  return_raw_input_ids: false
+  return_raw_chat: true
+  return_full_prompt: false
+  shuffle: true
+  seed: null
+  dataloader_num_workers: 8
+  image_patch_size: 14
+  validation_shuffle: false
+  filter_overlong_prompts: false
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  image_key: images
+  video_key: videos
+  trust_remote_code: false
+  custom_cls:
+    path: null
+    name: null
+  return_multi_modal_inputs: true
+  sampler:
+    class_path: null
+    class_name: null
+  datagen:
+    path: null
+    name: null
+  apply_chat_template_kwargs: {}
+  data_source: prompt
+critic:
+  optim:
+    _target_: verl.workers.config.FSDPOptimizerConfig
+    optimizer: AdamW
+    optimizer_impl: torch.optim
+    lr: 1.0e-05
+    lr_warmup_steps_ratio: 0.0
+    total_training_steps: -1
+    weight_decay: 0.01
+    lr_warmup_steps: -1
+    betas:
+    - 0.9
+    - 0.999
+    clip_grad: 1.0
+    min_lr_ratio: 0.0
+    num_cycles: 0.5
+    lr_scheduler_type: constant
+    zero_indexed_step: true
+    warmup_style: null
+    override_optimizer_config: null
+  fsdp:
+    _target_: verl.workers.config.FSDPEngineConfig
+    wrap_policy:
+      min_num_params: 0
+    param_offload: false
+    optimizer_offload: false
+    offload_policy: false
+    reshard_after_forward: true
+    fsdp_size: -1
+    forward_prefetch: false
+    model_dtype: fp32
+    use_orig_params: false
+    seed: 42
+    full_determinism: false
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    use_torch_compile: true
+    entropy_checkpointing: false
+    forward_only: false
+    strategy: fsdp
+    dtype: bfloat16
+    qat:
+      _target_: verl.workers.config.QATEngineConfig
+      enable: false
+      mode: w4a16
+      group_size: 16
+      ignore_patterns:
+      - lm_head
+      - embed_tokens
+      - re:.*mlp.gate$
+      activation_observer: static_minmax
+      quantization_config_path: null
+  _target_: verl.workers.config.FSDPCriticConfig
+  rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
+  strategy: fsdp
+  enable: null
+  ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
+  ppo_micro_batch_size: null
+  ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
+  use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
+  ppo_max_token_len_per_gpu: 32768
+  forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}
+  shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}
+  data_loader_seed: 42
+  cliprange_value: 0.5
+  loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}
+  checkpoint:
+    _target_: verl.trainer.config.CheckpointConfig
+    save_contents:
+    - model
+    - optimizer
+    - extra
+    load_contents: ${.save_contents}
+    async_save: false
+    mbridge_config: {}
+  profiler:
+    _target_: verl.utils.profiler.ProfilerConfig
+    tool: ${oc.select:global_profiler.tool,null}
+    enable: false
+    all_ranks: false
+    ranks: []
+    save_path: ${oc.select:global_profiler.save_path,null}
+    tool_config:
+      nsys:
+        _target_: verl.utils.profiler.config.NsightToolConfig
+        discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete}
+      npu:
+        _target_: verl.utils.profiler.config.NPUToolConfig
+        contents: []
+        level: level0
+        analysis: true
+        discrete: false
+      torch:
+        _target_: verl.utils.profiler.config.TorchProfilerToolConfig
+        contents: []
+        discrete: false
+      torch_memory:
+        _target_: verl.utils.profiler.config.TorchMemoryToolConfig
+        trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
+        stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
+  forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
+  ulysses_sequence_parallel_size: 1
+  grad_clip: 1.0
+custom_reward_function:
+  path: null
+  name: null
+reward_model:
+  num_workers: null
+  reward_manager: null
+  enable: null
+  enable_resource_pool: null
+  n_gpus_per_node: null
+  nnodes: null
+  reward_loop_source: null
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  model:
+    path: null
+    external_lib: null
+    trust_remote_code: null
+  rollout:
+    name: null
+    dtype: null
+    gpu_memory_utilization: null
+    enforce_eager: null
+    cudagraph_capture_sizes: null
+    free_cache_engine: null
+    data_parallel_size: null
+    expert_parallel_size: null
+    tensor_model_parallel_size: null
+    max_num_batched_tokens: null
+    max_model_len: null
+    max_num_seqs: null
+    load_format: null
+    engine_kwargs: null
+    limit_images: null
+    enable_chunked_prefill: null
+    enable_prefix_caching: null
+    disable_log_stats: null
+    skip_tokenizer_init: null
+    prompt_length: null
+    response_length: null
+sandbox_fusion:
+  url: null
+  max_concurrent: null
+  memory_limit_mb: null
+reward:
+  num_workers: 8
+  custom_reward_function:
+    path: null
+    name: compute_score
+  reward_manager:
+    _target_: verl.workers.config.reward_model.RewardManagerConfig
+    source: register
+    name: naive
+    module:
+      _target_: verl.trainer.config.config.ModuleConfig
+      path: null
+      name: custom_reward_manager
+  reward_model:
+    enable: false
+    enable_resource_pool: false
+    n_gpus_per_node: 8
+    nnodes: 0
+    model_path: null
+    rollout:
+      _target_: verl.workers.config.RolloutConfig
+      name: ???
+      dtype: bfloat16
+      gpu_memory_utilization: 0.5
+      enforce_eager: true
+      cudagraph_capture_sizes: null
+      free_cache_engine: true
+      data_parallel_size: 1
+      expert_parallel_size: 1
+      tensor_model_parallel_size: 2
+      max_num_batched_tokens: 8192
+      max_model_len: null
+      max_num_seqs: 1024
+      load_format: auto
+      engine_kwargs: {}
+      limit_images: null
+      enable_chunked_prefill: true
+      enable_prefix_caching: true
+      disable_log_stats: true
+      skip_tokenizer_init: false
+      prompt_length: 2048
+      response_length: 2048
+  sandbox_fusion:
+    url: null
+    max_concurrent: 64
+    memory_limit_mb: 1024
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    rollout_rs_threshold: null
+    bypass_mode: false
+    loss_type: ppo_clip
+    rollout_is_batch_normalize: false
+  _target_: verl.trainer.config.AlgoConfig
+  norm_adv_by_std_in_grpo: true
+trainer:
+  balance_batch: true
+  total_epochs: 30
+  total_training_steps: null
+  project_name: verl_examples
+  experiment_name: ocr
+  logger:
+  - console
+  - wandb
+  log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  esi_redundant_time: 0
+  resume_mode: auto
+  resume_from_path: null
+  val_before_train: true
+  val_only: false
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  del_local_ckpt_after_load: false
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  use_legacy_worker_impl: auto
+ray_kwargs:
+  ray_init:
+    num_cpus: null
+  timeline_json_file: null
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
index eb1459ef22a..aa74bb5cfd7 100644
--- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -1,5 +1,5 @@
 # This reference configration yaml is automatically generated via 'scripts/generate_trainer_config.sh'
-# in which it invokes 'python3 scripts/print_cfg.py --cfg job --config-name=ppo_megatron_trainer.yaml' to flatten the 'verl/trainer/config/ppo_megatron_trainer.yaml' config fields into a single file.
+# in which it invokes 'python3 scripts/print_cfg.py --cfg job model_engine=megatron' to flatten the 'verl/trainer/config/ppo_trainer.yaml' config fields into a single file.
 # Do not modify this file directly.
 # The file is usually only for reference and never used.
 
@@ -41,6 +41,8 @@ actor_rollout_ref:
       use_distributed_optimizer: true
       use_dist_checkpointing: false
       dist_checkpointing_path: null
+      dynamic_context_parallel: false
+      max_seqlen_per_dp_cp_rank: null
       dist_checkpointing_prefix: ''
       dist_ckpt_optim_fully_reshardable: false
       distrib_optim_fully_reshardable_mem_efficient: false
@@ -213,6 +215,8 @@ actor_rollout_ref:
       use_distributed_optimizer: true
       use_dist_checkpointing: false
       dist_checkpointing_path: null
+      dynamic_context_parallel: false
+      max_seqlen_per_dp_cp_rank: null
       dist_checkpointing_prefix: ''
       dist_ckpt_optim_fully_reshardable: false
       distrib_optim_fully_reshardable_mem_efficient: false
@@ -271,6 +275,7 @@ actor_rollout_ref:
     logprobs_mode: processed_logprobs
     scheduling_policy: fcfs
     load_format: dummy
+    layered_summon: false
     log_prob_micro_batch_size: null
     log_prob_micro_batch_size_per_gpu: null
     log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
@@ -320,6 +325,7 @@ actor_rollout_ref:
       backend: naive
       update_weights_bucket_megabytes: 2048
       engine_kwargs: {}
+      custom_backend_module: null
     trace:
       _target_: verl.workers.config.TraceConfig
       project_name: ${oc.select:trainer.project_name,null}
@@ -327,8 +333,12 @@ actor_rollout_ref:
       backend: null
       token2text: false
       max_samples_per_step_per_worker: null
-    skip_rollout: false
-    skip_dump_dir: /tmp/rollout_dump
+    skip:
+      _target_: verl.workers.config.SkipConfig
+      enable: false
+      dump_dir: ~/.verl/rollout_dump
+      max_dump_step: 1
+      action: cache
     skip_tokenizer_init: true
     enable_rollout_routing_replay: false
     profiler:
@@ -367,9 +377,6 @@ actor_rollout_ref:
       draft_model_path: null
       draft_tensor_parallel_size: 1
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
-    layer_name_map:
-      qkv_layer_name: qkv
-      gate_proj_layer_name: gate_up
   model:
     _target_: verl.workers.config.HFModelConfig
     path: ~/models/deepseek-llm-7b-chat
@@ -379,13 +386,10 @@ actor_rollout_ref:
     trust_remote_code: false
     custom_chat_template: null
     external_lib: null
-    override_config:
-      model_config: {}
-      moe_config:
-        freeze_moe_router: false
+    override_config: {}
     enable_gradient_checkpointing: true
     enable_activation_offload: false
-    use_remove_padding: false
+    use_remove_padding: true
     lora_rank: 0
     lora_alpha: 16
     target_modules: all-linear
@@ -511,6 +515,8 @@ critic:
     use_distributed_optimizer: true
     use_dist_checkpointing: false
     dist_checkpointing_path: null
+    dynamic_context_parallel: false
+    max_seqlen_per_dp_cp_rank: null
     dist_checkpointing_prefix: ''
     dist_ckpt_optim_fully_reshardable: false
     distrib_optim_fully_reshardable_mem_efficient: false
@@ -548,36 +554,6 @@ critic:
   rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
   strategy: megatron
   enable: null
-  model:
-    path: ~/models/deepseek-llm-7b-chat
-    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
-    override_config:
-      model_config: {}
-      moe_config:
-        freeze_moe_router: false
-    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
-    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
-    _target_: verl.trainer.config.BaseModelConfig
-    lora:
-      type: lora
-      rank: 0
-      alpha: 32
-      dropout: 0.0
-      target_modules:
-      - linear_qkv
-      - linear_proj
-      - linear_fc1
-      - linear_fc2
-      exclude_modules: []
-      dropout_position: pre
-      lora_A_init_method: xavier
-      lora_B_init_method: zero
-      a2a_experimental: false
-      dtype: null
-      adapter_path: null
-      freeze_vision_model: true
-      freeze_vision_projection: true
-      freeze_language_model: true
   ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
   ppo_micro_batch_size: null
   ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
@@ -625,6 +601,65 @@ critic:
         stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
   nccl_timeout: 600
   load_weight: true
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: ~/models/deepseek-llm-7b-chat
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+    lora:
+      type: lora
+      merge: false
+      rank: 0
+      alpha: 32
+      dropout: 0.0
+      target_modules:
+      - linear_qkv
+      - linear_proj
+      - linear_fc1
+      - linear_fc2
+      exclude_modules: []
+      dropout_position: pre
+      lora_A_init_method: xavier
+      lora_B_init_method: zero
+      a2a_experimental: false
+      dtype: null
+      adapter_path: null
+      freeze_vision_model: true
+      freeze_vision_projection: true
+      freeze_language_model: true
 custom_reward_function:
   path: null
   name: null
@@ -797,24 +832,26 @@ trainer:
   - console
   - wandb
   log_val_generations: 0
+  rollout_data_dir: null
+  validation_data_dir: null
   nnodes: 1
   n_gpus_per_node: 8
   save_freq: -1
   esi_redundant_time: 0
   resume_mode: auto
   resume_from_path: null
-  del_local_ckpt_after_load: false
   val_before_train: true
+  val_only: false
   test_freq: -1
   critic_warmup: 0
   default_hdfs_dir: null
+  del_local_ckpt_after_load: false
   default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
   max_actor_ckpt_to_keep: null
   max_critic_ckpt_to_keep: null
   ray_wait_register_center_timeout: 300
   device: cuda
-  rollout_data_dir: null
-  use_legacy_worker_impl: auto
+  use_legacy_worker_impl: disable
 global_profiler:
   _target_: verl.utils.profiler.ProfilerConfig
   tool: null
@@ -823,6 +860,7 @@ global_profiler:
   save_path: outputs/profile
   global_tool_config:
     nsys:
+      _target_: verl.utils.profiler.config.NsightToolConfig
       discrete: false
       controller_nsight_options:
         trace: cuda,nvtx,cublas,ucx
@@ -841,6 +879,14 @@ global_profiler:
       context: all
       stacks: all
       kw_args: {}
+    precision_debugger:
+      _target_: verl.utils.profiler.config.PrecisionDebuggerToolConfig
+      enable: false
+      config_path: null
+      data_dir: outputs/precision_debug
+      steps: null
+      stages: null
+      strict: false
 transfer_queue:
   enable: false
 ray_kwargs:
diff --git a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
index 818ae7ef40d..a6b8dd9ecf1 100644
--- a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
@@ -242,6 +242,7 @@ actor_rollout_ref:
     logprobs_mode: processed_logprobs
     scheduling_policy: fcfs
     load_format: dummy
+    layered_summon: false
     log_prob_micro_batch_size: null
     log_prob_micro_batch_size_per_gpu: null
     log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
@@ -291,6 +292,7 @@ actor_rollout_ref:
       backend: naive
       update_weights_bucket_megabytes: 2048
       engine_kwargs: {}
+      custom_backend_module: null
     trace:
       _target_: verl.workers.config.TraceConfig
       project_name: ${oc.select:trainer.project_name,null}
@@ -298,8 +300,12 @@ actor_rollout_ref:
       backend: null
       token2text: false
       max_samples_per_step_per_worker: null
-    skip_rollout: false
-    skip_dump_dir: /tmp/rollout_dump
+    skip:
+      _target_: verl.workers.config.SkipConfig
+      enable: false
+      dump_dir: ~/.verl/rollout_dump
+      max_dump_step: 1
+      action: cache
     skip_tokenizer_init: true
     enable_rollout_routing_replay: false
     profiler:
@@ -338,7 +344,6 @@ actor_rollout_ref:
       draft_model_path: null
       draft_tensor_parallel_size: 1
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
-    layered_summon: false
   model:
     _target_: verl.workers.config.HFModelConfig
     path: ~/models/deepseek-llm-7b-chat
@@ -377,6 +382,27 @@ actor_rollout_ref:
       speculative_num_draft_tokens: 4
       method: mtp
       num_speculative_tokens: 1
+    lora:
+      type: lora
+      merge: false
+      rank: 0
+      alpha: 32
+      dropout: 0.0
+      target_modules:
+      - linear_qkv
+      - linear_proj
+      - linear_fc1
+      - linear_fc2
+      exclude_modules: []
+      dropout_position: pre
+      lora_A_init_method: xavier
+      lora_B_init_method: zero
+      a2a_experimental: false
+      dtype: null
+      adapter_path: null
+      freeze_vision_model: true
+      freeze_vision_projection: true
+      freeze_language_model: true
   hybrid_engine: true
   nccl_timeout: 600
 data:
@@ -466,13 +492,6 @@ critic:
   rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
   strategy: torchtitan
   enable: null
-  model:
-    path: ~/models/deepseek-llm-7b-chat
-    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
-    override_config: {}
-    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
-    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
-    _target_: verl.trainer.config.BaseModelConfig
   ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
   ppo_micro_batch_size: null
   ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
@@ -518,6 +537,65 @@ critic:
         _target_: verl.utils.profiler.config.TorchMemoryToolConfig
         trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
         stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: ~/models/deepseek-llm-7b-chat
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+    lora:
+      type: lora
+      merge: false
+      rank: 0
+      alpha: 32
+      dropout: 0.0
+      target_modules:
+      - linear_qkv
+      - linear_proj
+      - linear_fc1
+      - linear_fc2
+      exclude_modules: []
+      dropout_position: pre
+      lora_A_init_method: xavier
+      lora_B_init_method: zero
+      a2a_experimental: false
+      dtype: null
+      adapter_path: null
+      freeze_vision_model: true
+      freeze_vision_projection: true
+      freeze_language_model: true
 custom_reward_function:
   path: null
   name: null
@@ -709,7 +787,7 @@ trainer:
   max_critic_ckpt_to_keep: null
   ray_wait_register_center_timeout: 300
   device: cuda
-  use_legacy_worker_impl: auto
+  use_legacy_worker_impl: disable
 global_profiler:
   _target_: verl.utils.profiler.ProfilerConfig
   tool: null
@@ -737,6 +815,14 @@ global_profiler:
       context: all
       stacks: all
       kw_args: {}
+    precision_debugger:
+      _target_: verl.utils.profiler.config.PrecisionDebuggerToolConfig
+      enable: false
+      config_path: null
+      data_dir: outputs/precision_debug
+      steps: null
+      stages: null
+      strict: false
 transfer_queue:
   enable: false
 ray_kwargs:
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index 53cc10e5967..9d2ed2858e7 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -251,6 +251,7 @@ actor_rollout_ref:
     logprobs_mode: processed_logprobs
     scheduling_policy: fcfs
     load_format: dummy
+    layered_summon: false
     log_prob_micro_batch_size: null
     log_prob_micro_batch_size_per_gpu: null
     log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
@@ -300,6 +301,7 @@ actor_rollout_ref:
       backend: naive
       update_weights_bucket_megabytes: 2048
       engine_kwargs: {}
+      custom_backend_module: null
     trace:
       _target_: verl.workers.config.TraceConfig
       project_name: ${oc.select:trainer.project_name,null}
@@ -307,8 +309,12 @@ actor_rollout_ref:
       backend: null
       token2text: false
       max_samples_per_step_per_worker: null
-    skip_rollout: false
-    skip_dump_dir: /tmp/rollout_dump
+    skip:
+      _target_: verl.workers.config.SkipConfig
+      enable: false
+      dump_dir: ~/.verl/rollout_dump
+      max_dump_step: 1
+      action: cache
     skip_tokenizer_init: true
     enable_rollout_routing_replay: false
     profiler:
@@ -347,7 +353,6 @@ actor_rollout_ref:
       draft_model_path: null
       draft_tensor_parallel_size: 1
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
-    layered_summon: false
   model:
     _target_: verl.workers.config.HFModelConfig
     path: ~/models/deepseek-llm-7b-chat
@@ -386,6 +391,27 @@ actor_rollout_ref:
       speculative_num_draft_tokens: 4
       method: mtp
       num_speculative_tokens: 1
+    lora:
+      type: lora
+      merge: false
+      rank: 0
+      alpha: 32
+      dropout: 0.0
+      target_modules:
+      - linear_qkv
+      - linear_proj
+      - linear_fc1
+      - linear_fc2
+      exclude_modules: []
+      dropout_position: pre
+      lora_A_init_method: xavier
+      lora_B_init_method: zero
+      a2a_experimental: false
+      dtype: null
+      adapter_path: null
+      freeze_vision_model: true
+      freeze_vision_projection: true
+      freeze_language_model: true
   hybrid_engine: true
   nccl_timeout: 600
 data:
@@ -448,55 +474,38 @@ critic:
     zero_indexed_step: true
     warmup_style: null
     override_optimizer_config: null
-  model:
-    fsdp_config:
-      _target_: verl.workers.config.FSDPEngineConfig
-      wrap_policy:
-        min_num_params: 0
-      param_offload: false
-      optimizer_offload: false
-      offload_policy: false
-      reshard_after_forward: true
-      fsdp_size: -1
-      forward_prefetch: false
-      model_dtype: fp32
-      use_orig_params: false
-      seed: 42
-      full_determinism: false
-      ulysses_sequence_parallel_size: 1
-      entropy_from_logits_with_chunking: false
-      use_torch_compile: true
-      entropy_checkpointing: false
-      forward_only: false
-      strategy: fsdp
-      dtype: bfloat16
-      qat:
-        _target_: verl.workers.config.QATEngineConfig
-        enable: false
-        mode: w4a16
-        group_size: 16
-        ignore_patterns:
-        - lm_head
-        - embed_tokens
-        - re:.*mlp.gate$
-        activation_observer: static_minmax
-        quantization_config_path: null
-    path: ~/models/deepseek-llm-7b-chat
-    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
-    override_config: {}
-    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
-    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
-    _target_: verl.workers.config.FSDPCriticModelCfg
-    use_shm: false
-    enable_gradient_checkpointing: true
-    enable_activation_offload: false
-    use_remove_padding: false
-    lora_rank: 0
-    lora_alpha: 16
-    target_modules: all-linear
-    tiled_mlp:
-      enabled: false
-      num_shards: 4
+  fsdp:
+    _target_: verl.workers.config.FSDPEngineConfig
+    wrap_policy:
+      min_num_params: 0
+    param_offload: false
+    optimizer_offload: false
+    offload_policy: false
+    reshard_after_forward: true
+    fsdp_size: -1
+    forward_prefetch: false
+    model_dtype: fp32
+    use_orig_params: false
+    seed: 42
+    full_determinism: false
+    ulysses_sequence_parallel_size: 1
+    entropy_from_logits_with_chunking: false
+    use_torch_compile: true
+    entropy_checkpointing: false
+    forward_only: false
+    strategy: fsdp
+    dtype: bfloat16
+    qat:
+      _target_: verl.workers.config.QATEngineConfig
+      enable: false
+      mode: w4a16
+      group_size: 16
+      ignore_patterns:
+      - lm_head
+      - embed_tokens
+      - re:.*mlp.gate$
+      activation_observer: static_minmax
+      quantization_config_path: null
   _target_: verl.workers.config.FSDPCriticConfig
   rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
   strategy: fsdp
@@ -550,6 +559,65 @@ critic:
   forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
   ulysses_sequence_parallel_size: 1
   grad_clip: 1.0
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: ~/models/deepseek-llm-7b-chat
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+    lora:
+      type: lora
+      merge: false
+      rank: 0
+      alpha: 32
+      dropout: 0.0
+      target_modules:
+      - linear_qkv
+      - linear_proj
+      - linear_fc1
+      - linear_fc2
+      exclude_modules: []
+      dropout_position: pre
+      lora_A_init_method: xavier
+      lora_B_init_method: zero
+      a2a_experimental: false
+      dtype: null
+      adapter_path: null
+      freeze_vision_model: true
+      freeze_vision_projection: true
+      freeze_language_model: true
 custom_reward_function:
   path: null
   name: null
@@ -741,7 +809,7 @@ trainer:
   max_critic_ckpt_to_keep: null
   ray_wait_register_center_timeout: 300
   device: cuda
-  use_legacy_worker_impl: auto
+  use_legacy_worker_impl: disable
 global_profiler:
   _target_: verl.utils.profiler.ProfilerConfig
   tool: null
@@ -769,6 +837,14 @@ global_profiler:
       context: all
       stacks: all
       kw_args: {}
+    precision_debugger:
+      _target_: verl.utils.profiler.config.PrecisionDebuggerToolConfig
+      enable: false
+      config_path: null
+      data_dir: outputs/precision_debug
+      steps: null
+      stages: null
+      strict: false
 transfer_queue:
   enable: false
 ray_kwargs:
diff --git a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
index bc919191c90..a30d1c012b2 100644
--- a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
@@ -221,6 +221,7 @@ actor_rollout_ref:
     logprobs_mode: processed_logprobs
     scheduling_policy: fcfs
     load_format: dummy
+    layered_summon: false
     log_prob_micro_batch_size: null
     log_prob_micro_batch_size_per_gpu: null
     log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}
@@ -270,6 +271,7 @@ actor_rollout_ref:
       backend: naive
       update_weights_bucket_megabytes: 2048
       engine_kwargs: {}
+      custom_backend_module: null
     trace:
       _target_: verl.workers.config.TraceConfig
       project_name: ${oc.select:trainer.project_name,null}
@@ -277,8 +279,12 @@ actor_rollout_ref:
       backend: null
       token2text: false
       max_samples_per_step_per_worker: null
-    skip_rollout: false
-    skip_dump_dir: /tmp/rollout_dump
+    skip:
+      _target_: verl.workers.config.SkipConfig
+      enable: false
+      dump_dir: ~/.verl/rollout_dump
+      max_dump_step: 1
+      action: cache
     skip_tokenizer_init: true
     enable_rollout_routing_replay: false
     profiler:
@@ -317,7 +323,6 @@ actor_rollout_ref:
       draft_model_path: null
       draft_tensor_parallel_size: 1
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
-    layered_summon: false
   model:
     _target_: verl.workers.config.HFModelConfig
     path: ~/models/deepseek-llm-7b-chat
@@ -356,6 +361,27 @@ actor_rollout_ref:
       speculative_num_draft_tokens: 4
       method: mtp
       num_speculative_tokens: 1
+    lora:
+      type: lora
+      merge: false
+      rank: 0
+      alpha: 32
+      dropout: 0.0
+      target_modules:
+      - linear_qkv
+      - linear_proj
+      - linear_fc1
+      - linear_fc2
+      exclude_modules: []
+      dropout_position: pre
+      lora_A_init_method: xavier
+      lora_B_init_method: zero
+      a2a_experimental: false
+      dtype: null
+      adapter_path: null
+      freeze_vision_model: true
+      freeze_vision_projection: true
+      freeze_language_model: true
   hybrid_engine: true
   nccl_timeout: 600
 data:
@@ -443,13 +469,6 @@ critic:
   rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}
   strategy: veomni
   enable: null
-  model:
-    path: ~/models/deepseek-llm-7b-chat
-    tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
-    override_config: {}
-    external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
-    trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
-    _target_: verl.trainer.config.BaseModelConfig
   ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
   ppo_micro_batch_size: null
   ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}
@@ -495,6 +514,65 @@ critic:
         _target_: verl.utils.profiler.config.TorchMemoryToolConfig
         trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
         stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    path: ~/models/deepseek-llm-7b-chat
+    hf_config_path: null
+    tokenizer_path: null
+    use_shm: false
+    trust_remote_code: false
+    custom_chat_template: null
+    external_lib: null
+    override_config: {}
+    enable_gradient_checkpointing: true
+    enable_activation_offload: false
+    use_remove_padding: true
+    lora_rank: 0
+    lora_alpha: 16
+    target_modules: all-linear
+    exclude_modules: null
+    lora_adapter_path: null
+    use_liger: false
+    use_fused_kernels: false
+    fused_kernel_options:
+      impl_backend: torch
+    tiled_mlp:
+      enabled: false
+      num_shards: 4
+    mtp:
+      _target_: verl.workers.config.MtpConfig
+      enable: false
+      enable_train: false
+      enable_rollout: false
+      detach_encoder: false
+      mtp_loss_scaling_factor: 0.1
+      speculative_algorithm: EAGLE
+      speculative_num_steps: 3
+      speculative_eagle_topk: 1
+      speculative_num_draft_tokens: 4
+      method: mtp
+      num_speculative_tokens: 1
+    lora:
+      type: lora
+      merge: false
+      rank: 0
+      alpha: 32
+      dropout: 0.0
+      target_modules:
+      - linear_qkv
+      - linear_proj
+      - linear_fc1
+      - linear_fc2
+      exclude_modules: []
+      dropout_position: pre
+      lora_A_init_method: xavier
+      lora_B_init_method: zero
+      a2a_experimental: false
+      dtype: null
+      adapter_path: null
+      freeze_vision_model: true
+      freeze_vision_projection: true
+      freeze_language_model: true
 custom_reward_function:
   path: null
   name: null
@@ -686,7 +764,7 @@ trainer:
   max_critic_ckpt_to_keep: null
   ray_wait_register_center_timeout: 300
   device: cuda
-  use_legacy_worker_impl: auto
+  use_legacy_worker_impl: disable
 global_profiler:
   _target_: verl.utils.profiler.ProfilerConfig
   tool: null
@@ -714,6 +792,14 @@ global_profiler:
       context: all
       stacks: all
       kw_args: {}
+    precision_debugger:
+      _target_: verl.utils.profiler.config.PrecisionDebuggerToolConfig
+      enable: false
+      config_path: null
+      data_dir: outputs/precision_debug
+      steps: null
+      stages: null
+      strict: false
 transfer_queue:
   enable: false
 ray_kwargs:
diff --git a/verl/trainer/config/actor/mindspeed_actor.yaml b/verl/trainer/config/actor/mindspeed_actor.yaml
new file mode 100644
index 00000000000..00e70f66314
--- /dev/null
+++ b/verl/trainer/config/actor/mindspeed_actor.yaml
@@ -0,0 +1,18 @@
+# mindspeed actor config, inheriting from trainer/config/actor/actor.yaml
+defaults:
+  - ../optim@optim: megatron
+
+  - ../engine@mindspeed:
+      - megatron
+      - mindspeed
+
+  - actor
+
+  # load the reference default config, then apply the fields in the current yaml
+  - _self_
+
+_target_: verl.workers.config.MindSpeedActorConfig
+
+strategy: mindspeed
+
+load_weight: True
diff --git a/verl/trainer/config/algorithm.py b/verl/trainer/config/algorithm.py
index a53fdd28394..a150ee63394 100644
--- a/verl/trainer/config/algorithm.py
+++ b/verl/trainer/config/algorithm.py
@@ -84,8 +84,10 @@ class RolloutCorrectionConfig(BaseConfig):
             - "sequence": Per-sequence IS weights (unbiased, high variance)
             Default: "sequence"
 
-        rollout_is_threshold (float): Upper threshold for IS weight truncation/rejection.
+        rollout_is_threshold (str | float): Threshold specification for IS weighting.
             Typical range: 1.5-5.0 for token level, 2.0-10.0 for sequence level.
+            - Single float or float-like string (e.g. ``2.0``): TIS, clamp weights to the upper bound
+            - ``"lower_upper"`` string (e.g. ``"0.5_5.0"``): IcePop, zero weights outside [lower, upper]
             Default: 2.0
 
         rollout_is_batch_normalize (bool): Apply batch normalization to IS weights.
@@ -171,7 +173,7 @@ class RolloutCorrectionConfig(BaseConfig):
     """
 
     rollout_is: Optional[str] = "sequence"
-    rollout_is_threshold: float = 2.0
+    rollout_is_threshold: str | float = 2.0
     rollout_is_batch_normalize: bool = False
     rollout_rs: Optional[str] = None
     rollout_rs_threshold: Optional[str | float] = None
@@ -206,6 +208,27 @@ def decoupled_seq_is(cls, threshold: float = 2.0) -> "RolloutCorrectionConfig":
         """
         return cls(rollout_is="sequence", rollout_is_threshold=threshold, rollout_rs=None)
 
+    @classmethod
+    def decoupled_token_icepop(
+        cls,
+        threshold: float = 5.0,
+        threshold_lower: float = 0.5,
+    ) -> "RolloutCorrectionConfig":
+        """Decoupled Mode with exact token-level IcePop.
+
+        Keeping response_mask unchanged and
+        zeroing token IS weights outside
+        [threshold_lower, threshold].
+
+        Args:
+            threshold (float): Upper IcePop bound. Default: 5.0
+            threshold_lower (float): Lower IcePop bound. Default: 0.5
+
+        Returns:
+            RolloutCorrectionConfig configured for decoupled mode with token-level IcePop
+        """
+        return cls(rollout_is="token", rollout_is_threshold=f"{threshold_lower}_{threshold}", rollout_rs=None)
+
     @classmethod
     def decoupled_seq_is_rs(
         cls,
@@ -343,6 +366,32 @@ def bypass_pg_is(cls, threshold: float = 2.0) -> "RolloutCorrectionConfig":
             loss_type="reinforce",
         )
 
+    @classmethod
+    def bypass_pg_token_icepop(
+        cls,
+        threshold: float = 5.0,
+        threshold_lower: float = 0.5,
+    ) -> "RolloutCorrectionConfig":
+        """Bypass mode with REINFORCE loss and exact token-level IcePop.
+
+        Uses explicit IS weights in bypass mode and zeroes out token weights
+        outside [threshold_lower, threshold] without modifying response_mask.
+
+        Args:
+            threshold (float): Upper IcePop bound. Default: 5.0
+            threshold_lower (float): Lower IcePop bound. Default: 0.5
+
+        Returns:
+            RolloutCorrectionConfig configured for bypass mode with REINFORCE + token-level IcePop
+        """
+        return cls(
+            rollout_is="token",
+            rollout_is_threshold=f"{threshold_lower}_{threshold}",
+            rollout_rs=None,
+            bypass_mode=True,
+            loss_type="reinforce",
+        )
+
     @classmethod
     def bypass_pg_geo_rs(
         cls,
diff --git a/verl/trainer/config/algorithm/rollout_correction.yaml b/verl/trainer/config/algorithm/rollout_correction.yaml
index 2fd95318453..60cc00e9265 100644
--- a/verl/trainer/config/algorithm/rollout_correction.yaml
+++ b/verl/trainer/config/algorithm/rollout_correction.yaml
@@ -5,7 +5,9 @@
 # IS aggregation level: null (disabled), "token" (per-token), "sequence" (per-sequence)
 rollout_is: null
 
-# Upper threshold for IS weight truncation (typical: 2.0-5.0)
+# Threshold for IS weighting:
+# - float / float-like string: TIS upper bound
+# - "lower_upper" string: IcePop bounds
 rollout_is_threshold: 2.0
 
 # RS aggregation level: null (disabled), e.g. "token_k1", "seq_sum_k1", "seq_mean_k3"
diff --git a/verl/trainer/config/critic/critic.yaml b/verl/trainer/config/critic/critic.yaml
index 19b4c3d62ee..6cc0be3325a 100644
--- a/verl/trainer/config/critic/critic.yaml
+++ b/verl/trainer/config/critic/critic.yaml
@@ -30,25 +30,6 @@ optim:
   # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
   lr_warmup_steps: -1
 
-
-# model config for the critic
-model:
-
-  # Path to pretrained model weights
-  path: ~/models/deepseek-llm-7b-chat
-
-  # Tokenizer path (defaults to actor's model path)
-  tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}
-
-  # Hugging Face config override
-  override_config: {}
-
-  # External model implementation (optional)
-  external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}
-
-  # Whether to trust remote code from Hugging Face models
-  trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}
-
 # PPO mini-batch size per update
 ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}
 
diff --git a/verl/trainer/config/critic/dp_critic.yaml b/verl/trainer/config/critic/dp_critic.yaml
index 1cbaf03444a..49337419c9e 100644
--- a/verl/trainer/config/critic/dp_critic.yaml
+++ b/verl/trainer/config/critic/dp_critic.yaml
@@ -11,7 +11,7 @@ defaults:
   - ../optim@optim: fsdp
 
   # fsdp engine config
-  - ../engine@model.fsdp_config: fsdp
+  - ../engine@fsdp: fsdp
 
   # dp actor config, inheriting from trainer/config/critic/critic.yaml
   - critic
@@ -25,42 +25,6 @@ _target_: verl.workers.config.FSDPCriticConfig
 # distribution strategy. Options: fsdp (deprecating), fsdp2
 strategy: fsdp
 
-# model config for the critic
-model:
-
-  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-  _target_: verl.workers.config.FSDPCriticModelCfg
-
-  # Whether to use shared memory for loading the model
-  use_shm: False
-
-  # Enable gradient checkpointing to save memory
-  enable_gradient_checkpointing: True
-
-  # Offload activations to CPU to reduce GPU memory usage
-  enable_activation_offload: False
-
-  # Use remove padding optimization (saves compute)
-  use_remove_padding: False
-
-  # Set to positive value to enable LoRA (e.g., 32)
-  lora_rank: 0
-
-  # LoRA scaling factor
-  lora_alpha: 16
-
-  # LoRA target modules: "all-linear" or list of linear projection layers
-  target_modules: all-linear
-
-  # TiledMLP configuration for memory-efficient MLP computation.
-  tiled_mlp:
-
-    # whether to enable TiledMLP
-    enabled: False
-
-    # number of shards to split the input
-    num_shards: 4
-
 # Forward-only batch size during inference (global)
 forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null}
 
diff --git a/verl/trainer/config/critic/megatron_critic.yaml b/verl/trainer/config/critic/megatron_critic.yaml
index 3f170575cdc..9987e4c4b6b 100644
--- a/verl/trainer/config/critic/megatron_critic.yaml
+++ b/verl/trainer/config/critic/megatron_critic.yaml
@@ -21,84 +21,6 @@ strategy: megatron
 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
 nccl_timeout: 600
 
-# model config for the critic
-model:
-
-  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-  _target_: verl.trainer.config.BaseModelConfig
-
-  # override default empty mapping
-  override_config:
-
-    model_config: {}
-
-    moe_config:
-
-      freeze_moe_router: False
-
-  # LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning
-  lora:
-      # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
-      type: lora
-
-      # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
-      rank: 0  # typical values: 8, 16, 32, 64
-      
-      #  Weighting factor for the low-rank projection. Defaults to 32
-      alpha: 32
-      
-      # Dropout rate for the low-rank projection. Defaults to 0.0
-      dropout: 0.0
-      
-      # A list of module names to apply LoRA to.
-      # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
-      # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
-      # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
-      # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
-      # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
-      # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
-      # Target modules can also contain wildcards. For example, you can specify
-      # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
-      # 
-      # Note:
-      # For MLA (e.g., DeepSeek), you should use ["linear_kv_down_proj","linear_kv_up_proj","linear_q_down_proj","linear_q_up_proj","linear_q_proj"]
-      # Instead of "linear_qkv" or ["linear_q","linear_k","linear_v"]
-      # By default, MoE routers are excluded from LoRA adaptation, and you will need to specify "router" in target_modules to include them.
-      target_modules:
-        - linear_qkv
-        - linear_proj
-        - linear_fc1
-        - linear_fc2
-      
-      # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
-      # does not match any string in exclude_modules. If used, will require target_modules to be empty list or null
-      exclude_modules: []
-
-      # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
-      dropout_position: pre
-
-      # Initialization method for the low-rank matrix A. Defaults to "xavier".
-      lora_A_init_method: xavier
-
-      # Initialization method for the low-rank matrix B. Defaults to "zero".
-      lora_B_init_method: zero
-
-      # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
-      a2a_experimental: False
-
-      # Parameter data type for LoRA weights. Default to null, which will use model's dtype.
-      dtype: null
-
-      # Path to pre-trained LoRA adapter weights (null to train from scratch)
-      adapter_path: null
-
-      # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
-      # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
-      # finetune the vision model.
-      freeze_vision_model: True
-      freeze_vision_projection: True
-      freeze_language_model: True
-
 # Whether to load initial weights
 load_weight: True
 
diff --git a/verl/trainer/config/critic/mindspeed_critic.yaml b/verl/trainer/config/critic/mindspeed_critic.yaml
new file mode 100644
index 00000000000..f91ee688f22
--- /dev/null
+++ b/verl/trainer/config/critic/mindspeed_critic.yaml
@@ -0,0 +1,30 @@
+# defaults specify the default config from each component
+defaults:
+
+  # mindspeed optimizer config
+  - ../optim@optim: megatron
+
+  # mindspeed engine config
+  - ../engine@mindspeed:
+      - megatron
+      - mindspeed
+
+  # dp actor config, inheriting from trainer/config/critic/critic.yaml
+  - critic
+
+  # load the reference default config, then apply the fields in the current yaml
+  - _self_
+
+# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+_target_: verl.workers.config.MindSpeedCriticConfig
+
+strategy: mindspeed
+
+# seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
+nccl_timeout: 600
+
+# Whether to load initial weights
+load_weight: True
+
+# seed for data loader
+data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
diff --git a/verl/trainer/config/critic/torchtitan_critic.yaml b/verl/trainer/config/critic/torchtitan_critic.yaml
index 4fafbd9d227..c0e5ce56cae 100644
--- a/verl/trainer/config/critic/torchtitan_critic.yaml
+++ b/verl/trainer/config/critic/torchtitan_critic.yaml
@@ -18,11 +18,5 @@ _target_: verl.workers.config.TorchTitanCriticConfig
 
 strategy: torchtitan
 
-# model config for the critic
-model:
-
-  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-  _target_: verl.trainer.config.BaseModelConfig
-
 # seed for data loader
 data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
diff --git a/verl/trainer/config/critic/veomni_critic.yaml b/verl/trainer/config/critic/veomni_critic.yaml
index 6745d17d0f2..a7b511483ce 100644
--- a/verl/trainer/config/critic/veomni_critic.yaml
+++ b/verl/trainer/config/critic/veomni_critic.yaml
@@ -18,12 +18,6 @@ _target_: verl.workers.config.VeOmniCriticConfig
 
 strategy: veomni
 
-# model config for the critic
-model:
-
-  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-  _target_: verl.trainer.config.BaseModelConfig
-
 # seed for data loader
 data_loader_seed: ${oc.select:actor_rollout_ref.actor.data_loader_seed,null}
 
diff --git a/verl/trainer/config/diffusion_trainer.yaml b/verl/trainer/config/diffusion_trainer.yaml
new file mode 100644
index 00000000000..78bd357f573
--- /dev/null
+++ b/verl/trainer/config/diffusion_trainer.yaml
@@ -0,0 +1,190 @@
+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+
+# Diffusion / FlowGRPO job: self-contained Hydra job (same component graph as the default RL trainer, with diffusion rollout and model defaults).
+defaults:
+
+  - model_engine: dp
+
+  # actor_rollout_ref.actor: trainer/config/actor/dp_actor.yaml
+  - actor@actor_rollout_ref.actor: ${model_engine}_actor
+
+  # data: trainer/config/data/legacy_data.yaml
+  - data@data: legacy_data
+
+  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
+  - ref@actor_rollout_ref.ref: ${model_engine}_ref
+
+  # Rollout model config (vLLM-Omni diffusion rollout).
+  - rollout@actor_rollout_ref.rollout: diffusion_rollout
+
+  # Model config (diffusion / FlowGRPO).
+  - model@actor_rollout_ref.model: diffusion_model
+
+  # Critic model config.
+  - critic@critic: ${model_engine}_critic
+
+  # legacy reward impl config, for backward compatibility
+  - legacy_reward_impl
+
+  # Reward config.
+  - reward@reward: reward
+
+  # Rollout correction config.
+  - algorithm@algorithm.rollout_correction: rollout_correction
+
+  # load the reference default config, then apply the fields in the current yaml
+  # self config override anything above
+  - _self_
+
+# Dataset config (merges with legacy_data from defaults)
+data:
+
+  # get ground-truth based on data_source, now support ["ocr", "prompt"]
+  data_source: "prompt"
+
+# config for actor, rollout and reference model
+actor_rollout_ref:
+
+  # Whether it's a hybrid engine, currently only supports hybrid engine
+  hybrid_engine: true
+
+  # Timeout for operations executed against the process group
+  nccl_timeout: 600
+
+  # Rollout model config.
+  rollout:
+
+    # for huge model, layered summon can save memory (prevent OOM) but make it slower
+    layered_summon: False
+
+  # Model config
+  model:
+
+    # Select diffusion agent loop and DiffusionModelConfig
+    model_type: "diffusion_model"
+
+  # Actor config
+  actor:
+
+    # PPO clip ratio (FlowGRPO-style; tighter than default 0.2)
+    clip_ratio: 0.0001
+
+    # Maximum absolute value for advantage clipping
+    clip_ratio_high: 5.0
+
+# config for the algorithm
+algorithm:
+
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.trainer.config.AlgoConfig
+
+  # Whether to normalize advantages by std (specific to GRPO)
+  norm_adv_by_std_in_grpo: True
+
+# config for the trainer
+trainer:
+
+  # Whether to balance batch sizes across distributed workers
+  balance_batch: True
+
+  # Number of epochs in training
+  total_epochs: 30
+
+  # Total training steps (can be set explicitly or derived from epochs)
+  total_training_steps: null
+
+  # Project name for experiment tracking (e.g., wandb)
+  project_name: verl_examples
+
+  # Experiment name for run identification in tracking tools (diffusion / FlowGRPO examples)
+  experiment_name: ocr
+
+  # Logging backends to use: "console", "wandb", etc.
+  logger: ["console", "wandb"]
+
+  # Number of generations to log during validation
+  log_val_generations: 0
+
+  # Directory for logging rollout data; no dump if null
+  rollout_data_dir: null
+
+  # Directory for logging validation data; no dump if null
+  validation_data_dir: null
+
+  # Number of nodes used in the training
+  nnodes: 1
+
+  # Number of GPUs per node
+  n_gpus_per_node: 8
+
+  # Save frequency (by iteration) for model checkpoints
+  save_freq: -1
+
+  # ESI refers to the elastic server instance used during training, similar to the training plan. For example,
+  # if you purchase 10 hours of computing power, the ESI will automatically shut down after 10 hours of training.
+  # To ensure a checkpoint is saved before ESI shuts down, the system will start saving a checkpoint in advance.
+  # The advance time is calculated as: Advance Time = Longest historical step duration + Checkpoint save duration + esi_redundant_time.
+  # Here, esi_redundant_time is a user-defined value that further extends the advance time for added safety.
+  esi_redundant_time: 0
+
+  # Resume mode: "auto", "disable", or "resume_path"
+  # "auto": resume from last checkpoint if available
+  # "disable": start from scratch
+  # "resume_path": resume from a user-defined path
+  resume_mode: auto
+
+  # Path to resume training from (only used when resume_mode is "resume_path")
+  resume_from_path: null
+
+  # Whether to run validation before training begins
+  val_before_train: True
+
+  # Whether to run validation only
+  val_only: False
+
+  # Validation frequency (in training iterations)
+  test_freq: -1
+
+  # Number of iterations to warm up the critic before updating policy
+  critic_warmup: 0
+
+  # Default path to distributed filesystem for saving checkpoints
+  default_hdfs_dir: null
+
+  # Whether to delete local checkpoints after loading
+  del_local_ckpt_after_load: False
+
+  # Default local directory for saving checkpoints
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+
+  # Maximum number of actor checkpoints to keep
+  max_actor_ckpt_to_keep: null
+
+  # Maximum number of critic checkpoints to keep
+  max_critic_ckpt_to_keep: null
+
+  # Timeout (in seconds) for Ray worker to wait for registration
+  ray_wait_register_center_timeout: 300
+
+  # Device to run training on (e.g., "cuda", "cpu")
+  device: cuda
+
+  # whether to use legacy worker implementation
+  #  mode: "auto", "enable", or "disable"
+  use_legacy_worker_impl: auto
+
+# configs related to ray
+ray_kwargs:
+
+  # configs related to ray initialization
+  ray_init:
+
+    # Number of CPUs for Ray. Use a fixed number instead of null when using SLURM.
+    num_cpus: null
+
+  # Path to save Ray timeline JSON for performance profiling
+  timeline_json_file: null
diff --git a/verl/trainer/config/engine/megatron.yaml b/verl/trainer/config/engine/megatron.yaml
index e5261d5b052..ec4883ddc33 100644
--- a/verl/trainer/config/engine/megatron.yaml
+++ b/verl/trainer/config/engine/megatron.yaml
@@ -40,6 +40,12 @@ use_dist_checkpointing: False
 # distributed checkpointing path
 dist_checkpointing_path: null
 
+# Whether to use hybrid context parallelism
+dynamic_context_parallel: False
+
+# Maximum sequence length per DPxCP rank
+max_seqlen_per_dp_cp_rank: null
+
 # distributed checkpointing prefix, e.g. Nemo2 will append prefix 'module.' to the state dict keys
 dist_checkpointing_prefix: ''
 
diff --git a/verl/trainer/config/engine/mindspeed.yaml b/verl/trainer/config/engine/mindspeed.yaml
new file mode 100644
index 00000000000..af1e9cab24d
--- /dev/null
+++ b/verl/trainer/config/engine/mindspeed.yaml
@@ -0,0 +1,45 @@
+# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+_target_: verl.workers.config.MindSpeedEngineConfig
+
+# mindspeed_llm or mindspeed_mm
+strategy: mindspeed_llm
+
+llm_kwargs:
+  # mindspeed_llm model config
+  use_mcore_models: true
+  spec: []
+  qk_layernorm: true
+  position_embedding_type: rope
+  normalization: RMSNorm
+  disable_bias_linear: true
+  swiglu: true
+  attention_softmax_in_fp32: true  
+  no_gradient_accumulation_fusion: true
+  group_query_attention: true
+
+  # only support transformer_engine for now
+  transformer_impl: transformer_engine
+
+  no_pad_to_seq_lengths: true
+  reset_attention_mask: true
+  context_parallel_algo: ulysses_cp_algo
+  attention_mask_type: general
+
+  # mindspeed_llm optimizer config
+  use_flash_attn: true
+  use_fused_rotary_pos_emb: true
+  sequence_parallel: true
+  use_rotary_position_embeddings: true
+  use_fused_swiglu: true
+  use_fused_rmsnorm: true
+  no_masked_softmax_fusion: true
+  use_distributed_optimizer: true
+
+  # mindspeed_llm train config
+  seq_length: 10240
+  micro_batch_size: 1
+  initial_loss_scale: 4096
+  init_method_std: 0.01
+  hidden_dropout: 0.0
+
+mm_kwargs: {}
diff --git a/verl/trainer/config/model/diffusion_model.yaml b/verl/trainer/config/model/diffusion_model.yaml
new file mode 100644
index 00000000000..33ed0894ac3
--- /dev/null
+++ b/verl/trainer/config/model/diffusion_model.yaml
@@ -0,0 +1,65 @@
+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+
+_target_: verl.workers.config.DiffusionModelConfig
+
+# path to the huggingface model
+path: ~/models/Qwen/Qwen-Image
+
+# path to the huggingface tokenizer. In case it is not the same as path
+tokenizer_path: null
+
+# whether to use shared memory for model loading
+use_shm: False
+
+# whether to trust remote code.
+trust_remote_code: False
+
+# custom chat template for the model
+custom_chat_template: null
+
+# whether to use external libs for the model
+external_lib: null
+
+# whether to enable gradient checkpointing.
+enable_gradient_checkpointing: True
+
+# Set to positive value to enable LoRA (e.g., 32)
+lora_rank: 32
+
+# LoRA scaling factor
+lora_alpha: 64
+
+# Lora initialization method.
+lora_init_weights: gaussian
+
+# Target modules for LoRA adaptation
+target_modules: all-linear
+
+# Target parameters for LoRA adaptation
+target_parameters: null
+
+# Exclude modules from LoRA adaptation
+exclude_modules: null
+
+# Path to pre-trained LoRA adapter to load for continued training
+lora_adapter_path: null
+
+# image/video height
+height: ${oc.select:actor_rollout_ref.rollout.height,512}
+
+# image/video width
+width: ${oc.select:actor_rollout_ref.rollout.width,512}
+
+# inference steps
+num_inference_steps: ${oc.select:actor_rollout_ref.rollout.num_inference_steps,10}
+
+# extra configs for algorithm specific features.
+# Model-specific diffusion sampling params (e.g. true_cfg_scale, guidance_scale,
+# max_sequence_length, noise_level) should be placed here so the agent loop stays
+# backend-neutral.  The rollout server's backend translation layer will promote
+# matching keys to direct OmniDiffusionSamplingParams fields automatically.
+extra_configs: {}
diff --git a/verl/trainer/config/model/hf_model.yaml b/verl/trainer/config/model/hf_model.yaml
index 4002a7f68c2..0ad45700afa 100644
--- a/verl/trainer/config/model/hf_model.yaml
+++ b/verl/trainer/config/model/hf_model.yaml
@@ -95,3 +95,69 @@ mtp:
 
   method: mtp
   num_speculative_tokens: 1
+
+# LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning
+lora:
+  # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
+  type: lora
+
+  # whether to sync weights / refit by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss). If this is False, it will load separate adapters.
+  merge: False
+
+  # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
+  rank: 0  # typical values: 8, 16, 32, 64
+  
+  #  Weighting factor for the low-rank projection. Defaults to 32
+  alpha: 32
+  
+  # Dropout rate for the low-rank projection. Defaults to 0.0
+  dropout: 0.0
+  
+  # A list of module names to apply LoRA to.
+  # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
+  # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
+  # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
+  # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
+  # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
+  # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
+  # Target modules can also contain wildcards. For example, you can specify
+  # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
+  # 
+  # Note:
+  # For MLA (e.g., DeepSeek), you should use ["linear_kv_down_proj","linear_kv_up_proj","linear_q_down_proj","linear_q_up_proj","linear_q_proj"]
+  # Instead of "linear_qkv" or ["linear_q","linear_k","linear_v"]
+  # By default, MoE routers are excluded from LoRA adaptation, and you will need to specify "router" in target_modules to include them.
+  target_modules:
+    - linear_qkv
+    - linear_proj
+    - linear_fc1
+    - linear_fc2
+  
+  # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
+  # does not match any string in exclude_modules. If used, will require target_modules to be empty list or None
+  exclude_modules: []
+
+  # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
+  dropout_position: pre
+
+  # Initialization method for the low-rank matrix A. Defaults to "xavier".
+  lora_A_init_method: xavier
+
+  # Initialization method for the low-rank matrix B. Defaults to "zero".
+  lora_B_init_method: zero
+
+  # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
+  a2a_experimental: False
+
+  # Parameter data type for LoRA weights. Default to null, which will use model's dtype.
+  dtype: null
+
+  # Path to pre-trained LoRA adapter weights (null to train from scratch)
+  adapter_path: null
+
+  # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
+  # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
+  # finetune the vision model.
+  freeze_vision_model: True
+  freeze_vision_projection: True
+  freeze_language_model: True
\ No newline at end of file
diff --git a/verl/trainer/config/model_engine/megatron.yaml b/verl/trainer/config/model_engine/megatron.yaml
new file mode 100644
index 00000000000..3dcd27b5079
--- /dev/null
+++ b/verl/trainer/config/model_engine/megatron.yaml
@@ -0,0 +1,2 @@
+# @package _global_
+model_engine: megatron
\ No newline at end of file
diff --git a/verl/trainer/config/model_engine/mindspeed.yaml b/verl/trainer/config/model_engine/mindspeed.yaml
new file mode 100644
index 00000000000..87f8b31bdf0
--- /dev/null
+++ b/verl/trainer/config/model_engine/mindspeed.yaml
@@ -0,0 +1,2 @@
+# @package _global_
+model_engine: mindspeed
\ No newline at end of file
diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml
index 5060f1ae353..2ebe9c789d5 100644
--- a/verl/trainer/config/ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/ppo_megatron_trainer.yaml
@@ -1,246 +1,6 @@
-# specify the default per-component configs
+# [Deprecated] This file is for backward compatibility, please use ppo_trainer.yaml instead.
+# python3 -m verl.trainer.main_ppo model_engine=megatron ...
 defaults:
-  # <folder_name>@<field_name>.<field_name>: <yaml_file_name>
-  # actor_rollout_ref.actor: trainer/config/actor/megatron_actor.yaml
-  - actor@actor_rollout_ref.actor: megatron_actor
-  # data: trainer/config/data/legacy_data.yaml
-  - data@data: legacy_data
-  # load the reference default config, then apply the fields in the current yaml
-  # Reference model config.
-  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
-  - ref@actor_rollout_ref.ref: megatron_ref
-  # Rollout model config.
-  - rollout@actor_rollout_ref.rollout: rollout
-  # Model config.
-  - model@actor_rollout_ref.model: hf_model
-  # Critic model config.
-  - critic@critic: megatron_critic
-  # legacy reward impl config, for backward compatibility
-  - legacy_reward_impl
-  # Reward model config.
-  - reward@reward: reward
-  # Rollout correction config.
-  - algorithm@algorithm.rollout_correction: rollout_correction
-  # distillation config
-  - distillation@distillation: distillation
+  - ppo_trainer
+  - override model_engine: megatron
   - _self_
-
-actor_rollout_ref:
-  hybrid_engine: True
-
-  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
-
-  model:
-    override_config:
-      model_config: {}
-      moe_config:
-        freeze_moe_router: False
-
-    use_fused_kernels: False # Whether to use custom fused kernels (PostProcessing, for memory efficiency)
-
-    trust_remote_code: False
-
-    # Whether to remove padding tokens in inputs during training
-    use_remove_padding: false
-
-    # LoRA (Low-Rank Adaptation) configuration for parameter-efficient fine-tuning
-    lora:
-      # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
-      type: lora
-
-      # whether to sync weights / refit by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss). If this is False, it will load separate adapters.
-      merge: False
-
-      # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
-      rank: 0  # typical values: 8, 16, 32, 64
-      
-      #  Weighting factor for the low-rank projection. Defaults to 32
-      alpha: 32
-      
-      # Dropout rate for the low-rank projection. Defaults to 0.0
-      dropout: 0.0
-      
-      # A list of module names to apply LoRA to.
-      # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
-      # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
-      # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
-      # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
-      # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
-      # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
-      # Target modules can also contain wildcards. For example, you can specify
-      # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
-      # 
-      # Note:
-      # For MLA (e.g., DeepSeek), you should use ["linear_kv_down_proj","linear_kv_up_proj","linear_q_down_proj","linear_q_up_proj","linear_q_proj"]
-      # Instead of "linear_qkv" or ["linear_q","linear_k","linear_v"]
-      # By default, MoE routers are excluded from LoRA adaptation, and you will need to specify "router" in target_modules to include them.
-      target_modules:
-        - linear_qkv
-        - linear_proj
-        - linear_fc1
-        - linear_fc2
-      
-      # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
-      # does not match any string in exclude_modules. If used, will require target_modules to be empty list or None
-      exclude_modules: []
-
-      # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
-      dropout_position: pre
-
-      # Initialization method for the low-rank matrix A. Defaults to "xavier".
-      lora_A_init_method: xavier
-
-      # Initialization method for the low-rank matrix B. Defaults to "zero".
-      lora_B_init_method: zero
-
-      # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
-      a2a_experimental: False
-
-      # Parameter data type for LoRA weights. Default to null, which will use model's dtype.
-      dtype: null
-
-      # Path to pre-trained LoRA adapter weights (null to train from scratch)
-      adapter_path: null
-
-      # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
-      # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
-      # finetune the vision model.
-      freeze_vision_model: True
-      freeze_vision_projection: True
-      freeze_language_model: True
-
-  rollout:
-    quantization: null
-
-    layer_name_map:
-      qkv_layer_name: qkv
-      gate_proj_layer_name: gate_up
-
-algorithm:
-  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-  _target_: verl.trainer.config.AlgoConfig
-  gamma: 1.0
-  lam: 1.0
-  adv_estimator: gae
-  norm_adv_by_std_in_grpo: True
-  use_kl_in_reward: False
-  kl_penalty: kl # how to estimate kl divergence
-  kl_ctrl:
-    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
-    _target_: verl.trainer.config.KLControlConfig
-    type: fixed
-    kl_coef: 0.001
-    horizon: 10000
-    target_kl: 0.1
-  use_pf_ppo: False
-  pf_ppo:
-    reweight_method: pow # ["pow", "max_min", "max_random"]
-    weight_pow: 2.0
-
-trainer:
-  balance_batch: True
-  total_epochs: 30
-  total_training_steps: null
-  project_name: verl_examples
-  experiment_name: gsm8k
-  logger: ["console", "wandb"]
-  log_val_generations: 0
-  nnodes: 1
-  n_gpus_per_node: 8
-  save_freq: -1
-  esi_redundant_time: 0
-
-  # auto: find the last ckpt to resume. If can't find, start from scratch
-  resume_mode: auto # or disable or resume_path if resume_from_path is set
-  resume_from_path: null
-  del_local_ckpt_after_load: False
-  val_before_train: True
-  test_freq: -1
-  critic_warmup: 0
-  default_hdfs_dir: null
-  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
-  max_actor_ckpt_to_keep: null
-  max_critic_ckpt_to_keep: null
-  # The timeout for ray worker group to wait for the register center to be ready
-  ray_wait_register_center_timeout: 300
-  device: cuda
-  # Directory for logging rollout data; no dump if null
-  rollout_data_dir: null
-
-  # whether to use legacy worker implementation
-  #  mode: "auto", "enable", or "disable"
-  use_legacy_worker_impl: auto
-
-global_profiler:
-  _target_: verl.utils.profiler.ProfilerConfig
-  tool: null # choose between nsys, npu, torch, torch_memory
-  steps: null # profile steps
-  profile_continuous_steps: False
-  save_path: "outputs/profile" # profiler saving path
-  # Specific tool configs, can use +profiler.tool_config.[tool].xxx to config
-  global_tool_config:
-    # nsys config
-    nsys:
-      # True for each task has its own database, False for all tasks in one training step share one database.
-      discrete: False
-
-      # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
-      ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
-      ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
-      controller_nsight_options:
-        # Select the API(s) to be traced.
-        trace: "cuda,nvtx,cublas,ucx"
-
-        # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
-        cuda-memory-usage: "true"
-
-        # CUDA graphs will be traced as a whole
-        cuda-graph-trace: "graph"
-
-      # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
-      worker_nsight_options:
-        # Select the API(s) to be traced.
-        trace: "cuda,nvtx,cublas,ucx"
-
-        # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
-        cuda-memory-usage: "true"
-
-        # CUDA graphs will be traced as a whole
-        cuda-graph-trace: "graph"
-
-        # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
-        capture-range: "cudaProfilerApi"
-
-        # Specify the desired behavior when a capture range ends.
-        # In verl we need the torch.cuda.profiler.start/stop pair to repeats n times.
-        # valid values are "repeat-shutdown:n" or null.
-        # For normal whole step profiling, n = len(profile_steps);
-        # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
-        # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
-        capture-range-end: null
-
-        # Send signal to the target application's process group. We let the program to exit by itself.
-        kill: none
-
-    # enable memory visualization for debugging memory usage
-    torch_memory:
-      #  Maximum number of allocation entries to record
-      trace_alloc_max_entries: 100_000
-      # The depth of the call stack to capture for each allocation
-      stack_depth: 32
-      # 'alloc': records only allocation events || 'state': records memory state changes || 'all': records both.
-      context: "all"
-      # 'python': records Python stacks || 'cpp': records C++ stacks (available in some versions) || 'all': records both.
-      stacks: "all"
-      # devices, record_context etc.
-      kw_args: {}
-
-# configs for TransferQueue
-transfer_queue:
-  # Whether to enable transfer queue
-  enable: False
-
-ray_kwargs:
-  ray_init:
-    num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
-  timeline_json_file: null
diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml
index 3100f5e7588..591e5a7c7a1 100644
--- a/verl/trainer/config/ppo_trainer.yaml
+++ b/verl/trainer/config/ppo_trainer.yaml
@@ -28,6 +28,7 @@ defaults:
 
   # Critic model config.
   - critic@critic: ${model_engine}_critic
+  - model@critic.model: hf_model
 
   # legacy reward impl config, for backward compatibility
   - legacy_reward_impl
@@ -54,12 +55,6 @@ actor_rollout_ref:
   # Timeout for operations executed against the process group
   nccl_timeout: 600
 
-  # Rollout model config.
-  rollout:
-
-    # for huge model, layered summon can save memory (prevent OOM) but make it slower
-    layered_summon: False
-
 # config for the algorithm
 algorithm:
 
@@ -204,7 +199,7 @@ trainer:
 
   # whether to use legacy worker implementation
   #  mode: "auto", "enable", or "disable"
-  use_legacy_worker_impl: auto
+  use_legacy_worker_impl: disable
 
 # profiler configs
 global_profiler:
@@ -212,7 +207,7 @@ global_profiler:
   # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
   _target_: verl.utils.profiler.ProfilerConfig
 
-  # Profiling tool: choose between nsys, npu, torch, torch_memory
+  # Profiling tool: choose between nsys, npu, torch, torch_memory, precision_debugger
   tool: null
 
   # profile steps
@@ -296,6 +291,32 @@ global_profiler:
       # devices, record_context etc.
       kw_args: {}
 
+    # msprobe precision debugger
+    precision_debugger:
+
+      # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+      _target_: verl.utils.profiler.config.PrecisionDebuggerToolConfig
+
+      # Enable msprobe precision debugger
+      enable: False
+
+      # Path to msprobe config json
+      config_path: null
+
+      # Data directory for msprobe dumps
+      data_dir: "outputs/precision_debug"
+
+      # Profile steps
+      steps: null
+
+      # Profile stages
+      # Supported stages: actor_update, actor_compute_log_prob, ref_compute_log_prob,
+      # compute_values, critic_update, compute_rm_score
+      stages: null
+
+      # Whether to fail on unknown stage or missing msprobe
+      strict: False
+
 # configs for TransferQueue
 transfer_queue:
 
diff --git a/verl/trainer/config/profiler/profiler.yaml b/verl/trainer/config/profiler/profiler.yaml
index 2004ba3f5f0..93cc2045921 100644
--- a/verl/trainer/config/profiler/profiler.yaml
+++ b/verl/trainer/config/profiler/profiler.yaml
@@ -70,4 +70,25 @@ tool_config:
     # Stack trace depth for memory allocations
     stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
 
-    name: torch_memory
\ No newline at end of file
+    name: torch_memory
+
+  precision_debugger:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.config.PrecisionDebuggerToolConfig
+
+    # Enable msprobe precision debugger
+    enable: ${oc.select:global_profiler.global_tool_config.precision_debugger.enable,False}
+
+    # Path to msprobe config json
+    config_path: ${oc.select:global_profiler.global_tool_config.precision_debugger.config_path,null}
+
+    # Data directory for msprobe dumps
+    data_dir: ${oc.select:global_profiler.global_tool_config.precision_debugger.data_dir,"outputs/precision_debug"}
+
+    # Profile steps
+    steps: ${oc.select:global_profiler.global_tool_config.precision_debugger.steps,null}
+
+    # Profile stages
+    stages: ${oc.select:global_profiler.global_tool_config.precision_debugger.stages,null}
+
+    name: precision_debugger
diff --git a/verl/trainer/config/ref/mindspeed_ref.yaml b/verl/trainer/config/ref/mindspeed_ref.yaml
new file mode 100644
index 00000000000..8407963ff9f
--- /dev/null
+++ b/verl/trainer/config/ref/mindspeed_ref.yaml
@@ -0,0 +1,36 @@
+# mindspeed ref config, inheriting from trainer/config/ref/ref.yaml
+defaults:
+  - ref
+
+  - ../optim@optim: megatron
+
+  # mindspeed engine config
+  - ../engine@mindspeed:
+      - megatron
+      - mindspeed
+
+  # load the reference default config, then apply the fields in the current yaml
+  - _self_
+
+_target_: verl.workers.config.MindSpeedActorConfig
+
+strategy: mindspeed
+
+mindspeed:
+  seed: ${oc.select:actor_rollout_ref.actor.mindspeed.seed,42}
+  override_transformer_config: ${oc.select:actor_rollout_ref.actor.mindspeed.override_transformer_config,{}}
+  use_mbridge: ${oc.select:actor_rollout_ref.actor.mindspeed.use_mbridge,False}
+  vanilla_mbridge: ${oc.select:actor_rollout_ref.actor.mindspeed.vanilla_mbridge,True}
+  use_remove_padding: ${oc.select:actor_rollout_ref.actor.mindspeed.use_remove_padding,True}
+  tensor_model_parallel_size: ${oc.select:actor_rollout_ref.actor.mindspeed.tensor_model_parallel_size,1}
+  pipeline_model_parallel_size: ${oc.select:actor_rollout_ref.actor.mindspeed.pipeline_model_parallel_size,1}
+  virtual_pipeline_model_parallel_size: ${oc.select:actor_rollout_ref.actor.mindspeed.virtual_pipeline_model_parallel_size,null}
+  context_parallel_size: ${oc.select:actor_rollout_ref.actor.mindspeed.context_parallel_size,1}
+  expert_model_parallel_size: ${oc.select:actor_rollout_ref.actor.mindspeed.expert_model_parallel_size,1}
+  expert_tensor_parallel_size: ${oc.select:actor_rollout_ref.actor.mindspeed.expert_tensor_parallel_size,null}
+  param_offload: ${oc.select:actor_rollout_ref.actor.mindspeed.param_offload,False}
+  forward_only: True
+  llm_kwargs: ${oc.select:actor_rollout_ref.actor.mindspeed.llm_kwargs,{}}
+  mm_kwargs: ${oc.select:actor_rollout_ref.actor.mindspeed.mm_kwargs,{}}
+
+load_weight: True
\ No newline at end of file
diff --git a/verl/trainer/config/rollout/diffusion_rollout.yaml b/verl/trainer/config/rollout/diffusion_rollout.yaml
new file mode 100644
index 00000000000..4e493abcaec
--- /dev/null
+++ b/verl/trainer/config/rollout/diffusion_rollout.yaml
@@ -0,0 +1,51 @@
+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+
+# Hydra defaults: compose base rollout then apply overrides from this file
+defaults:
+
+  - rollout
+
+  - _self_
+
+# Target class for this configuration
+_target_: verl.workers.config.DiffusionRolloutConfig
+
+# image height for diffusion model rollout
+height: 512
+
+# image width for diffusion model rollout
+width: 512
+
+# number of inference steps for diffusion model rollout
+num_inference_steps: 10
+
+# Extra inference engine arguments: add vllm_omni for diffusion
+engine_kwargs:
+
+  # vLLM-Omni engine options for diffusion rollout (empty dict uses defaults)
+  vllm_omni: {}
+
+# Sampling parameters used during validation (diffusion-specific).
+val_kwargs:
+
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.workers.config.DiffusionSamplingConfig
+
+  # whether to repeat n times for validation
+  n: 1
+
+  # Whether to sample during training rollout. False uses greedy sampling.
+  do_sample: False
+
+  # number of inference steps for diffusion model rollout
+  num_inference_steps: 40
+
+  # noise level for diffusion model rollout
+  noise_level: 0.0
+
+  # random seed for validation
+  seed: 42
diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
index 123b8b58883..e53d3cbb62c 100644
--- a/verl/trainer/config/rollout/rollout.yaml
+++ b/verl/trainer/config/rollout/rollout.yaml
@@ -91,6 +91,9 @@ scheduling_policy: fcfs
 # safetensors (for huge model, and set use_shm=True); dummy: randomly init model weight
 load_format: dummy
 
+# for huge model, layered summon can save memory (prevent OOM) but make it slower
+layered_summon: False
+
 # [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size.
 log_prob_micro_batch_size: null
 
@@ -275,6 +278,11 @@ checkpoint_engine:
   # Additional keyword arguments to pass to the checkpoint engine constructor
   engine_kwargs: {}
 
+  # If set, this Python module is imported on every worker process before the
+  # backend is instantiated, allowing custom backends to register themselves
+  # in CheckpointEngineRegistry.
+  custom_backend_module: null
+
 # trace rollout data
 trace:
 
@@ -299,14 +307,29 @@ trace:
   # Total traces per step = max_samples_per_step_per_worker * num_workers * n_rollouts_per_sample
   max_samples_per_step_per_worker: null
 
-# When enabled (True), the trainer will attempt to load previously generated rollout data from the specified directory instead of computing new rollouts.
-# If no cached data is found or loading fails, new rollouts will be generated and automatically saved.
-# This feature is useful for debugging or when you want to reuse computation results across multiple runs.
-skip_rollout: False
+# rollout skip config (load/dump rollout data)
+skip:
+
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.workers.config.SkipConfig
+
+  # When enabled (True), the trainer will attempt to load previously generated rollout data from the specified directory instead of computing new rollouts.
+  # If no cached data is found or loading fails, new rollouts will be generated and automatically saved.
+  # This feature is useful for debugging or when you want to reuse computation results across multiple runs.
+  enable: False
+
+  # Default ~/.verl/rollout_dump (expanduser+resolve at runtime). Ray/multi-node: prefer absolute dump_dir (relative paths follow each process cwd).
+  # Avoid /tmp/ray/session* (ephemeral).
+  dump_dir: "~/.verl/rollout_dump"
+
+  # Number of training steps (from start) for which dump/load is active. Steps with train_step <= max_dump_step are "dump steps" (try load first, then generate and dump if missing).
+  max_dump_step: 1
 
-# Specifies the filesystem path where rollout data should be cached when skip_rollout is enabled.
-# Note: Giving path under /tmp/ray/session* is not recommended as these are temporary Ray cluster directories.
-skip_dump_dir: /tmp/rollout_dump
+  # Action when beyond dump steps (gen_step > max_dump_step):
+  # - "cache": If dumped data exists for current step, use it; otherwise generate and dump.
+  # - "repeat": Reuse dumped data in a round-robin over the first max_dump_step steps.
+  # - "repeat_last": Reuse the last dumped step's data.
+  action: "cache"
 
 # Whether to skip tokenizer initialization for rollout engine
 # When enabled (True), the rollout assume token in token out for generation
diff --git a/verl/trainer/config/sft_trainer_engine.yaml b/verl/trainer/config/sft_trainer_engine.yaml
index e385e88c46c..3d1180f4d05 100644
--- a/verl/trainer/config/sft_trainer_engine.yaml
+++ b/verl/trainer/config/sft_trainer_engine.yaml
@@ -70,6 +70,7 @@ trainer:
   save_freq: -1
   test_freq: -1
   max_ckpt_to_keep: null  # Maximum number of checkpoints to keep, set to null to keep all
+  balance_batch: True
 
   # Resume mode: "auto", "disable", or "resume_path"
   # "auto": resume from last checkpoint if available
diff --git a/verl/trainer/distillation/megatron/losses.py b/verl/trainer/distillation/megatron/losses.py
index c9fabf897a7..81391493837 100644
--- a/verl/trainer/distillation/megatron/losses.py
+++ b/verl/trainer/distillation/megatron/losses.py
@@ -18,8 +18,8 @@
 import torch
 
 from verl.models.mcore.util import (
-    preprocess_bshd_no_padding,
-    preprocess_thd_no_padding,
+    preprocess_bshd_engine,
+    preprocess_thd_engine,
 )
 from verl.workers.config import DistillationConfig, DistillationLossConfig
 
@@ -242,11 +242,11 @@ def compute_forward_kl_topk(
 
     # 1. split across cp groups (bsz, seqlen, topk) => (bsz, seqlen/cp_size, topk)
     if data_format == "thd":
-        teacher_topk_log_probs_cp_split, *_ = preprocess_thd_no_padding(teacher_topk_log_probs, pre_process=True)
-        teacher_topk_ids_cp_split, *_ = preprocess_thd_no_padding(teacher_topk_ids, pre_process=True)
+        teacher_topk_log_probs_cp_split, *_ = preprocess_thd_engine(teacher_topk_log_probs, pre_process=True)
+        teacher_topk_ids_cp_split, *_ = preprocess_thd_engine(teacher_topk_ids, pre_process=True)
     else:
-        teacher_topk_log_probs_cp_split, *_ = preprocess_bshd_no_padding(teacher_topk_log_probs, pre_process=True)
-        teacher_topk_ids_cp_split, *_ = preprocess_bshd_no_padding(teacher_topk_ids, pre_process=True)
+        teacher_topk_log_probs_cp_split, *_ = preprocess_bshd_engine(teacher_topk_log_probs, pre_process=True)
+        teacher_topk_ids_cp_split, *_ = preprocess_bshd_engine(teacher_topk_ids, pre_process=True)
     assert teacher_topk_log_probs_cp_split.shape[:2] == teacher_topk_ids_cp_split.shape[:2] == student_logits.shape[:2]
 
     # 2. compute token-wise KL divergence across tp groups
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
index d53e4011604..1527b705a01 100644
--- a/verl/trainer/main_ppo.py
+++ b/verl/trainer/main_ppo.py
@@ -163,10 +163,7 @@ def add_actor_rollout_worker(self, config):
             actor_rollout_cls = AsyncActorRolloutRefWorker
             ray_worker_group_cls = RayWorkerGroup
 
-        elif (
-            config.actor_rollout_ref.actor.strategy == "veomni"
-            or config.actor_rollout_ref.actor.strategy == "torchtitan"
-        ):
+        elif config.actor_rollout_ref.actor.strategy in {"veomni", "torchtitan", "mindspeed"}:
             raise NotImplementedError(
                 f"{config.actor_rollout_ref.actor.strategy} does not support legacy worker implementation"
             )
@@ -195,9 +192,14 @@ def add_critic_worker(self, config):
 
         elif config.critic.strategy == "megatron":
             # TODO: switch this to TrainingWorker as well
-            from verl.workers.megatron_workers import CriticWorker
+            if use_legacy_worker_impl in ["auto", "enable"]:
+                from verl.workers.megatron_workers import CriticWorker
+            elif use_legacy_worker_impl == "disable":
+                from verl.workers.engine_workers import TrainingWorker
 
-        elif config.critic.strategy == "veomni" or config.critic.strategy == "torchtitan":
+                CriticWorker = TrainingWorker
+                print("Using new worker implementation")
+        elif config.critic.strategy in {"veomni", "torchtitan", "mindspeed"}:
             if use_legacy_worker_impl == "disable":
                 from verl.workers.engine_workers import TrainingWorker
 
@@ -237,16 +239,20 @@ def init_resource_pool_mgr(self, config):
             config.reward.reward_model.n_gpus_per_node = config.trainer.n_gpus_per_node
 
         distillation_config = config.get("distillation")
-        if is_distillation_enabled(distillation_config) and distillation_config.teacher_model.enable_resource_pool:
-            if distillation_config.teacher_model.n_gpus_per_node <= 0:
-                raise ValueError("config.distillation.teacher_model.n_gpus_per_node must be greater than 0")
-            if distillation_config.teacher_model.nnodes <= 0:
-                raise ValueError("config.distillation.teacher_model.nnodes must be greater than 0")
-
-            teacher_pool = [
-                distillation_config.teacher_model.n_gpus_per_node
-            ] * distillation_config.teacher_model.nnodes
-            resource_pool_spec["teacher_pool"] = teacher_pool
+        if is_distillation_enabled(distillation_config):
+            if distillation_config.teacher_model.enable_resource_pool:
+                if distillation_config.teacher_model.n_gpus_per_node <= 0:
+                    raise ValueError("config.distillation.teacher_model.n_gpus_per_node must be greater than 0")
+                if distillation_config.teacher_model.nnodes <= 0:
+                    raise ValueError("config.distillation.teacher_model.nnodes must be greater than 0")
+
+                teacher_pool = [
+                    distillation_config.teacher_model.n_gpus_per_node
+                ] * distillation_config.teacher_model.nnodes
+                resource_pool_spec["teacher_pool"] = teacher_pool
+            else:
+                distillation_config.teacher_model.nnodes = config.trainer.nnodes
+                distillation_config.teacher_model.n_gpus_per_node = config.trainer.n_gpus_per_node
 
         from verl.trainer.ppo.ray_trainer import ResourcePoolManager
 
diff --git a/verl/trainer/ppo/diffusion_algos.py b/verl/trainer/ppo/diffusion_algos.py
new file mode 100644
index 00000000000..5992a63d949
--- /dev/null
+++ b/verl/trainer/ppo/diffusion_algos.py
@@ -0,0 +1,97 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Diffusion-specific policy loss functions and KL penalties."""
+
+from typing import Any, Optional
+
+import torch
+from omegaconf import DictConfig
+
+from verl.trainer.ppo.core_algos import register_policy_loss
+from verl.workers.config import ActorConfig
+
+
+@register_policy_loss("flow_grpo")
+def compute_policy_loss_flow_grpo(
+    old_log_prob: torch.Tensor,
+    log_prob: torch.Tensor,
+    advantages: torch.Tensor,
+    response_mask: torch.Tensor,
+    loss_agg_mode: str = "token-mean",
+    config: Optional[DictConfig | ActorConfig] = None,
+    rollout_is_weights: torch.Tensor | None = None,
+) -> tuple[torch.Tensor, dict[str, Any]]:
+    """
+    Compute the clipped policy objective and related metrics for FlowGRPO.
+    Adapted from
+    https://github.com/yifan123/flow_grpo/blob/main/scripts/train_sd3_fast.py#L885
+    Args:
+        old_log_prob (torch.Tensor):
+            Log-probabilities of actions under the old policy, shape (batch_size,).
+        log_prob (torch.Tensor):
+            Log-probabilities of actions under the current policy, shape (batch_size,).
+        response_mask (torch.Tensor):
+            Not used currently.
+        loss_agg_mode (str, optional):
+            Not used currently.
+        advantages (torch.Tensor):
+            Advantage estimates for each action, shape (batch_size,).
+        config: `(verl.trainer.config.ActorConfig)`:
+            config for the actor.
+        rollout_is_weights: `torch.Tensor, optional)`:
+            Not used currently.
+    """
+    assert config is not None
+    assert isinstance(config, ActorConfig)
+    advantages = torch.clamp(
+        advantages,
+        -config.clip_ratio_high,
+        config.clip_ratio_high,
+    )
+    log_ratio = log_prob - old_log_prob
+    ratio = torch.exp(log_ratio)
+    unclipped_loss = -advantages * ratio
+    clipped_loss = -advantages * torch.clamp(
+        ratio,
+        1.0 - config.clip_ratio,
+        1.0 + config.clip_ratio,
+    )
+    pg_loss = torch.mean(torch.maximum(unclipped_loss, clipped_loss))
+
+    with torch.no_grad():
+        ppo_kl = torch.mean(-log_ratio)
+        pg_clipfrac = torch.mean((torch.abs(ratio - 1.0) > config.clip_ratio).float())
+        pg_clipfrac_higher = torch.mean((ratio - 1.0 > config.clip_ratio).float())
+        pg_clipfrac_lower = torch.mean((1.0 - ratio > config.clip_ratio).float())
+
+    pg_metrics = {
+        "actor/ppo_kl": ppo_kl.detach().item(),
+        "actor/pg_clipfrac": pg_clipfrac.detach().item(),
+        "actor/pg_clipfrac_higher": pg_clipfrac_higher.detach().item(),
+        "actor/pg_clipfrac_lower": pg_clipfrac_lower.detach().item(),
+    }
+    return pg_loss, pg_metrics
+
+
+def kl_penalty_image(
+    prev_sample_mean: torch.Tensor, ref_prev_sample_mean: torch.Tensor, std_dev_t: torch.Tensor
+) -> torch.Tensor:
+    """Compute KL divergence given previous sample mean and reference previous sample mean (for images or videos).
+    Args:
+        prev_sample_mean: (torch.Tensor) shape is (bs, s, c)
+        ref_prev_sample_mean: (torch.Tensor) shape is (bs, s, c)
+        std_dev_t: (torch.Tensor) shape is (bs, 1, 1)
+    """
+    kl_loss = ((prev_sample_mean - ref_prev_sample_mean) ** 2).mean(dim=(1, 2), keepdim=True) / (2 * std_dev_t**2)
+    return kl_loss.mean()
diff --git a/verl/trainer/ppo/ray_trainer.py b/verl/trainer/ppo/ray_trainer.py
index 067377bd10f..569073ef2ec 100644
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
@@ -34,7 +34,6 @@
 from tqdm import tqdm
 
 from verl import DataProto
-from verl.checkpoint_engine import CheckpointEngineManager
 from verl.experimental.dataset.sampler import AbstractCurriculumSampler
 from verl.protocol import pad_dataproto_to_divisor, unpad_dataproto
 from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup, ResourcePoolManager
@@ -70,7 +69,7 @@
 from verl.utils.seqlen_balancing import calculate_workload, get_seqlen_balanced_partitions, log_seqlen_unbalance
 from verl.utils.torch_functional import masked_mean
 from verl.utils.tracking import ValidationGenerationsLogger
-from verl.workers.config import DistillationConfig, FSDPEngineConfig
+from verl.workers.config import DistillationConfig, EngineConfig
 from verl.workers.utils.padding import left_right_2_no_padding, no_padding_2_padding
 
 
@@ -510,6 +509,17 @@ def _compute_reward_colocate(self, batch: DataProto) -> tuple[torch.Tensor, dict
         batch_reward = self.reward_loop_manager.compute_rm_score(batch)
         return batch_reward
 
+    def _should_compute_teacher_colocate(self, batch: DataProto) -> bool:
+        return self.use_teacher_policy and not self.distillation_config.teacher_model.enable_resource_pool
+
+    def _compute_teacher_colocate(self, batch: DataProto) -> DataProto:
+        """Compute teacher logprobs after rollout when teacher and student are colocated."""
+        assert self.teacher_model_manager is not None, "TeacherModelManager is None"
+        teacher_batch = self.teacher_model_manager.compute_logprobs(batch)
+        if "teacher_multi_modal_data" in batch.non_tensor_batch:
+            batch.pop(non_tensor_batch_keys=["teacher_multi_modal_data"])
+        return teacher_batch
+
     def _validate(self, merged: bool = False):
         data_source_lst = []
         reward_extra_infos_dict: dict[str, list] = defaultdict(list)
@@ -724,16 +734,13 @@ def init_workers(self):
                 from verl.workers.engine_workers import TrainingWorkerConfig
 
                 orig_critic_cfg = critic_cfg
-                if orig_critic_cfg.strategy == "fsdp":
-                    engine_config: FSDPEngineConfig = orig_critic_cfg.model.fsdp_config
-                    engine_config.infer_max_token_len_per_gpu = critic_cfg.ppo_infer_max_token_len_per_gpu
-                    engine_config.max_token_len_per_gpu = critic_cfg.ppo_max_token_len_per_gpu
-                else:
-                    raise NotImplementedError(f"Unknown strategy {orig_critic_cfg.strategy=}")
+                engine_config: EngineConfig = orig_critic_cfg.engine
+                engine_config.infer_max_token_len_per_gpu = critic_cfg.ppo_infer_max_token_len_per_gpu
+                engine_config.max_token_len_per_gpu = critic_cfg.ppo_max_token_len_per_gpu
 
                 critic_cfg = TrainingWorkerConfig(
                     model_type="value_model",
-                    model_config=orig_critic_cfg.model_config,
+                    model_config=orig_critic_cfg.model,
                     engine_config=engine_config,
                     optimizer_config=orig_critic_cfg.optim,
                     checkpoint_config=orig_critic_cfg.checkpoint,
@@ -870,7 +877,14 @@ def init_workers(self):
             reward_loop_worker_handles=reward_loop_worker_handles,
             teacher_model_manager=self.teacher_model_manager,
         )
+
         checkpoint_engine_config = omega_conf_to_dataclass(self.config.actor_rollout_ref.rollout.checkpoint_engine)
+        # Support custom CheckpointEngineManager via config
+        checkpoint_manager_class_fqn = self.config.actor_rollout_ref.rollout.get("checkpoint_manager_class")
+        if checkpoint_manager_class_fqn:
+            CheckpointEngineManager = load_class_from_fqn(checkpoint_manager_class_fqn, "CheckpointEngineManager")
+        else:
+            from verl.checkpoint_engine import CheckpointEngineManager
         self.checkpoint_manager = CheckpointEngineManager(
             config=checkpoint_engine_config,
             trainer=self.actor_rollout_wg,
@@ -1001,8 +1015,18 @@ def _load_checkpoint(self):
         # TODO: from remote not implemented yet
         dataloader_local_path = os.path.join(global_step_folder, "data.pt")
         if os.path.exists(dataloader_local_path):
-            dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False)
-            self.train_dataloader.load_state_dict(dataloader_state_dict)
+            steps_per_epoch = len(self.train_dataloader)
+            at_epoch_boundary = steps_per_epoch > 0 and self.global_steps % steps_per_epoch == 0
+            if at_epoch_boundary:
+                print(
+                    f"Skipping dataloader state restore: global_steps={self.global_steps} "
+                    f"is at an epoch boundary (steps_per_epoch={steps_per_epoch}). "
+                    f"The saved state marks the dataloader as exhausted. "
+                    f"Next epoch will iterate from scratch."
+                )
+            else:
+                dataloader_state_dict = torch.load(dataloader_local_path, weights_only=False)
+                self.train_dataloader.load_state_dict(dataloader_state_dict)
         else:
             print(f"Warning: No dataloader state found at {dataloader_local_path}, will start from scratch")
 
@@ -1299,7 +1323,7 @@ def fit(self):
             if self.config.trainer.get("val_only", False):
                 return
 
-        if self.config.actor_rollout_ref.rollout.get("skip_rollout", False):
+        if self.config.actor_rollout_ref.rollout.skip.get("enable", False):
             rollout_skip = RolloutSkip(self.config, self.async_rollout_manager)
             rollout_skip.wrap_generate_sequences()
 
@@ -1393,6 +1417,10 @@ def fit(self):
                     # repeat to align with repeated responses in rollout
                     batch = batch.repeat(repeat_times=self.config.actor_rollout_ref.rollout.n, interleave=True)
                     batch = batch.union(gen_batch_output)
+                    if self._should_compute_teacher_colocate(batch):
+                        with marked_timer("teacher", timing_raw, color="cyan"):
+                            batch_teacher = self._compute_teacher_colocate(batch)
+                            batch = batch.union(batch_teacher)
 
                     if "response_mask" not in batch.batch.keys():
                         batch.batch["response_mask"] = compute_response_mask(batch)
@@ -1537,7 +1565,10 @@ def fit(self):
                         metrics.update(critic_output_metrics)
 
                     # implement critic warmup
-                    if self.config.trainer.critic_warmup <= self.global_steps:
+                    if self.config.trainer.critic_warmup > self.global_steps:
+                        # Still in critic warmup, only update weights to wake up rollout replicas.
+                        self.checkpoint_manager.update_weights(self.global_steps)
+                    else:
                         # update actor
                         with marked_timer("update_actor", timing_raw, color="red"):
                             actor_output = self._update_actor(batch)
diff --git a/verl/trainer/ppo/reward.py b/verl/trainer/ppo/reward.py
index f13a3abf976..d0ab36be296 100644
--- a/verl/trainer/ppo/reward.py
+++ b/verl/trainer/ppo/reward.py
@@ -19,7 +19,7 @@
 from typing import TYPE_CHECKING, Any, Optional, cast
 
 from verl import DataProto
-from verl.utils.reward_score import default_compute_score
+from verl.utils.reward_score import get_default_compute_score
 
 if TYPE_CHECKING:
     from omegaconf import DictConfig
@@ -123,6 +123,8 @@ def load_reward_manager(config: DictConfig, tokenizer: Any, **reward_kwargs: Any
             load_extern_object(module_path=module_cfg.path, object_name=reward_manager_cls_name),
         )
 
+    default_compute_score_ = get_default_compute_score(reward_manager_cfg.name)
+
     if compute_score is None:
         sandbox_config = config.reward.get("sandbox_fusion")
         sandbox_url = sandbox_config.get("url") if sandbox_config else None
@@ -132,13 +134,13 @@ def load_reward_manager(config: DictConfig, tokenizer: Any, **reward_kwargs: Any
             # Create a semaphore to control concurrent access to the sandbox
             _concurrent_semaphore = sandbox_manager.Semaphore(sandbox_config.get("max_concurrent", 64))
             final_compute_score = partial(
-                default_compute_score,
+                default_compute_score_,
                 sandbox_fusion_url=sandbox_url,
                 concurrent_semaphore=_concurrent_semaphore,
                 memory_limit_mb=memory_limit_mb,
             )
         else:
-            final_compute_score = default_compute_score
+            final_compute_score = default_compute_score_
 
     # Instantiate and return the reward manager with the specified parameters
     return reward_manager_cls(
diff --git a/verl/trainer/ppo/rollout_corr_helper.py b/verl/trainer/ppo/rollout_corr_helper.py
index 6f770b38274..0f4d558c0b1 100644
--- a/verl/trainer/ppo/rollout_corr_helper.py
+++ b/verl/trainer/ppo/rollout_corr_helper.py
@@ -90,6 +90,45 @@
 TOKEN_LEVEL_ROLLOUT_RS_OPTIONS: set[str] = {"token_k1", "token_k2", "token_k3"}
 
 
+def _parse_rollout_is_threshold(threshold_spec: str | float) -> tuple[float, Optional[float]]:
+    if isinstance(threshold_spec, bool):
+        raise TypeError(
+            "rollout_is_threshold must be specified as a float or a string threshold specification, not a boolean."
+        )
+    if isinstance(threshold_spec, int | float):
+        upper = float(threshold_spec)
+        lower = None
+    elif isinstance(threshold_spec, str):
+        spec = threshold_spec.strip()
+        if not spec:
+            raise ValueError("rollout_is_threshold must not be an empty string.")
+        if "_" in spec:
+            lower_str, upper_str = spec.split("_", 1)
+            try:
+                lower = float(lower_str)
+                upper = float(upper_str)
+            except ValueError as exc:
+                raise ValueError(f"Invalid rollout_is_threshold '{threshold_spec}'.") from exc
+        else:
+            try:
+                upper = float(spec)
+            except ValueError as exc:
+                raise ValueError(f"Invalid rollout_is_threshold '{threshold_spec}'.") from exc
+            lower = None
+    else:
+        raise TypeError("rollout_is_threshold must be a float or a string threshold specification.")
+
+    if upper <= 0:
+        raise ValueError(f"rollout_is_threshold upper bound must be positive, got {upper}.")
+    if lower is not None:
+        if lower <= 0:
+            raise ValueError(f"rollout_is_threshold lower bound must be positive, got {lower}.")
+        if lower > upper:
+            raise ValueError("rollout_is_threshold lower bound must be <= upper bound.")
+
+    return upper, lower
+
+
 def _parse_rollout_rs_thresholds(
     options: list[str], threshold_spec: Optional[str | float]
 ) -> dict[str, dict[str, Optional[float]]]:
@@ -482,7 +521,7 @@ def compute_rollout_correction_weights(
     log_ratio: torch.Tensor,
     response_mask: torch.Tensor,
     rollout_is: str = "token",
-    rollout_is_threshold: float = 2.0,
+    rollout_is_threshold: str | float = 2.0,
     rollout_is_batch_normalize: bool = False,
 ) -> tuple[torch.Tensor, dict[str, float]]:
     """Compute importance sampling weights to correct for off-policy distribution shifts.
@@ -504,8 +543,9 @@ def compute_rollout_correction_weights(
         rollout_is: IS weight aggregation level, must be one of:
             - "token": Per-token weights (biased, low variance)
             - "sequence": Per-sequence weight (product of tokens; unbiased, high variance)
-        rollout_is_threshold: Upper threshold for truncating extreme weights (e.g., 2.0),
-            default 2.0.
+        rollout_is_threshold: Threshold specification for IS weights.
+            - Single float or float-like string: TIS, clamp weights to the upper bound
+            - "lower_upper" string: IcePop, zero weights outside [lower, upper]
         rollout_is_batch_normalize: Whether to normalize IS weights to have mean=1.0 per batch,
             default False.
 
@@ -523,15 +563,15 @@ def compute_rollout_correction_weights(
     valid_is_levels = {"token", "sequence"}
     if rollout_is not in valid_is_levels:
         raise ValueError(f"Invalid rollout_is: {rollout_is}. Must be one of {valid_is_levels}.")
-    if rollout_is_threshold <= 0:
-        raise ValueError(f"rollout_is_threshold must be positive, got {rollout_is_threshold}.")
+    rollout_is_threshold_upper, rollout_is_threshold_lower = _parse_rollout_is_threshold(rollout_is_threshold)
+    use_icepop = rollout_is_threshold_lower is not None
 
     # Compute IS weights from log ratio (handles different aggregation levels)
     if rollout_is == "token":
         # Per-token IS weight: exp(log(π_train/π_rollout)) with safety clamp
         log_ratio_for_metrics: torch.Tensor = log_ratio
         log_ratio_safe: torch.Tensor = torch.clamp(log_ratio, min=-SAFETY_BOUND, max=SAFETY_BOUND)
-        rollout_is_weights: torch.Tensor = torch.exp(log_ratio_safe)
+        raw_rollout_is_weights: torch.Tensor = torch.exp(log_ratio_safe)
 
     elif rollout_is == "sequence":
         # Sequence-level IS weight: product of token ratios (exp(sum(log ratios)))
@@ -541,25 +581,42 @@ def compute_rollout_correction_weights(
         log_ratio_for_metrics = log_ratio_sum
 
         log_ratio_sum_safe: torch.Tensor = torch.clamp(log_ratio_sum, min=-SAFETY_BOUND, max=SAFETY_BOUND)
-        rollout_is_weights = torch.exp(log_ratio_sum_safe).expand_as(log_ratio)  # Broadcast to sequence length
+        raw_rollout_is_weights = torch.exp(log_ratio_sum_safe).expand_as(log_ratio)  # Broadcast to sequence length
 
     else:
         raise ValueError(f"Unsupported rollout_is: {rollout_is}")
 
     # Zero out weights for padding tokens using response mask
-    rollout_is_weights = rollout_is_weights * response_mask
+    raw_rollout_is_weights = raw_rollout_is_weights * response_mask
+
+    # Apply TIS for a single upper bound and IcePop for a lower_upper string.
+    if not use_icepop:
+        rollout_is_weights = raw_rollout_is_weights.clamp(max=rollout_is_threshold_upper)
+    else:
+        assert rollout_is_threshold_lower is not None
+        token_kept_mask = (raw_rollout_is_weights >= rollout_is_threshold_lower) & (
+            raw_rollout_is_weights <= rollout_is_threshold_upper
+        )
+        rollout_is_weights = torch.where(
+            token_kept_mask, raw_rollout_is_weights, torch.zeros_like(raw_rollout_is_weights)
+        )
 
-    # Compute IS weight metrics (BEFORE truncation to get accurate fraction_high/low)
+    # Compute IS weight metrics.
     metrics: dict[str, float] = compute_is_metrics(
         rollout_is_weights=rollout_is_weights,
+        raw_rollout_is_weights=raw_rollout_is_weights,
         log_ratio_for_metrics=log_ratio_for_metrics,
         response_mask=response_mask,
         rollout_is=rollout_is,
-        rollout_is_threshold=rollout_is_threshold,
+        rollout_is_threshold=rollout_is_threshold_upper,
+        rollout_is_threshold_lower=rollout_is_threshold_lower,
     )
-
-    # Truncate extreme weights (TIS: Truncated Importance Sampling)
-    rollout_is_weights = rollout_is_weights.clamp(max=rollout_is_threshold)
+    if use_icepop:
+        assert rollout_is_threshold_lower is not None
+        oob_mask = (raw_rollout_is_weights < rollout_is_threshold_lower) | (
+            raw_rollout_is_weights > rollout_is_threshold_upper
+        )
+        metrics["rollout_is_oob_ratio"] = verl_F.masked_mean(oob_mask.float(), response_mask).item()
 
     # Detach weights to prevent gradient flow (mathematically required by IS theory)
     # IS weights change the measure, not the objective. See §3.2.2 in docs/algo/rollout_corr_math.md
@@ -600,19 +657,23 @@ def compute_rollout_correction_weights(
 
 def compute_is_metrics(
     rollout_is_weights: torch.Tensor,
+    raw_rollout_is_weights: torch.Tensor,
     log_ratio_for_metrics: torch.Tensor,
     response_mask: torch.Tensor,
     rollout_is: str,
     rollout_is_threshold: float,
+    rollout_is_threshold_lower: Optional[float] = None,
 ) -> dict[str, float]:
     """Compute comprehensive metrics for truncated importance sampling weights.
 
-    This function calculates statistics for truncated IS weights (TIS), using log-space
-    for accurate threshold checks and clamped weights for stable mean/std calculations.
+    This function calculates statistics for the applied IS weights while using the
+    raw pre-processing weights to diagnose how often ratios exceed the configured bounds.
 
     Args:
         rollout_is_weights: Truncated IS weights (π_train / π_rollout),
             shape (batch_size, seq_length).
+        raw_rollout_is_weights: Raw masked IS weights before TIS / IcePop processing,
+            shape (batch_size, seq_length).
         log_ratio_for_metrics: Log ratio of training to rollout probabilities (unclamped),
             shape varies by aggregation level.
         response_mask: Binary mask for valid tokens (1=valid, 0=padding),
@@ -629,7 +690,9 @@ def compute_is_metrics(
     metrics: dict[str, float] = {}
     device: torch.device = rollout_is_weights.device
     # Default lower threshold (reciprocal of upper threshold)
-    rollout_is_threshold_lower: float = 1.0 / rollout_is_threshold
+    rollout_is_threshold_lower = (
+        1.0 / rollout_is_threshold if rollout_is_threshold_lower is None else rollout_is_threshold_lower
+    )
 
     # Precompute log thresholds for accurate checks
     log_threshold_upper: torch.Tensor = torch.log(torch.tensor(rollout_is_threshold, device=device))
@@ -653,12 +716,13 @@ def compute_is_metrics(
         metrics["rollout_is_ratio_fraction_low"] = below_lower.float().mean().item()
 
     else:  # token-level
-        # Token-level aggregation: compute directly from truncated weights
+        # Token-level aggregation: the applied weights drive loss, std, and ESS,
+        # while high/low fractions are measured from the raw pre-processing weights.
         metrics["rollout_is_mean"] = verl_F.masked_mean(rollout_is_weights, response_mask).item()
 
         # Fraction of tokens exceeding thresholds
-        rollout_is_above_threshold: torch.Tensor = rollout_is_weights > rollout_is_threshold
-        rollout_is_below_threshold: torch.Tensor = rollout_is_weights < rollout_is_threshold_lower
+        rollout_is_above_threshold: torch.Tensor = raw_rollout_is_weights > rollout_is_threshold
+        rollout_is_below_threshold: torch.Tensor = raw_rollout_is_weights < rollout_is_threshold_lower
         metrics["rollout_is_ratio_fraction_high"] = verl_F.masked_mean(
             rollout_is_above_threshold.float(), response_mask
         ).item()
@@ -671,12 +735,11 @@ def compute_is_metrics(
         metrics["rollout_is_max"] = rollout_is_weights.masked_fill(~mask_bool, float("-inf")).max().item()
         metrics["rollout_is_min"] = rollout_is_weights.masked_fill(~mask_bool, float("inf")).min().item()
 
-    # Compute standard deviation (using clamped weights for stability)
+    # Compute standard deviation / ESS from the actual applied weights so exact
+    # IcePop diagnostics preserve zeroed-out coefficients.
     mask_count: torch.Tensor = response_mask.sum()
     if mask_count > 1:
-        weights_for_std: torch.Tensor = rollout_is_weights.clamp(
-            min=rollout_is_threshold_lower, max=rollout_is_threshold
-        )
+        weights_for_std: torch.Tensor = rollout_is_weights.clamp(min=0.0, max=rollout_is_threshold)
         mean_clamped: torch.Tensor = verl_F.masked_mean(weights_for_std, response_mask)
         rollout_is_var: torch.Tensor = (
             verl_F.masked_mean(weights_for_std.square(), response_mask) - mean_clamped.square()
@@ -686,7 +749,7 @@ def compute_is_metrics(
         metrics["rollout_is_std"] = 0.0
 
     # Compute Effective Sample Size (ESS) for truncated weights
-    weights_for_ess: torch.Tensor = rollout_is_weights.clamp(min=rollout_is_threshold_lower, max=rollout_is_threshold)
+    weights_for_ess: torch.Tensor = rollout_is_weights.clamp(min=0.0, max=rollout_is_threshold)
     mean_for_ess: torch.Tensor = verl_F.masked_mean(weights_for_ess, response_mask)
     is_weights_normalized: torch.Tensor = weights_for_ess / (mean_for_ess + 1e-8)  # Avoid division by zero
     metrics["rollout_is_eff_sample_size"] = (
@@ -718,7 +781,7 @@ def compute_rollout_correction_and_rejection_mask(
     rollout_log_prob: torch.Tensor,
     response_mask: torch.Tensor,
     rollout_is: Optional[str] = None,
-    rollout_is_threshold: Optional[float] = 2.0,
+    rollout_is_threshold: Optional[str | float] = 2.0,
     rollout_is_batch_normalize: bool = False,
     rollout_rs: Optional[str] = None,
     rollout_rs_threshold: Optional[str | float] = None,
@@ -741,8 +804,8 @@ def compute_rollout_correction_and_rejection_mask(
             shape (batch_size, seq_length).
         rollout_is: IS weight aggregation level (see compute_rollout_correction_weights for options).
             Set to None to disable IS weight computation.
-        rollout_is_threshold: Upper threshold for truncated IS weights (used if rollout_is is set),
-            default 2.0.
+        rollout_is_threshold: Threshold specification for IS weights.
+            Single float implies TIS; "lower_upper" implies IcePop.
         rollout_rs: Rejection sampling aggregation modes as a comma separated string
             (see compute_rollout_rejection_mask for the full list). Set to None to disable
             rejection sampling.
diff --git a/verl/trainer/sft_trainer.py b/verl/trainer/sft_trainer.py
index 14924692f2f..81fc6307470 100644
--- a/verl/trainer/sft_trainer.py
+++ b/verl/trainer/sft_trainer.py
@@ -277,6 +277,12 @@ def _get_batch_seqlens(self, data):
         else:
             batch_seqlens: torch.Tensor = data["attention_mask"].sum(dim=-1)
         batch_seqlens = batch_seqlens.to(self.device_name)  # (global_bsz // dp)
+        if self.engine.get_data_parallel_size() > 1:
+            output_tensor = torch.empty(
+                (batch_seqlens.shape[0] * self.engine.get_data_parallel_size(),),
+                dtype=batch_seqlens.dtype,
+                device=self.device_name,
+            )  # (global_bsz,)
 
         dp_group = self.engine.get_data_parallel_group()
         dp_size = self.engine.get_data_parallel_size()
diff --git a/verl/trainer/sft_trainer_ray.py b/verl/trainer/sft_trainer_ray.py
index ed536d3729f..5f2d68d0e54 100644
--- a/verl/trainer/sft_trainer_ray.py
+++ b/verl/trainer/sft_trainer_ray.py
@@ -38,6 +38,7 @@
 from verl.utils.dataset.multiturn_sft_dataset import MultiTurnSFTDataset
 from verl.utils.device import auto_set_device, get_device_name
 from verl.utils.logger import log_with_rank
+from verl.utils.seqlen_balancing import calculate_workload, get_seqlen_balanced_partitions
 from verl.utils.tracking import Tracking
 from verl.workers.engine_workers import TrainingWorker
 
@@ -307,6 +308,24 @@ def fit(self):
 
                 tu.assign_non_tensor(data, update_lr_scheduler=True, global_token_num=batch_seqlens_ntd)
 
+                if self.config.trainer.balance_batch:
+                    global_seqlen_lst = torch.Tensor([item.size()[0] for item in data["input_ids"]])
+                    global_seqlen_lst = calculate_workload(global_seqlen_lst)
+                    dp_size = max(self.training_client._query_dispatch_info("train")) + 1
+
+                    global_partition_lst = get_seqlen_balanced_partitions(
+                        global_seqlen_lst, k_partitions=dp_size, equal_size=True
+                    )
+                    # Place smaller micro-batches at both ends to reduce the bubbles in pipeline parallel.
+                    for idx, partition in enumerate(global_partition_lst):
+                        partition.sort(key=lambda x: (global_seqlen_lst[x], x))
+                        ordered_partition = partition[::2] + partition[1::2][::-1]
+                        global_partition_lst[idx] = ordered_partition
+
+                    global_idx = torch.tensor([j for partition in global_partition_lst for j in partition])
+
+                    data = tu.index_select_tensor_dict(data, global_idx)
+
                 # start profile in SPMD mode
                 if global_step == self.start_profile_step:
                     self.training_client.start_profile()
diff --git a/verl/utils/checkpoint/megatron_checkpoint_manager.py b/verl/utils/checkpoint/megatron_checkpoint_manager.py
index d2dd29f8219..6e643bb9a7d 100644
--- a/verl/utils/checkpoint/megatron_checkpoint_manager.py
+++ b/verl/utils/checkpoint/megatron_checkpoint_manager.py
@@ -259,23 +259,25 @@ def generate_state_dict(
         state_dict = {}
         base_metadata = metadata or self._build_sharded_state_dict_metadata()
 
-        # Should always generate model state dict
-        # All ranks Save Model to reduce memory pressure
-        # Get sharded state dict, notice that state_dict will collect among dp groups, causing memory pressure
-        for vpp_rank, model in enumerate(self.model):
-            if len(self.model) > 1:
-                mpu.set_virtual_pipeline_model_parallel_rank(vpp_rank)
-                key = f"model{vpp_rank}" if len(self.model) > 1 else "model"
-            else:
-                key = "model"
-            if hasattr(model, "module"):
-                model = model.module
+        should_generate_model_sections = generate_model or generate_optimizer
 
-            # GPTModel's sharded_state_dict function when having mtp requires metadata['dp_cp_group']
-            model_metadata = dict(base_metadata)
-            model_metadata["dp_cp_group"] = mpu.get_data_parallel_group(with_context_parallel=True)
-            kwargs = {"metadata": model_metadata}
-            state_dict[key] = model.sharded_state_dict(**kwargs)
+        # All ranks save model state dict when it is needed for either model checkpointing
+        # or optimizer sharded_state_dict generation.
+        if should_generate_model_sections:
+            for vpp_rank, model in enumerate(self.model):
+                if len(self.model) > 1:
+                    mpu.set_virtual_pipeline_model_parallel_rank(vpp_rank)
+                    key = f"model{vpp_rank}" if len(self.model) > 1 else "model"
+                else:
+                    key = "model"
+                if hasattr(model, "module"):
+                    model = model.module
+
+                # GPTModel's sharded_state_dict function when having mtp requires metadata['dp_cp_group']
+                model_metadata = dict(base_metadata)
+                model_metadata["dp_cp_group"] = mpu.get_data_parallel_group(with_context_parallel=True)
+                kwargs = {"metadata": model_metadata}
+                state_dict[key] = model.sharded_state_dict(**kwargs)
 
         # Optimizer State Dict
         if generate_optimizer:
@@ -293,7 +295,9 @@ def generate_state_dict(
                 state_dict["lr_scheduler"] = lr_state_dict
 
         if not generate_model:
-            state_dict.pop("model", None)
+            for key in list(state_dict.keys()):
+                if self._is_model_state_key(key):
+                    state_dict.pop(key)
 
         # RNG States State Dict
         if generate_extra:
@@ -341,6 +345,43 @@ def _build_sharded_state_dict_metadata(self) -> dict:
         metadata["chained_optim_avoid_prefix"] = True
         return metadata
 
+    @staticmethod
+    def _is_model_state_key(key: str) -> bool:
+        return key == "model" or (key.startswith("model") and key[5:].isdigit())
+
+    @staticmethod
+    def _has_checkpoint_files(path: str) -> bool:
+        return os.path.isdir(path) and any(os.scandir(path))
+
+    def _raise_for_unsupported_peft_checkpoint_layout(self, local_path: str, dist_checkpoint_path: str):
+        if self.peft_cls is None or not self.should_load_model or self._has_checkpoint_files(dist_checkpoint_path):
+            return
+
+        legacy_adapter_ckpt_path = os.path.join(local_path, "adapter_checkpoint")
+        hf_adapter_ckpt_path = os.path.join(local_path, "huggingface", "adapter")
+
+        if os.path.isdir(legacy_adapter_ckpt_path):
+            raise RuntimeError(
+                f"Found legacy PEFT checkpoint at {legacy_adapter_ckpt_path}, but checkpoint resume now expects "
+                f"adapter weights in {dist_checkpoint_path}. Resave/convert the checkpoint or load the adapter via "
+                "`lora.adapter_path`."
+            )
+
+        if os.path.isfile(os.path.join(hf_adapter_ckpt_path, "adapter_config.json")):
+            raise RuntimeError(
+                f"Found exported HF PEFT adapter at {hf_adapter_ckpt_path}, but `load_checkpoint()` resumes from "
+                f"{dist_checkpoint_path}. HF adapter exports are not used for trainer resume; keep the distributed "
+                "checkpoint or load the adapter separately via `lora.adapter_path`."
+            )
+
+    def _maybe_filter_peft_state_dict(self, state_dict: dict):
+        if self.peft_cls is None:
+            return state_dict
+
+        from megatron.bridge.training.checkpointing import apply_peft_adapter_filter_to_state_dict
+
+        return apply_peft_adapter_filter_to_state_dict(state_dict, self.peft_cls)
+
     def load_rng_states(self, rng_states, data_parallel_random_init=False, use_dist_ckpt=True):
         # access rng_state for data parallel rank
         if data_parallel_random_init:
@@ -373,6 +414,7 @@ def load_checkpoint(self, local_path: str, hdfs_path: str = None, del_local_afte
             pass
 
         dist_checkpoint_path = get_dist_checkpoint_path(local_path)
+        self._raise_for_unsupported_peft_checkpoint_layout(local_path, dist_checkpoint_path)
 
         load_content_metadata = getattr(dist_checkpointing, "load_content_metadata", None)
         if load_content_metadata is None:
@@ -392,13 +434,15 @@ def load_checkpoint(self, local_path: str, hdfs_path: str = None, del_local_afte
                 sharded_sd_metadata = self._build_sharded_state_dict_metadata()
 
         # Get State Dict for loading
+        should_load_dist_model = self.should_load_model and (self.use_dist_checkpointing or self.peft_cls is not None)
         sharded_state_dict = self.generate_state_dict(
-            self.should_load_model and self.use_dist_checkpointing,
+            should_load_dist_model,
             self.should_load_optimizer,
             self.should_load_extra,
             is_loading=True,
             metadata=sharded_sd_metadata,
         )
+        sharded_state_dict = self._maybe_filter_peft_state_dict(sharded_state_dict)
         log_with_rank(f"Generated state dict for loading: {sharded_state_dict.keys()}", rank=self.rank, logger=logger)
 
         # Load Dist Checkpointing
@@ -407,7 +451,7 @@ def load_checkpoint(self, local_path: str, hdfs_path: str = None, del_local_afte
             ckpt_dir=dist_checkpoint_path,
         )
 
-        if self.should_load_model and self.use_dist_checkpointing:
+        if should_load_dist_model:
             assert "model" in state_dict or any(
                 f"model{vpp_rank}" in state_dict for vpp_rank in range(len(self.model))
             ), f"Model state dict not found in {state_dict.keys()}. Please check the checkpoint file {local_path}."
@@ -418,8 +462,13 @@ def load_checkpoint(self, local_path: str, hdfs_path: str = None, del_local_afte
                     assert f"model{vpp_rank}" in state_dict, f"model{vpp_rank} not found in state_dict"
                     model_state_dict = state_dict[f"model{vpp_rank}"]
                 mpu.set_virtual_pipeline_model_parallel_rank(vpp_rank)
-                self.model[vpp_rank].load_state_dict(model_state_dict)
-            log_with_rank(f"Loaded sharded model checkpoint from {local_path}", rank=self.rank, logger=logger)
+                self.model[vpp_rank].load_state_dict(model_state_dict, strict=self.peft_cls is None)
+            if self.peft_cls is not None:
+                log_with_rank(
+                    f"Loaded PEFT adapter checkpoint from {dist_checkpoint_path}", rank=self.rank, logger=logger
+                )
+            else:
+                log_with_rank(f"Loaded sharded model checkpoint from {local_path}", rank=self.rank, logger=logger)
 
         # Skip HF checkpoint loading if PEFT is used
         elif self.should_load_model and self.use_hf_checkpoint and self.peft_cls is None:
@@ -429,29 +478,6 @@ def load_checkpoint(self, local_path: str, hdfs_path: str = None, del_local_afte
             else:
                 self.bridge.load_hf_weights(self.model, hf_model_path)
             log_with_rank(f"Loaded HF model checkpoint from {hf_model_path} with bridge", rank=self.rank, logger=logger)
-        # Load PEFT adapter checkpoint if available
-        if self.should_load_model and self.peft_cls is not None:
-            adapter_ckpt_path = os.path.join(local_path, "adapter_checkpoint")
-            if os.path.exists(adapter_ckpt_path):
-                from verl.utils.megatron_peft_utils import load_adapter_checkpoint
-
-                # TODO: a better format for adapter checkpoint, waiting megatron-bridge support
-
-                load_adapter_checkpoint(
-                    self.model,
-                    adapter_ckpt_path,
-                )
-                log_with_rank(
-                    f"Loaded adapter checkpoint from {adapter_ckpt_path}",
-                    rank=self.rank,
-                    logger=logger,
-                )
-            else:
-                log_with_rank(
-                    f"PEFT config is set but no adapter checkpoint found at {adapter_ckpt_path}",
-                    rank=self.rank,
-                    logger=logger,
-                )
 
         if self.should_load_optimizer:
             assert "optimizer" in state_dict, (
@@ -509,6 +535,7 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
                 self.should_save_extra,
                 metadata=sharded_sd_metadata,
             )
+            state_dict = self._maybe_filter_peft_state_dict(state_dict)
             log_with_rank(f"Generated state dict for saving: {state_dict.keys()}", rank=self.rank, logger=logger)
             for vpp_rank, model in enumerate(self.model):
                 if len(self.model) > 1:
@@ -535,11 +562,12 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
             # Generate optimizer and exra state dicts
             sharded_sd_metadata = self._build_sharded_state_dict_metadata()
             state_dict = self.generate_state_dict(
-                generate_model=False,
+                generate_model=self.should_save_model and self.peft_cls is not None,
                 generate_optimizer=self.should_save_optimizer,
                 generate_extra=self.should_save_extra,
                 metadata=sharded_sd_metadata,
             )
+            state_dict = self._maybe_filter_peft_state_dict(state_dict)
             # Save optimizer and extra states to local path
             # Start Async save if enabled
             async_save_request = save_dist_checkpointing(
@@ -555,26 +583,7 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
                 torch.distributed.barrier()
 
         if self.should_save_model:
-            # Save adapter-only checkpoint if PEFT is enabled
-            if self.peft_cls is not None:
-                from verl.utils.megatron_peft_utils import save_adapter_checkpoint
-
-                adapter_ckpt_path = os.path.join(local_path, "adapter_checkpoint")
-
-                # Save adapter weights only (much smaller than full model)
-                save_adapter_checkpoint(
-                    self.model,
-                    adapter_ckpt_path,
-                    self.rank,
-                )
-
-                log_with_rank(
-                    f"Saved adapter-only checkpoint to {adapter_ckpt_path}",
-                    rank=self.rank,
-                    logger=logger,
-                    log_only_rank_0=True,
-                )
-            elif self.use_hf_checkpoint:
+            if self.use_hf_checkpoint:
                 # Use mbridge to save HF model checkpoint
                 log_with_rank(f"Saving HF model checkpoint to {local_path} with bridge", rank=self.rank, logger=logger)
                 hf_ckpt_path = get_hf_model_checkpoint_path(local_path)
@@ -588,7 +597,17 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
                             extended_args[sig] = mbridge_config[sig]
                     self.bridge.save_weights(self.model, hf_ckpt_path, **extended_args)
                 else:
-                    self.bridge.save_hf_weights(self.model, hf_ckpt_path)
+                    if self.peft_cls is not None:
+                        hf_adapter_ckpt_path = os.path.join(hf_ckpt_path, "adapter")
+                        self.bridge.save_hf_adapter(self.model, hf_adapter_ckpt_path, self.peft_cls)
+                        log_with_rank(
+                            f"Saved HF PEFT adapter checkpoint to {hf_adapter_ckpt_path}",
+                            rank=self.rank,
+                            logger=logger,
+                            log_only_rank_0=True,
+                        )
+                    else:
+                        self.bridge.save_hf_weights(self.model, hf_ckpt_path)
 
                 log_with_rank(f"Saved bridge checkpoint to {hf_ckpt_path}", rank=self.rank, logger=logger)
 
diff --git a/verl/utils/dataset/dataset_utils.py b/verl/utils/dataset/dataset_utils.py
index 638940a68f8..a0ef903a805 100644
--- a/verl/utils/dataset/dataset_utils.py
+++ b/verl/utils/dataset/dataset_utils.py
@@ -76,6 +76,7 @@ def collate_variable_batch(self, batch: list[dict[str, any]]) -> dict[str, any]:
                     offsets = torch.zeros(len(tensors) + 1, dtype=torch.long)
                     torch.cumsum(lengths, dim=0, out=offsets[1:])
                     final_batch[key] = torch.nested.nested_tensor_from_jagged(values, offsets=offsets)
+                    final_batch[key]._ragged_idx = 2
                 else:
                     final_batch[key] = torch.nested.as_nested_tensor(tensors, layout=torch.jagged)
             else:
diff --git a/verl/utils/dataset/multiturn_sft_dataset.py b/verl/utils/dataset/multiturn_sft_dataset.py
index 57152e7257c..9775860a5b0 100644
--- a/verl/utils/dataset/multiturn_sft_dataset.py
+++ b/verl/utils/dataset/multiturn_sft_dataset.py
@@ -250,7 +250,7 @@ def _build_messages(self, example: dict):
         Returns:
             messages: List of messages with replaced placeholder.
         """
-        messages: list = example[self.messages_key]
+        messages: list = convert_nested_value_to_list_recursive(example[self.messages_key])
         images = example[self.image_key] if self.image_key in example else []
         videos = example[self.video_key] if self.video_key in example else []
 
@@ -293,6 +293,8 @@ def __getitem__(self, item):
         enable_thinking = (
             self.enable_thinking[item] if self.enable_thinking is not None else self.enable_thinking_default
         )
+        if enable_thinking is not None:
+            enable_thinking = bool(enable_thinking)
 
         # 1. tokenize each message
         input_ids, loss_mask, attention_mask, multi_modal_inputs = [], [], [], {}
diff --git a/verl/utils/debug/metrics.py b/verl/utils/debug/metrics.py
index e7d57a2fec3..706a7f97d55 100644
--- a/verl/utils/debug/metrics.py
+++ b/verl/utils/debug/metrics.py
@@ -98,6 +98,18 @@ def calculate_debug_metrics(data: DataProto) -> dict:
     actor_probs = torch.exp(actor_old_log_probs)
     rollout_probs = torch.exp(rollout_old_log_probs)
     response_mask_bool = response_mask.bool()
+
+    # check if there are any valid tokens before computing metrics
+    if not response_mask_bool.any():
+        logger.warning("response_mask is all False, returning default metrics")
+        return {
+            "training/rollout_probs_diff_valid": 0,
+            "training/rollout_probs_diff_max": float("nan"),
+            "training/rollout_probs_diff_mean": float("nan"),
+            "training/rollout_probs_diff_std": float("nan"),
+            "training/rollout_actor_probs_pearson_corr": float("nan"),
+        }
+
     pearson_corrcoef = pearson_correlation_coefficient(actor_probs, rollout_probs, response_mask_bool)
     rollout_probs_diff = calculate_log_prob_diff(actor_probs, rollout_probs, response_mask_bool)
     return {
diff --git a/verl/utils/experimental/reward_utils.py b/verl/utils/experimental/reward_utils.py
new file mode 100644
index 00000000000..975f01d85ba
--- /dev/null
+++ b/verl/utils/experimental/reward_utils.py
@@ -0,0 +1,36 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+from io import BytesIO
+
+from PIL import Image
+
+
+def pil_image_to_base64(image: Image.Image) -> str:
+    buffered = BytesIO()
+    image.save(buffered, format="PNG")
+    encoded_image_text = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    base64_image = f"data:image/png;base64,{encoded_image_text}"
+    return base64_image
+
+
+def prepare_query_for_multi_modal(image_base64: str) -> list:
+    query = [
+        {
+            "type": "image_url",
+            "image_url": {"url": image_base64},
+        },
+    ]
+    return query
diff --git a/verl/utils/flops_counter.py b/verl/utils/flops_counter.py
index a0a490c11f3..3fae58adcfb 100644
--- a/verl/utils/flops_counter.py
+++ b/verl/utils/flops_counter.py
@@ -240,7 +240,10 @@ def _estimate_qwen3_vit_flop(images_seqlens, config):
     merger_N = (out_hidden_size + (dim * (spatial_merge_size**2))) * (dim * (spatial_merge_size**2))
 
     # Qwen3 VL uses deep stack, one merger for every deepstack layer
-    deepstack_merger_N = merger_N * len(config.deepstack_visual_indexes)
+    if getattr(config, "deepstack_visual_indexes", None) is not None:
+        deepstack_merger_N = merger_N * len(config.deepstack_visual_indexes)
+    else:
+        deepstack_merger_N = 0
     # non-attn all_layer parm
     dense_N = patch_embed_N + (mlp_N + attn_linear_N) * depth + deepstack_merger_N + merger_N
 
@@ -539,8 +542,8 @@ def _estimate_unknown_flops(config, tokens_sum, batch_seqlens, delta_time):
     "qwen2": _estimate_qwen2_flops,
     "llama": _estimate_qwen2_flops,
     "qwen2_moe": _estimate_qwen2_moe_flops,
-    "qwen2_vl": _estimate_qwen2_flops,
-    "qwen2_5_vl": _estimate_qwen2_flops,
+    "qwen2_vl": _estimate_qwen3_vl_flops,
+    "qwen2_5_vl": _estimate_qwen3_vl_flops,
     "qwen3": _estimate_qwen2_flops,
     "qwen3_moe": _estimate_qwen2_moe_flops,
     "qwen3_vl": _estimate_qwen3_vl_flops,
diff --git a/verl/utils/fp8_utils.py b/verl/utils/fp8_utils.py
index a6a8efb0370..cfe7aa656d9 100644
--- a/verl/utils/fp8_utils.py
+++ b/verl/utils/fp8_utils.py
@@ -19,6 +19,7 @@
 import torch
 
 from verl.utils.kernel.fp8_kernel import scaled_fp8_blockwise
+from verl.workers.rollout.utils import ensure_async_iterator
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "INFO"))
@@ -82,12 +83,12 @@ def should_quantize_param(self, param_name):
         logger.debug(f"Skip quantization: {param_name}")
         return False
 
-    def quant_weights_by_name(self, weights, dtype=torch.bfloat16):
+    async def quant_weights_by_name(self, weights, dtype=torch.bfloat16):
         """FP8 quantization based on parameter name using a memory-efficient generator.
 
 
         Args:
-            weights: Generator or iterable of (name, tensor) pairs
+            weights: Generator, AsyncGenerator, or iterable of (name, tensor) pairs
             dtype: Data type for intermediate computation
 
         Yields:
@@ -101,7 +102,7 @@ def quant_weights_by_name(self, weights, dtype=torch.bfloat16):
         if weight_block_size is None:
             raise ValueError("weight_block_size not found in quant_config")
 
-        for k, v in weights:
+        async for k, v in ensure_async_iterator(weights):
             # Check if quantization is needed
             if not self.should_quantize_param(k):
                 yield (k, v)
diff --git a/verl/utils/fsdp_utils.py b/verl/utils/fsdp_utils.py
index 8ec9c6c0b2b..2409378166a 100644
--- a/verl/utils/fsdp_utils.py
+++ b/verl/utils/fsdp_utils.py
@@ -590,7 +590,7 @@ def fsdp2_clip_grad_norm_(parameters, max_norm, norm_type=2.0, error_if_nonfinit
     return total_norm
 
 
-def layered_summon_lora_params(fsdp_module) -> OrderedDict:
+def layered_summon_lora_params(fsdp_module, is_diffusers=False) -> OrderedDict:
     from peft.utils.save_and_load import get_peft_model_state_dict
 
     def __prefix_submodules(module, prefix):
@@ -599,22 +599,33 @@ def __prefix_submodules(module, prefix):
                 yield name, submodule
 
     lora_params = OrderedDict()
-    prefix_list = [
-        # fsdp
-        "_fsdp_wrapped_module.base_model.model.",
-        "_fsdp_wrapped_module.base_model.model.model.",
-        "_fsdp_wrapped_module.base_model.model.model.layers.",
-        "_fsdp_wrapped_module.base_model.model.model.language_model.layers.",
-        # fsdp2
-        "base_model.model.",
-        "base_model.model.model.",
-        "base_model.model.model.layers.",
-        "base_model.model.model.language_model.layers.",
-    ]
+    if is_diffusers:
+        prefix_list = [
+            # fsdp
+            "_fsdp_wrapped_module.transformer_blocks.",
+            # fsdp2
+            "transformer_blocks.",
+        ]
+    else:
+        prefix_list = [
+            # fsdp
+            "_fsdp_wrapped_module.base_model.model.",
+            "_fsdp_wrapped_module.base_model.model.model.",
+            "_fsdp_wrapped_module.base_model.model.model.layers.",
+            "_fsdp_wrapped_module.base_model.model.model.language_model.layers.",
+            # fsdp2
+            "base_model.model.",
+            "base_model.model.model.",
+            "base_model.model.model.layers.",
+            "base_model.model.model.language_model.layers.",
+        ]
     peft_model = getattr(fsdp_module, "_fsdp_wrapped_module", fsdp_module)
     for prefix in prefix_list:
         for name, submodule in __prefix_submodules(fsdp_module, prefix):
-            prefix = name.replace("_fsdp_wrapped_module.base_model.model.", "base_model.model.")
+            if is_diffusers:
+                prefix = name.replace("_fsdp_wrapped_module.", "")
+            else:
+                prefix = name.replace("_fsdp_wrapped_module.base_model.model.", "base_model.model.")
             if name.endswith(".model") or name.endswith(".layers"):
                 continue
             if fsdp_version(submodule) > 0:
@@ -632,7 +643,9 @@ def __prefix_submodules(module, prefix):
     return lora_params
 
 
-def collect_lora_params(module: FSDP, layered_summon: bool, base_sync_done: bool) -> OrderedDict:
+def collect_lora_params(
+    module: FSDP, layered_summon: bool, base_sync_done: bool, is_diffusers: bool = False
+) -> OrderedDict:
     """
     collect lora params or full params if base model is not ready in vllm
     work with if isinstance(self.module._fsdp_wrapped_module, PeftModel)
@@ -648,7 +661,7 @@ def collect_lora_params(module: FSDP, layered_summon: bool, base_sync_done: bool
                     "To use layered_summon, you must make sure base-model is preloaded in vllm, e.g. let "
                     "rollout.load_format=safetensors"
                 )
-            lora_params = layered_summon_lora_params(module)
+            lora_params = layered_summon_lora_params(module, is_diffusers=is_diffusers)
         else:
             with FSDP.summon_full_params(module, writeback=False):
                 if base_sync_done:
diff --git a/verl/utils/import_utils.py b/verl/utils/import_utils.py
index ee78b580675..ab4273156e8 100644
--- a/verl/utils/import_utils.py
+++ b/verl/utils/import_utils.py
@@ -69,12 +69,20 @@ def is_trl_available():
     return trl_spec is not None
 
 
+@cache
+def is_msprobe_available():
+    try:
+        msprobe_spec = importlib.util.find_spec("msprobe")
+    except ModuleNotFoundError:
+        msprobe_spec = None
+    return msprobe_spec is not None
+
+
 def import_external_libs(external_libs=None):
     if external_libs is None:
         return
     if not isinstance(external_libs, list):
         external_libs = [external_libs]
-    import importlib
 
     for external_lib in external_libs:
         importlib.import_module(external_lib)
diff --git a/verl/utils/megatron/router_replay_utils.py b/verl/utils/megatron/router_replay_utils.py
index 463eb12ca4e..9984af102cd 100644
--- a/verl/utils/megatron/router_replay_utils.py
+++ b/verl/utils/megatron/router_replay_utils.py
@@ -37,9 +37,9 @@
 
 from verl.models.mcore.util import (
     postprocess_packed_seqs,
-    postprocess_thd_no_padding,
+    postprocess_thd_engine,
     preprocess_packed_seqs,
-    preprocess_thd_no_padding,
+    preprocess_thd_engine,
 )
 from verl.utils.device import get_device_name
 from verl.utils.megatron.router_replay_patch import RouterReplay, RouterReplayAction
@@ -253,8 +253,8 @@ def merge_router_topk_indices(attention_mask, input_ids, mini_layer_topk_idx_lis
 
         if input_ids.is_nested:
             batch_size = input_ids.shape[0]
-            _, packed_seq_params = preprocess_thd_no_padding(input_ids, pre_process=True)
-            layers_topk_idx = postprocess_thd_no_padding(
+            _, packed_seq_params, _ = preprocess_thd_engine(input_ids, pre_process=True)
+            layers_topk_idx = postprocess_thd_engine(
                 layers_topk_idx, packed_seq_params, input_ids, batch_size, post_process=True
             )
         else:
@@ -287,7 +287,7 @@ def set_router_replay_data(layers_topk_idx, attention_mask, tf_config, vp_rank=N
     """
     with torch.no_grad():
         if layers_topk_idx.is_nested:
-            layers_topk_idx_rmpad, _, _ = preprocess_thd_no_padding(layers_topk_idx, pre_process=True)
+            layers_topk_idx_rmpad, _, _ = preprocess_thd_engine(layers_topk_idx, pre_process=True)
         else:
             layers_topk_idx_rmpad, _ = preprocess_packed_seqs(layers_topk_idx, attention_mask, pre_process=True)
         layers_topk_idx_rmpad = layers_topk_idx_rmpad.contiguous()  # 1, dynamic_bs_all, layer_num, topk
diff --git a/verl/utils/megatron_peft_utils.py b/verl/utils/megatron_peft_utils.py
index ae80069287b..e81b1bce595 100644
--- a/verl/utils/megatron_peft_utils.py
+++ b/verl/utils/megatron_peft_utils.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 """Utilities for PEFT (Parameter-Efficient Fine-Tuning) of Megatron in VERL."""
 
-import os
-from pathlib import Path
 from typing import Iterator
 
 import torch
@@ -70,170 +68,6 @@
 ]
 
 
-def _get_rank_checkpoint_path(base_path: str) -> str:
-    """Get rank-specific checkpoint path following Megatron's convention.
-
-    Returns path like: base_path/mp_rank_{tp:02d}_{pp:03d}_{ep:03d}/
-
-    Args:
-        base_path: Base checkpoint directory
-
-    Returns:
-        Rank-specific subdirectory path
-    """
-    from megatron.core import mpu
-
-    tensor_rank = mpu.get_tensor_model_parallel_rank()
-    pipeline_rank = mpu.get_pipeline_model_parallel_rank()
-    expert_rank = mpu.get_expert_model_parallel_rank()
-
-    pipeline_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
-    expert_parallel = mpu.get_expert_model_parallel_world_size() > 1
-
-    if not pipeline_parallel:
-        rank_path = os.path.join(base_path, f"mp_rank_{tensor_rank:02d}")
-    else:
-        rank_path = os.path.join(base_path, f"mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}")
-
-    if expert_parallel:
-        rank_path = rank_path + f"_{expert_rank:03d}"
-
-    return rank_path
-
-
-def get_adapter_state_dict(model):
-    """Extract only adapter parameters from a model.
-
-    Args:
-        model: PyTorch model (possibly wrapped in DDP/Float16Module)
-
-    Returns:
-        Dict of adapter parameter names to tensors
-    """
-    from verl.utils.megatron_utils import unwrap_model
-
-    # Unwrap model from DDP/Float16Module
-    unwrapped = unwrap_model(model)
-    if isinstance(unwrapped, list):
-        unwrapped = unwrapped[0]
-
-    adapter_state = {}
-    for name, param in unwrapped.named_parameters():
-        if ".adapter." in name.lower():
-            adapter_state[name] = param.data.clone()
-
-    return adapter_state
-
-
-def save_adapter_checkpoint(
-    model: torch.nn.Module | list[torch.nn.Module],
-    checkpoint_path: str,
-    rank: int = 0,
-):
-    """Save only adapter parameters to checkpoint.
-
-    This is much more efficient than saving the full model when using PEFT,
-    as adapters typically represent <1% of total parameters.
-
-    Uses Megatron's distributed checkpoint structure: each rank saves to
-    checkpoint_path/mp_rank_{tp:02d}_{pp:03d}/adapter.pt
-
-    Args:
-        model: Model or list of models
-        checkpoint_path: Base path to save checkpoint (rank-specific subdirs created)
-        rank: Process rank (used for logging only)
-    """
-
-    if isinstance(model, list):
-        models = model
-    else:
-        models = [model]
-
-    # Get adapter state from first model
-    adapter_state = get_adapter_state_dict(models[0])
-
-    if not adapter_state:
-        if rank == 0:
-            print("Warning: No adapter parameters found to save")
-        return
-
-    # Get rank-specific directory path
-    Path(checkpoint_path).mkdir(parents=True, exist_ok=True)
-    rank_path = _get_rank_checkpoint_path(checkpoint_path)
-    adapter_file = rank_path + "_adapter.pt"
-
-    torch.save(
-        {
-            "adapter_state_dict": adapter_state,
-        },
-        adapter_file,
-    )
-
-    if rank == 0:
-        print(f"Saved {len(adapter_state)} adapter parameters to {checkpoint_path} (distributed)")
-
-
-def load_adapter_checkpoint(
-    model: torch.nn.Module | list[torch.nn.Module],
-    checkpoint_path: str,
-    strict: bool = True,
-):
-    """Load adapter parameters from checkpoint.
-
-    Loads from Megatron's distributed checkpoint structure: reads from
-    checkpoint_path/mp_rank_{tp:02d}_{pp:03d}/adapter.pt for each rank.
-
-    Args:
-        model: Model or list of models
-        checkpoint_path: Base path to checkpoint directory
-        strict: Whether to strictly enforce parameter name matching
-    """
-    from megatron.core import mpu
-
-    from verl.utils.megatron_utils import unwrap_model
-
-    # Get rank-specific path
-    rank_path = _get_rank_checkpoint_path(checkpoint_path)
-    adapter_file = rank_path + "_adapter.pt"
-
-    if not os.path.isfile(adapter_file):
-        raise FileNotFoundError(f"Adapter checkpoint not found: {adapter_file}")
-
-    checkpoint = torch.load(adapter_file, map_location="cpu")
-    adapter_state = checkpoint.get("adapter_state_dict", {})
-
-    if not adapter_state:
-        print("Warning: No adapter parameters found in checkpoint")
-        return
-
-    if isinstance(model, list):
-        models = model
-    else:
-        models = [model]
-
-    # Load adapter parameters into each model (for VPP, models may have multiple chunks)
-    loaded_count = 0
-    for m in models:
-        unwrapped = unwrap_model(m)
-        if isinstance(unwrapped, list):
-            unwrapped = unwrapped[0]
-
-        # Load parameters
-        _, unexpected = unwrapped.load_state_dict(adapter_state, strict=False)
-
-        if strict and unexpected:
-            raise RuntimeError(f"Error loading adapter checkpoint:\nUnexpected keys: {unexpected}")
-
-        loaded_count += len(adapter_state)
-
-    if (
-        mpu.get_data_parallel_rank() == 0
-        and mpu.get_tensor_model_parallel_rank() == 0
-        and mpu.get_pipeline_model_parallel_rank() == 0
-    ):
-        print(f"Loaded {len(adapter_state)} adapter parameters from {checkpoint_path}")
-
-
 def count_adapter_parameters(model):
     """Count the number of trainable adapter parameters.
 
@@ -349,9 +183,6 @@ def add_base_layer_suffix(
 
 
 __all__ = [
-    "get_adapter_state_dict",
-    "save_adapter_checkpoint",
-    "load_adapter_checkpoint",
     "count_adapter_parameters",
     "print_adapter_info",
     "convert_megatron_to_hf_target_modules",
diff --git a/verl/utils/megatron_utils.py b/verl/utils/megatron_utils.py
index dba32d02539..81dcf80e5f0 100644
--- a/verl/utils/megatron_utils.py
+++ b/verl/utils/megatron_utils.py
@@ -36,11 +36,14 @@
 from megatron.core.transformer.module import Float16Module
 from megatron.core.transformer.multi_token_prediction import MTPLossLoggingHelper
 from megatron.core.utils import get_attr_wrapped_model
+from tensordict import TensorDict
 from transformers import PretrainedConfig
 
 import verl.utils.megatron.tensor_parallel as tp_utils
+from verl.utils import tensordict_utils as tu
 from verl.utils.device import get_device_id, get_device_name, get_torch_device
 from verl.utils.fs import local_mkdir_safe
+from verl.utils.megatron.dist_checkpointing import load_dist_checkpointing
 from verl.utils.model import normalize_model_name
 from verl.utils.torch_dtypes import PrecisionType
 from verl.workers.config import HFModelConfig, McoreEngineConfig
@@ -167,6 +170,37 @@ def get_model(
     return model
 
 
+def get_hf_rope_theta(hf_config: PretrainedConfig) -> float:
+    """Return RoPE base frequency theta.
+
+    Most configs expose ``rope_theta`` on the root. Newer models (e.g. Qwen3 in transformers>=5) store it under
+    ``rope_parameters["rope_theta"]``, optionally nested per attention pattern when ``rope_parameters`` maps names
+    to parameter dicts.
+    """
+    # For transformers <= 4.57.6
+    if hasattr(hf_config, "rope_theta"):
+        return hf_config.rope_theta
+    if hasattr(hf_config, "text_config") and hasattr(hf_config.text_config, "rope_theta"):
+        return hf_config.text_config.rope_theta
+
+    # For transformers >= 5.0.0, check rope_parameters dict (optionally nested) for rope_theta
+    rp = None
+    if hasattr(hf_config, "rope_parameters"):
+        rp = hf_config.rope_parameters
+    elif hasattr(hf_config, "text_config") and hasattr(hf_config.text_config, "rope_parameters"):
+        rp = hf_config.text_config.rope_parameters
+    if isinstance(rp, dict):
+        if "rope_theta" in rp:
+            return rp["rope_theta"]
+        for v in rp.values():
+            if isinstance(v, dict) and "rope_theta" in v:
+                return v["rope_theta"]
+    raise AttributeError(
+        f"{type(hf_config).__name__} has no rope_theta and no rope_parameters['rope_theta'] — "
+        "cannot determine RoPE base."
+    )
+
+
 @dataclass
 class McoreModuleWrapperConfig:
     """Configuration for Mcore module wrapper."""
@@ -188,6 +222,10 @@ def make_megatron_module(
     peft_cls: Any = None,
     peft_config: Any = None,
 ):
+    from verl.models.mcore.config_converter import get_hf_rope_theta
+
+    hf_config.rope_theta = get_hf_rope_theta(hf_config)
+
     if override_model_config is None:
         override_model_config = {}
 
@@ -220,7 +258,12 @@ def make_megatron_module(
             # Register PEFT transformation as pre-wrap hook if peft_cls is specified
             # This must happen BEFORE DDP wrapping to avoid KeyError with frozen parameters
             if peft_cls is not None:
-                from verl.utils.megatron_peft_utils import load_adapter_checkpoint, print_adapter_info
+                from megatron.bridge.training.checkpointing import (
+                    _generate_model_state_dict,
+                    apply_peft_adapter_filter_to_state_dict,
+                )
+
+                from verl.utils.megatron_peft_utils import print_adapter_info
 
                 def peft_pre_wrap_hook(model):
                     """Pre-wrap hook that applies PEFT transformation."""
@@ -235,7 +278,13 @@ def peft_pre_wrap_hook(model):
                     adapter_path = getattr(peft_config, "adapter_path", None)
                     if adapter_path is not None and adapter_path:
                         print(f"Loading adapter weights from: {adapter_path}")
-                        load_adapter_checkpoint(transformed_model, adapter_path)
+                        model_chunks = transformed_model if isinstance(transformed_model, list) else [transformed_model]
+                        sharded_state_dict = _generate_model_state_dict(model_chunks, {})
+                        sharded_state_dict = apply_peft_adapter_filter_to_state_dict(sharded_state_dict, peft_cls)
+                        loaded_state_dict = load_dist_checkpointing(sharded_state_dict, str(adapter_path))
+                        for vpp_rank, model_chunk in enumerate(model_chunks):
+                            model_key = "model" if len(model_chunks) == 1 else f"model{vpp_rank}"
+                            model_chunk.load_state_dict(loaded_state_dict[model_key], strict=False)
 
                     # Print PEFT statistics
                     if torch.distributed.get_rank() == 0:
@@ -276,12 +325,22 @@ def peft_pre_wrap_hook(model):
             # Extract TransformerConfig from the created model
             tf_config = get_model_config(model[0] if isinstance(model, list) else model)
         else:
+            # Build ddp_config dict with use_distributed_optimizer, same as provider path
+            ddp_config = None
+            if wrap_config.wrap_with_ddp:
+                ddp_config_dict = {
+                    "use_distributed_optimizer": wrap_config.use_distributed_optimizer,
+                }
+                if override_ddp_config is not None:
+                    ddp_config_dict.update(override_ddp_config)
+                ddp_config = ddp_config_dict
+
             model = bridge.get_model(
                 post_model_creation_callbacks=post_model_creation_callbacks,
                 wrap_with_ddp=wrap_config.wrap_with_ddp,
                 fp16=tf_config.fp16,
                 bf16=tf_config.bf16,
-                ddp_config=override_ddp_config,
+                ddp_config=ddp_config,
             )
 
         if isinstance(tf_config, MLATransformerConfig):
@@ -1337,6 +1396,84 @@ def get_megatron_module_device(models: list[Any]) -> str:
         return get_device_name()
 
 
+def dynamic_cp_split_batch(
+    batch: TensorDict, engine_config: McoreEngineConfig, dp_size: int, dp_rank: int
+) -> TensorDict:
+    """
+    Split the batch into sub-batches for dynamic context parallel.
+
+    we can spilt a microbatch into several sub-batches with different local_cp_size, but for simplicity now,
+    we only split the batch into a fixed local_cp_size.
+
+    """
+    input_ids = batch["input_ids"]
+    assert input_ids.is_nested, "input_ids must be a nested tensor"
+    seq_len_effective: torch.Tensor = input_ids.offsets().diff()
+    max_seq_len = max(seq_len_effective)
+    # if num of sequences is less than dp_size, we don't need to split the batch
+    local_cp_size = None
+    if len(seq_len_effective) < dp_size:
+        local_cp_size = dp_size
+        return batch
+    else:
+        # decide the local_cp_size based on the max_seq_len and dp_size
+        max_seqlen_per_dp_cp_rank = engine_config.max_seqlen_per_dp_cp_rank
+        import math
+
+        local_cp_size = math.ceil(max_seq_len / max_seqlen_per_dp_cp_rank)
+        # round up to the nearest power of 2, for [1,2,3,4,5,6,7,8] -> [1,2,4,4,8,8,8,8]
+        local_cp_size = 1 << (local_cp_size - 1).bit_length()
+
+        assert local_cp_size <= dp_size, (
+            "local_cp_size must be less than or equal to dp_size, try to increase max_seqlen_per_dp_cp_rank"
+        )
+        if local_cp_size < dp_size:
+            # split the batch into local_cp_size sub-batches
+            local_dp_rank = dp_rank // local_cp_size
+            local_dp_size = dp_size // local_cp_size
+            indices = list(range(len(seq_len_effective)))
+            num_seq_per_local_cp = math.ceil(len(seq_len_effective) / local_dp_size)
+            start_idx = local_dp_rank * num_seq_per_local_cp
+            end_idx = min(start_idx + num_seq_per_local_cp, len(seq_len_effective))
+            selected_indices = indices[start_idx:end_idx]
+            batch = tu.index_select_tensor_dict(batch, selected_indices)
+
+    # print(f"rank={torch.distributed.get_rank()}, local_cp_size={local_cp_size} max_seq_len={max_seq_len}")
+    tu.assign_non_tensor_data(batch, "local_cp_size", local_cp_size)
+    return batch
+
+
+def dynamic_cp_merge_output(
+    outputs: dict[str, torch.Tensor],
+    dp_size: int,
+    dp_rank: int,
+    local_cp_size: int,
+) -> TensorDict:
+    """
+    Merge the outputs from different sub-batches for dynamic context parallel.
+    """
+    if local_cp_size == dp_size:
+        return outputs
+
+    merged_output = {}
+    for k in outputs:
+        data_local = outputs[k]
+        object_list = [None for _ in range(dp_size)]
+        torch.distributed.all_gather_object(
+            object_list=object_list, obj=data_local, group=mpu.get_data_parallel_group()
+        )
+
+        to_merge = object_list[(dp_rank % local_cp_size) :: local_cp_size]
+        merged = torch.nested.nested_tensor(
+            sum([list(x.to(data_local.device).unbind()) for x in to_merge], []), layout=torch.jagged
+        )
+        merged_output[k] = merged
+        # print(f'local_cp_size={local_cp_size}, dp_rank={dp_rank}, key={k},
+        # data_local shape={data_local.shape}, merged shape={merged_output[k].shape} ')
+
+    return merged_output
+
+
 def check_mtp_config(model_config: HFModelConfig, engine_config: McoreEngineConfig):
     """
     Check and configure MTP (Multi-Token Prediction) settings.
diff --git a/verl/utils/model.py b/verl/utils/model.py
index 1ef7275355c..d4f1939a8e4 100644
--- a/verl/utils/model.py
+++ b/verl/utils/model.py
@@ -570,6 +570,8 @@ def get_parallel_gptmodel_from_config(
     from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
     from megatron.core.models.gpt.gpt_model import GPTModel
 
+    from verl.models.mcore.config_converter import get_hf_rope_theta
+
     use_te = True
     assert tfconfig.normalization == "RMSNorm", "only RMSNorm is supported for now"
     transformer_layer_spec = get_gpt_decoder_block_spec(tfconfig, use_transformer_engine=use_te)
@@ -586,16 +588,16 @@ def get_parallel_gptmodel_from_config(
         post_process=post_process,
         share_embeddings_and_output_weights=share_embeddings_and_output_weights,
         position_embedding_type="rope",
-        rotary_base=hf_config.rope_theta,
+        rotary_base=get_hf_rope_theta(hf_config),
         **rope_scaling_args,
     )
     # # for layer in parallel_model.decoder.layers:
     # layer.self_attention.core_attention.flash_attention.softmax_scale = None
     if post_process and value:
-        from verl.models.llama.megatron.layers.parallel_linear import LinearForLastLayer
+        from verl.models.mcore.bridge import LinearForLastLayer
 
         parallel_model.output_layer = LinearForLastLayer(
-            input_size=tfconfig.hidden_size, output_size=1, config=tfconfig
+            input_size=tfconfig.hidden_size, output_size=1, sequence_parallel=tfconfig.sequence_parallel
         )
     return parallel_model
 
@@ -663,6 +665,9 @@ def load_valuehead_model(local_path, torch_dtype, model_config, trust_remote_cod
         attn_implementation="flash_attention_2",
         trust_remote_code=trust_remote_code,
     )
+    # vlm models
+    if hasattr(model_config, "text_config"):
+        ori_model.config.hidden_size = model_config.text_config.hidden_size
     model = AutoModelForCausalLMWithValueHead.from_pretrained(ori_model)
     patch_valuehead_model(model)
     return model
diff --git a/verl/utils/profiler/config.py b/verl/utils/profiler/config.py
index e31cf4c8929..ec26b50089f 100644
--- a/verl/utils/profiler/config.py
+++ b/verl/utils/profiler/config.py
@@ -79,6 +79,33 @@ def __post_init__(self) -> None:
         assert self.stack_depth > 0, f"stack_depth must be positive, got {self.stack_depth}"
 
 
+@dataclass
+class PrecisionDebuggerToolConfig(BaseConfig):
+    """Precision debugger tool config (msprobe)."""
+
+    name: str = "precision_debugger"
+    enable: bool = False
+    config_path: Optional[str] = None
+    data_dir: str = "outputs/precision_debug"
+    steps: Optional[list[int]] = None
+    # Supported stages:
+    # actor_update, actor_compute_log_prob, ref_compute_log_prob,
+    # compute_values, critic_update, compute_rm_score
+    stages: Optional[list[str]] = None
+    strict: bool = False
+
+    def __post_init__(self) -> None:
+        assert isinstance(self.enable, bool), f"enable must be bool, got {type(self.enable)}"
+        if self.config_path is not None:
+            assert isinstance(self.config_path, str), f"config_path must be str, got {type(self.config_path)}"
+        assert isinstance(self.data_dir, str), f"data_dir must be str, got {type(self.data_dir)}"
+        if self.steps is not None:
+            assert isinstance(self.steps, list), f"steps must be list[int], got {type(self.steps)}"
+        if self.stages is not None:
+            assert isinstance(self.stages, list), f"stages must be list[str], got {type(self.stages)}"
+        assert isinstance(self.strict, bool), f"strict must be bool, got {type(self.strict)}"
+
+
 @dataclass
 class NPUToolConfig(NsightToolConfig):
     """NPU profiler too; config."""
diff --git a/verl/utils/profiler/precision_debugger_profile.py b/verl/utils/profiler/precision_debugger_profile.py
new file mode 100644
index 00000000000..10351c6b75d
--- /dev/null
+++ b/verl/utils/profiler/precision_debugger_profile.py
@@ -0,0 +1,265 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import logging
+import os
+from dataclasses import asdict
+from typing import Optional
+
+from verl.utils.import_utils import is_msprobe_available
+from verl.utils.profiler.config import PrecisionDebuggerToolConfig
+
+logger = logging.getLogger(__name__)
+
+_STAGE_TO_ROLE = {
+    "actor_update": "actor",
+    "actor_compute_log_prob": "actor",
+    "ref_compute_log_prob": "ref",
+    "compute_values": "critic",
+    "critic_update": "critic",
+    "compute_rm_score": "reward_model",
+}
+
+_MODEL_ATTRS_BY_ROLE = {
+    "actor": (
+        "actor.engine.module",
+        "actor.actor_module",
+        "actor.actor_module_fsdp",
+        "actor_module_fsdp",
+        "actor_module",
+    ),
+    "ref": (
+        "ref.engine.module",
+        "ref.actor_module",
+        "ref_policy.actor_module",
+        "ref_module_fsdp",
+        "ref_module",
+        "ref_policy.ref_module",
+    ),
+    "critic": (
+        "critic.engine.module",
+        "critic.critic_module",
+        "critic_module_fsdp",
+        "critic_module",
+    ),
+    "reward_model": (
+        "reward_model.engine.module",
+        "reward_model_module_fsdp",
+        "reward_model_module",
+        "rm.reward_model_module",
+    ),
+}
+
+_SKIP_STAGES = {"rollout_generate"}
+
+
+class PrecisionDebuggerProfiler:
+    """Minimal msprobe PrecisionDebugger integration."""
+
+    def __init__(self, precision_cfg, rank: Optional[int] = None):
+        self.rank = rank
+        self.precision_cfg = self._normalize_config(precision_cfg)
+        self._enabled = bool(self.precision_cfg.enable)
+        self._available = is_msprobe_available()
+        self._debugger = None
+        self._stages = self._normalize_stages(self.precision_cfg.stages)
+        self._current_global_step = None
+
+    @staticmethod
+    def _normalize_config(precision_cfg) -> PrecisionDebuggerToolConfig:
+        if precision_cfg is None:
+            return PrecisionDebuggerToolConfig()
+        if isinstance(precision_cfg, PrecisionDebuggerToolConfig):
+            return precision_cfg
+        if hasattr(precision_cfg, "to_container"):
+            precision_cfg = precision_cfg.to_container(resolve=True)
+        if isinstance(precision_cfg, dict):
+            return PrecisionDebuggerToolConfig(**precision_cfg)
+        return PrecisionDebuggerToolConfig(**asdict(precision_cfg))
+
+    @staticmethod
+    def _normalize_stage(stage: Optional[str]) -> Optional[str]:
+        return stage
+
+    def _normalize_stages(self, stages: Optional[list[str]]) -> Optional[set[str]]:
+        if stages is None:
+            return None
+        normalized = {self._normalize_stage(stage) for stage in stages}
+        if _SKIP_STAGES & normalized:
+            logger.warning("Ignoring precision_debugger stages: %s", sorted(_SKIP_STAGES & normalized))
+        normalized = normalized - _SKIP_STAGES
+        unknown = normalized - set(_STAGE_TO_ROLE.keys())
+        if unknown:
+            msg = f"Unknown precision_debugger stages: {sorted(unknown)}"
+            if self.precision_cfg.strict:
+                raise ValueError(msg)
+            logger.warning(msg)
+        return normalized & set(_STAGE_TO_ROLE.keys())
+
+    @staticmethod
+    def _resolve_attr(obj, attr_path: str):
+        current = obj
+        for part in attr_path.split("."):
+            current = getattr(current, part, None)
+            if current is None:
+                return None
+        return current
+
+    @staticmethod
+    def _is_valid_model(model) -> bool:
+        return model is not None and callable(getattr(model, "forward", None))
+
+    def _get_candidate_attrs(self, stage: str) -> tuple[str, ...]:
+        role = _STAGE_TO_ROLE.get(stage)
+        if role is None:
+            return ()
+        return _MODEL_ATTRS_BY_ROLE.get(role, ())
+
+    def _resolve_model(self, self_instance, stage: str):
+        for attr in self._get_candidate_attrs(stage):
+            value = self._resolve_attr(self_instance, attr)
+            if self._is_valid_model(value):
+                return value
+        fallback = getattr(self_instance, "module", None)
+        return fallback if self._is_valid_model(fallback) else None
+
+    def _resolve_global_step(self, self_instance, args, kwargs):
+        for val in list(args) + list(kwargs.values()):
+            if hasattr(val, "meta_info"):
+                meta = val.meta_info
+                if isinstance(meta, dict) and "global_steps" in meta:
+                    return meta.get("global_steps")
+            if isinstance(val, dict) and "global_steps" in val:
+                return val.get("global_steps")
+        for attr in ("global_step", "_global_step"):
+            if hasattr(self_instance, attr):
+                return getattr(self_instance, attr)
+        return self._current_global_step
+
+    def _should_collect(self, stage: str, global_step: Optional[int]) -> bool:
+        if not self._enabled:
+            return False
+        if stage in _SKIP_STAGES:
+            return False
+        if stage not in _STAGE_TO_ROLE:
+            msg = f"Unknown precision_debugger stage: {stage}"
+            if self.precision_cfg.strict:
+                raise ValueError(msg)
+            logger.warning(msg)
+            return False
+        if self._stages is not None and stage not in self._stages:
+            return False
+        if self.precision_cfg.steps is not None and global_step is not None:
+            if int(global_step) not in set(self.precision_cfg.steps):
+                return False
+        return True
+
+    def start(self, stage: Optional[str] = None, global_step: Optional[int] = None, model=None, **kwargs) -> bool:
+        profile_step = kwargs.get("global_step", kwargs.get("profile_step"))
+        if profile_step is not None:
+            self._current_global_step = profile_step
+        stage = self._normalize_stage(stage)
+        if stage is None:
+            return False
+        if global_step is None:
+            global_step = self._current_global_step
+        if not self._should_collect(stage, global_step):
+            return False
+        if not self._available:
+            if self.precision_cfg.strict:
+                raise ImportError("msprobe is not available but precision_debugger.strict is True")
+            return False
+        if not self.precision_cfg.config_path or not self.precision_cfg.data_dir:
+            return False
+        if not self._is_valid_model(model):
+            msg = f"PrecisionDebugger model not resolved for stage '{stage}'"
+            if self.precision_cfg.strict:
+                raise ValueError(msg)
+            logger.warning(msg)
+            return False
+
+        try:
+            from msprobe.pytorch import PrecisionDebugger
+
+            step_tag = f"step_{global_step}" if global_step is not None else "step_unknown"
+            dump_path = os.path.join(self.precision_cfg.data_dir, step_tag, stage)
+            os.makedirs(dump_path, exist_ok=True)
+
+            if self._debugger is None:
+                self._debugger = PrecisionDebugger(config_path=self.precision_cfg.config_path, dump_path=dump_path)
+                if self._debugger is None:
+                    if self.precision_cfg.strict:
+                        raise RuntimeError("Failed to create PrecisionDebugger instance")
+                    return False
+            if hasattr(self._debugger, "service") and hasattr(self._debugger.service, "config"):
+                self._debugger.service.config.dump_path = dump_path
+            self._debugger.start(model)
+            return True
+        except Exception:
+            if self.precision_cfg.strict:
+                raise
+            return False
+
+    def stop(self, started: bool = False) -> None:
+        if not started:
+            return
+        if not self._available:
+            return
+        if self._debugger is None:
+            return
+        self._debugger.stop()
+        self._reset_debugger_status()
+
+    def annotate(
+        self,
+        message: Optional[str] = None,
+        color: Optional[str] = None,
+        domain: Optional[str] = None,
+        category: Optional[str] = None,
+        **kwargs_outer,
+    ):
+        _ = (message, color, domain, category)
+        stage = self._normalize_stage(kwargs_outer.get("role"))
+        if stage is None:
+            return lambda func: func
+
+        def decorator(func):
+            @functools.wraps(func)
+            def wrapper(self_instance, *args, **kwargs_inner):
+                global_step = self._resolve_global_step(self_instance, args, kwargs_inner)
+                model = self._resolve_model(self_instance, stage)
+                started = self.start(stage=stage, global_step=global_step, model=model)
+                try:
+                    return func(self_instance, *args, **kwargs_inner)
+                finally:
+                    self.stop(started=started)
+
+            return wrapper
+
+        return decorator
+
+    def _reset_debugger_status(self) -> None:
+        service = getattr(self._debugger, "service", None)
+        if service is None:
+            return
+
+        reset_status = getattr(service, "reset_status", None)
+        if callable(reset_status):
+            reset_status()
+            return
+
+        reset_status = getattr(service, "_reset_status", None)
+        if callable(reset_status):
+            reset_status()
diff --git a/verl/utils/profiler/profile.py b/verl/utils/profiler/profile.py
index 8e3145a66bb..93082643bc2 100644
--- a/verl/utils/profiler/profile.py
+++ b/verl/utils/profiler/profile.py
@@ -77,6 +77,7 @@ class DistProfiler:
     - npu: NPUProfiler (Ascend)
     - torch: PyTorch torch.profiler wrapper
     - torch_memory: Torch CUDA memory snapshot dump
+    - precision_debugger: msprobe precision debugger
     """
 
     def __init__(
@@ -125,6 +126,10 @@ def __init__(
             self._impl = _Torch(rank=rank, config=config, tool_config=tool_config)
         elif self._tool == "torch_memory":
             self._impl = TorchMemoryProfiler(rank=rank, config=config, tool_config=tool_config)
+        elif self._tool == "precision_debugger":
+            from .precision_debugger_profile import PrecisionDebuggerProfiler as _Precision
+
+            self._impl = _Precision(precision_cfg=tool_config, rank=rank)
         else:
             # Fallback to a no-op impl
             self._impl = _NoOpProfiler()
diff --git a/verl/utils/reward_score/__init__.py b/verl/utils/reward_score/__init__.py
index b65d94ec14d..c4026139de8 100644
--- a/verl/utils/reward_score/__init__.py
+++ b/verl/utils/reward_score/__init__.py
@@ -114,6 +114,47 @@ def default_compute_score(
         return float(res[0])
 
 
+def default_compute_score_image(
+    data_source,
+    solution_image,
+    ground_truth,
+    extra_info=None,
+    sandbox_fusion_url=None,
+    concurrent_semaphore=None,
+    memory_limit_mb=None,
+    **kwargs,
+):
+    """Compute the score for a given solution based on the data source.
+
+    Args:
+        data_source (str): The source dataset identifier which determines the scoring method.
+        solution_image (Image.Image or torch.Tensor): The solution image to be evaluated.
+        ground_truth (str): The ground truth answer for comparison.
+        extra_info (dict, optional): Additional information that might be needed for scoring. Defaults to None.
+
+    Returns:
+        float: The computed score as a floating point number. If the result is a dictionary,
+               it returns the dictionary instead.
+
+    Raises:
+        NotImplementedError: If the reward function is not implemented for the given data source.
+    """
+    if data_source == "jpeg_compressibility":
+        from . import jpeg_compressibility
+
+        res = jpeg_compressibility.compute_score(solution_image)
+
+    else:
+        raise NotImplementedError(f"Reward function is not implemented for {data_source=}")
+
+    if isinstance(res, dict):
+        return res
+    elif isinstance(res, int | float | bool):
+        return float(res)
+    else:
+        return float(res[0])
+
+
 @deprecated("verl.utils.reward_score.default_compute_score")
 def _default_compute_score(
     data_source,
@@ -132,4 +173,12 @@ def _default_compute_score(
     )
 
 
+def get_default_compute_score(reward_name: str | None):
+    """Get the default compute_score function based on the reward manager type."""
+    if reward_name == "visual":
+        return default_compute_score_image
+    else:
+        return default_compute_score
+
+
 __all__ = ["default_compute_score"]
diff --git a/verl/utils/reward_score/jpeg_compressibility.py b/verl/utils/reward_score/jpeg_compressibility.py
new file mode 100644
index 00000000000..8523040345d
--- /dev/null
+++ b/verl/utils/reward_score/jpeg_compressibility.py
@@ -0,0 +1,61 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+The reward function for JPEG compressibility.
+It is adapted from https://github.com/kvablack/ddpo-pytorch.
+"""
+
+import io
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+def jpeg_incompressibility():
+    def _fn(images, prompts):
+        if isinstance(images, torch.Tensor):
+            images = (images * 255).round().clamp(0, 255).to(torch.uint8).cpu().numpy()
+            images = images.transpose(0, 2, 3, 1)  # NCHW -> NHWC
+        images = [Image.fromarray(image) for image in images]
+        buffers = [io.BytesIO() for _ in images]
+        for image, buffer in zip(images, buffers, strict=False):
+            image.save(buffer, format="JPEG", quality=95)
+        sizes = [buffer.tell() / 1000 for buffer in buffers]
+        return np.array(sizes), {}
+
+    return _fn
+
+
+def jpeg_compressibility():
+    jpeg_fn = jpeg_incompressibility()
+
+    def _fn(images, prompts):
+        rew, meta = jpeg_fn(images, prompts)
+        return -rew / 500, meta
+
+    return _fn
+
+
+def compute_score(solution_image):
+    """The scoring function for JPEG compressibility.
+
+    Args:
+        solution_image: the solution image or video, in shape (C, H, W) or (N, C, H, W).
+    """
+    if isinstance(solution_image, torch.Tensor) and solution_image.ndim == 3:
+        solution_image = solution_image.unsqueeze(0)
+    score = jpeg_compressibility()(solution_image, None)[0]
+    return score
diff --git a/verl/utils/reward_score/math_verify.py b/verl/utils/reward_score/math_verify.py
index c1ce7c1a483..7071dacc02b 100644
--- a/verl/utils/reward_score/math_verify.py
+++ b/verl/utils/reward_score/math_verify.py
@@ -12,28 +12,56 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import multiprocessing
+import threading
+from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures import TimeoutError as FuturesTimeoutError
+
 try:
     from math_verify.errors import TimeoutException
-    from math_verify.metric import math_metric
-    from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
 except ImportError:
+
+    class TimeoutException(Exception):
+        pass
+
     print("To use Math-Verify, please install it first by running `pip install math-verify`.")
 
+_pool = None
+_pool_lock = threading.Lock()
+
+
+def _get_pool():
+    global _pool
+    if _pool is None:
+        with _pool_lock:
+            if _pool is None:
+                _pool = ProcessPoolExecutor(max_workers=4, mp_context=multiprocessing.get_context("spawn"))
+    return _pool
 
-def compute_score(model_output: str, ground_truth: str, timeout_score: float = 0) -> bool:
-    verify_func = math_metric(
-        gold_extraction_target=(LatexExtractionConfig(),),
-        pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
-    )
-    ret_score = 0.0
 
-    # Wrap the ground truth in \boxed{} format for verification
+def _verify_in_subprocess(ground_truth_boxed: str, model_output: str) -> float:
+    """Run math_verify in a subprocess where signal.alarm() works."""
+    from math_verify.grader import verify
+    from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig, parse
+
+    gold_targets = (LatexExtractionConfig(),)
+    pred_targets = (ExprExtractionConfig(), LatexExtractionConfig())
+
+    extracted_gold = parse(ground_truth_boxed, gold_targets)
+    extracted_pred = parse(model_output, pred_targets)
+    if extracted_gold and extracted_pred:
+        return max(1.0 if any(verify(g, p) for g in extracted_gold) else 0.0 for p in extracted_pred)
+    return 0.0
+
+
+def compute_score(model_output: str, ground_truth: str, timeout_score: float = 0, timeout: float = 30.0) -> float:
+    ret_score = 0.0
     ground_truth_boxed = "\\boxed{" + ground_truth + "}"
     try:
-        ret_score, _ = verify_func([ground_truth_boxed], [model_output])
-    except Exception:
-        pass
-    except TimeoutException:
+        future = _get_pool().submit(_verify_in_subprocess, ground_truth_boxed, model_output)
+        ret_score = future.result(timeout=timeout)
+    except (FuturesTimeoutError, TimeoutException):
         ret_score = timeout_score
-
+    except Exception as e:
+        print(f"Error in math_verify compute_score: {e}")
     return ret_score
diff --git a/verl/utils/rollout_skip.py b/verl/utils/rollout_skip.py
index 3909d48b6f0..04414265ea7 100644
--- a/verl/utils/rollout_skip.py
+++ b/verl/utils/rollout_skip.py
@@ -11,9 +11,64 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
+from enum import Enum
 from pathlib import Path
+from typing import Any, Callable
 
 from verl.protocol import DataProto
+from verl.workers.config.rollout import RolloutConfig
+
+
+def _get_skip_attr(skip_config, key: str, default):
+    """Get attribute from skip config, supporting both dict and SkipConfig dataclass."""
+    if isinstance(skip_config, dict):
+        return skip_config.get(key, default)
+    return getattr(skip_config, key, default)
+
+
+def _find_last_gen_step_for_train_step(step_file: Path, target_train_step: int) -> tuple[int, int] | None:
+    """
+    Find the last `(train_step, gen_step)` pair for a given train_step without loading the
+    entire file into memory.
+
+    This scans the file line-by-line (O(n) time, O(1) memory) and keeps the last match.
+    It also stops early once `train_step` exceeds `target_train_step` (assuming chronological logs).
+    """
+    step_file = Path(step_file)
+    if not step_file.is_file():
+        return None
+
+    last_match: tuple[int, int] | None = None
+    with step_file.open("r", encoding="utf-8", errors="ignore") as f:
+        for raw in f:
+            line = raw.strip()
+            if not line:
+                continue
+            parts = line.split()
+            if len(parts) < 2:
+                continue
+            try:
+                train_step = int(parts[0])
+                gen_step = int(parts[1])
+            except Exception:
+                continue
+
+            if train_step < target_train_step:
+                continue
+            if train_step == target_train_step:
+                last_match = (train_step, gen_step)
+                continue
+            # train_step > target_train_step: no more matches expected
+            break
+
+    return last_match
+
+
+class SkipAction(Enum):
+    CACHE = "cache"  # cache the sample. If dump_date is found, use it. If not found, dump it.
+    REPEAT = "repeat"  # Repeat the sample when gen_step reach skip.max_dump_step
+    REPEAT_LAST = "repeat_last"  # Repeat the last sample when gen_step reach skip.max_dump_step
 
 
 class RolloutSkip:
@@ -26,107 +81,373 @@ class RolloutSkip:
         rollout_wg: The worker group that handles the rollout process.
 
     Note:
-        When rollout.n or rollout.gen_batch_size differ from previous runs,
-        new sequences will be generated and saved with different filenames.
+        Whenever any of the following parameters differ from previous runs—trainer.experiment_name,
+        trainer.project_name, rollout.n, or rollout.gen_batch_size—new sequences will be generated
+        and saved under different filenames.
+
+
     """
 
-    print_mark = "[RolloutSkip()]"
+    print_mark = "[RolloutSkip()] "
 
-    def __init__(self, config, rollout_wg):
-        self.rollout_config = config.actor_rollout_ref.rollout
-        self.exp_name = config.data.get("experiment_name", "")
-        self.project_name = config.data.get("project_name", "")
+    def __init__(self, config, rollout_wg) -> None:
+        self.rollout_config: RolloutConfig = config.actor_rollout_ref.rollout
+        self.skip_config = self.rollout_config.skip
+        self.is_enable = _get_skip_attr(self.skip_config, "enable", False)
+        self._rollout_wg = rollout_wg
 
-        self.n = int(self.rollout_config.get("n", 0))
+        if not self.is_enable:
+            return
+
+        self.exp_name = config.trainer.get("experiment_name", "")
+        self.project_name = config.trainer.get("project_name", "")
+        self.n = int(getattr(self.rollout_config, "n", 0))
         self.gbs = int(config.data.get("gen_batch_size", config.data.get("train_batch_size", 0)))
+        self.response_length = config.data.get("max_response_length", 0)
+        self.prompt_length = config.data.get("max_prompt_length", 0)
+
+        self._new_batch = None
+        self.curr_gen_step: int = 0  # mark the index of rollout result, start from 1
+        self.curr_train_step: int = 0
+
+        self.record_global_steps = None  # Given from xxx_ray_tainer.py, start from 1
+        self.record_gen_steps = None  # Given from xxx_ray_tainer.py, start from 1
+        self.__gen_offset_step = 0
+
+        self.max_dump_step = max(0, _get_skip_attr(self.skip_config, "max_dump_step", 1))  # at least dump once
+        self.action = _get_skip_attr(self.skip_config, "action", SkipAction.REPEAT)
+        self.action = SkipAction(self.action)
 
-        self.dumped_dir = Path(self.rollout_config.get("skip_dump_dir", "/tmp/verl/rollout_dump"))
-        self.dumped_dir.mkdir(parents=True, exist_ok=True)
+        if self.max_dump_step <= 0:
+            assert self.action in [SkipAction.CACHE]
+
+        self._create_dump_path()
+        self._flag_record = False
+        self.list_dumped_steps = []
+
+    @property
+    def is_active(self) -> bool:
+        """Whether RolloutSkip is enabled and has a rollout worker group."""
+        return self.is_enable and self._rollout_wg is not None
+
+    @property
+    def is_dump_step(self) -> bool:
+        """
+        Determine if the current step is a dump step based on the configured dump interval.
+        If train_step is given, it follows the train_step, otherwise it follows the gen_step.
+        """
+        return self.is_active and self.curr_train_step <= self.max_dump_step
+
+    @property
+    def num_dumped_step(self) -> int:
+        return len(self.list_dumped_steps)
+
+    def _get_path_dump(self, gen_step: int | None = None) -> Path:
+        """Return the directory path for a given gen_step (one dir per step, no .pkl)."""
+        if gen_step is None:
+            gen_step = self.curr_gen_step
+        return self.specify_dumped_dir.joinpath(f"genstep_{gen_step:06d}").absolute()
+
+    def _get_path_step_record(self) -> Path:
+        return self.specify_dumped_dir.joinpath("train_step__gen_step.txt").absolute()
+
+    def step(self) -> None:
+        if self.record_global_steps is None:
+            self.curr_train_step += 1
+        else:
+            self.curr_train_step = self.record_global_steps
+
+        if self.record_gen_steps is None:
+            self.curr_gen_step = self.curr_train_step
+        else:
+            self.curr_gen_step = self.record_gen_steps
+
+    def _create_dump_path(self) -> None:
+        """
+        Create the directory for dumping rollout data if it doesn't exist.
+        Warn if the directory is within Ray's temporary session directory.
+        Relative dump_dir is resolved against cwd; use an absolute path under Ray/multi-process.
+        """
+
+        raw = _get_skip_attr(self.skip_config, "dump_dir", "~/.verl/rollout_dump")
+        dumped_dir = Path(raw).expanduser().resolve()
+        sub_dir = (
+            f"{self.exp_name}_{self.project_name}"
+            + f"/GBS{self.gbs}_N{self.n}_in{self.prompt_length}_out{self.response_length}"
+        )
+
+        self.specify_dumped_dir = dumped_dir.joinpath(sub_dir)
+        self.specify_dumped_dir.mkdir(parents=True, exist_ok=True)
+
+        tmp_ray = "/tmp/ray/session"
 
         # Check if path is in Ray temporary directory
-        if str(self.dumped_dir.absolute()).startswith("/tmp/ray/session"):
+        if str(self.specify_dumped_dir.absolute()).startswith(tmp_ray):
             print(
-                f"\033[33m{self.print_mark} Warning: \nUsing dump path ",
-                f"'{self.dumped_dir.absolute()}' is not recommended ",
-                "as it's located in /tmp/ray/session*\033[0m",
+                f"{self.print_mark}\033[33mWarning: \nUsing dump path ",
+                f"'{self.specify_dumped_dir.absolute()}' is not recommended ",
+                f"as it's located in {tmp_ray}*\033[0m",
                 flush=True,
             )
-
         print(
-            f"{self.print_mark} Rollout skip dump path set to: ",
-            f"{self.dumped_dir.absolute()}",
+            f"{self.print_mark}Rollout skip dump path set to: ",
+            str(self.specify_dumped_dir.absolute()),
             flush=True,
         )
 
-        self._rollout_wg = rollout_wg
+    def record(
+        self,
+        new_batch: DataProto,
+        global_steps: int | None = None,
+        gen_steps: int | None = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Record the current training step based on the new batch.
 
-    @property
-    def curr_path_dump(self):
-        return self.dumped_dir.joinpath(f"{self.exp_name}_{self.project_name}_GBS{self.gbs}__N{self.n}").absolute()
+        Args:
+            new_batch (DataProto): The new batch of data being processed.
+        """
+        if self._rollout_wg is None:
+            return
+        if self._flag_record is False:
+            # make sure one record only corresponds to one skip
+            self._flag_record = True
+            self._new_batch = new_batch
+        else:
+            print(
+                f"{self.print_mark}Warning, duplicate record new_batch, "
+                "it was not a problem if acc/reward is not cared.",
+                flush=True,
+            )
+
+        if gen_steps is None:
+            gen_steps = global_steps
+
+        # Check if train_step not start from 1
+        if global_steps is not None:
+            if self.record_global_steps is None and global_steps > 1:
+                print(f"{self.print_mark}\033[32mResume Mode.\033[0m", flush=True)
+                last_train_step = global_steps - 1  # default when step file missing
+                last_gen_step = 0
+                try:
+                    found = _find_last_gen_step_for_train_step(
+                        self._get_path_step_record(),
+                        target_train_step=global_steps - 1,
+                    )
+                    if found is not None:
+                        last_train_step, last_gen_step = found
+                        if last_train_step + 1 != global_steps:
+                            print(f"{self.print_mark}\033[31mWarning: Train step not continues.\033[0m")
+                        self.__gen_offset_step = last_gen_step
+                except Exception as e:
+                    print(
+                        f"{self.print_mark}\033[31mFailed to read step describe file. {e.__repr__()}\033[0m",
+                        flush=True,
+                    )
+                print(
+                    f"{self.print_mark}\033[32mResume from train_step: {last_train_step}, "
+                    f"gen_step: {last_gen_step}.\033[0m",
+                    flush=True,
+                )
+
+        if global_steps is not None:
+            self.record_global_steps = global_steps
+        if gen_steps is not None:
+            #! it is not right since dapo_trainer reset `gen_steps` when resume
+            self.record_gen_steps = gen_steps + self.__gen_offset_step
+
+    def wrap_generate_sequences(self) -> None:
+        # if self.is_enable:
+        #     self._rollout_wg = rollout_wg
 
-    def wrap_generate_sequences(self):
         try:
             self._rollout_wg.generate_sequences = wrap_generate_sequences(self, self._rollout_wg)
             print(
-                f"{self.print_mark} Successfully patched `actor_rollout_wg.generate_sequences()`",
+                f"{self.print_mark}\033[32mSuccessfully patched `actor_rollout_wg.generate_sequences()`.\033[0m",
                 flush=True,
             )
         except Exception as e:
             raise RuntimeError(
-                "{self.print_mark} Failed to patch `actor_rollout_wg.generate_sequences()`",
+                f"{self.print_mark}\033[31mFailed to patch `actor_rollout_wg.generate_sequences()`.\033[0m",
                 flush=True,
             ) from e
 
-    def try_load(self):
-        if not self.curr_path_dump.exists():
+    def try_load(self, step: int | None = None) -> tuple[DataProto | None, DataProto | None]:
+        dumped_gen_batch = None
+        dumped_new_batch = None
+        if step is None:
+            step = self.curr_gen_step
+
+        step_dir = self._get_path_dump(step)
+        if not step_dir.exists() or not step_dir.is_dir():
             print(
-                f"{self.print_mark} No data dump found at {self.curr_path_dump}.",
-                "The trainer will generate and automatically dump the data for this first run.",
+                f"{self.print_mark}\033[33mNo dumped data found at gen_step {step} "
+                f"from {step_dir}. The trainer will generate and dump the data for this gen_step.\033[0m",
                 flush=True,
             )
-            return None
+            return dumped_new_batch, dumped_gen_batch
+
+        new_batch_path = step_dir / "new_batch.dp"
+        gen_batch_path = step_dir / "gen_batch.dp"
+        if not (new_batch_path.is_file() and gen_batch_path.is_file()):
+            print(
+                f"{self.print_mark}\033[33mNo dumped data found at gen_step {step} "
+                f"(missing new_batch.dp or gen_batch.dp in {step_dir}).\033[0m",
+                flush=True,
+            )
+            return dumped_new_batch, dumped_gen_batch
 
         try:
-            # * Load
-            ret_batch = DataProto.load_from_disk(self.curr_path_dump)
+            dumped_new_batch = DataProto.load_from_disk(new_batch_path)
+            dumped_gen_batch = DataProto.load_from_disk(gen_batch_path)
             print(
-                f"\033[32m{self.print_mark} Successfully load pre-generated data from {self.curr_path_dump}\033[0m",
+                f"{self.print_mark}\033[32mSuccessfully load pre-generated data from {step_dir}.\033[0m",
                 flush=True,
             )
-            return ret_batch
+            if step not in self.list_dumped_steps:
+                self.list_dumped_steps.append(step)
         except Exception as e:
             print(
-                f"\033[31m{self.print_mark} Failed to load pre-generated data from {self.curr_path_dump}",
-                f"Error: {str(e)}\033[0m",
+                f"{self.print_mark}\033[31mFailed to load pre-generated data from {step_dir}: {e}\033[0m",
                 flush=True,
             )
-            return None
 
-    def dump(self, outputs: DataProto):
+        return dumped_new_batch, dumped_gen_batch
+
+    def dump(self, outputs: DataProto) -> None:
+        if self._flag_record is False or self._new_batch is None:
+            raise AssertionError(
+                f"{self.print_mark}\033[33mError: \n"
+                + "The new_batch record is required."
+                + "Please record the new_batch using `RolloutSkip.record(new_batch)` in trainer.fit().\033[0m"
+            )
+        self._flag_record = False
+
+        train_step = self.record_global_steps if self.record_global_steps is not None else self.curr_train_step
+        gen_step = self.record_gen_steps if self.record_gen_steps is not None else self.curr_gen_step
+        step_dir = self._get_path_dump(gen_step)
+        step_dir.mkdir(parents=True, exist_ok=True)
+
         try:
-            outputs.save_to_disk(self.curr_path_dump)
+            self._new_batch.save_to_disk(step_dir / "new_batch.dp")
+            outputs.save_to_disk(step_dir / "gen_batch.dp")
+            meta_path = step_dir / "meta.json"
+            meta_path.write_text(json.dumps({"global_steps": train_step, "gen_steps": gen_step}))
+
+            with open(str(self._get_path_step_record()), "a") as f:
+                f.write(f"{train_step} {gen_step}\n")
+
             print(
-                f"\033[32m{self.print_mark} Successfully dump data in {self.curr_path_dump}\033[0m",
+                f"{self.print_mark}\033[32mSuccessfully dump data in {step_dir}\033[0m",
                 flush=True,
             )
+            if self.curr_gen_step not in self.list_dumped_steps:
+                self.list_dumped_steps.append(self.curr_gen_step)
+
         except Exception as e:
             print(
-                f"\033[31m{self.print_mark} Failed to dump data in {self.curr_path_dump}: {e}\033[0m",
+                f"{self.print_mark}\033[31mFailed to dump data in {step_dir}: {e}\033[0m",
                 flush=True,
             )
 
+    def replace_curr_new_batch(self, dumped_new_batch: DataProto) -> None:
+        """Replace the current new_batch's content with that from the dumped_new_batch.
+        In case of [Answer] mismatch.
+        """
 
-def wrap_generate_sequences(rolloutskip: RolloutSkip, rollout_wg):
+        if self._flag_record is False:
+            raise AssertionError(
+                f"{self.print_mark}\033[33mError: \n"
+                + "The new_batch is not recorded. Please record the new_batch"
+                + "using `RolloutSkip.record(new_batch)`. \033[0m"
+            )
+        self._flag_record = False
+
+        self._new_batch.batch = dumped_new_batch.batch
+        self._new_batch.non_tensor_batch = dumped_new_batch.non_tensor_batch
+        self._new_batch.meta_info = dumped_new_batch.meta_info
+
+
+def wrap_generate_sequences(rolloutskip: RolloutSkip, rollout_wg: Any) -> Callable[..., DataProto]:
     generate_sequences = rollout_wg.generate_sequences
 
-    def warp_fn(batch, **kwargs):
-        gen_batch_output = rolloutskip.try_load()
+    def rollout_skip_wrap_fn(batch: DataProto, **kwargs: Any) -> DataProto:
+        rolloutskip.step()
+        # Record input batch as new_batch so dump() / replace_curr_new_batch() have it
+        rolloutskip.record(batch)
+        return_batch = None
+
+        if rolloutskip.is_dump_step:
+            # * try load
+            dumped_new_batch, return_batch = rolloutskip.try_load()
+
+            if return_batch is None:
+                # 1. Generation
+                return_batch = generate_sequences(batch, **kwargs)
+                # 2. Dump
+                rolloutskip.dump(return_batch)
+            else:
+                rolloutskip.replace_curr_new_batch(dumped_new_batch)
+
+        elif rolloutskip.action == SkipAction.CACHE:
+            return_batch = generate_sequences(batch, **kwargs)
+
+        elif rolloutskip.action == SkipAction.REPEAT:
+            if rolloutskip.num_dumped_step == 0:
+                return_batch = generate_sequences(batch, **kwargs)
+                rolloutskip.dump(return_batch)
+            else:
+                target_step = rolloutskip.list_dumped_steps[
+                    (rolloutskip.curr_gen_step - 1) % rolloutskip.num_dumped_step
+                ]
+                dumped_new_batch, return_batch = rolloutskip.try_load(step=target_step)
+                if return_batch is None:
+                    return_batch = generate_sequences(batch, **kwargs)
+                    rolloutskip.dump(return_batch)
+                else:
+                    rolloutskip.replace_curr_new_batch(dumped_new_batch)
+
+        elif rolloutskip.action == SkipAction.REPEAT_LAST:
+            target_step = rolloutskip.list_dumped_steps[-1]
+            dumped_new_batch, return_batch = rolloutskip.try_load(step=target_step)
+            if return_batch is None:
+                return_batch = generate_sequences(batch, **kwargs)
+                rolloutskip.dump(return_batch)
+            else:
+                rolloutskip.replace_curr_new_batch(dumped_new_batch)
+
+            # clean
+        return return_batch
+
+    return rollout_skip_wrap_fn
+
+
+def read_dumped_data(path_dump: Path | str) -> dict[str, DataProto]:
+    """
+    Read dumped rollout data from a step directory (DataProto.save_to_disk format).
+
+    path_dump should point to a step directory containing new_batch.dp and gen_batch.dp,
+    e.g. .../GBS8_N16_in1024_out10240/genstep_000001/
+
+    ```
+    from verl.utils.rollout_skip import read_dumped_data
+
+    dumped_data = read_dumped_data("path/to/rollout_dump/.../genstep_000001")
+    print(dumped_data["new_batch"])
+    print(dumped_data["gen_batch"])
+    ```
+    """
+    path_dump = Path(path_dump)
+    if not path_dump.is_dir():
+        raise FileNotFoundError(f"Directory {path_dump} does not exist.")
 
-        if gen_batch_output is None:
-            # * 1. Generation
-            gen_batch_output = generate_sequences(batch, **kwargs)
-            # * 2. Dump
-            rolloutskip.dump(gen_batch_output)
-        return gen_batch_output
+    new_batch_path = path_dump / "new_batch.dp"
+    gen_batch_path = path_dump / "gen_batch.dp"
+    if not (new_batch_path.is_file() and gen_batch_path.is_file()):
+        raise FileNotFoundError(f"Missing new_batch.dp or gen_batch.dp under {path_dump}.")
 
-    return warp_fn
+    return {
+        "new_batch": DataProto.load_from_disk(new_batch_path),
+        "gen_batch": DataProto.load_from_disk(gen_batch_path),
+    }
diff --git a/verl/utils/tensordict_utils.py b/verl/utils/tensordict_utils.py
index 9e6afb78f67..426d1e51552 100644
--- a/verl/utils/tensordict_utils.py
+++ b/verl/utils/tensordict_utils.py
@@ -401,7 +401,7 @@ def get_tensordict(tensor_dict: dict[str, torch.Tensor | list], non_tensor_dict:
             # Convert to NonTensorStack to handle nested structures
             tensor_dict[key] = NonTensorStack.from_list([NonTensorData(item) for item in val])
 
-        assert isinstance(val, torch.Tensor | list)
+        assert isinstance(val, torch.Tensor | list), f"{key} -> {type(val)} isn't of 'torch.Tensor | list' type"
 
         if batch_size is None:
             batch_size = val.size(0) if isinstance(val, torch.Tensor) else len(val)
diff --git a/verl/utils/tracking.py b/verl/utils/tracking.py
index f4c1e4a7f8d..263e2a15fa2 100644
--- a/verl/utils/tracking.py
+++ b/verl/utils/tracking.py
@@ -331,7 +331,18 @@ def log(self, data, step):
         import mlflow
 
         results = {self._sanitize_key(k): v for k, v in data.items()}
-        mlflow.log_metrics(metrics=results, step=step)
+        for _attempt in range(MLFLOW_MAX_ATTEMPTS):
+            try:
+                mlflow.log_metrics(metrics=results, step=step)
+                return
+            except Exception as error:
+                # No sleep between retries — this runs per training step, so we avoid blocking.
+                msg = "mlflow.log_metrics failed (attempt %d/%d): %s"
+                args = (_attempt + 1, MLFLOW_MAX_ATTEMPTS, error)
+                if _attempt < MLFLOW_MAX_ATTEMPTS - 1:
+                    self.logger.info(msg, *args)
+                else:
+                    self.logger.warning(msg, *args)
 
 
 def _compute_mlflow_params_from_objects(params) -> dict[str, Any]:
diff --git a/verl/utils/transformers_compat.py b/verl/utils/transformers_compat.py
index 9a03c658512..cebb1562b44 100644
--- a/verl/utils/transformers_compat.py
+++ b/verl/utils/transformers_compat.py
@@ -72,3 +72,21 @@ def get_auto_model_for_vision2seq():
         return AutoModelForVision2Seq
 
     return AutoModelForImageTextToText
+
+
+def unpack_visual_output(visual_output):
+    """Unpack the output from the visual encoder, handling both tuple and object return types.
+
+    Newer versions of transformers return an object with `pooler_output` and `deepstack_features`
+    attributes instead of a plain tuple.
+    """
+    if hasattr(visual_output, "pooler_output"):
+        # For newer versions(>=5.0.0) of transformers, return the pooler_output and deepstack_features
+        if hasattr(visual_output, "deepstack_features"):
+            return visual_output.pooler_output, visual_output.deepstack_features
+        else:
+            return visual_output.pooler_output, None
+    if isinstance(visual_output, tuple):
+        return visual_output
+    else:
+        return visual_output, None
diff --git a/verl/utils/vllm/npu_vllm_patch.py b/verl/utils/vllm/npu_vllm_patch.py
index 727afc8a62a..c22647ceac6 100644
--- a/verl/utils/vllm/npu_vllm_patch.py
+++ b/verl/utils/vllm/npu_vllm_patch.py
@@ -160,6 +160,18 @@ def wrapper(self, *args, **kwargs):
     return wrapper
 
 
+def vllm_v013_weight_loader_method_wrapper(fn):
+    @wraps(fn)
+    def wrapper(self, param, loaded_weight, weight_name, shard_id, expert_id, return_success=False):
+        if (shard_id in ("w1", "w3") and param.shape[1] == self.hidden_size) or (
+            shard_id == "w2" and param.shape[2] == self.hidden_size
+        ):
+            param.data = param.data.transpose(1, 2)
+        return fn(self, param, loaded_weight, weight_name, shard_id, expert_id, return_success)
+
+    return wrapper
+
+
 def patch_vllm013_rotary_emb():
     from vllm.model_executor.layers.rotary_embedding.common import ApplyRotaryEmb
 
@@ -169,7 +181,7 @@ def vllm013_npu_rotary_embedding_init_impl(
         is_neox_style: bool = True,
         enable_fp32_compute: bool = False,
     ) -> None:
-        super(ApplyRotaryEmb, self).__init__(enforce_enable)
+        super(ApplyRotaryEmb, self).__init__()
         self.is_neox_style = is_neox_style
         self.enable_fp32_compute = enable_fp32_compute
         self.apply_rotary_emb_flash_attn = None
@@ -182,14 +194,17 @@ def vllm013_npu_rotary_embedding_init_impl(
     from packaging import version
 
     _VLLM_VERSION = version.parse(vllm.__version__)
-    if _VLLM_VERSION >= version.parse("0.13.0"):
+    if _VLLM_VERSION >= version.parse("0.13.0") and _VLLM_VERSION <= version.parse("0.14.0"):
         # Disable flash_attn in RotaryEmbedding (NPU) when VLLM >= 0.13
+        from vllm.model_executor.layers.fused_moe import FusedMoE
+
         patch_vllm013_rotary_emb()
+        FusedMoE.weight_loader = vllm_v013_weight_loader_method_wrapper(FusedMoE.weight_loader)
 
     VERL_NPU_ENABLE_A2_PATCH_VLLM_ASCEND_MC2 = bool(int(os.getenv("VERL_NPU_ENABLE_A2_PATCH_VLLM_ASCEND_MC2", "1")))
     if VERL_NPU_ENABLE_A2_PATCH_VLLM_ASCEND_MC2:
         # only support vllm 0.13 and 0.11 now.
-        if _VLLM_VERSION >= version.parse("0.13.0"):
+        if _VLLM_VERSION >= version.parse("0.13.0") and _VLLM_VERSION <= version.parse("0.14.0"):
             from vllm_ascend import ascend_forward_context
             from vllm_ascend.ops.linear_op import SequenceRowParallelOp
 
@@ -199,7 +214,8 @@ def vllm013_npu_rotary_embedding_init_impl(
             SequenceRowParallelOp.matmul_and_reduce = vllm_ascend_v013_matmul_and_reduce_wrapper(
                 SequenceRowParallelOp.matmul_and_reduce
             )
-        elif _VLLM_VERSION >= version.parse("0.11.0"):
+
+        elif _VLLM_VERSION >= version.parse("0.11.0") and _VLLM_VERSION < version.parse("0.13.0"):
             from vllm_ascend.ops.linear_op import SequenceRowParallelOp
             from vllm_ascend.worker.model_runner_v1 import NPUModelRunner
 
diff --git a/verl/utils/vllm/patch.py b/verl/utils/vllm/patch.py
index 7a52a3c97ae..951c5cad3d6 100644
--- a/verl/utils/vllm/patch.py
+++ b/verl/utils/vllm/patch.py
@@ -66,6 +66,13 @@
 except ImportError:
     pass
 
+try:
+    from vllm.model_executor.models.qwen3_5 import Qwen3_5MoeForCausalLM
+
+    SUPPORTED_MOE_MODELS.append(Qwen3_5MoeForCausalLM)
+except ImportError:
+    pass
+
 
 def patch_vllm_moe_model_weight_loader(model):
     # this is a work around to load the weight of vllm fused moe model
@@ -115,7 +122,7 @@ def patch_vllm_moe_model_weight_loader(model):
 
     # TODO(@leisuzz): class Qwen3MoeLLMForCausalLM is not available if VLLM version < 0.11.0,
     # will update the 'if statement' with 'isinstance' when verl commonly use VLLM version >= 0.11.0
-    if type(inner_model).__name__ == "Qwen3MoeLLMForCausalLM":
+    if type(inner_model).__name__ in ("Qwen3MoeLLMForCausalLM", "Qwen3_5MoeForCausalLM"):
         inner_model = inner_model.model  # Reassign inner_model in Qwen3-vl
 
     for layer_idx, layer in enumerate(inner_model.layers):
diff --git a/verl/utils/vllm/vllm_fp8_utils.py b/verl/utils/vllm/vllm_fp8_utils.py
index 159e66f79e6..54e7b44d18c 100644
--- a/verl/utils/vllm/vllm_fp8_utils.py
+++ b/verl/utils/vllm/vllm_fp8_utils.py
@@ -32,6 +32,21 @@
 
 logger = logging.getLogger(__name__)
 
+MXFP8_BLOCK_QUANT_KWARGS = {
+    "activation_scheme": "dynamic",
+    "fmt": "e4m3",
+    "quant_method": "ascend",
+    "weight_block_size": [1, 32],
+    # Enable dynamic inference mode, no need to pre-generate quant_model_description.json
+    # for each model. Just specify default_quant_type, vllm-ascend will automatically
+    # infer quant type based on layer type:
+    # - lm_head, embedding, norm layers -> FLOAT
+    # - Other Linear, MoE layers -> default_quant_type (W8A8_MXFP8)
+    "default_quant_type": "W8A8_MXFP8",
+    # group_size for MXFP8 quantization
+    "group_size": 32,
+}
+
 
 # Ref: https://github.com/NVIDIA-NeMo/RL/commit/bc24887c72a6e1b2699a228bc87c588546dfe6b7
 @dataclass()
@@ -49,8 +64,11 @@ class FP8State:
 def is_fp8_model(vllm_config):
     from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 
-    if hasattr(vllm_config, "quant_config") and isinstance(vllm_config.quant_config, Fp8Config):
-        return True
+    if hasattr(vllm_config, "quant_config"):
+        if isinstance(vllm_config.quant_config, Fp8Config):
+            return True
+        elif is_mxfp8_vllm_ascend(vllm_config.quant_config):
+            return True
 
     return False
 
@@ -102,6 +120,118 @@ def is_fp8_weight(name, model):
     return name in fp8_state.fp8_param_names
 
 
+def is_mxfp8_vllm_ascend(quant_config):
+    try:
+        from vllm_ascend.quantization.modelslim_config import AscendModelSlimConfig
+
+        if isinstance(quant_config, AscendModelSlimConfig):
+            quant_method = quant_config.quant_description.get("quant_method")
+            return quant_method in ["ascend"]
+        return False
+    except ImportError:
+        # vllm_ascend not installed, so this can't be an Ascend MXFP8 config
+        return False
+
+
+def restore_mxfp8_weights_for_loading(model):
+    for name, module in model.named_modules():
+        if (
+            hasattr(module, "_mxfp8_transformed")
+            and hasattr(module, "quant_method")
+            and hasattr(module.quant_method, "quant_method")
+            and hasattr(module.quant_method.quant_method, "restore_weights_for_rl_loading")
+        ):
+            module.quant_method.quant_method.restore_weights_for_rl_loading(module)
+
+
+def apply_mxfp8_transformation_after_loading(model):
+    """Re-apply MXFP8 transformations after weight loading.
+
+    This function iterates through all linear modules in the model and applies
+    the MXFP8 transformations (transpose, reshape) that are required for NPU
+    inference.
+
+    Must be called AFTER model.load_weights() in RL training loops.
+    """
+    try:
+        from vllm.model_executor.layers.linear import LinearBase
+    except ImportError:
+        logger.warning("Could not import LinearBase, skipping MXFP8 transformation")
+        return
+
+    for name, module in model.named_modules():
+        if (isinstance(module, LinearBase) or isinstance(module, FusedMoE)) and hasattr(
+            module, "_mxfp8_original_shapes"
+        ):
+            if hasattr(module, "quant_method") and hasattr(module.quant_method, "process_weights_after_loading"):
+                logger.debug(f"Applying MXFP8 transformation for module: {name}")
+                module.quant_method.process_weights_after_loading(module)
+
+
+def npu_scaled_mxfp8_blockwise(
+    data_hp,
+    weight_block_size,
+):
+    assert data_hp.dim() == 2, "Only 2D tensors supported (M, N)"
+
+    block_size = weight_block_size[1]
+
+    # Constants for MXFP8 / NPU
+    FP32_MIN_NORMAL = torch.finfo(torch.float32).tiny
+    MAX_NORM = torch.finfo(torch.float8_e4m3fn).max  # 2 ** 8 * 1.75
+    EMAX = 8  # 2 ** (4 - 1)
+    SCALE_EMAX = 127  # 2 ** (8 - 1) - 1
+
+    data_hp = data_hp.float()
+    original_shape = data_hp.shape
+    M, N = original_shape
+    assert N % block_size == 0, f"Last dimension {N} must be divisible by block_size {block_size}"
+
+    # Reshape to (M, N // block_size, block_size)
+    num_blocks_n = N // block_size
+    data_blocked = data_hp.reshape(M, num_blocks_n, block_size)
+
+    # Calculate max absolute value per block
+    max_val = torch.amax(torch.abs(data_blocked), dim=-1)
+
+    # Shared exponent calculation
+    # Handle zero/tiny values to avoid log2(0) -> -inf
+    max_val_safe = torch.where(max_val == 0, FP32_MIN_NORMAL, max_val)
+    shared_exp = torch.floor(torch.log2(max_val_safe)) - EMAX
+
+    shared_exp[shared_exp > SCALE_EMAX] = float("NaN")
+
+    shared_exp_expanded = shared_exp.unsqueeze(-1)
+    scale_factor = torch.pow(2.0, shared_exp_expanded)
+    data_normalized = data_blocked / scale_factor
+    abs_norm = torch.abs(data_normalized)
+    private_exp = torch.floor(torch.log2(abs_norm + (abs_norm == 0).float()))
+    min_exp = -6
+    private_exp = private_exp.clamp(min=min_exp)
+
+    mantissa_scale = 8.0  # 2 ** (5 - 2)
+
+    scale_private = torch.pow(2.0, private_exp)
+    scaled = data_normalized / scale_private * mantissa_scale
+
+    # Round half away from zero: sign * floor(abs + 0.5)
+    data_quant = torch.sign(scaled) * torch.floor(torch.abs(scaled) + 0.5)
+    data_quant = data_quant / mantissa_scale * scale_private
+    data_quant = torch.clamp(data_quant, min=-MAX_NORM, max=MAX_NORM)
+
+    # Restore Inf/NaN
+    data_quant = torch.where(torch.isinf(data_normalized), data_normalized, data_quant)
+    data_quant = torch.where(torch.isnan(data_normalized), data_normalized, data_quant)
+
+    fp_data = data_quant.reshape(original_shape).to(torch.float8_e4m3fn)
+
+    # Encode scale/exponent for NPU (uint8)
+    shared_exp_fixed = torch.nan_to_num(shared_exp, nan=-127.0)
+    descale_fp = torch.clamp(shared_exp_fixed + 127, 0, 255).round().to(torch.uint8)
+
+    return fp_data, descale_fp
+
+
 def quant_weights(weights, model, quant_config, dtype=torch.bfloat16):
     """Quantize weights to FP8 format using a memory-efficient generator.
 
@@ -115,8 +245,18 @@ def quant_weights(weights, model, quant_config, dtype=torch.bfloat16):
     Yields:
         Tuples of (name, tensor) for each weight and its scale
     """
-    if quant_config.weight_block_size is None:
-        raise ValueError("Currently only support blockwise quantization, please set weight_block_size in quant_config")
+
+    is_mxfp8_npu = is_mxfp8_vllm_ascend(quant_config)
+
+    weight_block_size = None
+    if is_mxfp8_npu:
+        weight_block_size = MXFP8_BLOCK_QUANT_KWARGS["weight_block_size"]
+    else:
+        if quant_config.weight_block_size is None:
+            raise ValueError(
+                "Currently only support blockwise quantization, please set weight_block_size in quant_config"
+            )
+        weight_block_size = quant_config.weight_block_size
 
     # vLLM v0.11-v0.12 renamed weight_scale_inv → weight_scale in process_weights_after_loading,
     # so load_weights expects "_scale" suffix. v0.14+ keeps weight_scale_inv, so expects "_scale_inv".
@@ -130,11 +270,16 @@ def quant_weights(weights, model, quant_config, dtype=torch.bfloat16):
         # Cast the weight into fp8 and its scale factor
         if torch.distributed.get_rank() == 0:
             logger.debug(f"Quantizing to FP8 blockwise: {k}")
-
-        param_lp, param_scale = scaled_fp8_blockwise(
-            v.to(dtype),
-            weight_block_size=quant_config.weight_block_size,
-        )
+        if is_mxfp8_npu:
+            param_lp, param_scale = npu_scaled_mxfp8_blockwise(
+                v.to(dtype),
+                weight_block_size=weight_block_size,
+            )
+        else:
+            param_lp, param_scale = scaled_fp8_blockwise(
+                v.to(dtype),
+                weight_block_size=quant_config.weight_block_size,
+            )
         param_scale = param_scale.squeeze(-1)
 
         # Yield the quantized weight
@@ -143,7 +288,7 @@ def quant_weights(weights, model, quant_config, dtype=torch.bfloat16):
         # Yield the scale with appropriate naming based on vLLM version
         if _use_scale_not_scale_inv and "expert" not in k:
             yield (k + "_scale", param_scale)
-        else:
+        elif not is_mxfp8_npu:
             yield (k + "_scale_inv", param_scale)
 
         # Explicitly delete original tensor reference to help GC
@@ -155,6 +300,15 @@ def load_quanted_weights(weights, model_runner):
     quant_config = model_runner.vllm_config.quant_config
     vllm_dtype = model_runner.vllm_config.model_config.dtype
 
+    is_mxfp8_npu = is_mxfp8_vllm_ascend(quant_config)
+
+    if is_mxfp8_npu:
+        # For MXFP8 on NPU, we need to restore weights to original shapes
+        # before loading, then re-apply transformation after loading.
+        # This is because process_weights_after_loading transposes the weights,
+        # but the weight_loader expects original shapes.
+        restore_mxfp8_weights_for_loading(model)
+
     weights_quantized = quant_weights(weights, model, quant_config, dtype=vllm_dtype)
 
     # Monkey patch the param class to their subclass, as certain models
@@ -169,6 +323,11 @@ def load_quanted_weights(weights, model_runner):
     for name, param in model.named_parameters():
         if hasattr(param, "subclass_type"):
             param.__class__ = param.orig_type
+
+    if is_mxfp8_npu:
+        # Re-apply MXFP8 transformations after weight loading
+        apply_mxfp8_transformation_after_loading(model)
+
     return loaded_params
 
 
diff --git a/verl/workers/actor/dp_actor.py b/verl/workers/actor/dp_actor.py
index c13ffa09c2b..23b3c6d4785 100644
--- a/verl/workers/actor/dp_actor.py
+++ b/verl/workers/actor/dp_actor.py
@@ -31,6 +31,7 @@
 from verl.utils.attention_utils import index_first_axis, pad_input, rearrange, unpad_input
 from verl.utils.device import get_device_id, get_device_name
 from verl.utils.fsdp_utils import FSDPModule, fsdp2_clip_grad_norm_
+from verl.utils.import_utils import deprecated
 from verl.utils.profiler import GPUMemoryLogger
 from verl.utils.py_functional import append_to_dict
 from verl.utils.seqlen_balancing import prepare_dynamic_batch, restore_dynamic_batch
@@ -46,6 +47,7 @@
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
+@deprecated("legacy worker implementation is deprecated and will be removed in v0.8.0")
 class DataParallelPPOActor(BasePPOActor):
     """FSDP DataParallel PPO Actor or Ref worker
 
@@ -465,7 +467,9 @@ def compute_log_prob(self, data: DataProto, calculate_entropy: bool = False) ->
 
         if use_dynamic_bsz:
             max_token_len = data.meta_info["max_token_len"] * self.ulysses_sequence_parallel_size
-            micro_batches, batch_idx_list = prepare_dynamic_batch(data, max_token_len=max_token_len)
+            micro_batches, batch_idx_list = prepare_dynamic_batch(
+                data, max_token_len=max_token_len, dp_group=torch.distributed.group.WORLD
+            )
         else:
             micro_batches = data.split(micro_batch_size)
 
@@ -557,7 +561,9 @@ def update_policy(self, data: DataProto):
             for batch_idx, mini_batch in enumerate(mini_batches):
                 if self.config.use_dynamic_bsz:
                     max_token_len = self.config.ppo_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
-                    micro_batches, _ = prepare_dynamic_batch(mini_batch, max_token_len=max_token_len)
+                    micro_batches, _ = prepare_dynamic_batch(
+                        mini_batch, max_token_len=max_token_len, dp_group=torch.distributed.group.WORLD
+                    )
                 else:
                     self.gradient_accumulation = (
                         self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu
diff --git a/verl/workers/actor/megatron_actor.py b/verl/workers/actor/megatron_actor.py
index f4a697866ab..9764178134f 100644
--- a/verl/workers/actor/megatron_actor.py
+++ b/verl/workers/actor/megatron_actor.py
@@ -39,6 +39,7 @@
 from verl import DataProto
 from verl.trainer.ppo.core_algos import agg_loss, get_policy_loss_fn, kl_penalty
 from verl.utils.device import get_device_id, get_torch_device
+from verl.utils.import_utils import deprecated
 from verl.utils.megatron.pipeline_parallel import make_batch_generator
 from verl.utils.megatron.router_replay_patch import RouterReplay, RouterReplayAction
 from verl.utils.megatron.router_replay_utils import (
@@ -64,6 +65,7 @@
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
+@deprecated("legacy worker implementation is deprecated and will be removed in v0.8.0")
 class MegatronPPOActor(BasePPOActor):
     def __init__(
         self,
diff --git a/verl/workers/config/actor.py b/verl/workers/config/actor.py
index f33bb9269cc..962b478ae2e 100644
--- a/verl/workers/config/actor.py
+++ b/verl/workers/config/actor.py
@@ -22,7 +22,13 @@
 from verl.utils.profiler.config import ProfilerConfig
 from verl.utils.qat import QATConfig
 
-from .engine import FSDPEngineConfig, McoreEngineConfig, TorchtitanEngineConfig, VeOmniEngineConfig
+from .engine import (
+    FSDPEngineConfig,
+    McoreEngineConfig,
+    MindSpeedEngineConfig,
+    TorchtitanEngineConfig,
+    VeOmniEngineConfig,
+)
 from .model import HFModelConfig
 from .optimizer import OptimizerConfig
 
@@ -35,6 +41,7 @@
     "VeOmniActorConfig",
     "QATConfig",
     "TorchTitanActorConfig",
+    "MindSpeedActorConfig",
 ]
 
 
@@ -305,6 +312,10 @@ def __post_init__(self):
         """Validate FSDP actor configuration parameters."""
         super().__post_init__()
         self.engine = self.fsdp_config
+        # Sync strategy to engine config so engine_workers can pick the right FSDP version.
+        # EngineConfig.strategy defaults to None, so without this, engine_workers.py always
+        # falls back to FSDP1 even when actor.strategy="fsdp2".
+        object.__setattr__(self.engine, "strategy", self.strategy)
 
         # backward compatibility
         if self.ulysses_sequence_parallel_size > 1:
@@ -366,3 +377,29 @@ def __post_init__(self):
         """Validate TorchTitan actor configuration parameters."""
         super().__post_init__()
         self.engine = self.torchtitan
+
+
+@dataclass
+class MindSpeedActorConfig(ActorConfig):
+    """Configuration for mindspeed actor models.
+
+    The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
+
+    Args:
+        strategy (str): Training strategy set to 'mindspeed' for mindspeed parallelism.
+        load_weight (bool): Whether to load model weights from checkpoint.
+        mindspeed (dict[str, Any]): Configuration for mindspeed parallelism settings.
+        profile (dict[str, Any]): Configuration for profiling settings.
+        use_rollout_log_probs (bool): Whether to use log probabilities from rollout engine.
+    """
+
+    strategy: str = "mindspeed"
+    load_weight: bool = True
+    mindspeed: MindSpeedEngineConfig = field(default_factory=MindSpeedEngineConfig)
+    profile: dict[str, Any] = field(default_factory=dict)
+    use_rollout_log_probs: bool = False
+
+    def __post_init__(self):
+        """Validate MindSpeed actor configuration parameters."""
+        super().__post_init__()
+        self.engine = self.mindspeed
diff --git a/verl/workers/config/critic.py b/verl/workers/config/critic.py
index caca5bac6ac..3531ed63c92 100644
--- a/verl/workers/config/critic.py
+++ b/verl/workers/config/critic.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -22,11 +21,18 @@
 from verl.trainer.config import BaseModelConfig, CheckpointConfig
 from verl.utils.profiler import ProfilerConfig
 
-from .engine import FSDPEngineConfig, McoreEngineConfig, TorchtitanEngineConfig
+from .engine import FSDPEngineConfig, McoreEngineConfig, MindSpeedEngineConfig, TorchtitanEngineConfig
 from .model import HFModelConfig
 from .optimizer import OptimizerConfig
 
-__all__ = ["CriticConfig", "FSDPCriticConfig", "McoreCriticConfig", "TorchTitanCriticConfig", "FSDPCriticModelCfg"]
+__all__ = [
+    "CriticConfig",
+    "FSDPCriticConfig",
+    "McoreCriticConfig",
+    "TorchTitanCriticConfig",
+    "FSDPCriticModelCfg",
+    "MindSpeedCriticConfig",
+]
 
 
 @dataclass
@@ -59,6 +65,7 @@ class CriticConfig(BaseConfig):
         "ppo_micro_batch_size_per_gpu",
         "ppo_mini_batch_size",
         "ppo_micro_batch_size",
+        "engine",
         "model_config",
     }
 
@@ -81,9 +88,7 @@ class CriticConfig(BaseConfig):
     ppo_micro_batch_size: Optional[int] = None
     engine: BaseConfig = field(default_factory=BaseConfig)
     optim: OptimizerConfig = field(default_factory=OptimizerConfig)
-    # deprecate model to favor model_config
-    model: BaseModelConfig = field(default_factory=BaseModelConfig)
-    model_config: HFModelConfig = None
+    model: HFModelConfig = None
     checkpoint: CheckpointConfig = field(default_factory=CheckpointConfig)
     profiler: ProfilerConfig = field(default_factory=ProfilerConfig)
 
@@ -91,16 +96,6 @@ def __post_init__(self):
         """Validate critic configuration parameters."""
         assert self.strategy != MISSING
 
-        if self.model_config is None:
-            warnings.warn("using model in Critic Config is deprecated, please use model_config instead", stacklevel=2)
-            self.model_config = HFModelConfig(
-                path=self.model.path,
-                tokenizer_path=self.model.tokenizer_path,
-                override_config=self.model.override_config,
-                external_lib=self.model.external_lib,
-                trust_remote_code=self.model.trust_remote_code,
-            )
-
         if not self.use_dynamic_bsz:
             self._check_mutually_exclusive(self.ppo_micro_batch_size, self.ppo_micro_batch_size_per_gpu, "critic")
 
@@ -174,6 +169,11 @@ def validate(self, n_gpus: int, train_batch_size: int):
         """Validate Megatron critic configuration with runtime parameters."""
         super().validate(n_gpus, train_batch_size)
 
+    def __post_init__(self):
+        """Validate Megatron critic configuration parameters."""
+        super().__post_init__()
+        self.engine = self.megatron
+
 
 @dataclass
 class FSDPCriticConfig(CriticConfig):
@@ -194,6 +194,7 @@ class FSDPCriticConfig(CriticConfig):
     }
 
     strategy: str = "fsdp"
+    fsdp: FSDPEngineConfig = field(default_factory=FSDPEngineConfig)
     forward_micro_batch_size: int = 1
     forward_micro_batch_size_per_gpu: int = 1
     ulysses_sequence_parallel_size: int = 1
@@ -202,6 +203,11 @@ class FSDPCriticConfig(CriticConfig):
     def __post_init__(self):
         """Validate FSDP critic configuration parameters."""
         super().__post_init__()
+        self.engine = self.fsdp
+        # Sync strategy to engine config so engine_workers can pick the right FSDP version.
+        # EngineConfig.strategy defaults to None, so without this, engine_workers.py always
+        # falls back to FSDP1 even when critic.strategy="fsdp2".
+        object.__setattr__(self.engine, "strategy", self.strategy)
 
         if self.strategy in {"fsdp", "fsdp2"}:
             if self.ulysses_sequence_parallel_size > 1:
@@ -270,3 +276,25 @@ class FSDPCriticModelCfg(BaseModelConfig):
     target_modules: str | list[str] = "all-linear"
     # TiledMLP configuration for memory-efficient MLP computation
     tiled_mlp: dict = field(default_factory=lambda: {"enabled": False, "num_shards": 4})
+
+
+@dataclass
+class MindSpeedCriticConfig(CriticConfig):
+    """Configuration for mindspeed-based critic model training.
+
+    The inheritance from CriticConfig provides all base critic configuration plus mindspeed-specific settings.
+
+    Args:
+        nccl_timeout (int): NCCL timeout in seconds for distributed operations.
+        mindspeed (Dict[str, Any]): mindspeed-specific parallelism settings.
+        load_weight (bool): Whether to load initial weights.
+    """
+
+    strategy: str = "mindspeed"
+    nccl_timeout: int = 600
+    mindspeed: MindSpeedEngineConfig = field(default_factory=MindSpeedEngineConfig)
+    load_weight: bool = True
+
+    def validate(self, n_gpus: int, train_batch_size: int):
+        """Validate mindspeed critic configuration with runtime parameters."""
+        super().validate(n_gpus, train_batch_size)
diff --git a/verl/workers/config/distillation.py b/verl/workers/config/distillation.py
index c78cf4aad76..f513e21e6f0 100644
--- a/verl/workers/config/distillation.py
+++ b/verl/workers/config/distillation.py
@@ -160,7 +160,6 @@ class DistillationConfig(BaseConfig):
     def __post_init__(self):
         # Prompt + Response from student are fed into teacher as context
         max_model_len = self.teacher_model.inference.max_model_len
-        max_num_batched_tokens = self.teacher_model.inference.max_num_batched_tokens
         student_prompt_length = self.teacher_model.inference.prompt_length
         student_response_length = self.teacher_model.inference.response_length
         if self.enabled:
@@ -171,13 +170,6 @@ def __post_init__(self):
                     f"response, and one generated token, but got {student_prompt_length=}, "
                     f"{student_response_length=}, {required_context_len=}, {max_model_len=}."
                 )
-            if max_num_batched_tokens is not None and required_context_len > max_num_batched_tokens:
-                raise ValueError(
-                    "Distillation teacher inference requires room for the student prompt, the full student "
-                    f"response, and one generated token within the engine batching budget, but got "
-                    f"{student_prompt_length=}, {student_response_length=}, {required_context_len=}, "
-                    f"{max_num_batched_tokens=}."
-                )
 
         self.teacher_model.inference.prompt_length = (
             self.teacher_model.inference.prompt_length + self.teacher_model.inference.response_length
diff --git a/verl/workers/config/engine.py b/verl/workers/config/engine.py
index c334f7112eb..87880a7de42 100644
--- a/verl/workers/config/engine.py
+++ b/verl/workers/config/engine.py
@@ -20,7 +20,7 @@
 from verl.trainer.config import CheckpointConfig
 
 from ...utils.profiler import ProfilerConfig
-from .model import HFModelConfig
+from .model import DiffusionModelConfig, HFModelConfig
 from .optimizer import OptimizerConfig
 
 __all__ = [
@@ -33,6 +33,7 @@
     "EngineConfig",
     "EngineRouterReplayConfig",
     "QATEngineConfig",
+    "MindSpeedEngineConfig",
 ]
 
 
@@ -156,6 +157,8 @@ class McoreEngineConfig(EngineConfig):
         virtual_pipeline_model_parallel_size (Optional[int]): Virtual pipeline model parallel size
             for interleaved scheduling.
         context_parallel_size (int): Context parallel size for long sequences.
+        dynamic_context_parallel (bool): Whether to enable hybrid context parallelism.
+        max_seqlen_per_dp_cp_rank (Optional[int]): Maximum sequence length per DPxCP rank.
         sequence_parallel (bool): Whether to enable sequence parallelism.
         use_distributed_optimizer (bool): Whether to use distributed optimizer.
         use_dist_checkpointing (bool): Whether to use distributed checkpointing.
@@ -178,6 +181,8 @@ class McoreEngineConfig(EngineConfig):
     pipeline_model_parallel_size: int = 1
     virtual_pipeline_model_parallel_size: Optional[int] = None
     context_parallel_size: int = 1
+    dynamic_context_parallel: bool = False
+    max_seqlen_per_dp_cp_rank: Optional[int] = None
     sequence_parallel: bool = True
     use_distributed_optimizer: bool = True
     use_dist_checkpointing: bool = False
@@ -519,10 +524,34 @@ def __post_init__(self):
         assert self.pp_size == 1, "Pipeline parallelism (pp_size > 1) is not yet supported for automodel backend"
 
 
+@dataclass
+class MindSpeedEngineConfig(McoreEngineConfig):
+    """Configuration for mindspeed parallelism.
+
+    The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
+
+    Args:
+        llm_kwargs (str): mindspeed_llm engine kwargs.
+        mm_kwargs (str): mindspeed_mm engine kwargs.
+    """
+
+    strategy: str = "mindspeed_llm"
+    llm_kwargs: dict[str, Any] = field(default_factory=dict)
+    mm_kwargs: dict[str, Any] = field(default_factory=dict)
+
+    def __post_init__(self) -> None:
+        """config validation logics go here"""
+        assert self.strategy in ["mindspeed_llm", "mindspeed_mm"], f"strategy {self.strategy} not supported"
+        assert self.dtype in ["bfloat16", "float16"], f"dtype {self.dtype} not supported"
+        if self.tensor_model_parallel_size == 1:
+            warnings.warn("set sequence parallel to false as TP size is 1", stacklevel=2)
+            self.sequence_parallel = False
+
+
 @dataclass
 class TrainingWorkerConfig(BaseConfig):
     model_type: str = None  # model type (language_model/value_model)
-    model_config: HFModelConfig = None
+    model_config: HFModelConfig | DiffusionModelConfig = None
     engine_config: EngineConfig = None
     optimizer_config: OptimizerConfig = None
     checkpoint_config: CheckpointConfig = None
diff --git a/verl/workers/config/model.py b/verl/workers/config/model.py
index 54ba682deaa..f7ab687599d 100644
--- a/verl/workers/config/model.py
+++ b/verl/workers/config/model.py
@@ -72,6 +72,7 @@ class MtpConfig(BaseConfig):
 class HFModelConfig(BaseConfig):
     # note that we separate model_path, model_config_path and tokenizer_path in case they are different
     _mutable_fields = {
+        "model_type",
         "hf_config_path",
         "tokenizer_path",
         "hf_config",
@@ -92,6 +93,9 @@ class HFModelConfig(BaseConfig):
     tokenizer_path: Optional[str] = None
     local_tokenizer_path: Optional[str] = None
 
+    # model type, e.g., "language_model", "value_model", "diffusion_model"
+    model_type: str = "language_model"
+
     # whether to load tokenizer. This is useful when we only want to load model config
     load_tokenizer: bool = True
 
@@ -234,7 +238,15 @@ def get_processor(self):
 
 @dataclass
 class DiffusionModelConfig(BaseConfig):
-    _mutable_fields = {"tokenizer_path", "tokenizer", "processor", "local_path", "local_tokenizer_path", "architecture"}
+    _mutable_fields = {
+        "model_type",
+        "tokenizer_path",
+        "tokenizer",
+        "processor",
+        "local_path",
+        "local_tokenizer_path",
+        "architecture",
+    }
 
     path: str = MISSING
     # Handler key matched against @DiffusionModelBase.register(name).
@@ -278,7 +290,6 @@ class DiffusionModelConfig(BaseConfig):
     height: int = 512
     width: int = 512
     num_inference_steps: int = 10
-    guidance_scale: float = 4.5
 
     # extra configs for algorithm specific features.
     extra_configs: dict[str, Any] = field(default_factory=dict)
@@ -299,9 +310,11 @@ def __post_init__(self):
         # construct tokenizer
         if self.load_tokenizer:
             self.local_tokenizer_path = copy_to_local(self.tokenizer_path, use_shm=self.use_shm)
-            # see issue https://github.com/huggingface/tokenizers/issues/537, we use a non-fast tokenizer here
+            # Fast tokenizer for diffusion: DiffusionSingleTurnAgentLoop applies chat template on the asyncio
+            # thread (not run_in_executor) so Rust-backed tokenizers avoid RuntimeError: Already borrowed
+            # with recent transformers when combined with thread-pool tokenization.
             self.tokenizer = hf_tokenizer(
-                self.local_tokenizer_path, trust_remote_code=self.trust_remote_code, use_fast=False
+                self.local_tokenizer_path, trust_remote_code=self.trust_remote_code, use_fast=True
             )
             if os.path.exists(os.path.join(self.local_path, "processor")):
                 self.processor = hf_processor(
diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index a38500c0229..d055c6e07fb 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -34,9 +34,27 @@
     "DiffusionRolloutConfig",
     "CheckpointEngineConfig",
     "SpeculativeDecodingConfig",
+    "SkipConfig",
 ]
 
 
+@dataclass
+class SkipConfig(BaseConfig):
+    """
+    Configuration for rollout skip: load/dump previously generated rollout data
+    instead of computing new rollouts (e.g. for debugging or reuse).
+    """
+
+    enable: bool = False
+    dump_dir: str = "~/.verl/rollout_dump"
+    max_dump_step: int = 1
+    action: str = "cache"  # cache | repeat | repeat_last
+
+    def get(self, key: str, default=None):
+        """Dict-like get for compatibility with code that uses skip.get('enable', False)."""
+        return getattr(self, key, default)
+
+
 @dataclass
 class SamplingConfig(BaseConfig):
     temperature: float = 1.0
@@ -47,9 +65,7 @@ class SamplingConfig(BaseConfig):
 
 
 @dataclass
-class DiffusionSamplingConfig(BaseConfig):
-    do_sample: bool = True
-    n: int = 1
+class DiffusionSamplingConfig(SamplingConfig):
     noise_level: float = 0.0
     num_inference_steps: int = 40
     seed: int = 42
@@ -144,6 +160,10 @@ class CheckpointEngineConfig(BaseConfig):
     update_weights_bucket_megabytes: int = 2048
     # Additional keyword arguments for checkpoint engine
     engine_kwargs: dict = field(default_factory=dict)
+    # If set, this Python module is imported on every worker process before the
+    # backend is instantiated, allowing custom backends to register themselves
+    # in CheckpointEngineRegistry.
+    custom_backend_module: Optional[str] = None
 
 
 @dataclass
@@ -240,12 +260,15 @@ class RolloutConfig(BaseConfig):
     # Extension point for custom configurations
     custom: Optional[dict] = None
 
+    # Fully qualified class name for a custom CheckpointEngineManager. When set, the trainer
+    # loads this class instead of the built-in CheckpointEngineManager.
+    checkpoint_manager_class: Optional[str] = None
+
     # Checkpoint Engine config for update weights from trainer to rollout
     checkpoint_engine: CheckpointEngineConfig = field(default_factory=CheckpointEngineConfig)
 
-    skip_rollout: bool = False
-
-    skip_dump_dir: str = "/tmp/rollout_dump"
+    # Rollout skip config (load/dump rollout data)
+    skip: SkipConfig = field(default_factory=SkipConfig)
 
     profiler: Optional[ProfilerConfig] = None
 
@@ -366,8 +389,6 @@ class DiffusionRolloutConfig(RolloutConfig):
 
     num_inference_steps: int = 10
 
-    guidance_scale: float = 4.5
-
     def __post_init__(self):
         """Validate diffusion rollout config"""
         super().__post_init__()
diff --git a/verl/workers/engine/__init__.py b/verl/workers/engine/__init__.py
index 009f0a8fc8b..7e59fbee5f7 100644
--- a/verl/workers/engine/__init__.py
+++ b/verl/workers/engine/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .base import BaseEngine, EngineRegistry
-from .fsdp import FSDPEngine, FSDPEngineWithLMHead
+from .fsdp import DiffusersFSDPEngine, FSDPEngine, FSDPEngineWithLMHead
 
 __all__ = [
     "BaseEngine",
@@ -21,6 +21,9 @@
     "FSDPEngineWithLMHead",
 ]
 
+if DiffusersFSDPEngine is not None:
+    __all__.append("DiffusersFSDPEngine")
+
 try:
     from .torchtitan import TorchTitanEngine, TorchTitanEngineWithLMHead
 
@@ -47,11 +50,12 @@
 
 # Mindspeed must be imported before Megatron to ensure the related monkey patches take effect as expected
 try:
-    from .mindspeed import MindspeedEngineWithLMHead
+    from .mindspeed import MindspeedEngineWithLMHead, MindSpeedLLMEngineWithLMHead
 
-    __all__ += ["MindspeedEngineWithLMHead"]
+    __all__ += ["MindspeedEngineWithLMHead", "MindSpeedLLMEngineWithLMHead"]
 except ImportError:
     MindspeedEngineWithLMHead = None
+    MindSpeedLLMEngineWithLMHead = None
 
 try:
     from .megatron import MegatronEngine, MegatronEngineWithLMHead
diff --git a/verl/workers/engine/fsdp/__init__.py b/verl/workers/engine/fsdp/__init__.py
index a1bdb16b47c..98acc3fb0c7 100644
--- a/verl/workers/engine/fsdp/__init__.py
+++ b/verl/workers/engine/fsdp/__init__.py
@@ -14,3 +14,11 @@
 from .transformer_impl import FSDPEngine, FSDPEngineWithLMHead
 
 __all__ = ["FSDPEngine", "FSDPEngineWithLMHead"]
+
+
+try:
+    from .diffusers_impl import DiffusersFSDPEngine
+
+    __all__ += ["DiffusersFSDPEngine"]
+except ImportError:
+    DiffusersFSDPEngine = None
diff --git a/verl/workers/engine/fsdp/diffusers_impl.py b/verl/workers/engine/fsdp/diffusers_impl.py
new file mode 100644
index 00000000000..79920547d79
--- /dev/null
+++ b/verl/workers/engine/fsdp/diffusers_impl.py
@@ -0,0 +1,827 @@
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The concrete Engine implementation using PyTorch FullyShardedDataParallel (FSDP)
+"""
+
+import gc
+import json
+import logging
+import os
+import warnings
+from contextlib import contextmanager, nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed
+from peft import LoraConfig
+from tensordict import TensorDict
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp.api import FullStateDictConfig, ShardedStateDictConfig, StateDictType
+from torch.distributed.tensor import DTensor
+
+from verl.models.diffusers_model import build_scheduler, forward_and_sample_previous_step, prepare_model_inputs
+from verl.trainer.config import CheckpointConfig
+from verl.utils import tensordict_utils as tu
+from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager
+from verl.utils.debug import log_gpu_memory_usage
+from verl.utils.device import get_device_id, get_device_name
+from verl.utils.fsdp_utils import (
+    CPUOffloadPolicy,
+    FSDPModule,
+    MixedPrecisionPolicy,
+    apply_fsdp2,
+    collect_lora_params,
+    fsdp2_clip_grad_norm_,
+    fsdp2_load_full_state_dict,
+    fsdp_version,
+    get_fsdp_wrap_policy,
+    get_init_weight_context_manager,
+    init_fn,
+    load_fsdp_model_to_gpu,
+    load_fsdp_optimizer,
+    offload_fsdp_model_to_cpu,
+    offload_fsdp_optimizer,
+    replace_lora_wrapper,
+)
+from verl.utils.model import convert_weight_keys
+from verl.utils.py_functional import append_to_dict, convert_to_regular_types
+from verl.workers.config import DiffusionModelConfig, FSDPEngineConfig, FSDPOptimizerConfig
+
+from ..base import BaseEngine, BaseEngineCtx, EngineRegistry
+from ..utils import enable_full_determinism, prepare_micro_batches
+from .utils import create_device_mesh, get_sharding_strategy
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+device_name = get_device_name()
+
+
+@EngineRegistry.register(model_type="diffusion_model", backend=["fsdp", "fsdp2"], device=["cuda"])
+class DiffusersFSDPEngine(BaseEngine):
+    """
+    Concrete Diffusers Engine implementation using PyTorch FullyShardedDataParallel (FSDP).
+
+    Supports model sharding, activation/optimizer offloading, LoRA, and sequence parallelism.
+    """
+
+    def __init__(
+        self,
+        model_config: DiffusionModelConfig,
+        engine_config: FSDPEngineConfig,
+        optimizer_config: FSDPOptimizerConfig,
+        checkpoint_config: CheckpointConfig,
+    ):
+        """
+        Initialize the DiffusersFSDPEngine.
+
+        Sets up distributed device meshes, LoRA, and offload policies based on config.
+
+        Args:
+            config: Configuration object with FSDP and model settings.
+        """
+        super().__init__()
+
+        self.model_config = model_config
+        self.engine_config = engine_config
+        self.optimizer_config = optimizer_config
+        self.checkpoint_config = checkpoint_config
+
+        self.mode = None
+
+        self.rank = torch.distributed.get_rank()
+
+        self._init_device_mesh()
+
+        if self.engine_config.full_determinism:
+            enable_full_determinism(seed=self.engine_config.seed)
+
+        # set FSDP offload params
+        self._is_offload_param = self.engine_config.param_offload
+        self._is_offload_optimizer = self.engine_config.optimizer_offload
+        self._is_lora = self.model_config.lora_rank > 0
+
+    @property
+    def is_param_offload_enabled(self) -> bool:
+        return self._is_offload_param
+
+    @property
+    def is_optimizer_offload_enabled(self) -> bool:
+        return self._is_offload_optimizer
+
+    def is_mp_src_rank_with_outputs(self):
+        return True
+
+    def initialize(self):
+        """
+        Build the model, optimizer, and learning rate scheduler under FSDP.
+
+        Applies device, dtype, and precision configurations, including mixed precision.
+        Sets up checkpoint manager and FLOPs counter.
+        """
+        # This is used to import external_lib into the huggingface systems
+        self._build_model_optimizer()
+
+        self.checkpoint_manager = FSDPCheckpointManager(
+            model=self.module,
+            optimizer=self.optimizer,
+            lr_scheduler=self.lr_scheduler,
+            processing_class=self.model_config.get_processor(),
+            checkpoint_config=self.checkpoint_config,
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
+        self.to(
+            device="cpu",
+            model=self._is_offload_param,
+            optimizer=self._is_offload_optimizer,
+            grad=self._is_offload_param,
+        )
+
+        log_gpu_memory_usage("After offload model/optimizer/grad during init", logger=logger)
+
+    def _init_device_mesh(self):
+        world_size = torch.distributed.get_world_size()
+
+        fsdp_size = self.engine_config.fsdp_size
+
+        self.device_mesh = create_device_mesh(world_size=world_size, fsdp_size=fsdp_size)
+        self.ulysses_sequence_parallel_size = self.engine_config.ulysses_sequence_parallel_size
+        if self.ulysses_sequence_parallel_size > 1:
+            raise NotImplementedError("Ulysses sequence parallel for Diffusers backend is not supported currently.")
+
+    def _build_module(self):
+        from diffusers import AutoModel
+
+        from verl.utils.torch_dtypes import PrecisionType
+
+        torch_dtype = self.engine_config.model_dtype
+
+        if torch_dtype is None:
+            # if it is training, we force torch_dtype to fp32
+            torch_dtype = torch.float32 if not self.engine_config.forward_only else torch.bfloat16
+
+        torch_dtype = PrecisionType.to_dtype(torch_dtype)
+
+        init_context = get_init_weight_context_manager(use_meta_tensor=True, mesh=self.device_mesh)
+
+        with init_context(), warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+
+            module = AutoModel.from_pretrained(
+                self.model_config.local_path,
+                torch_dtype=torch_dtype,
+                trust_remote_code=self.model_config.trust_remote_code,
+                subfolder="transformer",  # currently we support DiT with transformer backbone only.
+            )
+
+            # some parameters may not in torch_dtype
+            module.to(torch_dtype)
+
+            if self.model_config.enable_gradient_checkpointing:
+                module.enable_gradient_checkpointing()
+
+            # patch for checkpoint saving
+            def save_config(self, save_directory: str | os.PathLike):
+                output_config_file = os.path.join(save_directory, "config.json")
+                with open(output_config_file, "w", encoding="utf-8") as f:
+                    json.dump(self, f, indent=4, sort_keys=True)
+
+            module.can_generate = lambda: False
+            module.config.save_pretrained = save_config.__get__(module.config)
+
+        return module
+
+    def _build_lora_module(self, module):
+        lora_adapter_path = getattr(self.model_config, "lora_adapter_path", None)
+        if lora_adapter_path is not None:
+            from verl.utils.fs import copy_to_local
+
+            print(f"Loading pre-trained LoRA adapter to from: {lora_adapter_path}")
+            # Copy adapter to local if needed
+            local_adapter_path = copy_to_local(lora_adapter_path, use_shm=self.model_config.use_shm)
+
+            module.load_lora_adapter(local_adapter_path)
+        else:
+            # Convert config to regular Python types before creating PEFT model
+            lora_config = {
+                "r": self.model_config.lora_rank,
+                "lora_alpha": self.model_config.lora_alpha,
+                "init_lora_weights": self.model_config.lora_init_weights,
+                "target_modules": convert_to_regular_types(self.model_config.target_modules),
+                "target_parameters": convert_to_regular_types(self.model_config.target_parameters),
+                "exclude_modules": convert_to_regular_types(self.model_config.exclude_modules),
+                "bias": "none",
+            }
+            module.add_adapter(LoraConfig(**lora_config))
+
+        return module
+
+    def _build_fsdp_module(self, module):
+        # TODO(ziheng): need to improve
+        from torch.distributed.fsdp import CPUOffload, MixedPrecision
+
+        from verl.utils.torch_dtypes import PrecisionType
+
+        mixed_precision_config = self.engine_config.mixed_precision
+        if mixed_precision_config is not None:
+            param_dtype = PrecisionType.to_dtype(mixed_precision_config.get("param_dtype", "bf16"))
+            reduce_dtype = PrecisionType.to_dtype(mixed_precision_config.get("reduce_dtype", "fp32"))
+            buffer_dtype = PrecisionType.to_dtype(mixed_precision_config.get("buffer_dtype", "fp32"))
+        else:
+            param_dtype = torch.bfloat16
+            reduce_dtype = torch.float32
+            buffer_dtype = torch.float32
+
+        mixed_precision = MixedPrecision(param_dtype=param_dtype, reduce_dtype=reduce_dtype, buffer_dtype=buffer_dtype)
+
+        auto_wrap_policy = get_fsdp_wrap_policy(
+            module=module,
+            config=self.engine_config.wrap_policy,
+            is_lora=self.model_config.lora_rank > 0,
+        )
+
+        fsdp_mesh = self.device_mesh
+        sharding_strategy = get_sharding_strategy(fsdp_mesh)
+
+        # Note: We force turn off CPUOffload because it causes incorrect results when using grad accumulation
+        if self.engine_config.strategy == "fsdp":
+            # cpu_offload:
+            # - actor: None
+            # - critic: None
+            # - ref: CPUOffload(offload_params=True)
+
+            # We force reference policy to use CPUOffload to save memory.
+            # We force turn off CPUOffload for actor because it causes incorrect results when using grad accumulation
+            cpu_offload = None
+            if self.engine_config.forward_only:
+                cpu_offload = CPUOffload(offload_params=True)
+                self._is_offload_param = False
+                self._is_offload_optimizer = False
+
+            module = FSDP(
+                module,
+                param_init_fn=init_fn,
+                auto_wrap_policy=auto_wrap_policy,
+                device_id=get_device_id(),
+                sharding_strategy=sharding_strategy,
+                mixed_precision=mixed_precision,
+                sync_module_states=True,
+                device_mesh=self.device_mesh,
+                forward_prefetch=self.engine_config.forward_prefetch,
+                use_orig_params=self.engine_config.use_orig_params,
+                cpu_offload=cpu_offload,
+            )
+        elif self.engine_config.strategy == "fsdp2":
+            # - actor: offload_policy
+            # - critic: offload_policy
+            # - ref: CPUOffloadPolicy(pin_memory=True)
+            assert CPUOffloadPolicy is not None, "PyTorch version >= 2.4 is required for using fully_shard API (FSDP2)"
+            mp_policy = MixedPrecisionPolicy(
+                param_dtype=param_dtype, reduce_dtype=reduce_dtype, cast_forward_inputs=True
+            )
+            offload_policy = None
+            if self.engine_config.offload_policy or self.engine_config.forward_only:
+                self._is_offload_param = False
+                self._is_offload_optimizer = False
+                offload_policy = CPUOffloadPolicy(pin_memory=True)
+
+            fsdp_kwargs = {
+                "mesh": fsdp_mesh,
+                "mp_policy": mp_policy,
+                "offload_policy": offload_policy,
+                "reshard_after_forward": self.engine_config.reshard_after_forward,
+            }
+            full_state = module.state_dict()
+            apply_fsdp2(module, fsdp_kwargs, self.engine_config)
+            fsdp2_load_full_state_dict(module, full_state, fsdp_mesh, offload_policy)
+        else:
+            raise NotImplementedError(f"Unknown strategy {self.engine_config.strategy}")
+
+        if torch.distributed.get_world_size() == 1 and fsdp_version(module) == 1:
+            FSDP.set_state_dict_type(
+                module,
+                state_dict_type=StateDictType.FULL_STATE_DICT,
+                state_dict_config=FullStateDictConfig(),
+            )
+        elif fsdp_version(module) == 1:
+            FSDP.set_state_dict_type(
+                module,
+                state_dict_type=StateDictType.SHARDED_STATE_DICT,
+                state_dict_config=ShardedStateDictConfig(),
+            )
+
+        return module
+
+    def _build_scheduler(self):
+        return build_scheduler(self.model_config)
+
+    def _build_optimizer(self, module):
+        from verl.workers.config.optimizer import build_optimizer
+
+        optimizer = build_optimizer(module.parameters(), self.optimizer_config)
+
+        return optimizer
+
+    def _build_lr_scheduler(self, optimizer):
+        from verl.utils.torch_functional import get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup
+
+        optim_config = self.optimizer_config
+
+        total_steps = optim_config.total_training_steps
+        num_warmup_steps = optim_config.lr_warmup_steps
+        lr_scheduler_type = optim_config.lr_scheduler_type
+        min_lr_ratio = optim_config.min_lr_ratio
+        num_cycles = optim_config.num_cycles
+        zero_indexed_step = optim_config.zero_indexed_step
+        if num_warmup_steps <= 0:
+            num_warmup_steps_ratio = optim_config.lr_warmup_steps_ratio
+            num_warmup_steps = int(num_warmup_steps_ratio * total_steps)
+
+        if self.rank == 0:
+            print(f"Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}")
+
+        if lr_scheduler_type == "constant":
+            lr_scheduler = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_warmup_steps)
+        elif lr_scheduler_type == "cosine":
+            lr_scheduler = get_cosine_schedule_with_warmup(
+                optimizer=optimizer,
+                num_warmup_steps=num_warmup_steps,
+                num_training_steps=total_steps,
+                min_lr_ratio=min_lr_ratio,
+                num_cycles=num_cycles,
+                zero_indexed_step=zero_indexed_step,
+            )
+        else:
+            raise NotImplementedError(f"LR scheduler type {lr_scheduler_type} is not supported")
+        return lr_scheduler
+
+    def _build_model_optimizer(self):
+        from verl.utils.model import print_model_size
+
+        # Load base model with specified configuration and dtype
+        module = self._build_module()
+        # Apply LoRA adapters if low-rank adaptation is enabled
+        if self._is_lora:
+            module = self._build_lora_module(module)
+
+        # Load diffusion scheduler
+        scheduler = self._build_scheduler()
+
+        # Synchronize all distributed processes before proceeding
+        torch.distributed.barrier()
+        if self.rank == 0:
+            print_model_size(module)
+        log_gpu_memory_usage("After init model from Diffusers AutoModel", logger=logger)
+
+        # Wrap model with FSDP for distributed training (sharding, mixed precision, etc.)
+        log_gpu_memory_usage("Before FSDP", logger=None)
+        module = self._build_fsdp_module(module)
+        log_gpu_memory_usage("After FSDP", logger=None)
+
+        if not self.engine_config.forward_only:
+            # Initialize optimizer with model parameters and config settings
+            optimizer = self._build_optimizer(module)
+            # Create learning rate scheduler with warmup and decay settings
+            lr_scheduler = self._build_lr_scheduler(optimizer)
+        else:
+            optimizer = None
+            lr_scheduler = None
+
+        self.module = module
+        self.scheduler = scheduler
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+
+    def train_mode(self, **kwargs):
+        """
+        Return a context manager that switches to training mode with FSDP-specific handling.
+
+        Includes parameter and optimizer offload entry/exit.
+        """
+        return EngineTrainModeCtx(self, **kwargs)
+
+    def eval_mode(self, **kwargs):
+        """
+        Return a context manager that switches to evaluation mode with FSDP-specific handling.
+
+        Includes activation offload entry/exit.
+        """
+        return EngineEvalModeCtx(self, **kwargs)
+
+    def get_data_parallel_rank(self):
+        return torch.distributed.get_rank()
+
+    def get_data_parallel_size(self):
+        return torch.distributed.get_world_size()
+
+    def get_data_parallel_group(self):
+        return torch.distributed.group.WORLD
+
+    def get_model_parallel_group(self):
+        raise NotImplementedError
+
+    def get_context_parallel_group(self):
+        raise NotImplementedError
+
+    def forward_backward_batch(
+        self, data: TensorDict, loss_function: Callable, forward_only: bool = False
+    ) -> list[TensorDict]:
+        num_timesteps = data["all_timesteps"].shape[1]
+        tu.assign_non_tensor(data, use_dynamic_bsz=False)
+
+        micro_batches, indices = prepare_micro_batches(
+            data=data, dp_group=self.get_data_parallel_group(), same_micro_num_in_dp=True
+        )
+
+        gradient_accumulation_steps = len(micro_batches) * num_timesteps
+        tu.assign_non_tensor(data, gradient_accumulation_steps=gradient_accumulation_steps)
+
+        output_lst = []
+
+        ctx = torch.no_grad() if forward_only else nullcontext()
+
+        for micro_batch in micro_batches:
+            micro_batch = micro_batch.to(get_device_id())
+            meta_info_lst = {"model_output": [], "loss": [], "metrics": []}
+            # Forward and backward for each timestep
+            with ctx:
+                for step in range(num_timesteps):
+                    loss, meta_info = self.forward_step(
+                        micro_batch, loss_function=loss_function, forward_only=forward_only, step=step
+                    )
+
+                    if not forward_only:
+                        loss.backward()
+
+                    for key, val in meta_info.items():
+                        meta_info_lst[key].append(val)
+
+            output_lst.append(meta_info_lst)
+
+        # postprocess and return
+        return self.postprocess_batch_func(output_lst=output_lst, indices=indices, data=data)
+
+    def postprocess_batch_func(self, output_lst, indices, data: TensorDict):
+        model_output = {}
+        losses = []
+        aggregated_metrics = {}
+
+        for output in output_lst:
+            # model output list
+            model_output_lst = {}
+            if "model_output" in output:
+                for model_output_dict in output["model_output"]:
+                    for key, val in model_output_dict.items():
+                        model_output_lst.setdefault(key, []).append(val)
+                for key, val in model_output_lst.items():
+                    model_output.setdefault(key, []).append(torch.stack(val, dim=1))  # (bsz, steps, ...)
+            # loss
+            if "loss" in output:
+                losses.append(output["loss"])
+
+            # metrics
+            if "metrics" in output:
+                for metrics in output["metrics"]:
+                    append_to_dict(aggregated_metrics, metrics)
+
+        # concat results from micro batches
+        for key, val in model_output.items():
+            model_output[key] = torch.concat(val, dim=0)  # (global_bsz, steps, ...)
+
+        output = {
+            "model_output": model_output,  # a dict of tensors in shape (global_bsz, steps, ...)
+            "loss": losses,  # micro-batch step-wise losses
+            "metrics": aggregated_metrics,
+        }
+
+        return output
+
+    @staticmethod
+    def _unpad_nested_embeds(embeds: torch.Tensor, mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """Convert a jagged nested tensor pair (embeds, mask) to dense padded tensors."""
+        batch_size = embeds.size(0)
+        max_seq_len = max(embeds.offsets().diff())
+        embed_dim = embeds.size(-1)
+        embeds = torch.nested.to_padded_tensor(embeds, padding=0, output_size=(batch_size, max_seq_len, embed_dim))
+        mask = torch.nested.to_padded_tensor(mask, padding=0, output_size=(batch_size, max_seq_len))
+        return embeds, mask
+
+    def prepare_model_inputs(self, micro_batch: TensorDict, step: int):
+        """
+        Extract and pre-process universal tensors, then delegate architecture-specific
+        input construction to the registered DiffusionModelBase subclass.
+
+        Handles common tensor extraction and nested-embed unpadding here.
+        Architecture-specific input dict construction is delegated to the model registry.
+        """
+        latents = micro_batch["all_latents"]
+        timesteps = micro_batch["all_timesteps"]
+        prompt_embeds = micro_batch["prompt_embeds"]
+        prompt_embeds_mask = micro_batch["prompt_embeds_mask"]
+        negative_prompt_embeds = micro_batch["negative_prompt_embeds"]
+        negative_prompt_embeds_mask = micro_batch["negative_prompt_embeds_mask"]
+
+        if prompt_embeds.is_nested:
+            prompt_embeds, prompt_embeds_mask = self._unpad_nested_embeds(prompt_embeds, prompt_embeds_mask)
+
+        if isinstance(negative_prompt_embeds, torch.Tensor) and negative_prompt_embeds.is_nested:
+            negative_prompt_embeds, negative_prompt_embeds_mask = self._unpad_nested_embeds(
+                negative_prompt_embeds, negative_prompt_embeds_mask
+            )
+
+        return prepare_model_inputs(
+            module=self.module,
+            model_config=self.model_config,
+            latents=latents,
+            timesteps=timesteps,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            negative_prompt_embeds=negative_prompt_embeds,
+            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+            micro_batch=micro_batch,
+            step=step,
+        )
+
+    def prepare_model_outputs(self, output, micro_batch: TensorDict):
+        log_prob, prev_sample_mean, std_dev_t = output
+        return {
+            "log_probs": log_prob,
+            "prev_sample_mean": prev_sample_mean,
+            "std_dev_t": std_dev_t,
+        }
+
+    def forward_step(self, micro_batch: TensorDict, loss_function, forward_only, step):
+        model_inputs, negative_model_inputs = self.prepare_model_inputs(micro_batch=micro_batch, step=step)
+        raw_output = forward_and_sample_previous_step(
+            module=self.module,
+            scheduler=self.scheduler,
+            model_config=self.model_config,
+            model_inputs=model_inputs,
+            negative_model_inputs=negative_model_inputs,
+            scheduler_inputs=micro_batch,
+            step=step,
+        )
+        model_output = self.prepare_model_outputs(output=raw_output, micro_batch=micro_batch)
+
+        if loss_function is not None:
+            data = tu.get_tensordict(
+                {
+                    "old_log_probs": micro_batch["old_log_probs"][:, step],
+                    "advantages": micro_batch["advantages"][:, step],
+                    "response_mask": micro_batch["response_mask"][:, step],
+                },
+            )
+            tu.assign_non_tensor(
+                data,
+                gradient_accumulation_steps=tu.get_non_tensor_data(
+                    micro_batch, "gradient_accumulation_steps", default=None
+                ),
+            )
+
+            if micro_batch.get("ref_log_prob", None) is not None:
+                data["ref_log_prob"] = micro_batch["ref_log_prob"][:, step]
+
+            if micro_batch.get("ref_prev_sample_mean", None) is not None:
+                data["ref_prev_sample_mean"] = micro_batch["ref_prev_sample_mean"][:, step]
+
+            loss, metrics = loss_function(model_output=model_output, data=data, dp_group=self.get_data_parallel_group())
+        else:
+            assert forward_only, "forward_only must be True when loss_function is None"
+            loss = torch.tensor(1.0, device=device_name)
+            metrics = {}
+
+        output = {
+            "model_output": model_output,
+            "loss": loss.detach().item(),
+            "metrics": metrics,
+        }
+
+        return loss, output
+
+    def optimizer_zero_grad(self):
+        """
+        Zero gradients and enforce FSDP grad-clipping logic.
+        """
+        self.optimizer.zero_grad()
+
+    def optimizer_step(self):
+        """
+        Clip gradients, skip update if non-finite, and step optimizer.
+
+        Returns:
+            grad_norm (float): Norm of gradients before clipping.
+        """
+        assert self.optimizer_config.clip_grad is not None
+
+        if isinstance(self.module, FSDP):
+            grad_norm = self.module.clip_grad_norm_(self.optimizer_config.clip_grad)
+        elif isinstance(self.module, FSDPModule):
+            grad_norm = fsdp2_clip_grad_norm_(self.module.parameters(), max_norm=self.optimizer_config.clip_grad)
+        else:
+            grad_norm = torch.nn.utils.clip_grad_norm_(
+                self.module.parameters(), max_norm=self.optimizer_config.clip_grad
+            )
+
+        if isinstance(grad_norm, DTensor):
+            grad_norm = grad_norm.full_tensor()
+
+        # if grad_norm is not finite, skip the update
+        if not torch.isfinite(grad_norm):
+            print(f"WARN: grad_norm is not finite: {grad_norm}")
+            self.optimizer.zero_grad()
+        else:
+            self.optimizer.step()
+        return grad_norm.item()
+
+    def lr_scheduler_step(self):
+        """
+        Advance FSDP scheduler and return updated learning rate.
+        """
+        self.lr_scheduler.step()
+        lr = self.lr_scheduler.get_last_lr()[0]  # only return the first group
+        return lr
+
+    def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool = True):
+        """
+        Move FSDP model and/or optimizer to CPU or GPU with offload support.
+        Note that this function executes irrespective of offload config. It serves as manual control
+        """
+        super().to(device=device, model=model, optimizer=optimizer, grad=grad)
+
+        if self.engine_config.forward_only:
+            # force cpu_offload
+            return
+
+        device_name = get_device_name()
+
+        assert device in (device_name, "cpu")
+        if device == device_name:
+            if model:
+                load_fsdp_model_to_gpu(self.module)
+            if optimizer and self.optimizer is not None:
+                load_fsdp_optimizer(self.optimizer, device)
+            gc.collect()
+        elif device == "cpu":
+            if model:
+                offload_fsdp_model_to_cpu(self.module)
+            if optimizer and self.optimizer is not None:
+                offload_fsdp_optimizer(self.optimizer)
+        else:
+            raise ValueError(f"Invalid device type: {device}")
+
+    def save_checkpoint(
+        self,
+        local_path: str,
+        hdfs_path: Optional[str] = None,
+        global_step: int = 0,
+        max_ckpt_to_keep: Optional[int] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Save FSDP checkpoint, handling parameter offload as needed.
+        """
+        origin_module_device = next(self.module.parameters()).device.type
+        if self._is_offload_param or origin_module_device == "cpu":
+            load_fsdp_model_to_gpu(self.module)
+
+        self.checkpoint_manager.save_checkpoint(
+            local_path=local_path, hdfs_path=hdfs_path, global_step=global_step, max_ckpt_to_keep=max_ckpt_to_keep
+        )
+
+        torch.distributed.barrier()
+        if self._is_offload_param:
+            offload_fsdp_model_to_cpu(self.module)
+
+    def load_checkpoint(
+        self, local_path: str, hdfs_path: Optional[str] = None, del_local_after_load: int = True, **kwargs
+    ) -> None:
+        """
+        Load FSDP checkpoint, restoring parameters and optimizer state.
+        """
+        import torch
+
+        if self._is_offload_param:
+            load_fsdp_model_to_gpu(self.module)
+
+        self.checkpoint_manager.load_checkpoint(
+            local_path=local_path, hdfs_path=hdfs_path, del_local_after_load=del_local_after_load
+        )
+
+        torch.distributed.barrier()
+        if self._is_offload_param:
+            offload_fsdp_model_to_cpu(self.module)
+
+        if self._is_offload_optimizer:
+            offload_fsdp_optimizer(self.optimizer)
+
+    def get_per_tensor_param(self, layered_summon=False, base_sync_done=False, **kwargs):
+        log_gpu_memory_usage("Before load_fsdp_model_to_gpu", logger=logger)
+
+        load_fsdp_model_to_gpu(self.module)
+
+        log_gpu_memory_usage("After load_fsdp_model_to_gpu", logger=logger)
+
+        peft_config = None
+
+        peft_model = getattr(self.module, "_fsdp_wrapped_module", self.module)
+        if hasattr(peft_model, "peft_config"):  # LoRA
+            peft_config = peft_model.peft_config.get("default", None)
+            params = collect_lora_params(
+                module=self.module,
+                layered_summon=layered_summon,
+                base_sync_done=base_sync_done,
+                is_diffusers=True,
+            )
+            if not base_sync_done:
+                params = {replace_lora_wrapper(k, peft_config): v for k, v in params.items()}
+        else:
+            params = self.module.state_dict()
+
+        params = convert_weight_keys(params, getattr(self.module, "_fsdp_wrapped_module", self.module))
+
+        log_gpu_memory_usage("Before offload_fsdp_model_to_cpu", logger=logger)
+        if self._is_offload_param:
+            offload_fsdp_model_to_cpu(self.module)
+        log_gpu_memory_usage("After offload_fsdp_model_to_cpu", logger=logger)
+
+        if peft_config is not None and base_sync_done:
+            per_tensor_param = params.items()
+        else:
+            device = get_device_id()  # used when fsdp2 set cpu_offload_policy
+            # TODO: cast fp32 to bf16 to reduce weight sync overhead, need more fine-grained control, e.g MoE gate
+            per_tensor_param = (
+                (
+                    name,
+                    param.to(device, non_blocking=True).full_tensor().to(torch.bfloat16, non_blocking=True)
+                    if isinstance(param, DTensor)
+                    else param,
+                )
+                for name, param in params.items()
+            )
+
+        # we need to add the prefix to make it compatible with rollout engine
+        per_tensor_param = ((f"transformer.{name}", tensor) for name, tensor in per_tensor_param)
+        peft_config_dict = peft_config.to_dict() if peft_config is not None else None
+        return per_tensor_param, peft_config_dict
+
+    @contextmanager
+    def disable_adapter(self):
+        from diffusers.loaders import PeftAdapterMixin
+
+        assert isinstance(self.module, PeftAdapterMixin)
+        try:
+            self.module.disable_adapters()
+            yield
+        finally:
+            self.module.enable_adapters()
+
+
+class EngineEvalModeCtx(BaseEngineCtx):
+    def __init__(self, engine: DiffusersFSDPEngine, **kwargs):
+        super().__init__(engine=engine, mode="eval", **kwargs)
+
+    def __enter__(self):
+        assert isinstance(self.engine, DiffusersFSDPEngine)
+        super().__enter__()
+        self.engine.module.eval()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        assert isinstance(self.engine, DiffusersFSDPEngine)
+
+        # https://pytorch.org/docs/stable/notes/fsdp.html#fsdp-notes
+        # unshard the root FSDP module
+        if self.engine.engine_config.fsdp_size > 1:
+            if fsdp_version(self.engine.module) == 1:
+                self.engine.module._handle.reshard(True)
+            elif fsdp_version(self.engine.module) == 2:
+                self.engine.module.reshard()
+
+        super().__exit__(exc_type, exc_value, traceback)
+
+
+class EngineTrainModeCtx(BaseEngineCtx):
+    def __init__(self, engine: DiffusersFSDPEngine, **kwargs):
+        super().__init__(engine=engine, mode="train", **kwargs)
+
+    def __enter__(self):
+        assert isinstance(self.engine, DiffusersFSDPEngine)
+        super().__enter__()
+        self.engine.module.train()
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        assert isinstance(self.engine, DiffusersFSDPEngine)
+        self.engine.optimizer_zero_grad()
+        super().__exit__(exc_type, exc_value, traceback)
diff --git a/verl/workers/engine/fsdp/transformer_impl.py b/verl/workers/engine/fsdp/transformer_impl.py
index 312e59850f0..f8506206c7c 100644
--- a/verl/workers/engine/fsdp/transformer_impl.py
+++ b/verl/workers/engine/fsdp/transformer_impl.py
@@ -231,14 +231,31 @@ def _build_module(self):
         with init_context(), warnings.catch_warnings():
             warnings.simplefilter("ignore")
 
-            auto_class = get_hf_auto_model_class(hf_config=self.model_config.hf_config)
+            if self.model_config.model_type == "language_model":
+                auto_class = get_hf_auto_model_class(hf_config=self.model_config.hf_config)
+
+                module = auto_class.from_pretrained(
+                    pretrained_model_name_or_path=self.model_config.local_path,
+                    torch_dtype=torch_dtype,
+                    config=self.model_config.hf_config,
+                    trust_remote_code=self.model_config.trust_remote_code,
+                )
+            else:
+                from verl.utils.model import load_valuehead_model
 
-            module = auto_class.from_pretrained(
-                pretrained_model_name_or_path=self.model_config.local_path,
-                torch_dtype=torch_dtype,
-                config=self.model_config.hf_config,
-                trust_remote_code=self.model_config.trust_remote_code,
-            )
+                assert self.model_config.model_type == "value_model", (
+                    f"Unsupported model type: {self.model_config.model_type}"
+                )
+                self.model_config.hf_config.num_labels = 1
+                self.model_config.hf_config.classifier_dropout = 0.0
+                self.model_config.hf_config.hidden_dropout = "0"
+                self.model_config.hf_config.summary_dropout_prob = 0.0
+                module = load_valuehead_model(
+                    local_path=self.model_config.local_path,
+                    torch_dtype=torch_dtype,
+                    model_config=self.model_config.hf_config,
+                    trust_remote_code=self.model_config.trust_remote_code,
+                )
 
             use_liger = self.model_config.use_liger
             # Apply Liger kernel; disable fused_linear_cross_entropy (conflicts with verl's forward patching)
@@ -1164,7 +1181,7 @@ def prepare_model_outputs(self, output, output_args, micro_batch: TensorDict, lo
         if use_remove_padding:
             if hasattr(self.module, "v_head"):
                 # For trl.AutoModelForCausalLMWithValueHead
-                values_rmpad = output[2].squeeze(0).unsqueeze(-1)
+                values_rmpad = output[2].squeeze(0)
             else:
                 values_rmpad = output.logits
                 values_rmpad = values_rmpad.squeeze(0)  # (total_nnz, 1)
@@ -1189,7 +1206,7 @@ def prepare_model_outputs(self, output, output_args, micro_batch: TensorDict, lo
                 # For trl.AutoModelForCausalLMWithValueHead
                 values = output[2]
             else:
-                values = output.logits
+                values = output.logits.squeeze(-1)
 
             if pad_mode == DatasetPadMode.NO_PADDING:
                 cu_seqlens = input_ids.offsets()
diff --git a/verl/workers/engine/megatron/transformer_impl.py b/verl/workers/engine/megatron/transformer_impl.py
index ae8ed7ddee6..31c4c314b6d 100644
--- a/verl/workers/engine/megatron/transformer_impl.py
+++ b/verl/workers/engine/megatron/transformer_impl.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import inspect
 import logging
 import os
 from functools import partial
@@ -24,7 +26,7 @@
 from tensordict import TensorDict
 
 import verl.utils.torch_functional as verl_F
-from verl.models.mcore import get_mcore_forward_fused_no_padding_fn, get_mcore_weight_converter
+from verl.models.mcore import get_mcore_weight_converter
 from verl.trainer.config import CheckpointConfig
 from verl.utils import tensordict_utils as tu
 from verl.utils.checkpoint.megatron_checkpoint_manager import MegatronCheckpointManager
@@ -127,6 +129,18 @@ def _init_device_mesh(self):
         if mpu.is_initialized():
             return
 
+        extra_args = dict()
+
+        if self.engine_config.dynamic_context_parallel:
+            assert "dynamic_context_parallel" in inspect.signature(mpu.initialize_model_parallel).parameters, (
+                "dynamic_context_parallel is not supported in your megatron version, "
+                + "please update your megatron version to the latest version"
+            )
+            assert self.engine_config.max_seqlen_per_dp_cp_rank is not None, (
+                "max_seqlen_per_dp_cp_rank is required when dynamic_context_parallel is enabled"
+            )
+            extra_args["dynamic_context_parallel"] = self.engine_config.dynamic_context_parallel
+
         mpu.initialize_model_parallel(
             tensor_model_parallel_size=self.engine_config.tensor_model_parallel_size,
             pipeline_model_parallel_size=self.engine_config.pipeline_model_parallel_size,
@@ -136,6 +150,7 @@ def _init_device_mesh(self):
             expert_model_parallel_size=self.engine_config.expert_model_parallel_size,
             expert_tensor_parallel_size=self.engine_config.expert_tensor_parallel_size,
             nccl_communicator_config_path=None,
+            **extra_args,
         )
 
     def _build_tf_config(self):
@@ -148,8 +163,6 @@ def _build_tf_config(self):
         self.dtype = PrecisionType.to_dtype(self.param_dtype)
 
         override_transformer_config = mapping_string_to_attn_backend({**self.engine_config.override_transformer_config})
-        if self.enable_routing_replay:
-            override_transformer_config["enable_routing_replay"] = True
 
         self.provider = None
         self.vanilla_bridge = self.engine_config.vanilla_mbridge
@@ -158,6 +171,14 @@ def _build_tf_config(self):
             from verl.models.mcore.mbridge import AutoBridge
 
             bridge = AutoBridge.from_config(self.model_config.hf_config, dtype=self.param_dtype)
+            if self.engine_config.dynamic_context_parallel:
+                override_transformer_config["max_seqlen_per_dp_cp_rank"] = self.engine_config.max_seqlen_per_dp_cp_rank
+                # note(baiyan): we must set the transformer_config.dynamic_context_parallel to False
+                # because of the bad coupling design in Megatron-LM
+                # https://github.com/xiaoyao0115/Megatron-LM/blob/88733ab6614e3e91b9d095172f41e7d8b5d8e9d4/megatron/core/pipeline_parallel/dynamic_cp_schedule.py#L552-L553
+                # but it does not affect the functionality of dynamic CP, so we can use it to avoid the coupling.
+                override_transformer_config["dynamic_context_parallel"] = False
+                override_transformer_config["context_parallel_size"] = mpu.get_data_parallel_world_size()
             bridge.set_extra_args(**override_transformer_config)
             tf_config = bridge.config
             tf_config.fp16 = self.param_dtype == torch.float16
@@ -206,6 +227,9 @@ def _build_tf_config(self):
             for key, value in override_transformer_config.items():
                 setattr(provider, key, value)
 
+            if self.enable_routing_replay:
+                provider.enable_routing_replay = True
+
             provider.finalize()
             self.provider = provider
             tf_config = None  # Will be set after model creation
@@ -214,6 +238,13 @@ def _build_tf_config(self):
         if not self.bridge:
             self.weight_converter = get_mcore_weight_converter(self.model_config.hf_config, self.dtype)
 
+        # Set enable_routing_replay directly on tf_config instead of passing through
+        # override_transformer_config, because dataclass subclasses like MLATransformerConfig
+        # generate their own __init__ and don't inherit the patched TransformerConfig.__init__
+        # that accepts this kwarg.
+        if self.enable_routing_replay and tf_config is not None:
+            tf_config.enable_routing_replay = True
+
         if torch.distributed.get_rank() == 0:
             if tf_config is not None:
                 print(f"TF config: {tf_config}")
@@ -229,25 +260,20 @@ def _build_megatron_module(self):
         from verl.utils.megatron_utils import McoreModuleWrapperConfig, make_megatron_module
         from verl.utils.model import print_model_size
 
-        # TODO: add more cases
-        is_value_model = (
-            "ForTokenClassification" in self.model_config.architectures[0]
-            or "ForSequenceClassification" in self.model_config.architectures[0]
-        )
-
-        self.is_value_model = is_value_model
-
+        self.is_value_model = self.model_config.model_type == "value_model"
         if self.engine_config.forward_only:
             wrap_with_ddp = False
         else:
             wrap_with_ddp = True
 
         wrap_config = McoreModuleWrapperConfig(
-            is_value_model=is_value_model,  # actor is not value model
-            share_embeddings_and_output_weights=self.model_config.share_embeddings_and_output_weights,
+            is_value_model=self.is_value_model,
             wrap_with_ddp=wrap_with_ddp,
             use_distributed_optimizer=self.engine_config.use_distributed_optimizer,
         )
+        if self.is_value_model:
+            self.model_config.hf_config.tie_word_embeddings = False
+
         module, updated_tf_config = make_megatron_module(
             wrap_config=wrap_config,
             tf_config=self.tf_config,
@@ -263,7 +289,9 @@ def _build_megatron_module(self):
         print(f"module: {len(module)}")
 
         if self.engine_config.use_dist_checkpointing:
-            load_mcore_dist_weights(module, self.engine_config.dist_checkpointing_path, is_value_model=is_value_model)
+            load_mcore_dist_weights(
+                module, self.engine_config.dist_checkpointing_path, is_value_model=self.is_value_model
+            )
         else:
             if self.vanilla_bridge:
                 self.bridge.load_weights(module, self.model_config.local_path)
@@ -494,9 +522,15 @@ def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool
             raise ValueError(f"Invalid device type: {device}")
 
     def get_data_parallel_rank(self):
+        if self.engine_config.dynamic_context_parallel:
+            # in order to let every dp-cp group has full data to split, we set dp=1
+            return 0
         return mpu.get_data_parallel_rank()
 
     def get_data_parallel_size(self):
+        if self.engine_config.dynamic_context_parallel:
+            # in order to let every dp-cp group has full data to split, we set dp=1
+            return 1
         return mpu.get_data_parallel_world_size()
 
     def get_data_parallel_group(self):
@@ -762,6 +796,18 @@ def forward_step(
         self, batch_iter: Iterator[TensorDict], model, logits_processor_func, postprocess_micro_batch_func
     ):
         batch: TensorDict = next(batch_iter)
+
+        if self.engine_config.dynamic_context_parallel:
+            # split the batch and give the sub-batches to each dp-cp group
+            from verl.utils.megatron_utils import dynamic_cp_split_batch
+
+            batch = dynamic_cp_split_batch(
+                batch=batch,
+                engine_config=self.engine_config,
+                dp_size=mpu.get_data_parallel_world_size(),
+                dp_rank=mpu.get_data_parallel_rank(),
+            )
+
         batch = batch.to(get_device_id())
         use_fused_kernels = tu.get_non_tensor_data(batch, key="use_fused_kernels", default=False)
         calculate_entropy = tu.get_non_tensor_data(batch, key="calculate_entropy", default=False)
@@ -771,6 +817,7 @@ def forward_step(
         model_inputs = self.prepare_model_inputs(batch)
         input_ids = model_inputs["input_ids"]
         multi_modal_inputs = model_inputs["multi_modal_inputs"]
+        local_cp_size = tu.get_non_tensor_data(data=batch, key="local_cp_size", default=None)
         loss_mask = model_inputs["loss_mask"]
 
         unwrapped_model = unwrap_model(model)
@@ -793,8 +840,6 @@ def forward_step(
         else:
             raise NotImplementedError(f"Pad mode {pad_mode} is not supported for megatron engine")
 
-        from verl.models.mcore import get_mcore_forward_no_padding_fn
-
         if use_fused_kernels:
             if not self.engine_config.use_remove_padding:
                 logger.warning_once(
@@ -813,7 +858,9 @@ def forward_step(
                 temperature_value = float(temperature)
 
         if use_fused_kernels:
-            fused_forward_fn = get_mcore_forward_fused_no_padding_fn(self.model_config.hf_config)
+            from verl.models.mcore import get_mcore_forward_fused_model_engine_fn
+
+            fused_forward_fn = get_mcore_forward_fused_model_engine_fn(self.model_config.hf_config)
             output = fused_forward_fn(
                 model=model,
                 input_ids=input_ids,
@@ -830,8 +877,9 @@ def forward_step(
             temperature = temperature.to(torch.float32)
             assert temperature.shape[0] == input_ids.shape[0]
             temperature = verl_F.expand_as_nested(temperature, input_ids)  # (bsz, j1)
+            from verl.models.mcore import get_mcore_engine_forward_fn
 
-            forward_fn = get_mcore_forward_no_padding_fn(self.model_config.hf_config)
+            forward_fn = get_mcore_engine_forward_fn(self.model_config.hf_config)
             data_format = "thd" if self.engine_config.use_remove_padding else "bshd"
 
             def logits_processor(logits, label, temperature):
@@ -874,6 +922,7 @@ def logits_processor(logits, label, temperature):
                 pad_token_id=self.model_config.tokenizer.pad_token_id,
                 data_format=data_format,
                 mtp_enable_train=self.model_config.mtp.enable and self.model_config.mtp.enable_train,
+                local_cp_size=local_cp_size,
             )
 
         # Router replay: record routing decisions for R2 mode
@@ -886,15 +935,19 @@ def logits_processor(logits, label, temperature):
             for router in router_instance_list:
                 router.set_router_replay_action(RouterReplayAction.REPLAY_BACKWARD)
 
-        return output, partial(postprocess_micro_batch_func, data=batch)
+        return output, partial(postprocess_micro_batch_func, data=batch, local_cp_size=local_cp_size)
 
-    def postprocess_micro_batch_func(self, output, data: TensorDict, forward_only: bool, loss_function):
+    def postprocess_micro_batch_func(
+        self, output, data: TensorDict, forward_only: bool, loss_function, local_cp_size=None
+    ):
         # For memory efficiency
         # We move calculation of entropy to compute_log_probs, forward_only == True
         device = data["input_ids"].device
         model_output = self.prepare_model_outputs(output, data)
 
         if loss_function is not None:
+            # TODO(baiyan): How to support hybrid context parallel with dp_group,
+            # now the dp_group is not used, so just leave it as is, but what if we need to use it?
             loss, metrics = loss_function(model_output=model_output, data=data, dp_group=self.get_data_parallel_group())
             # scale loss by num_micro_batch because megatron will scale loss
             # by n_micro_batch inside pp schedule
@@ -904,6 +957,16 @@ def postprocess_micro_batch_func(self, output, data: TensorDict, forward_only: b
             loss = torch.tensor(1.0, device=device)
             scaled_loss = loss
             metrics = {}
+        if local_cp_size is not None:
+            # aggregate model_output by DP-CP groups
+            from verl.utils.megatron_utils import dynamic_cp_merge_output
+
+            model_output = dynamic_cp_merge_output(
+                model_output,
+                dp_size=mpu.get_data_parallel_world_size(),
+                dp_rank=mpu.get_data_parallel_rank(),
+                local_cp_size=local_cp_size,
+            )
 
         output = {
             "model_output": model_output,
@@ -925,9 +988,9 @@ def forward_step(self, batch_iter, model, logits_processor_func, postprocess_mic
         input_ids = model_inputs["input_ids"]
         multi_modal_inputs = model_inputs["multi_modal_inputs"]
 
-        from verl.models.mcore import get_mcore_forward_no_padding_fn
+        from verl.models.mcore import get_mcore_engine_forward_fn
 
-        forward_fn = get_mcore_forward_no_padding_fn(self.model_config.hf_config)
+        forward_fn = get_mcore_engine_forward_fn(self.model_config.hf_config)
 
         output = forward_fn(
             model,
@@ -936,7 +999,7 @@ def forward_step(self, batch_iter, model, logits_processor_func, postprocess_mic
             value_model=True,
             vision_model=hasattr(self.model_config.hf_config, "vision_config"),
             pad_token_id=self.model_config.tokenizer.pad_token_id,
-            enable_mtp=self.model_config.mtp.enable_train,
+            data_format="thd" if self.engine_config.use_remove_padding else "bshd",
         )
 
         return output, partial(postprocess_micro_batch_func, data=batch)
diff --git a/verl/workers/engine/mindspeed/__init__.py b/verl/workers/engine/mindspeed/__init__.py
index 63a83da7872..503de19deb9 100644
--- a/verl/workers/engine/mindspeed/__init__.py
+++ b/verl/workers/engine/mindspeed/__init__.py
@@ -12,6 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .transformer_impl import MindspeedEngineWithLMHead
+from .transformer_impl import MindspeedEngineWithLMHead, MindSpeedLLMEngineWithLMHead
 
-__all__ = ["MindspeedEngineWithLMHead"]
+__all__ = ["MindspeedEngineWithLMHead", "MindSpeedLLMEngineWithLMHead"]
diff --git a/verl/workers/engine/mindspeed/transformer_impl.py b/verl/workers/engine/mindspeed/transformer_impl.py
index d369e60c319..acbdcf63a66 100644
--- a/verl/workers/engine/mindspeed/transformer_impl.py
+++ b/verl/workers/engine/mindspeed/transformer_impl.py
@@ -21,10 +21,21 @@
     repatch = None
 
 from verl.trainer.config import CheckpointConfig
-from verl.workers.config import HFModelConfig, McoreEngineConfig, McoreOptimizerConfig
+from verl.utils.megatron.router_replay_patch import RouterReplay
+from verl.utils.model import print_model_size
+from verl.workers.config import (
+    HFModelConfig,
+    McoreEngineConfig,
+    McoreOptimizerConfig,
+    MindSpeedEngineConfig,
+)
 
 from ..base import EngineRegistry
 from ..megatron import MegatronEngineWithLMHead
+from .utils import (
+    apply_patch,
+    gpt_model_provider,
+)
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -54,3 +65,50 @@ def _init_device_mesh(self):
                 repatch_config["context_parallel_size"] = self.engine_config.context_parallel_size
             repatch(repatch_config)
         super()._init_device_mesh()
+
+
+@EngineRegistry.register(model_type="language_model", backend="mindspeed_llm", device="npu")
+class MindSpeedLLMEngineWithLMHead(MegatronEngineWithLMHead):
+    def __init__(
+        self,
+        model_config: HFModelConfig,
+        engine_config: MindSpeedEngineConfig,
+        optimizer_config: McoreOptimizerConfig,
+        checkpoint_config: CheckpointConfig,
+    ):
+        super().__init__(model_config, engine_config, optimizer_config, checkpoint_config)
+
+    def _init_device_mesh(self):
+        apply_patch(self.model_config, self.engine_config, self.optimizer_config)
+        super()._init_device_mesh()
+
+    def _build_megatron_module(self):
+        is_value_model = (
+            "ForTokenClassification" in self.model_config.architectures[0]
+            or "ForSequenceClassification" in self.model_config.architectures[0]
+        )
+
+        self.is_value_model = is_value_model
+
+        import torch.distributed
+        from megatron.core.enums import ModelType
+        from megatron.training.training import get_model
+
+        # For forward_only, we don't need optimizer, lr_scheduler, checkpoint_mananager
+        if self.engine_config.forward_only:
+            module = get_model(gpt_model_provider, ModelType.encoder_or_decoder, wrap_with_ddp=False)
+            return module
+
+        module = get_model(gpt_model_provider, ModelType.encoder_or_decoder, wrap_with_ddp=True)
+        if self.vanilla_bridge:
+            self.bridge.load_weights(module, self.model_config.local_path)
+        else:
+            raise ValueError(f"vanilla_bridge should be true now, but got {self.vanilla_bridge}")
+
+        if torch.distributed.get_rank() == 0:
+            print_model_size(module[0])
+
+        if self.enable_routing_replay:
+            print(f"routing replay layers: {len(RouterReplay.router_instances)}")
+
+        return module
diff --git a/verl/workers/engine/mindspeed/utils.py b/verl/workers/engine/mindspeed/utils.py
new file mode 100644
index 00000000000..9b600e30aba
--- /dev/null
+++ b/verl/workers/engine/mindspeed/utils.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2026 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+
+import torch
+
+from verl.workers.config import HFModelConfig, McoreOptimizerConfig, MindSpeedEngineConfig
+
+
+def get_base_mcore_config_from_model_config(model_config: HFModelConfig) -> dict:
+    """
+    Create a base TransformerConfig with common parameters across different model architectures.
+
+    Args:
+        model_config: HuggingFace model configuration
+
+    Returns:
+        TransformerConfig with common parameters
+    """
+
+    hf_config = model_config.hf_config
+    base_config = {
+        "num_layers": hf_config.num_hidden_layers,
+        "hidden_size": hf_config.hidden_size,
+        "num_attention_heads": hf_config.num_attention_heads,
+        "num_query_groups": hf_config.num_key_value_heads,
+        "ffn_hidden_size": hf_config.intermediate_size,
+        "attention_dropout": hf_config.attention_dropout,
+        "hidden_dropout": getattr(hf_config, "hidden_dropout", 0.0),
+        "kv_channels": getattr(hf_config, "head_dim", None),
+        "norm_topk_prob": getattr(hf_config, "norm_topk_prob", False),
+        "layernorm_epsilon": hf_config.rms_norm_eps,
+        "max_position_embeddings": hf_config.max_position_embeddings,
+        "tie_word_embeddings": hf_config.tie_word_embeddings,
+        "torch_dtype": hf_config.torch_dtype,
+        "bf16": hf_config.dtype is torch.bfloat16,
+        "rotary_base": int(hf_config.rope_theta),
+        "num_experts": getattr(hf_config, "num_experts", None),
+        "moe_router_topk": getattr(hf_config, "num_experts_per_tok", None),
+        "moe_ffn_hidden_size": getattr(hf_config, "moe_intermediate_size", None),
+        "padded_vocab_size": hf_config.vocab_size,
+        "make_vocab_size_divisible_by": 1,
+        "untie_embeddings_and_output_weights": True,
+    }
+
+    tokenizer_config = {
+        "tokenizer_name_or_path": model_config.tokenizer_path,
+        "tokenizer_type": "PretrainedFromHF",
+    }
+    base_config.update(tokenizer_config)
+    return base_config
+
+
+def get_base_mcore_config_from_engine_config(engine_config: MindSpeedEngineConfig) -> dict:
+    """
+    Create a base TransformerConfig with common parameters across different model architectures.
+
+    Args:
+        engine_config: mindspeed engine configuration
+
+    Returns:
+        TransformerConfig with common parameters
+    """
+
+    base_config = {
+        "tensor_model_parallel_size": engine_config.tensor_model_parallel_size,
+        "expert_model_parallel_size": engine_config.expert_model_parallel_size,
+        "expert_tensor_parallel_size": engine_config.expert_tensor_parallel_size,
+        "pipeline_model_parallel_size": engine_config.pipeline_model_parallel_size,
+        "virtual_pipeline_model_parallel_size": engine_config.virtual_pipeline_model_parallel_size,
+        "context_parallel_size": engine_config.context_parallel_size,
+        "sequence_parallel": engine_config.sequence_parallel,
+        "use_distributed_optimizer": engine_config.use_distributed_optimizer,
+        "seed": engine_config.seed,
+    }
+    if engine_config.strategy == "mindspeed_llm":
+        base_config.update(engine_config.llm_kwargs)
+    elif engine_config.strategy == "mindspeed_mm":
+        base_config.update(engine_config.mm_kwargs)
+    return base_config
+
+
+def get_base_mcore_config_from_optim_config(optim_config: McoreOptimizerConfig) -> dict:
+    """
+    Create a base TransformerConfig with common parameters across different model architectures.
+
+    Args:
+        optim_config: megatron optimizer configuration
+
+    Returns:
+        TransformerConfig with common parameters
+    """
+
+    base_config = {
+        "lr": optim_config.lr,
+        "lr_decay_style": optim_config.lr_decay_style,
+        "min_lr": optim_config.min_lr,
+        "weight_decay": optim_config.weight_decay,
+        "lr_warmup_fraction": optim_config.lr_warmup_steps_ratio,
+        "clip_grad": optim_config.clip_grad,
+        "adam_beta1": optim_config.betas[0],
+        "adam_beta2": optim_config.betas[1],
+    }
+
+    base_config.update(optim_config.override_optimizer_config)
+    return base_config
+
+
+def set_global_config(config):
+    from megatron.training.arguments import parse_args, validate_args
+    from megatron.training.global_vars import set_global_variables
+
+    args = parse_args(ignore_unknown_args=True)
+    for key, value in config.items():
+        setattr(args, key, value)
+
+    validate_args(args)
+    try:
+        set_global_variables(args)
+    except AssertionError:
+        print("megatron args already set")
+
+
+def add_mcore_arguments(all_config: dict) -> dict:
+    mcore_config_dict = {}
+    mcore_config_list = []
+    for key, value in all_config.items():
+        if value is None:
+            continue
+        mcore_config_dict[key] = value
+        if isinstance(value, bool):
+            mcore_config_list.append(f"--{key.replace('_', '-')}")
+
+    from megatron.training.arguments import add_megatron_arguments
+
+    parser = argparse.ArgumentParser(description="Megatron-LM Arguments", allow_abbrev=False)
+    parser = add_megatron_arguments(parser)
+    args, _ = parser.parse_known_args(mcore_config_list)
+    return {**vars(args), **mcore_config_dict}
+
+
+def apply_patch(model_config, engine_config, optimizer_config):
+    model_config = get_base_mcore_config_from_model_config(model_config)
+    optimizer_config = get_base_mcore_config_from_optim_config(optimizer_config)
+    engine_config = get_base_mcore_config_from_engine_config(engine_config)
+    all_config = {**model_config, **optimizer_config, **engine_config}
+    mcore_config = add_mcore_arguments(all_config)
+    from mindspeed_llm.tasks.megatron_adaptor_v2 import repatch
+
+    repatch(mcore_config)
+    set_global_config(mcore_config)
+
+
+def gpt_model_provider(pre_process=True, post_process=True):
+    """
+    Builds the model.
+
+    If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model.
+
+    Args:
+        pre_process (bool, optional): Set to true if you need to compute embedings. Defaults to True.
+        post_process (bool, optional): Set to true if you need to want to compute output logits/loss.
+        Defaults to True.
+
+
+    Returns:
+        Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model
+    """
+    from megatron.core.models.gpt import GPTModel
+    from megatron.core.models.gpt.gpt_layer_specs import (
+        get_gpt_layer_local_spec,
+        get_gpt_layer_with_transformer_engine_spec,
+    )
+    from megatron.core.transformer.spec_utils import import_module
+    from megatron.training import get_args
+    from megatron.training.arguments import core_transformer_config_from_args
+
+    args = get_args()
+    use_te = args.transformer_impl == "transformer_engine"
+    # Experimental loading arguments from configs
+    config = core_transformer_config_from_args(args)
+
+    if args.spec is not None:
+        transformer_layer_spec = import_module(args.spec)
+    else:
+        if use_te:
+            transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+                args.num_experts, args.moe_grouped_gemm, qk_layernorm=args.qk_layernorm
+            )
+        else:
+            transformer_layer_spec = get_gpt_layer_local_spec(
+                args.num_experts, args.moe_grouped_gemm, qk_layernorm=args.qk_layernorm
+            )
+
+    model = GPTModel(
+        config=config,
+        transformer_layer_spec=transformer_layer_spec,
+        vocab_size=args.padded_vocab_size,
+        max_sequence_length=args.max_position_embeddings,
+        pre_process=pre_process,
+        post_process=post_process,
+        fp16_lm_cross_entropy=args.fp16_lm_cross_entropy,
+        parallel_output=True,
+        share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
+        position_embedding_type=args.position_embedding_type,
+        rotary_percent=args.rotary_percent,
+        seq_len_interpolation_factor=args.rotary_seq_len_interpolation_factor,
+    )
+
+    return model
diff --git a/verl/workers/engine_workers.py b/verl/workers/engine_workers.py
index 32456ec8f7e..e68bd84e6cf 100644
--- a/verl/workers/engine_workers.py
+++ b/verl/workers/engine_workers.py
@@ -32,9 +32,10 @@
 from verl.trainer.distillation import distillation_ppo_loss, is_distillation_enabled
 from verl.utils import tensordict_utils as tu
 from verl.utils.config import omega_conf_to_dataclass
-from verl.utils.device import get_device_name, set_expandable_segments
+from verl.utils.device import get_device_name, is_npu_available, set_expandable_segments
 from verl.utils.distributed import initialize_global_process_group_ray, set_numa_affinity
 from verl.utils.flops_counter import FlopsCounter
+from verl.utils.import_utils import import_external_libs
 from verl.utils.memory_utils import aggressive_empty_cache
 from verl.utils.metric.utils import Metric
 from verl.utils.profiler import DistProfiler, DistProfilerExtension, ProfilerConfig, log_gpu_memory_usage
@@ -83,6 +84,11 @@ def __init__(self, config: TrainingWorkerConfig):
 
         from verl.workers.engine import BaseEngine, EngineRegistry
 
+        # TODO(jhz): Switch to `set_expandable_segments` when the torch_npu library
+        # supports `torch.npu.memory._set_allocator_settings`
+        if is_npu_available:
+            os.environ["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True"
+
         initialize_global_process_group_ray(timeout_second=None)
 
         set_numa_affinity()
@@ -108,8 +114,8 @@ def __init__(self, config: TrainingWorkerConfig):
 
         # we use the one defined in model
         # TODO: this is not elegant and should refactor later
-        self.engine_config.use_remove_padding = self.model_config.use_remove_padding
-        self.engine_config.use_fused_kernels = self.model_config.use_fused_kernels
+        self.engine_config.use_remove_padding = self.model_config.get("use_remove_padding", False)
+        self.engine_config.use_fused_kernels = self.model_config.get("use_fused_kernels", False)
 
         # TODO: add DistProfilerExtension
         self.profiler_config = self.config.profiler_config
@@ -122,6 +128,7 @@ def __init__(self, config: TrainingWorkerConfig):
             self, DistProfiler(rank=self.rank, config=self.profiler_config, tool_config=self.profiler_tool_config)
         )
 
+        self.model_config.model_type = self.config.model_type
         self.engine: BaseEngine = EngineRegistry.new(
             model_type=self.config.model_type,
             backend=self.engine_config.strategy,
@@ -138,7 +145,11 @@ def __init__(self, config: TrainingWorkerConfig):
             is_collect=self.engine.is_mp_src_rank_with_outputs(),
         )
 
-        self.flops_counter = FlopsCounter(self.model_config.hf_config)
+        if hasattr(self.model_config, "hf_config"):
+            self.flops_counter = FlopsCounter(self.model_config.hf_config)
+        else:
+            # for Diffusion models, FlopsCounter is not supported yet.
+            self.flops_counter = None
 
         self.loss_fn = None
 
@@ -209,7 +220,7 @@ def _postprocess_output(self, output, *, global_token_num, delta_time, forward_o
                 flatten_v = [sublist[0] for sublist in v]  # sublist should be single element
                 final_metrics[k] = sum(flatten_v) / len(flatten_v)
         # compute mfu
-        if global_token_num is not None:
+        if global_token_num is not None and self.flops_counter is not None:
             estimated_flops, promised_flops = self.flops_counter.estimate_flops(
                 global_token_num, delta_time, images_seqlens=images_seqlens
             )
@@ -271,13 +282,19 @@ def train_mini_batch(self, data: TensorDict) -> TensorDict:
 
             for batch_idx, mini_batch_td in enumerate(dataloader):
                 # add global token num
-                global_token_num = mini_batch_td["input_ids"].offsets().diff().tolist()  # (total_nnz,)
-                # allgather from dp rank
-                global_token_num_output = [None] * self.engine.get_data_parallel_size()
-                torch.distributed.all_gather_object(
-                    global_token_num_output, global_token_num, self.engine.get_data_parallel_group()
-                )
-                global_token_num = [x for xs in global_token_num_output for x in xs]
+                if "input_ids" in mini_batch_td:
+                    global_token_num = mini_batch_td["input_ids"].offsets().diff().tolist()  # (total_nnz,)
+                    # allgather from dp rank
+                    global_token_num_output = [None] * torch.distributed.get_world_size(
+                        self.engine.get_data_parallel_group()
+                    )
+                    torch.distributed.all_gather_object(
+                        global_token_num_output, global_token_num, self.engine.get_data_parallel_group()
+                    )
+                    global_token_num = [x for xs in global_token_num_output for x in xs]
+                else:
+                    global_token_num = None
+
                 tu.assign_non_tensor(
                     mini_batch_td,
                     global_token_num=NonTensorData(global_token_num),
@@ -317,7 +334,7 @@ def train_batch(self, data: TensorDict) -> TensorDict:
 
         # inject engineering parameters if not specified
         default_keys = dict(
-            use_remove_padding=self.model_config.use_remove_padding,
+            use_remove_padding=self.model_config.get("use_remove_padding", False),
             use_dynamic_bsz=self.engine_config.use_dynamic_bsz,
             max_token_len_per_gpu=self.engine_config.max_token_len_per_gpu,
             micro_batch_size_per_gpu=self.engine_config.micro_batch_size_per_gpu,
@@ -371,7 +388,7 @@ def infer_batch(self, data: TensorDict) -> TensorDict:
         images_seqlens = tu.get(data, key="images_seqlens", default=None)
 
         default_keys = dict(
-            use_remove_padding=self.model_config.use_remove_padding,
+            use_remove_padding=self.model_config.get("use_remove_padding", False),
             use_dynamic_bsz=self.engine_config.use_dynamic_bsz,
             max_token_len_per_gpu=self.engine_config.infer_max_token_len_per_gpu,
             micro_batch_size_per_gpu=self.engine_config.infer_micro_batch_size_per_gpu,
@@ -449,7 +466,7 @@ def __init__(
             omega_profiler_config = config.ref.get("profiler", {})
 
         profiler_config = omega_conf_to_dataclass(omega_profiler_config, dataclass_type=ProfilerConfig)
-        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory"]:
+        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory", "precision_debugger"]:
             tool_config = omega_conf_to_dataclass(
                 omega_profiler_config.get("tool_config", {}).get(omega_profiler_config.get("tool"))
             )
@@ -598,6 +615,9 @@ def init_model(self):
             backend = checkpoint_engine_config.backend
             bucket_size = checkpoint_engine_config.update_weights_bucket_megabytes << 20
             engine_kwargs = checkpoint_engine_config.engine_kwargs.get(backend, {})
+            # If custom_backend_module is set, import it so plugins can register
+            # in CheckpointEngineRegistry before the backend is instantiated.
+            import_external_libs(checkpoint_engine_config.custom_backend_module or None)
             self.checkpoint_engine = CheckpointEngineRegistry.new(
                 backend, is_master=(torch.distributed.get_rank() == 0), bucket_size=bucket_size, **engine_kwargs
             )
@@ -665,36 +685,34 @@ async def update_weights(self, global_steps: int = None):
             await self.rollout.resume(tags=["weights"])
         log_gpu_memory_usage("After resume weights", logger=logger)
 
-        # 2. get per tensor params from engine, this will load model to gpu
+        # 2. determine if we need a base weight sync (adapter path only)
         per_tensor_param, peft_config = self.actor.engine.get_per_tensor_param(
             layered_summon=self.layered_summon, base_sync_done=True
         )
 
-        await self.rollout.update_weights(
-            per_tensor_param, peft_config=peft_config, base_sync_done=True, global_steps=global_steps
-        )
-
         do_lora_base_sync = False
         if not self.peft_merge and peft_config is not None:
-            # set sleep level for LoRA adapter weights only sync
-            # TODO: make this configurable so that users with small
-            # main memory can trade sync time to avoid OOM
             self.rollout.sleep_level = 1
+            do_lora_base_sync = not self.base_sync_done
 
-            do_lora_base_sync = (not self.base_sync_done) or (
-                self.rollout.sleep_level != 1 and self.config.rollout.free_cache_engine
-            )
-
+        # 3. sync weights: For SGLang, we need base first (when needed), then adapter/merged
         if do_lora_base_sync:
-            per_tensor_base_params, _ = self.actor.engine.get_per_tensor_param(
+            per_tensor_param_base, peft_config = self.actor.engine.get_per_tensor_param(
                 layered_summon=self.layered_summon, base_sync_done=False
             )
-            await self.rollout.update_weights(per_tensor_base_params, peft_config=peft_config, base_sync_done=False)
+            await self.rollout.update_weights(
+                per_tensor_param_base, peft_config=peft_config, base_sync_done=False, global_steps=global_steps
+            )
+
+        await self.rollout.update_weights(
+            per_tensor_param, peft_config=peft_config, base_sync_done=True, global_steps=global_steps
+        )
 
         log_gpu_memory_usage("After update_weights", logger=logger)
 
         # 3. offload model to cpu
-        self.actor.engine.to("cpu", model=True, optimizer=False, grad=False)
+        if self.actor.engine.is_param_offload_enabled:
+            self.actor.engine.to("cpu", model=True, optimizer=False, grad=False)
         aggressive_empty_cache(force_sync=True)
 
         # 4. resume kv_cache
diff --git a/verl/workers/fsdp_workers.py b/verl/workers/fsdp_workers.py
index 7e6cd2df480..f052448dfe3 100644
--- a/verl/workers/fsdp_workers.py
+++ b/verl/workers/fsdp_workers.py
@@ -35,6 +35,8 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import FullStateDictConfig, ShardedStateDictConfig, StateDictType
 
+from verl.utils.import_utils import deprecated
+
 try:
     # for torch 2.5+
     from torch.distributed.tensor import DTensor
@@ -141,6 +143,7 @@ def get_vl_model_vision_tower(vl_model_instance):
     return None
 
 
+@deprecated("legacy worker implementation is deprecated and will be removed in v0.8.0")
 class ActorRolloutRefWorker(Worker, DistProfilerExtension):
     """
     This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy
@@ -226,7 +229,7 @@ def __init__(self, config: DictConfig, role: str, **kwargs):
         # omega_profiler_config is DictConfig
         # profiler_config is a ProfilerConfig dataclass
         profiler_config = omega_conf_to_dataclass(omega_profiler_config, dataclass_type=ProfilerConfig)
-        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory"]:
+        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory", "precision_debugger"]:
             tool_config = omega_conf_to_dataclass(
                 omega_profiler_config.get("tool_config", {}).get(omega_profiler_config.get("tool"))
             )
@@ -1306,12 +1309,13 @@ def dump_memory_snapshot(self, tag: str = "manual", sub_dir: str = None) -> None
                 pass
 
 
+@deprecated("legacy worker implementation is deprecated and will be removed in v0.8.0")
 class CriticWorker(Worker, DistProfilerExtension):
     def __init__(self, config: FSDPCriticConfig):
         Worker.__init__(self)
         omega_profiler_config = config.get("profiler", {})
         profiler_config = omega_conf_to_dataclass(omega_profiler_config, dataclass_type=ProfilerConfig)
-        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory"]:
+        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory", "precision_debugger"]:
             tool_config = omega_conf_to_dataclass(
                 omega_profiler_config.get("tool_config", {}).get(omega_profiler_config.get("tool"))
             )
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
index bc0fd1fafba..6693df70cfd 100644
--- a/verl/workers/megatron_workers.py
+++ b/verl/workers/megatron_workers.py
@@ -52,6 +52,7 @@
 from verl.utils.distributed import set_numa_affinity
 from verl.utils.flops_counter import FlopsCounter
 from verl.utils.fs import copy_to_local
+from verl.utils.import_utils import deprecated
 from verl.utils.megatron.router_replay_patch import RouterReplay, RouterReplayAction, apply_router_replay_patch
 from verl.utils.megatron_peft_utils import add_base_layer_suffix, build_peft_config_for_vllm
 from verl.utils.megatron_utils import (
@@ -103,6 +104,7 @@ def set_random_seed(seed, only_rollout=False):
     # os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
 
 
+@deprecated("legacy worker implementation is deprecated and will be removed in v0.8.0")
 class MegatronWorker(Worker):
     def _init_hf_config_and_tf_config(
         self,
@@ -261,6 +263,7 @@ def _init_hf_config_and_tf_config(
         )
 
 
+@deprecated("legacy worker implementation is deprecated and will be removed in v0.8.0")
 class ActorRolloutRefWorker(MegatronWorker, DistProfilerExtension):
     """
     This worker can be instantiated as a standalone actor or a standalone rollout or a standalone reference policy
@@ -346,7 +349,7 @@ def __init__(self, config: DictConfig, role: str, **kwargs):
         # omega_profiler_config is DictConfig
         # profiler_config is a ProfilerConfig dataclass
         profiler_config = omega_conf_to_dataclass(omega_profiler_config, dataclass_type=ProfilerConfig)
-        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory"]:
+        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory", "precision_debugger"]:
             tool_config = omega_conf_to_dataclass(
                 omega_profiler_config.get("tool_config", {}).get(omega_profiler_config.get("tool"))
             )
@@ -1023,7 +1026,7 @@ def __init__(self, config: McoreCriticConfig):
 
         omega_profiler_config = config.get("profiler", {})
         profiler_config = omega_conf_to_dataclass(omega_profiler_config, dataclass_type=ProfilerConfig)
-        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory"]:
+        if omega_profiler_config.get("tool", None) in ["npu", "nsys", "torch", "torch_memory", "precision_debugger"]:
             tool_config = omega_conf_to_dataclass(
                 omega_profiler_config.get("tool_config", {}).get(omega_profiler_config.get("tool"))
             )
diff --git a/verl/workers/rollout/replica.py b/verl/workers/rollout/replica.py
index 4f05fd1d1de..1a40d29af82 100644
--- a/verl/workers/rollout/replica.py
+++ b/verl/workers/rollout/replica.py
@@ -20,7 +20,7 @@
 
 import ray
 from omegaconf import DictConfig
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 from ray.actor import ActorHandle
 
 from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup, ResourcePoolManager
@@ -52,9 +52,11 @@ class TokenOutput(BaseModel):
 
 
 class DiffusionOutput(BaseModel):
-    diffusion_output: list[list[list[float]]] | list[list[list[list[float]]]]
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    diffusion_output: Any
     """generated image tensor (CHW format) / video tensor (TCHW format)"""
-    log_probs: Optional[list[float]] = None
+    log_probs: Optional[Any] = None
     """logprobs of generated image/video"""
     stop_reason: Optional[str] = None
     """stop reason: 'completed', 'aborted', or None for unknown"""
@@ -325,7 +327,10 @@ def _load_vllm():
 
 
 def _load_vllm_omni():
-    from verl.workers.rollout.vllm_rollout.vllm_omni_async_server import vLLMOmniReplica
+    try:
+        from verl.workers.rollout.vllm_rollout.vllm_omni_async_server import vLLMOmniReplica
+    except ImportError as err:
+        raise ImportError("vllm-omni rollout requires vllm-omni to be installed.") from err
 
     return vLLMOmniReplica
 
diff --git a/verl/workers/rollout/sglang_rollout/async_sglang_server.py b/verl/workers/rollout/sglang_rollout/async_sglang_server.py
index 90c4717916c..b504cb9db06 100644
--- a/verl/workers/rollout/sglang_rollout/async_sglang_server.py
+++ b/verl/workers/rollout/sglang_rollout/async_sglang_server.py
@@ -156,8 +156,6 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
                 assert master_address and master_port, "non-master node should provide master address and port"
                 self._master_address = master_address
                 self._master_port = master_port
-            else:
-                self._master_sock.close()
 
         engine_kwargs = self.config.get("engine_kwargs", {}).get("sglang", {}) or {}
         attention_backend = engine_kwargs.pop("attention_backend", None)
@@ -323,15 +321,29 @@ async def wake_up(self):
             await self.tokenizer_manager.resume_memory_occupation(obj, None)
             await self.tokenizer_manager.flush_cache()
 
+    @property
+    def lora_as_adapter(self) -> bool:
+        return (
+            self.model_config.lora_rank > 0 or self.model_config.lora.get("rank", 0) > 0
+        ) and not self.model_config.lora.get("merge", False)
+
     async def sleep(self):
         if self.node_rank != 0 or not self.config.free_cache_engine:
             return
 
+        # When using LoRA as adapter (merge=False), only release kv_cache —
+        # keep base weights in GPU so we only need to sync adapter deltas.
+        # Mirrors the vLLM sleep() pattern in vllm_async_server.py.
+        if self.lora_as_adapter:
+            tags = ["kv_cache"]
+        else:
+            tags = ["kv_cache", "weights"]
+
         if self.rollout_mode == RolloutMode.HYBRID:
-            obj = ReleaseMemoryOccupationReqInput(tags=["kv_cache", "weights"])
+            obj = ReleaseMemoryOccupationReqInput(tags=tags)
             await self.tokenizer_manager.release_memory_occupation(obj, None)
         elif self.rollout_mode == RolloutMode.COLOCATED:
-            obj = ReleaseMemoryOccupationReqInput(tags=["kv_cache", "weights"])
+            obj = ReleaseMemoryOccupationReqInput(tags=tags)
             await self.tokenizer_manager.release_memory_occupation(obj, None)
         elif self.rollout_mode == RolloutMode.STANDALONE:
             # In standalone mode, resume kv_cache if free_cache_engine is enabled
diff --git a/verl/workers/rollout/sglang_rollout/sglang_rollout.py b/verl/workers/rollout/sglang_rollout/sglang_rollout.py
index 4329bf346d4..0b7bcff6108 100644
--- a/verl/workers/rollout/sglang_rollout/sglang_rollout.py
+++ b/verl/workers/rollout/sglang_rollout/sglang_rollout.py
@@ -107,7 +107,8 @@ def __init__(
         device_mesh: DeviceMesh,
         replica_rank: int = -1,
     ):
-        if config.get("quantization", None) == "fp8":
+        super().__init__(config, model_config, device_mesh)
+        if self.config.get("quantization", None) == "fp8":
             import sglang
             from packaging import version
 
@@ -121,8 +122,7 @@ def __init__(
                 "weight_block_size": [128, 128],
             }
             fp8_block_quant_kwargs = dict(FP8_BLOCK_QUANT_KWARGS)
-            model_config.hf_config.quantization_config = fp8_block_quant_kwargs
-        super().__init__(config, model_config, device_mesh)
+            self.model_config.hf_config.quantization_config = fp8_block_quant_kwargs
         self._engine: AsyncHttpServerAdapter = None
 
         rank = int(os.environ["RANK"])
diff --git a/tests/trainer/config/__init__.py b/verl/workers/rollout/trtllm_rollout/__init__.py
similarity index 91%
rename from tests/trainer/config/__init__.py
rename to verl/workers/rollout/trtllm_rollout/__init__.py
index 1ce90c5eb35..d828409b82e 100644
--- a/tests/trainer/config/__init__.py
+++ b/verl/workers/rollout/trtllm_rollout/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2026 Bytedance Ltd. and/or its affiliates
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/verl/workers/rollout/trtllm_rollout/trtllm_rollout.py b/verl/workers/rollout/trtllm_rollout/trtllm_rollout.py
index 4f6b3a88eee..606cb4b019f 100644
--- a/verl/workers/rollout/trtllm_rollout/trtllm_rollout.py
+++ b/verl/workers/rollout/trtllm_rollout/trtllm_rollout.py
@@ -22,7 +22,7 @@
 import pickle
 import threading
 from contextlib import asynccontextmanager
-from typing import Any, Generator, Optional
+from typing import Any, AsyncGenerator, Generator, Optional
 
 import aiohttp
 import pynvml
@@ -36,6 +36,7 @@
 from verl.utils.net_utils import is_valid_ipv6_address
 from verl.workers.config import HFModelConfig, RolloutConfig
 from verl.workers.rollout.base import BaseRollout
+from verl.workers.rollout.utils import ensure_async_iterator
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
@@ -411,7 +412,10 @@ async def update_weights_from_ipc_handles(self, device_handles):
         await asyncio.to_thread(dist.barrier, group=self.hybrid_device_mesh["exclude_dp"].get_group())
 
     async def update_weights(
-        self, weights: Generator[tuple[str, torch.Tensor], None, None], global_steps: int = None, **kwargs
+        self,
+        weights: Generator[tuple[str, torch.Tensor], None, None] | AsyncGenerator[tuple[str, torch.Tensor], None],
+        global_steps: int = None,
+        **kwargs,
     ):
         assert self.hybrid_device_mesh is not None, "hybrid_device_mesh is not set"
 
@@ -453,11 +457,11 @@ async def flush():
             cur_available_bytes = total_available_bytes
             cur_handles = []
 
-        # Non-VLM never supports partial loading. For VLM, leader queries and broadcasts to all
+        # For non-VLM, always use partial loading. For VLM, leader queries and broadcasts to all
         # ranks in the DP replica; use get_global_rank(group, 0) since each replica has a different leader.
         is_vlm = self.model_config.hf_config is not None and hasattr(self.model_config.hf_config, "vision_config")
         if not is_vlm:
-            supports_partial_loading = False
+            supports_partial_loading = True
         else:
             exclude_dp_group = self.hybrid_device_mesh["exclude_dp"].get_group()
             spl_tensor = torch.zeros(1, dtype=torch.int32)
@@ -468,7 +472,7 @@ async def flush():
             await asyncio.to_thread(dist.broadcast, spl_tensor, src=leader_global_rank, group=exclude_dp_group)
             supports_partial_loading = bool(spl_tensor.item())
 
-        for name, param in weights:
+        async for name, param in ensure_async_iterator(weights):
             if supports_partial_loading:
                 size_in_bytes = param.element_size() * param.numel()
                 if size_in_bytes > cur_available_bytes:
diff --git a/verl/workers/rollout/utils.py b/verl/workers/rollout/utils.py
index efff4af2852..6a5022b6f3c 100644
--- a/verl/workers/rollout/utils.py
+++ b/verl/workers/rollout/utils.py
@@ -93,7 +93,11 @@ def qwen2_5_vl_dedup_image_tokens(prompt_ids: list[int], processor):
     <|vision_start|><|image_pad|><|vision_end|>
     ```
     """
-    if processor is not None and "Qwen2VLImageProcessor" in processor.image_processor.__class__.__name__:
+    if (
+        processor is not None
+        and hasattr(processor, "image_processor")
+        and "Qwen2VLImageProcessor" in processor.image_processor.__class__.__name__
+    ):
         prompt_ids = np.array(prompt_ids)
         mask = np.ones(len(prompt_ids), dtype=bool)
         is_value = (prompt_ids == processor.image_token_id) | (prompt_ids == processor.video_token_id)
diff --git a/verl/workers/rollout/vllm_rollout/utils.py b/verl/workers/rollout/vllm_rollout/utils.py
index 437ffe9f0b2..fd2de7e2391 100644
--- a/verl/workers/rollout/vllm_rollout/utils.py
+++ b/verl/workers/rollout/vllm_rollout/utils.py
@@ -35,7 +35,7 @@
     from verl.utils.vllm_omni import OmniTensorLoRARequest, VLLMOmniHijack
 
     _VLLM_OMNI_AVAILABLE = True
-except ImportError:  # vllm_omni and related utilities are optional
+except (ImportError, RuntimeError):  # optional stack; ImportError if missing, RuntimeError e.g. diffusers/transformers
     CustomPipelineWorkerExtension = None  # type: ignore[assignment]
     OmniTensorLoRARequest = None  # type: ignore[assignment]
     VLLMOmniHijack = None  # type: ignore[assignment]
diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
index 371b2f5cb0e..bfa8dd5d7a4 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py
@@ -282,8 +282,8 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
                     served_model_name = served_model_name.split("/")[-1]
                 args["served_model_name"] = served_model_name
 
-        # mtp
-        if self.config.mtp.enable and self.config.mtp.enable_rollout:
+        # mtp (None for diffusion models; only LLM models use speculative decoding)
+        if self.config.mtp is not None and self.config.mtp.enable and self.config.mtp.enable_rollout:
             speculative_config = {
                 "method": self.config.mtp.method,
                 "num_speculative_tokens": self.config.mtp.num_speculative_tokens,
@@ -384,13 +384,8 @@ async def launch_server(self, master_address: str = None, master_port: int = Non
 
         # 3. launch server
         if self.node_rank == 0:
-            self._master_sock.close()
-            self._dp_rpc_sock.close()
-            self._dp_master_sock.close()
             await self.run_server(server_args)
         else:
-            # TODO: avoid connect before master_sock close
-            await asyncio.sleep(3)
             await self.run_headless(server_args)
 
     async def run_server(self, args: argparse.Namespace):
@@ -796,6 +791,7 @@ def _apply_quantization(self) -> tuple[Optional[str], dict]:
         """Process quantization config. Returns (quantization_str, hf_overrides)."""
         quantization = self.config.quantization
         hf_overrides = {}
+
         if is_torch_npu_available(check_device=False):
             from verl.utils.vllm.npu_vllm_patch import check_vllm_ascend_before_server_launch
 
@@ -827,7 +823,7 @@ def _apply_quantization(self) -> tuple[Optional[str], dict]:
             hf_overrides["quantization_config"] = quantization_config_dict
         elif quantization is not None:
             # Handle other quantization methods (fp8, torchao)
-            _SUPPORTED_QUANTIZATION = ["fp8", "torchao"]
+            _SUPPORTED_QUANTIZATION = ["fp8", "torchao", "ascend"]
             if quantization not in _SUPPORTED_QUANTIZATION:
                 raise ValueError(f"Currently only support {_SUPPORTED_QUANTIZATION} quantization, got: {quantization}")
 
diff --git a/verl/workers/rollout/vllm_rollout/vllm_omni_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_omni_async_server.py
index c6afe28e568..a5f24de7eaf 100644
--- a/verl/workers/rollout/vllm_rollout/vllm_omni_async_server.py
+++ b/verl/workers/rollout/vllm_rollout/vllm_omni_async_server.py
@@ -187,14 +187,14 @@ async def generate(
             final_res = output
         assert final_res is not None
 
-        diffusion_output = (self._to_tensor(final_res.images[0]) / 255.0).tolist()
+        diffusion_output = self._to_tensor(final_res.images[0]).float() / 255.0
 
         # Extract extra data from custom_output (populated by DiffusionEngine)
         mm_output = final_res.custom_output or {}
 
         if sampling_params.get("logprobs", False):
             all_log_probs = mm_output.get("all_log_probs")
-            log_probs = all_log_probs[0].tolist() if all_log_probs is not None else None
+            log_probs = all_log_probs[0] if all_log_probs is not None else None
         else:
             log_probs = None
 
diff --git a/verl/workers/utils/losses.py b/verl/workers/utils/losses.py
index 9273088e31a..7b344407a7d 100644
--- a/verl/workers/utils/losses.py
+++ b/verl/workers/utils/losses.py
@@ -17,6 +17,7 @@
 from tensordict import TensorDict
 
 from verl.trainer.ppo.core_algos import agg_loss, compute_value_loss, get_policy_loss_fn, kl_penalty
+from verl.trainer.ppo.diffusion_algos import kl_penalty_image
 from verl.utils import tensordict_utils as tu
 from verl.utils.dataset.dataset_utils import DatasetPadMode
 from verl.utils.metric import AggregationType, Metric
@@ -54,7 +55,6 @@ def sft_loss(config: ActorConfig, model_output, data: TensorDict, dp_group=None)
     return loss, {}
 
 
-
 def ppo_loss(config: ActorConfig, model_output, data: TensorDict, dp_group=None):
     """Computes ppo loss from model output (log_prob, entropy, values, etc. ) and old_log_probs from data."""
     log_prob = no_padding_2_padding(model_output["log_probs"], data)
@@ -175,3 +175,55 @@ def value_loss(config: CriticConfig, model_output, data: TensorDict, dp_group=No
     )
 
     return vf_loss, metrics
+
+
+def diffusion_loss(config: ActorConfig, model_output, data: TensorDict, dp_group=None):
+    """Compute loss for diffusion model"""
+    log_prob = model_output["log_probs"]
+
+    config.global_batch_info["loss_scale_factor"] = config.loss_scale_factor
+
+    metrics = {}
+
+    response_mask = data["response_mask"].to(bool)
+    # compute policy loss
+    old_log_prob = data["old_log_probs"]
+    advantages = data["advantages"]
+
+    loss_agg_mode = config.loss_agg_mode
+
+    loss_mode = config.policy_loss.get("loss_mode", "flow_grpo")
+
+    policy_loss_fn = get_policy_loss_fn(loss_mode)
+    pg_loss, pg_metrics = policy_loss_fn(
+        old_log_prob=old_log_prob,
+        log_prob=log_prob,
+        advantages=advantages,
+        response_mask=response_mask,
+        loss_agg_mode=loss_agg_mode,
+        config=config,
+        rollout_is_weights=None,
+    )
+
+    pg_metrics = Metric.from_dict(pg_metrics, aggregation=AggregationType.MEAN)
+
+    metrics.update(pg_metrics)
+    metrics["actor/pg_loss"] = Metric(value=pg_loss, aggregation=AggregationType.MEAN)
+    policy_loss = pg_loss
+
+    if config.use_kl_loss:
+        ref_prev_sample_mean = data["ref_prev_sample_mean"]
+        prev_sample_mean = model_output["prev_sample_mean"]
+        std_dev_t = model_output["std_dev_t"]
+        kl_loss = kl_penalty_image(
+            prev_sample_mean=prev_sample_mean, ref_prev_sample_mean=ref_prev_sample_mean, std_dev_t=std_dev_t
+        )
+
+        policy_loss += kl_loss * config.kl_loss_coef
+        metrics["kl_loss"] = Metric(value=kl_loss, aggregation=AggregationType.MEAN)
+        metrics["kl_coef"] = config.kl_loss_coef
+
+    gradient_accumulation_steps = tu.get_non_tensor_data(data, "gradient_accumulation_steps", default=None)
+    policy_loss = policy_loss / gradient_accumulation_steps
+
+    return policy_loss, metrics
diff --git a/verl/workers/utils/padding.py b/verl/workers/utils/padding.py
index 1adb37f5f1d..c7137d05573 100644
--- a/verl/workers/utils/padding.py
+++ b/verl/workers/utils/padding.py
@@ -141,3 +141,43 @@ def no_padding_2_padding(tensor: torch.Tensor, data: TensorDict) -> torch.Tensor
 
     output = torch.stack(response_list, dim=0)
     return output
+
+
+def embeds_padding_2_no_padding(data: TensorDict) -> TensorDict:
+    """
+    Convert TensorDict from prompt embeds with padding to no-padding format.
+    For diffusion model training only.
+
+    Currently we expect the prompt embedding mask to be [1111000...] format,
+    which means the valid tokens are continuous and start from the left.
+
+    Args:
+        data: TensorDict with "prompt_embeds", "prompt_embeds_mask",
+              "negative_prompt_embeds", "negative_prompt_embeds_mask"
+
+    Returns:
+        data: TensorDict with
+        - Tensor includes NestedTensors "prompt_embeds", "prompt_embeds_mask",
+          "negative_prompt_embeds", "negative_prompt_embeds_mask"
+    """
+
+    def _to_nested(embeds: torch.Tensor, mask: torch.Tensor):
+        """Strip padding from (bs, seq_len, dim) embeds using the boolean mask and return nested tensors."""
+        embeds_list, mask_list = [], []
+        for i in range(mask.shape[0]):
+            curr_mask = mask[i].bool()
+            embeds_list.append(embeds[i, curr_mask, :])
+            mask_list.append(curr_mask[curr_mask])
+        return (
+            torch.nested.as_nested_tensor(embeds_list, layout=torch.jagged),
+            torch.nested.as_nested_tensor(mask_list, layout=torch.jagged),
+        )
+
+    data["prompt_embeds"], data["prompt_embeds_mask"] = _to_nested(data["prompt_embeds"], data["prompt_embeds_mask"])
+
+    if isinstance(data.get("negative_prompt_embeds", None), torch.Tensor):
+        data["negative_prompt_embeds"], data["negative_prompt_embeds_mask"] = _to_nested(
+            data["negative_prompt_embeds"], data["negative_prompt_embeds_mask"]
+        )
+
+    return data

From 8349c49ccf5c90d4700143a289037f4e0167ee38 Mon Sep 17 00:00:00 2001
From: aleskeymalakhov11 <aleksey.malakhov11@gmail.com>
Date: Wed, 8 Apr 2026 17:30:51 +0300
Subject: [PATCH 3/8] fix docs

---
 verl/trainer/config/rollout/rollout.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
index e53d3cbb62c..c2c56c10e93 100644
--- a/verl/trainer/config/rollout/rollout.yaml
+++ b/verl/trainer/config/rollout/rollout.yaml
@@ -424,13 +424,28 @@ quantization_config_file: null
 # MTP configuration, reuse model configuration
 mtp: ${oc.select:actor_rollout_ref.model.mtp, null}
 
+# Speculative decoding configuration for vLLM rollout using an external draft model.
 speculative_decoding:
+
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
   _target_: verl.workers.config.rollout.SpeculativeDecodingConfig
+
+  # Whether to enable inference-only speculative decoding for vLLM rollout
   enable: False
+
+  # Speculative decoding method supported by vLLM, e.g. eagle or eagle3
   method: eagle3
+
+  # Number of speculative decoding steps
   num_steps: 1
+
+  # Number of draft tokens proposed by the draft model
   num_draft_tokens: 4
+
+  # Path to the draft model used for speculative decoding
   draft_model_path: null
+
+  # Tensor parallel size for the draft model, should be 1 or match tensor_model_parallel_size
   draft_tensor_parallel_size: 1
 
 # QAT configuration (inherited from actor's engine config)

From 81e23381d4db8a11abf5d1ddfea5bbca35574169 Mon Sep 17 00:00:00 2001
From: aleskeymalakhov11 <aleksey.malakhov11@gmail.com>
Date: Wed, 8 Apr 2026 17:35:23 +0300
Subject: [PATCH 4/8] remove config

---
 config/plain.yaml | 129 ----------------------------------------------
 1 file changed, 129 deletions(-)
 delete mode 100644 config/plain.yaml

diff --git a/config/plain.yaml b/config/plain.yaml
deleted file mode 100644
index fe0c6d14d73..00000000000
--- a/config/plain.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
-hydra:
-  searchpath:
-    - file://verl/trainer/config
-
-defaults:
-  - ppo_megatron_trainer
-  - _self_
-
-algorithm:
-  adv_estimator: grpo
-  use_kl_in_reward: false
-
-data:
-  train_files: /from_s3/dataset/train.parquet
-  val_files: /from_s3/dataset/test.parquet
-  train_batch_size: 128
-  max_prompt_length: 512
-  max_response_length: 2048
-  filter_overlong_prompts: true
-  truncation: error
-  return_raw_chat: true
-
-actor_rollout_ref:
-  model:
-    path: /from_s3/models
-    use_fused_kernels: false # true
-    use_remove_padding: false
-    trust_remote_code: true
-
-  actor:
-    use_dynamic_bsz: false
-    ppo_max_token_len_per_gpu: 2048
-    ppo_mini_batch_size: 16
-    ppo_micro_batch_size_per_gpu: 1
-    optim:
-      lr: 1e-6
-    use_kl_loss: false
-    kl_loss_coef: 0.001
-    kl_loss_type: low_var_kl
-    entropy_coeff: 0
-
-    # USE_LEGACY_WORKER_IMPL=enable path from the bash snippet
-    router_replay:
-      # mode: R3
-      mode: disabled
-      record_file: null  # Path for recording routing decisions
-      replay_file: null   # Path for replaying recorded decisions
-
-    megatron:
-      param_offload: true
-      optimizer_offload: true
-      grad_offload: true
-      pipeline_model_parallel_size: 1
-      # tensor_model_parallel_size: 4
-      # expert_model_parallel_size: 4
-      # expert_tensor_parallel_size: 2
-
-      tensor_model_parallel_size: 1
-      expert_model_parallel_size: 1
-      expert_tensor_parallel_size: 1
-
-      use_mbridge: true
-      override_transformer_config:
-        # moe_enable_deepep: true
-        # moe_token_dispatcher_type: flex
-        apply_rope_fusion: false # true
-        bias_activation_fusion: false # true
-        # moe_router_dtype: fp32
-        recompute_method: uniform
-        recompute_granularity: full
-        recompute_num_layers: 1
-        gradient_accumulation_fusion: false # true
-        moe_permute_fusion: false
-
-#  Using a large number of experts (e.g. >=32) without fp32 routing. Consider enabling moe_router_dtype for better numerical stability. [repeated 7x across cluster]
-# UserWarning: moe_enable_deepep is deprecated.Please use --moe-flex-dispatcher-backend=deepep instead. [repeated 7x across cluster]
-
-  rollout:
-    name: vllm
-    mode: async
-    temperature: 1.0
-    tensor_model_parallel_size: 1
-    gpu_memory_utilization: 0.75
-    n: 8
-
-    enable_chunked_prefill: true # not tested yet with true
-    enable_prefix_caching: true # not tested yet with true
-    enforce_eager: false # not tested yet with false
-
-    max_num_batched_tokens: 1024
-    skip_tokenizer_init: true
-    enable_rollout_routing_replay: false # not tested yet with true
-    log_prob_use_dynamic_bsz: false
-    log_prob_micro_batch_size_per_gpu: 1
-    log_prob_max_token_len_per_gpu: 2048
-
-    load_format: "safetensors"
-
-    disable_log_stats: false
-    prometheus:
-      enable: true
-
-    speculative_decoding:
-      enable: true
-
-      method: EAGLE3
-
-      num_steps: 1
-      num_draft_tokens: 4
-
-      draft_model_path: /from_s3/eagle_vllm
-      draft_tensor_parallel_size: 1
-
-
-trainer:
-  critic_warmup: 0
-  logger: ["console", "clearml"]
-  project_name: verl_megatron_moe
-  experiment_name: cuda-graphs-eagle3-4tokens-qwen3-lighteval-MATH
-  nnodes: 1
-  n_gpus_per_node: 8
-  save_freq: -1
-  test_freq: 10
-  total_training_steps: 50000
-  balance_batch: false
-  use_legacy_worker_impl: enable
-  val_before_train: false
-  log_val_generations: 10
-  # rollout_data_dir: /from_s3/train_rollout_data/

From 2731f389b4a6d808d93371f660872e015b29ca5d Mon Sep 17 00:00:00 2001
From: aleskeymalakhov11 <aleksey.malakhov11@gmail.com>
Date: Wed, 8 Apr 2026 17:48:13 +0300
Subject: [PATCH 5/8] fix assert

---
 verl/workers/config/rollout.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index d055c6e07fb..d3c34c90341 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -362,9 +362,9 @@ def __post_init__(self):
                     stacklevel=2,
                 )
 
-            if (
-                self.speculative_decoding.draft_tensor_parallel_size != 1
-                or self.speculative_decoding.draft_tensor_parallel_size != self.tensor_model_parallel_size
+            if not (
+                self.speculative_decoding.draft_tensor_parallel_size == self.tensor_model_parallel_size
+                or self.speculative_decoding.draft_tensor_parallel_size == 1
             ):
                 raise ValueError(
                     f"draft_tensor_parallel_size={self.speculative_decoding.draft_tensor_parallel_size} "

From 4c3396b3bce6347ce923f48132a34a5397e4eb79 Mon Sep 17 00:00:00 2001
From: aleskeymalakhov11 <aleksey.malakhov11@gmail.com>
Date: Wed, 8 Apr 2026 18:47:33 +0300
Subject: [PATCH 6/8] fix async requests

---
 verl/experimental/agent_loop/agent_loop.py    |  4 +-
 .../agent_loop/prometheus_utils.py            | 38 ++++++++++++++-----
 verl/workers/rollout/vllm_rollout/utils.py    |  8 +++-
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index 7dd814e1ef6..c16bf631a6a 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -1151,7 +1151,7 @@ async def generate_sequences(self, prompts: DataProto) -> DataProto:
         spec_before = None
         if self.rollout_config.name == "vllm" and self.rollout_config.speculative_decoding.enable:
             try:
-                spec_before = read_spec_decoding_metrics_from_prometheus(self.server_addresses)
+                spec_before = await read_spec_decoding_metrics_from_prometheus(self.server_addresses)
             except Exception as e:
                 print(f"speculative decoding unavailable: {e}")
 
@@ -1174,7 +1174,7 @@ async def generate_sequences(self, prompts: DataProto) -> DataProto:
 
         if spec_before is not None:
             try:
-                spec_after = read_spec_decoding_metrics_from_prometheus(self.server_addresses)
+                spec_after = await read_spec_decoding_metrics_from_prometheus(self.server_addresses)
                 spec_delta = {key: spec_after[key] - spec_before[key] for key in spec_before}
                 acceptance_rate = (
                     spec_delta["num_accepted_tokens"] / spec_delta["num_draft_tokens"]
diff --git a/verl/experimental/agent_loop/prometheus_utils.py b/verl/experimental/agent_loop/prometheus_utils.py
index a11d050d3d7..167d2530791 100644
--- a/verl/experimental/agent_loop/prometheus_utils.py
+++ b/verl/experimental/agent_loop/prometheus_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
+import asyncio
 import logging
 import os
 
@@ -110,7 +110,7 @@ def reload_prometheus(port):
         logger.error(f"Failed to update Prometheus configuration: {e}")
 
 
-def read_spec_decoding_metrics_from_prometheus(server_adresses: list[str]) -> dict[str, float]:
+def _read_spec_decoding_metrics_from_prometheus_for_address(address: str) -> dict[str, float]:
     import requests
     from prometheus_client.parser import text_string_to_metric_families
 
@@ -123,11 +123,31 @@ def read_spec_decoding_metrics_from_prometheus(server_adresses: list[str]) -> di
     session = requests.Session()
     session.trust_env = False
 
-    for address in server_adresses:
-        metrics_text = session.get(f"http://{address}/metrics", timeout=5).text
-        for family in text_string_to_metric_families(metrics_text):
-            for sample in family.samples:
-                key = metric_name_to_key.get(sample.name)
-                if key is not None:
-                    totals[key] += float(sample.value)
+    metrics_text = session.get(f"http://{address}/metrics", timeout=5).text
+    for family in text_string_to_metric_families(metrics_text):
+        for sample in family.samples:
+            key = metric_name_to_key.get(sample.name)
+            if key is not None:
+                totals[key] += float(sample.value)
+    return totals
+
+
+async def read_spec_decoding_metrics_from_prometheus(server_adresses: list[str]) -> dict[str, float]:
+    totals = {
+        "num_drafts": 0.0,
+        "num_draft_tokens": 0.0,
+        "num_accepted_tokens": 0.0,
+    }
+
+    results = await asyncio.gather(
+        *[
+            asyncio.to_thread(_read_spec_decoding_metrics_from_prometheus_for_address, address)
+            for address in server_adresses
+        ]
+    )
+
+    for metrics in results:
+        for key, value in metrics.items():
+            totals[key] += value
+
     return totals
diff --git a/verl/workers/rollout/vllm_rollout/utils.py b/verl/workers/rollout/vllm_rollout/utils.py
index fd2de7e2391..12bf30b17ce 100644
--- a/verl/workers/rollout/vllm_rollout/utils.py
+++ b/verl/workers/rollout/vllm_rollout/utils.py
@@ -320,7 +320,13 @@ def __new__(cls, **kwargs):
 
         return super().__new__(cls)
 
-    def update_weights_from_ipc(self, peft_config: dict = None, base_sync_done=False, use_shm: bool = False):
+    def update_weights_from_ipc(
+        self,
+        peft_config: dict = None,
+        base_sync_done=False,
+        use_shm: bool = False,
+        use_speculative_decoding: bool = False,
+    ):
         """Update the weights of the rollout model."""
 
         from verl.workers.rollout.vllm_rollout.bucketed_weight_transfer import BucketedWeightReceiver

From b91f6da18692f6a53e60ef9bf7ebd3b22014e9ff Mon Sep 17 00:00:00 2001
From: aleskeymalakhov11 <aleksey.malakhov11@gmail.com>
Date: Wed, 8 Apr 2026 20:28:20 +0300
Subject: [PATCH 7/8] add warning

---
 verl/workers/config/rollout.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index d3c34c90341..4f89cdc452b 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -372,6 +372,20 @@ def __post_init__(self):
                     "tensor_parallel_size={self.tensor_model_parallel_size} "
                 )
 
+            if self.speculative_decoding.method.lower() in {"eagle", "eagle3"} and (
+                self.enable_chunked_prefill or self.enable_prefix_caching or not self.enforce_eager
+            ):
+                warnings.warn(
+                    "vLLM speculative decoding with EAGLE/EAGLE3 may regress throughput when "
+                    "enable_chunked_prefill=True, enable_prefix_caching=True, or enforce_eager=False. "
+                    "Overriding to enable_chunked_prefill=False, enable_prefix_caching=False, "
+                    "and enforce_eager=True for now.",
+                    stacklevel=2,
+                )
+                self.enable_chunked_prefill = False
+                self.enable_prefix_caching = False
+                self.enforce_eager = True
+
         if self.speculative_decoding.enable and self.mtp.enable_rollout:
             raise ValueError("Use either speculative_decoding or mtp, but not both simultaneously")
 

From e2f472dc4a662588bc920ff8c1f82a69aa65ab1a Mon Sep 17 00:00:00 2001
From: aleskeymalakhov11 <aleksey.malakhov11@gmail.com>
Date: Thu, 9 Apr 2026 19:03:31 +0300
Subject: [PATCH 8/8] add asserts, optional metric logging and other fixes

---
 verl/experimental/agent_loop/agent_loop.py    |  6 ++-
 .../config/_generated_diffusion_trainer.yaml  |  1 +
 .../_generated_ppo_megatron_trainer.yaml      |  1 +
 .../_generated_ppo_torchtitan_trainer.yaml    |  1 +
 .../config/_generated_ppo_trainer.yaml        |  1 +
 .../config/_generated_ppo_veomni_trainer.yaml |  1 +
 verl/trainer/config/rollout/rollout.yaml      |  4 ++
 verl/workers/config/rollout.py                | 37 +++++++++++++++----
 8 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
index c16bf631a6a..bd9ccf4c378 100644
--- a/verl/experimental/agent_loop/agent_loop.py
+++ b/verl/experimental/agent_loop/agent_loop.py
@@ -1149,7 +1149,11 @@ async def generate_sequences(self, prompts: DataProto) -> DataProto:
             await self.teacher_model_manager.wake_up()
 
         spec_before = None
-        if self.rollout_config.name == "vllm" and self.rollout_config.speculative_decoding.enable:
+        if (
+            self.rollout_config.name == "vllm"
+            and self.rollout_config.speculative_decoding.enable
+            and self.rollout_config.speculative_decoding.log_metrics
+        ):
             try:
                 spec_before = await read_spec_decoding_metrics_from_prometheus(self.server_addresses)
             except Exception as e:
diff --git a/verl/trainer/config/_generated_diffusion_trainer.yaml b/verl/trainer/config/_generated_diffusion_trainer.yaml
index 10640606970..e8a81951f2a 100644
--- a/verl/trainer/config/_generated_diffusion_trainer.yaml
+++ b/verl/trainer/config/_generated_diffusion_trainer.yaml
@@ -356,6 +356,7 @@ actor_rollout_ref:
       num_draft_tokens: 4
       draft_model_path: null
       draft_tensor_parallel_size: 1
+      log_metrics: false
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
     height: 512
     width: 512
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
index aa74bb5cfd7..5de8b63ece3 100644
--- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -376,6 +376,7 @@ actor_rollout_ref:
       num_draft_tokens: 4
       draft_model_path: null
       draft_tensor_parallel_size: 1
+      log_metrics: false
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
   model:
     _target_: verl.workers.config.HFModelConfig
diff --git a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
index a6b8dd9ecf1..3339ca0a9b5 100644
--- a/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_torchtitan_trainer.yaml
@@ -343,6 +343,7 @@ actor_rollout_ref:
       num_draft_tokens: 4
       draft_model_path: null
       draft_tensor_parallel_size: 1
+      log_metrics: false
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
   model:
     _target_: verl.workers.config.HFModelConfig
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
index 9d2ed2858e7..4176750402e 100644
--- a/verl/trainer/config/_generated_ppo_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -352,6 +352,7 @@ actor_rollout_ref:
       num_draft_tokens: 4
       draft_model_path: null
       draft_tensor_parallel_size: 1
+      log_metrics: false
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
   model:
     _target_: verl.workers.config.HFModelConfig
diff --git a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
index a30d1c012b2..2de4df5269d 100644
--- a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
+++ b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
@@ -322,6 +322,7 @@ actor_rollout_ref:
       num_draft_tokens: 4
       draft_model_path: null
       draft_tensor_parallel_size: 1
+      log_metrics: false
     qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
   model:
     _target_: verl.workers.config.HFModelConfig
diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml
index c2c56c10e93..c472a6b7631 100644
--- a/verl/trainer/config/rollout/rollout.yaml
+++ b/verl/trainer/config/rollout/rollout.yaml
@@ -448,5 +448,9 @@ speculative_decoding:
   # Tensor parallel size for the draft model, should be 1 or match tensor_model_parallel_size
   draft_tensor_parallel_size: 1
 
+  # Whether to scrape and log speculative decoding metrics such as acceptance rate.
+  # Disabled by default because this currently requires Prometheus requests around rollout batches.
+  log_metrics: false
+
 # QAT configuration (inherited from actor's engine config)
 qat: ${oc.select:actor_rollout_ref.actor.fsdp_config.qat,${oc.select:actor_rollout_ref.actor.megatron.qat,null}}
diff --git a/verl/workers/config/rollout.py b/verl/workers/config/rollout.py
index 4f89cdc452b..4ef644acda7 100644
--- a/verl/workers/config/rollout.py
+++ b/verl/workers/config/rollout.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
+import os
 import warnings
 from dataclasses import dataclass, field
 from typing import Optional
@@ -37,6 +39,9 @@
     "SkipConfig",
 ]
 
+logger = logging.getLogger(__name__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
 
 @dataclass
 class SkipConfig(BaseConfig):
@@ -176,6 +181,9 @@ class SpeculativeDecodingConfig(BaseConfig):
     draft_model_path: str | None = None
 
     draft_tensor_parallel_size: int = 1
+    # Disabled by default because speculative decoding metrics currently require
+    # Prometheus scrapes around rollout batches, which can slightly reduce speedup.
+    log_metrics: bool = False
 
 
 @dataclass
@@ -362,6 +370,12 @@ def __post_init__(self):
                     stacklevel=2,
                 )
 
+            if self.load_format != "safetensors":
+                raise ValueError(
+                    "vLLM speculative decoding currently requires rollout.load_format='safetensors', "
+                    f"but got {self.load_format}"
+                )
+
             if not (
                 self.speculative_decoding.draft_tensor_parallel_size == self.tensor_model_parallel_size
                 or self.speculative_decoding.draft_tensor_parallel_size == 1
@@ -369,22 +383,31 @@ def __post_init__(self):
                 raise ValueError(
                     f"draft_tensor_parallel_size={self.speculative_decoding.draft_tensor_parallel_size} "
                     "cannot be other value than 1 or target model "
-                    "tensor_parallel_size={self.tensor_model_parallel_size} "
+                    f"tensor_parallel_size={self.tensor_model_parallel_size} "
+                )
+
+            if self.speculative_decoding.log_metrics:
+                msg = (
+                    "speculative_decoding.log_metrics defaults to False because enabling it can slightly "
+                    "reduce rollout speedup. vLLM does not currently expose speculative decoding metrics "
+                    "directly, so this path scrapes Prometheus around rollout batches."
+                )
+                logger.warning(msg)
+                warnings.warn(
+                    msg,
+                    stacklevel=2,
                 )
 
             if self.speculative_decoding.method.lower() in {"eagle", "eagle3"} and (
                 self.enable_chunked_prefill or self.enable_prefix_caching or not self.enforce_eager
             ):
                 warnings.warn(
-                    "vLLM speculative decoding with EAGLE/EAGLE3 may regress throughput when "
+                    "vLLM speculative decoding with EAGLE/EAGLE3 can regress throughput under "
                     "enable_chunked_prefill=True, enable_prefix_caching=True, or enforce_eager=False. "
-                    "Overriding to enable_chunked_prefill=False, enable_prefix_caching=False, "
-                    "and enforce_eager=True for now.",
+                    "For better performance, set enable_chunked_prefill=False, "
+                    "enable_prefix_caching=False, and enforce_eager=True.",
                     stacklevel=2,
                 )
-                self.enable_chunked_prefill = False
-                self.enable_prefix_caching = False
-                self.enforce_eager = True
 
         if self.speculative_decoding.enable and self.mtp.enable_rollout:
             raise ValueError("Use either speculative_decoding or mtp, but not both simultaneously")