alibaba · UsernameFull · Feb 4, 2026 · Feb 6, 2026 · Feb 11, 2026 · Mar 16, 2026
diff --git a/examples/ascend_examples/qwen3_4B_dpo_megatron.yaml b/examples/ascend_examples/qwen3_4B_dpo_megatron.yaml
@@ -0,0 +1,103 @@
+defaults:
+  - ../config/deepspeed_zero@_here_
+  - ../config/deepspeed_zero2@_here_
+  - ../config/deepspeed_zero3@_here_
+  - ../config/deepspeed_zero3_cpuoffload@_here_
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+exp_name: "qwen3-4B-dpo-config"
+seed: 42
+logging_dir: ./output/logs
+output_dir: ./output
+system_envs:
+  USE_MODELSCOPE: '1'
+
+checkpoint_config:
+  type: file_system
+  output_dir: ./ckpt
+
+
+track_name: None
+
+
+max_steps: 500
+save_steps: 500
+logging_steps: 1
+eval_steps: 100
+resume_from_checkpoint: false
+
+sequence_length: 512
+train_batch_size: 64
+val_batch_size: 64
+
+# local_rank: -1
+num_nodes: 1
+num_gpus_per_node: 4
+
+pretrain: Qwen/Qwen3-4B
+
+ipo: false
+beta: 0.1
+label_smoothing: 0.0
+
+chosen_key: chosen
+rejected_key: rejected
+
+validation:
+  data_args:
+    template: qwen3
+    file_name: data/comparison_gpt4_data_zh.json
+
+actor_train:
+  model_args:
+    disable_gradient_checkpointing: false
+    dtype: bf16
+    model_type: ~
+  training_args:
+    lr_scheduler_type: constant
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 16
+    gradient_accumulation_steps: 1
+    warmup_steps: 20
+    num_train_epochs: 10
+  data_args:
+    template: qwen3
+    file_name:
+      - data/comparison_gpt4_data_zh.json
+    dataset_dir: data
+    preprocessing_num_workers: 1
+  strategy_args:
+    strategy_name: megatron_train
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      use_distributed_optimizer: true
+      recompute_granularity: full
+  device_mapping: list(range(0,2))
+  infer_batch_size: 16
+
+
+reference:
+  model_args:
+    disable_gradient_checkpointing: true
+    dtype: bf16
+    model_type: ~
+  data_args:
+    template: qwen3
+  strategy_args:
+    strategy_name: megatron_infer
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      additional_configs:
+        mindspeed_args:
+          attention_mask_type: general
+  device_mapping: list(range(2,4))
+  infer_batch_size: 16
diff --git a/examples/ascend_examples/qwen3_8b_rlvr_deepspeed.yaml b/examples/ascend_examples/qwen3_8b_rlvr_deepspeed.yaml
@@ -0,0 +1,166 @@
+defaults:
+  - ../config/deepspeed_zero@_here_
+  - ../config/deepspeed_zero2@_here_
+  - ../config/deepspeed_zero3@_here_
+  - ../config/deepspeed_zero3_cpuoffload@_here_
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
+exp_name: Qwen3-8B-RLVR-${pg_variant}
+seed: 42
+logging_dir: ./output/logs
+output_dir: ./output
+system_envs:
+  USE_MODELSCOPE: '1'
+
+checkpoint_config:
+  type: file_system
+  output_dir: ./ckpt
+
+num_gpus_per_node: 8
+
+max_steps: 200
+save_steps: 100
+logging_steps: 1
+eval_steps: 10
+resume_from_checkpoint: false
+
+
+rollout_batch_size: 64  # prompt
+prompt_length: 1024
+response_length: 2048
+
+num_return_sequences_in_group: 8
+ppo_epochs: 1
+adv_estimator: "reinforce"
+
+# clip
+value_clip: 0.5
+reward_clip: 10
+advantage_clip: 2.0
+dual_clip_loss: true
+
+# normalize
+norm_mean_type: batch
+norm_std_type: batch
+
+# data mask
+max_len_mask: true
+difficulty_mask: true
+difficulty_low_threshold: 0.2
+difficulty_high_threshold: 0.95
+error_max_len_clip: false
+
+# data weight
+difficulty_loss_weight: false
+length_loss_weight: false
+
+# reward
+add_token_level_kl: false
+
+# advantage
+whiten_advantages: true
+
+# dynamic sampling scheduler
+# use_additional_prompts: true
+# max_running_requests: 256
+# is_num_return_sequences_expand: false
+
+pretrain: Qwen/Qwen3-8B-Base
+reward_pretrain: Qwen/Qwen3-8B-Base
+
+validation:
+  data_args:
+    template: qwen3
+    file_name:
+      - data/math_benchmarks.jsonl
+  generating_args:
+    top_p: 0.6
+    top_k: 50
+    num_beams: 1
+    temperature: 0.6
+    num_return_sequences: 1
+  eval_steps: 10
+
+actor_train:
+  worker_cls: roll.pipeline.rlvr.actor_pg_worker.ActorPGWorker
+  pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
+  model_args:
+    flash_attn: fa2
+    disable_gradient_checkpointing: false
+    dtype: bf16
+    model_type: ~
+  training_args:
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 16
+    warmup_steps: 20
+    num_train_epochs: 50
+  data_args:
+    template: qwen3
+    file_name:
+      - data/math_deepmath_deal.jsonl
+    domain_interleave_probs:
+      math_rule: 1
+    dataset_dir: data
+    messages: messages
+    interleave_probs: "1.0"
+    preprocessing_num_workers: 16
+  strategy_args:
+    strategy_name: deepspeed_train
+    strategy_config: ${deepspeed_zero3}
+  device_mapping: list(range(0,4))
+  infer_batch_size: 4
+
+actor_infer:
+  model_args:
+    flash_attn: fa2
+    disable_gradient_checkpointing: true
+    dtype: bf16
+  generating_args:
+    max_new_tokens: ${response_length}
+    top_p: 0.99
+    top_k: 100
+    num_beams: 1
+    temperature: 0.99
+    num_return_sequences: ${num_return_sequences_in_group}
+  data_args:
+    template: qwen3
+  strategy_args:
+    strategy_name: vllm
+    strategy_config:
+      gpu_memory_utilization: 0.6
+      block_size: 16
+      max_model_len: 8000
+  device_mapping: list(range(4,6))
+  infer_batch_size: 4
+
+reference:
+  model_args:
+    flash_attn: fa2
+    disable_gradient_checkpointing: true
+    dtype: bf16
+    model_type: ~
+  data_args:
+    template: qwen3
+  strategy_args:
+    strategy_name: hf_infer
+    strategy_config: ~
+  device_mapping: list(range(6,8))
+  infer_batch_size: 4
+
+rewards:
+  math_rule:
+    worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
+    model_args:
+      model_name_or_path: ${reward_pretrain}
+    data_args:
+      template: qwen3
+    tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule']
+    world_size: 8
+    infer_batch_size: 1
diff --git a/examples/ascend_examples/run_dpo_pipeline.sh b/examples/ascend_examples/run_dpo_pipeline.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+set +x
+
+CONFIG_PATH=$(basename $(dirname $0))
+python examples/start_dpo_pipeline.py --config_path $CONFIG_PATH  --config_name qwen3_4B_dpo_megatron
diff --git a/mcore_adapter/src/mcore_adapter/training_args.py b/mcore_adapter/src/mcore_adapter/training_args.py
@@ -2,6 +2,12 @@
 from dataclasses import dataclass, field, fields
 from typing import Literal, Optional, Union
 
+try:
+    # NPU patch
+    import mindspeed.megatron_adaptor
+except ImportError:
+    pass
+
 from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout
 from transformers import Seq2SeqTrainingArguments as HFSeq2SeqTrainingArguments
 from transformers import TrainingArguments as HFTrainingArguments

diff --git a/roll/distributed/strategy/megatron_strategy.py b/roll/distributed/strategy/megatron_strategy.py
@@ -422,6 +422,13 @@ def inner_forward_step(self, loss_func, data_iterator: Iterator[DataProto], mode
         else:
             input_ids = self._get_feature_on_this_cp_rank(input_ids, "input_ids")
             attention_mask = self._get_feature_on_this_cp_rank(attention_mask, "attention_mask")
+
+            if hasattr(torch, "npu") and torch.npu.is_available() and attention_mask is not None:
+                attention_mask = attention_mask.bool()
+                B, S = attention_mask.shape
+                attention_mask = attention_mask[:, None, None, :]   # [B,1,1,S]
+                attention_mask = attention_mask.expand(B, 1, S, S)        # [B,1,S,S]
+
             if labels is not None:
                 labels = self._get_feature_on_this_cp_rank(labels, "labels")
         position_ids = None

diff --git a/roll/platforms/__init__.py b/roll/platforms/__init__.py
@@ -25,26 +25,31 @@ def _init_platform() -> Platform:
     Returns:
         An instance of a subclass of Platform corresponding to the detected hardware.
     """
+    try:
+        import torch_npu  # noqa: F401
+
+        if hasattr(torch, "npu") and torch.npu.is_available():
+            logger.debug("Detected NPU (torch_npu). Initializing NPU platform.")
+            return NpuPlatform()
+    except ImportError:
+        pass
+
     if torch.cuda.is_available():
         device_name = torch.cuda.get_device_name().upper()
         logger.debug(f"Detected CUDA device: {device_name}")
+
         if "NVIDIA" in device_name:
             logger.debug("Initializing CUDA platform (NVIDIA).")
             return CudaPlatform()
         elif "AMD" in device_name:
             logger.debug("Initializing ROCm platform (AMD).")
             return RocmPlatform()
+
         logger.warning("Unrecognized CUDA device. Falling back to UnknownPlatform.")
         return UnknownPlatform()
-    else:
-        try:
-            import torch_npu  # noqa: F401
 
-            logger.debug("Detected torch_npu. Initializing NPU platform.")
-            return NpuPlatform()
-        except ImportError:
-            logger.debug("No supported accelerator detected. Initializing CPU platform.")
-            return CpuPlatform()
+    logger.debug("No supported accelerator detected. Initializing CPU platform.")
+    return CpuPlatform()
 
 
 # Global singleton representing the current platform in use.

diff --git a/roll/third_party/megatron/optimizer.py b/roll/third_party/megatron/optimizer.py
@@ -69,9 +69,15 @@ def get_megatron_optimizer(
     optimizers = []
     model_chunk_offset = 0
     kwargs = {}
-    if "config_overrides" in inspect.signature(_get_param_groups_and_buffers).parameters:
+    _param_groups_sig = inspect.signature(_get_param_groups_and_buffers).parameters
+    if "config_overrides" in _param_groups_sig:
         # config_overrides is required in mcore-core>=0.16
-        kwargs = {"config_overrides": None}
+        kwargs["config_overrides"] = None
+    if "no_weight_decay_cond" in _param_groups_sig:
+        # no_weight_decay_cond, scale_lr_cond, lr_mult are required in newer mcore versions
+        kwargs["no_weight_decay_cond"] = no_weight_decay_cond
+        kwargs["scale_lr_cond"] = scale_lr_cond
+        kwargs["lr_mult"] = lr_mult
     for dense_model_chunks, overlap_param_gather_with_optimizer_step in zip(
         all_dense_model_chunks, overlap_param_gather_with_optimizer_step_flags
     ):