Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions examples/ascend_examples/qwen3_4B_dpo_megatron.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
defaults:
- ../config/deepspeed_zero@_here_
- ../config/deepspeed_zero2@_here_
- ../config/deepspeed_zero3@_here_
- ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
run:
dir: .
output_subdir: null

exp_name: "qwen3-4B-dpo-config"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
system_envs:
USE_MODELSCOPE: '1'

checkpoint_config:
type: file_system
output_dir: ./ckpt


track_name: None


max_steps: 500
save_steps: 500
logging_steps: 1
eval_steps: 100
resume_from_checkpoint: false

sequence_length: 512
train_batch_size: 64
val_batch_size: 64

# local_rank: -1
num_nodes: 1
num_gpus_per_node: 4

pretrain: Qwen/Qwen3-4B

ipo: false
beta: 0.1
label_smoothing: 0.0

chosen_key: chosen
rejected_key: rejected

validation:
data_args:
template: qwen3
file_name: data/comparison_gpt4_data_zh.json

actor_train:
model_args:
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
lr_scheduler_type: constant
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 16
gradient_accumulation_steps: 1
warmup_steps: 20
num_train_epochs: 10
data_args:
template: qwen3
file_name:
- data/comparison_gpt4_data_zh.json
dataset_dir: data
preprocessing_num_workers: 1
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,2))
infer_batch_size: 16


reference:
model_args:
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
data_args:
template: qwen3
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
additional_configs:
mindspeed_args:
attention_mask_type: general
device_mapping: list(range(2,4))
infer_batch_size: 16
166 changes: 166 additions & 0 deletions examples/ascend_examples/qwen3_8b_rlvr_deepspeed.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
defaults:
- ../config/deepspeed_zero@_here_
- ../config/deepspeed_zero2@_here_
- ../config/deepspeed_zero3@_here_
- ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
run:
dir: .
output_subdir: null

pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
exp_name: Qwen3-8B-RLVR-${pg_variant}
seed: 42
logging_dir: ./output/logs
output_dir: ./output
system_envs:
USE_MODELSCOPE: '1'

checkpoint_config:
type: file_system
output_dir: ./ckpt

num_gpus_per_node: 8

max_steps: 200
save_steps: 100
logging_steps: 1
eval_steps: 10
resume_from_checkpoint: false


rollout_batch_size: 64 # prompt
prompt_length: 1024
response_length: 2048

num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "reinforce"

# clip
value_clip: 0.5
reward_clip: 10
advantage_clip: 2.0
dual_clip_loss: true

# normalize
norm_mean_type: batch
norm_std_type: batch

# data mask
max_len_mask: true
difficulty_mask: true
difficulty_low_threshold: 0.2
difficulty_high_threshold: 0.95
error_max_len_clip: false

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: true

# dynamic sampling scheduler
# use_additional_prompts: true
# max_running_requests: 256
# is_num_return_sequences_expand: false

pretrain: Qwen/Qwen3-8B-Base
reward_pretrain: Qwen/Qwen3-8B-Base

validation:
data_args:
template: qwen3
file_name:
- data/math_benchmarks.jsonl
generating_args:
top_p: 0.6
top_k: 50
num_beams: 1
temperature: 0.6
num_return_sequences: 1
eval_steps: 10

actor_train:
worker_cls: roll.pipeline.rlvr.actor_pg_worker.ActorPGWorker
pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
model_args:
flash_attn: fa2
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 1
gradient_accumulation_steps: 16
warmup_steps: 20
num_train_epochs: 50
data_args:
template: qwen3
file_name:
- data/math_deepmath_deal.jsonl
domain_interleave_probs:
math_rule: 1
dataset_dir: data
messages: messages
interleave_probs: "1.0"
preprocessing_num_workers: 16
strategy_args:
strategy_name: deepspeed_train
strategy_config: ${deepspeed_zero3}
device_mapping: list(range(0,4))
infer_batch_size: 4

actor_infer:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
generating_args:
max_new_tokens: ${response_length}
top_p: 0.99
top_k: 100
num_beams: 1
temperature: 0.99
num_return_sequences: ${num_return_sequences_in_group}
data_args:
template: qwen3
strategy_args:
strategy_name: vllm
strategy_config:
gpu_memory_utilization: 0.6
block_size: 16
max_model_len: 8000
device_mapping: list(range(4,6))
infer_batch_size: 4

reference:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
data_args:
template: qwen3
strategy_args:
strategy_name: hf_infer
strategy_config: ~
device_mapping: list(range(6,8))
infer_batch_size: 4

rewards:
math_rule:
worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
model_args:
model_name_or_path: ${reward_pretrain}
data_args:
template: qwen3
tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule']
world_size: 8
infer_batch_size: 1
5 changes: 5 additions & 0 deletions examples/ascend_examples/run_dpo_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash
set +x

CONFIG_PATH=$(basename $(dirname $0))
python examples/start_dpo_pipeline.py --config_path $CONFIG_PATH --config_name qwen3_4B_dpo_megatron
6 changes: 6 additions & 0 deletions mcore_adapter/src/mcore_adapter/training_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
from dataclasses import dataclass, field, fields
from typing import Literal, Optional, Union

try:
# NPU patch
import mindspeed.megatron_adaptor
except ImportError:
pass

from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout
from transformers import Seq2SeqTrainingArguments as HFSeq2SeqTrainingArguments
from transformers import TrainingArguments as HFTrainingArguments
Expand Down
7 changes: 7 additions & 0 deletions roll/distributed/strategy/megatron_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,13 @@ def inner_forward_step(self, loss_func, data_iterator: Iterator[DataProto], mode
else:
input_ids = self._get_feature_on_this_cp_rank(input_ids, "input_ids")
attention_mask = self._get_feature_on_this_cp_rank(attention_mask, "attention_mask")

if hasattr(torch, "npu") and torch.npu.is_available() and attention_mask is not None:
attention_mask = attention_mask.bool()
B, S = attention_mask.shape
attention_mask = attention_mask[:, None, None, :] # [B,1,1,S]
attention_mask = attention_mask.expand(B, 1, S, S) # [B,1,S,S]

if labels is not None:
labels = self._get_feature_on_this_cp_rank(labels, "labels")
position_ids = None
Expand Down
21 changes: 13 additions & 8 deletions roll/platforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,31 @@ def _init_platform() -> Platform:
Returns:
An instance of a subclass of Platform corresponding to the detected hardware.
"""
try:
import torch_npu # noqa: F401

if hasattr(torch, "npu") and torch.npu.is_available():
logger.debug("Detected NPU (torch_npu). Initializing NPU platform.")
return NpuPlatform()
except ImportError:
pass

if torch.cuda.is_available():
device_name = torch.cuda.get_device_name().upper()
logger.debug(f"Detected CUDA device: {device_name}")

if "NVIDIA" in device_name:
logger.debug("Initializing CUDA platform (NVIDIA).")
return CudaPlatform()
elif "AMD" in device_name:
logger.debug("Initializing ROCm platform (AMD).")
return RocmPlatform()

logger.warning("Unrecognized CUDA device. Falling back to UnknownPlatform.")
return UnknownPlatform()
else:
try:
import torch_npu # noqa: F401

logger.debug("Detected torch_npu. Initializing NPU platform.")
return NpuPlatform()
except ImportError:
logger.debug("No supported accelerator detected. Initializing CPU platform.")
return CpuPlatform()
logger.debug("No supported accelerator detected. Initializing CPU platform.")
return CpuPlatform()


# Global singleton representing the current platform in use.
Expand Down
10 changes: 8 additions & 2 deletions roll/third_party/megatron/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,15 @@ def get_megatron_optimizer(
optimizers = []
model_chunk_offset = 0
kwargs = {}
if "config_overrides" in inspect.signature(_get_param_groups_and_buffers).parameters:
_param_groups_sig = inspect.signature(_get_param_groups_and_buffers).parameters
if "config_overrides" in _param_groups_sig:
# config_overrides is required in mcore-core>=0.16
kwargs = {"config_overrides": None}
kwargs["config_overrides"] = None
if "no_weight_decay_cond" in _param_groups_sig:
# no_weight_decay_cond, scale_lr_cond, lr_mult are required in newer mcore versions
kwargs["no_weight_decay_cond"] = no_weight_decay_cond
kwargs["scale_lr_cond"] = scale_lr_cond
kwargs["lr_mult"] = lr_mult
for dense_model_chunks, overlap_param_gather_with_optimizer_step in zip(
all_dense_model_chunks, overlap_param_gather_with_optimizer_step_flags
):
Expand Down
Loading