diff --git a/examples/ascend_examples/qwen3_4B_dpo_megatron.yaml b/examples/ascend_examples/qwen3_4B_dpo_megatron.yaml new file mode 100644 index 000000000..d41176232 --- /dev/null +++ b/examples/ascend_examples/qwen3_4B_dpo_megatron.yaml @@ -0,0 +1,103 @@ +defaults: + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "qwen3-4B-dpo-config" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +system_envs: + USE_MODELSCOPE: '1' + +checkpoint_config: + type: file_system + output_dir: ./ckpt + + +track_name: None + + +max_steps: 500 +save_steps: 500 +logging_steps: 1 +eval_steps: 100 +resume_from_checkpoint: false + +sequence_length: 512 +train_batch_size: 64 +val_batch_size: 64 + +# local_rank: -1 +num_nodes: 1 +num_gpus_per_node: 4 + +pretrain: Qwen/Qwen3-4B + +ipo: false +beta: 0.1 +label_smoothing: 0.0 + +chosen_key: chosen +rejected_key: rejected + +validation: + data_args: + template: qwen3 + file_name: data/comparison_gpt4_data_zh.json + +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + lr_scheduler_type: constant + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 16 + gradient_accumulation_steps: 1 + warmup_steps: 20 + num_train_epochs: 10 + data_args: + template: qwen3 + file_name: + - data/comparison_gpt4_data_zh.json + dataset_dir: data + preprocessing_num_workers: 1 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,2)) + infer_batch_size: 16 + + +reference: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen3 + strategy_args: + strategy_name: megatron_infer + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + additional_configs: + mindspeed_args: + attention_mask_type: general + device_mapping: list(range(2,4)) + infer_batch_size: 16 \ No newline at end of file diff --git a/examples/ascend_examples/qwen3_8b_rlvr_deepspeed.yaml b/examples/ascend_examples/qwen3_8b_rlvr_deepspeed.yaml new file mode 100644 index 000000000..5ddfdde66 --- /dev/null +++ b/examples/ascend_examples/qwen3_8b_rlvr_deepspeed.yaml @@ -0,0 +1,166 @@ +defaults: + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo +exp_name: Qwen3-8B-RLVR-${pg_variant} +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +system_envs: + USE_MODELSCOPE: '1' + +checkpoint_config: + type: file_system + output_dir: ./ckpt + +num_gpus_per_node: 8 + +max_steps: 200 +save_steps: 100 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + + +rollout_batch_size: 64 # prompt +prompt_length: 1024 +response_length: 2048 + +num_return_sequences_in_group: 8 +ppo_epochs: 1 +adv_estimator: "reinforce" + +# clip +value_clip: 0.5 +reward_clip: 10 +advantage_clip: 2.0 +dual_clip_loss: true + +# normalize +norm_mean_type: batch +norm_std_type: batch + +# data mask +max_len_mask: true +difficulty_mask: true +difficulty_low_threshold: 0.2 +difficulty_high_threshold: 0.95 +error_max_len_clip: false + +# data weight +difficulty_loss_weight: false +length_loss_weight: false + +# reward +add_token_level_kl: false + +# advantage +whiten_advantages: true + +# dynamic sampling scheduler +# use_additional_prompts: true +# max_running_requests: 256 +# is_num_return_sequences_expand: false + +pretrain: Qwen/Qwen3-8B-Base +reward_pretrain: Qwen/Qwen3-8B-Base + +validation: + data_args: + template: qwen3 + file_name: + - data/math_benchmarks.jsonl + generating_args: + top_p: 0.6 + top_k: 50 + num_beams: 1 + temperature: 0.6 + num_return_sequences: 1 + eval_steps: 10 + +actor_train: + worker_cls: roll.pipeline.rlvr.actor_pg_worker.ActorPGWorker + pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo + model_args: + flash_attn: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 16 + warmup_steps: 20 + num_train_epochs: 50 + data_args: + template: qwen3 + file_name: + - data/math_deepmath_deal.jsonl + domain_interleave_probs: + math_rule: 1 + dataset_dir: data + messages: messages + interleave_probs: "1.0" + preprocessing_num_workers: 16 + strategy_args: + strategy_name: deepspeed_train + strategy_config: ${deepspeed_zero3} + device_mapping: list(range(0,4)) + infer_batch_size: 4 + +actor_infer: + model_args: + flash_attn: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: ${num_return_sequences_in_group} + data_args: + template: qwen3 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.6 + block_size: 16 + max_model_len: 8000 + device_mapping: list(range(4,6)) + infer_batch_size: 4 + +reference: + model_args: + flash_attn: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen3 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(6,8)) + infer_batch_size: 4 + +rewards: + math_rule: + worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen3 + tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule'] + world_size: 8 + infer_batch_size: 1 \ No newline at end of file diff --git a/examples/ascend_examples/run_dpo_pipeline.sh b/examples/ascend_examples/run_dpo_pipeline.sh new file mode 100644 index 000000000..a79132ca4 --- /dev/null +++ b/examples/ascend_examples/run_dpo_pipeline.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set +x + +CONFIG_PATH=$(basename $(dirname $0)) +python examples/start_dpo_pipeline.py --config_path $CONFIG_PATH --config_name qwen3_4B_dpo_megatron diff --git a/mcore_adapter/src/mcore_adapter/training_args.py b/mcore_adapter/src/mcore_adapter/training_args.py index 412322d11..794d10cc7 100644 --- a/mcore_adapter/src/mcore_adapter/training_args.py +++ b/mcore_adapter/src/mcore_adapter/training_args.py @@ -2,6 +2,12 @@ from dataclasses import dataclass, field, fields from typing import Literal, Optional, Union +try: + # NPU patch + import mindspeed.megatron_adaptor +except ImportError: + pass + from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout from transformers import Seq2SeqTrainingArguments as HFSeq2SeqTrainingArguments from transformers import TrainingArguments as HFTrainingArguments diff --git a/roll/distributed/strategy/megatron_strategy.py b/roll/distributed/strategy/megatron_strategy.py index 4eeb4cc74..c12e7dff0 100644 --- a/roll/distributed/strategy/megatron_strategy.py +++ b/roll/distributed/strategy/megatron_strategy.py @@ -422,6 +422,13 @@ def inner_forward_step(self, loss_func, data_iterator: Iterator[DataProto], mode else: input_ids = self._get_feature_on_this_cp_rank(input_ids, "input_ids") attention_mask = self._get_feature_on_this_cp_rank(attention_mask, "attention_mask") + + if hasattr(torch, "npu") and torch.npu.is_available() and attention_mask is not None: + attention_mask = attention_mask.bool() + B, S = attention_mask.shape + attention_mask = attention_mask[:, None, None, :] # [B,1,1,S] + attention_mask = attention_mask.expand(B, 1, S, S) # [B,1,S,S] + if labels is not None: labels = self._get_feature_on_this_cp_rank(labels, "labels") position_ids = None diff --git a/roll/platforms/__init__.py b/roll/platforms/__init__.py index 6869621f4..c9dff3f15 100644 --- a/roll/platforms/__init__.py +++ b/roll/platforms/__init__.py @@ -25,26 +25,31 @@ def _init_platform() -> Platform: Returns: An instance of a subclass of Platform corresponding to the detected hardware. """ + try: + import torch_npu # noqa: F401 + + if hasattr(torch, "npu") and torch.npu.is_available(): + logger.debug("Detected NPU (torch_npu). Initializing NPU platform.") + return NpuPlatform() + except ImportError: + pass + if torch.cuda.is_available(): device_name = torch.cuda.get_device_name().upper() logger.debug(f"Detected CUDA device: {device_name}") + if "NVIDIA" in device_name: logger.debug("Initializing CUDA platform (NVIDIA).") return CudaPlatform() elif "AMD" in device_name: logger.debug("Initializing ROCm platform (AMD).") return RocmPlatform() + logger.warning("Unrecognized CUDA device. Falling back to UnknownPlatform.") return UnknownPlatform() - else: - try: - import torch_npu # noqa: F401 - logger.debug("Detected torch_npu. Initializing NPU platform.") - return NpuPlatform() - except ImportError: - logger.debug("No supported accelerator detected. Initializing CPU platform.") - return CpuPlatform() + logger.debug("No supported accelerator detected. Initializing CPU platform.") + return CpuPlatform() # Global singleton representing the current platform in use. diff --git a/roll/third_party/megatron/optimizer.py b/roll/third_party/megatron/optimizer.py index 888dc7a87..4f860ac2e 100644 --- a/roll/third_party/megatron/optimizer.py +++ b/roll/third_party/megatron/optimizer.py @@ -69,9 +69,15 @@ def get_megatron_optimizer( optimizers = [] model_chunk_offset = 0 kwargs = {} - if "config_overrides" in inspect.signature(_get_param_groups_and_buffers).parameters: + _param_groups_sig = inspect.signature(_get_param_groups_and_buffers).parameters + if "config_overrides" in _param_groups_sig: # config_overrides is required in mcore-core>=0.16 - kwargs = {"config_overrides": None} + kwargs["config_overrides"] = None + if "no_weight_decay_cond" in _param_groups_sig: + # no_weight_decay_cond, scale_lr_cond, lr_mult are required in newer mcore versions + kwargs["no_weight_decay_cond"] = no_weight_decay_cond + kwargs["scale_lr_cond"] = scale_lr_cond + kwargs["lr_mult"] = lr_mult for dense_model_chunks, overlap_param_gather_with_optimizer_step in zip( all_dense_model_chunks, overlap_param_gather_with_optimizer_step_flags ):