-
Notifications
You must be signed in to change notification settings - Fork 193
Open
Description
如图所示:
我自纯on-policy下,approxkl在训练的时候发生大的波动,log-prob和old-log-probs差距巨大,
导致训练崩溃
配置如下
hydra:
run:
dir: .
output_subdir: null
pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
exp_name: Qwen2.5-7B-RLVR-finial_true_infer_correction-${pg_variant}-${now:%Y%m%d-%H%M%S}
seed: 42
logging_dir: ./output/logs/${exp_name}
output_dir: ./output
system_envs:
USE_MODELSCOPE: '1'
checkpoint_config:
type: file_system
output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
# track_with: wandb
# tracker_kwargs:
# api_key: xxx
# project: roll_examples
# name: ${exp_name}
# notes: roll_examples
# tags: rlvr
# - rlvr
# - baseline
track_with: tensorboard
tracker_kwargs:
log_dir: ./tensorboard/roll_exp/${exp_name}
num_gpus_per_node: 8
max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 20
resume_from_checkpoint: false
rollout_batch_size: 64 # prompt
prompt_length: 2048
response_length: 12288
num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "grpo"
# data mask
max_len_mask: true
error_max_len_clip: false
# data weight
difficulty_loss_weight: false
length_loss_weight: false
# reward
add_token_level_kl: false
# advantage
whiten_advantages: true
use_pg_clip_range: true
pg_clip_low: 0.2
pg_clip_high: 0.28
# dynamic sampling scheduler
# use_additional_prompts: true
# max_running_requests: 256
# is_num_return_sequences_expand: false
pretrain: /Qwen2.5-7B
reward_pretrain: /Qwen2.5-7B
# infer_correction: true
# infer_is_mode: token
# infer_is_threshold_min: 0.0
# infer_is_threshold_max: 2.0 # 1.5~5.0
# enable_token_reject: false
# infer_token_mask_threshold_min: 0.0
# infer_token_mask_threshold_max: 2.0 # 2~10
# enable_catastrophic_reject: false
# infer_catastrophic_threshold: 1e-4
# enable_seq_reject: None
validation:
data_args:
template: qwen2_5
file_name:
- data/math_benchmarks_0710.jsonl
generating_args:
top_p: 0.6
top_k: 50
num_beams: 1
temperature: 0.6
num_return_sequences: 1
eval_steps: 20
actor_train:
worker_cls: roll.pipeline.rlvr.actor_pg_worker.ActorPGWorker
pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
model_args:
flash_attn: fa2
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 1
gradient_accumulation_steps: 64
warmup_steps: 20
num_train_epochs: 50
data_args:
template: qwen2_5
file_name:
- data/math_deepmath_deal.jsonl
domain_interleave_probs:
math_rule: 1
# math_rule: 0.4
# code_sandbox: 0.3
# llm_judge: 0.1
# crossthinkqa: 0.1
# ifeval: 0.1
dataset_dir: data
messages: messages
interleave_probs: "1.0"
preprocessing_num_workers: 16
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,8))
infer_batch_size: 2
use_dynamic_batching_in_train: true
max_tokens_per_microbatch_in_train: 30732
sequence_length_round_in_train: 128
use_dynamic_batching_in_infer: true
max_tokens_per_microbatch_in_infer: 43008
sequence_length_round_in_infer: 128
actor_infer:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
generating_args:
max_new_tokens: ${response_length}
top_p: 1
top_k: 100
num_beams: 1
temperature: 1
num_return_sequences: ${num_return_sequences_in_group}
logprobs: 1
data_args:
template: qwen2_5
strategy_args:
# strategy_name: vllm
# strategy_config:
# gpu_memory_utilization: 0.6
# block_size: 16
# max_model_len: 8000
strategy_name: sglang
strategy_config:
mem_fraction_static: 0.8
load_format: dummy
device_mapping: list(range(0,8))
infer_batch_size: 2
use_dynamic_batching_in_infer: true
max_tokens_per_microbatch_in_infer: 43008
sequence_length_round_in_infer: 128
reference:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
data_args:
template: qwen2_5
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
device_mapping: list(range(0,8))
infer_batch_size: 2
use_dynamic_batching_in_infer: true
max_tokens_per_microbatch_in_infer: 43008
sequence_length_round_in_infer: 128
rewards:
math_rule:
worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
model_args:
model_name_or_path: ${reward_pretrain}
data_args:
template: qwen2_5
tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule']
world_size: 8
infer_batch_size: 1Metadata
Metadata
Assignees
Labels
No labels