-
Notifications
You must be signed in to change notification settings - Fork 193
Closed
Description
如上所示,
我使用的是非常官方的脚本,
hydra:
run:
dir: .
output_subdir: null
pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
exp_name: Qwen2.5-7B-RLVR-finial_true_infer_correction-${pg_variant}-${now:%Y%m%d-%H%M%S}
seed: 42
logging_dir: ./output/logs/${exp_name}
output_dir: ./output
system_envs:
USE_MODELSCOPE: '1'
checkpoint_config:
type: file_system
output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
track_with: tensorboard
tracker_kwargs:
log_dir: ./tensorboard/roll_exp/${exp_name}
num_gpus_per_node: 8
max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 20
resume_from_checkpoint: false
max_grad_norm: 1.0
rollout_batch_size: 64 # prompt
prompt_length: 2048
response_length: 8196
num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "grpo"
# data mask
max_len_mask: false
error_max_len_clip: false
# data weight
difficulty_loss_weight: false
length_loss_weight: false
# reward
add_token_level_kl: false
# advantage
whiten_advantages: false
use_pg_clip_range: true
pg_clip_low: 0.2
pg_clip_high: 0.28
pretrain: /Qwen2.5-7B
reward_pretrain: /Qwen2.5-7B
use_kl_loss: false
init_kl_coef: 0
enable_reference: false
enable_old_logprobs_recompute: false
force_disable_old_logprobs_recompute: true
validation:
data_args:
template: qwen2_5
file_name:
- data/math_benchmarks_0710.jsonl
generating_args:
top_p: 0.6
top_k: 50
num_beams: 1
temperature: 0.6
num_return_sequences: 8
eval_steps: 20
actor_train:
worker_cls: roll.pipeline.rlvr.actor_worker.ActorWorker
pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
model_args:
flash_attn: fa2
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 2
gradient_accumulation_steps: 32
warmup_steps: 20
num_train_epochs: 50
data_args:
template: qwen2_5
file_name:
- data/math8k.jsonl
domain_interleave_probs:
math_rule: 1
# math_rule: 0.4
# code_sandbox: 0.3
# llm_judge: 0.1
# crossthinkqa: 0.1
# ifeval: 0.1
dataset_dir: data
messages: messages
interleave_probs: "1.0"
preprocessing_num_workers: 16
strategy_args:
strategy_name: megatron_train
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
use_distributed_optimizer: true
recompute_granularity: full
device_mapping: list(range(0,8))
infer_batch_size: 4
actor_infer:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
generating_args:
max_new_tokens: ${response_length}
top_p: 0.99
top_k: 100
num_beams: 1
temperature: 0.99
num_return_sequences: ${num_return_sequences_in_group}
logprobs: 1
data_args:
template: qwen2_5
strategy_args:
strategy_name: vllm
strategy_config:
gpu_memory_utilization: 0.6
block_size: 16
max_model_len: 8000
reference:
model_args:
flash_attn: fa2
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
data_args:
template: qwen2_5
strategy_args:
strategy_name: megatron_infer
strategy_config:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
expert_model_parallel_size: 1
device_mapping: list(range(0,8))
infer_batch_size: 4
rewards:
math_rule:
worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
model_args:
model_name_or_path: ${reward_pretrain}
data_args:
template: qwen2_5
tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule']
world_size: 8
infer_batch_size: 1
其中,训着训着就崩掉了,下面是崩掉了的效果图
我查询了所有的观察指标,似乎都是在模型崩掉之后才崩溃的,唯一一点不一样的是,模型的response在不断减小,这与我之前项目的不一致,正常的训练都是长度不断增加,而使用ROLL后,是这个样子,总长度不断下降,错误回复的长度不断下降,正确回复的长度微微上升,到了我们的崩溃点就崩掉了。。。。。。。。
不知道贵方在实现框架的时候,是否做了一些默认的更改,是的总体回复长度下降,或者观察到训练的崩溃现象,因为这是一个非常基础的配置。。。。。
Metadata
Metadata
Assignees
Labels
No labels