Skip to content

RLVR默认配置训练出现问题,造成不断减少回复长度,直至为0或不回答,崩掉模型效果 #303

@millioniron

Description

@millioniron

如上所示,

我使用的是非常官方的脚本,

hydra:
  run:
    dir: .
  output_subdir: null

pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
exp_name: Qwen2.5-7B-RLVR-finial_true_infer_correction-${pg_variant}-${now:%Y%m%d-%H%M%S}
seed: 42
logging_dir: ./output/logs/${exp_name}
output_dir: ./output
system_envs:
  USE_MODELSCOPE: '1'

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
track_with: tensorboard
tracker_kwargs:
  log_dir: ./tensorboard/roll_exp/${exp_name}

num_gpus_per_node: 8

max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 20
resume_from_checkpoint: false

max_grad_norm: 1.0

rollout_batch_size: 64  # prompt
prompt_length: 2048
response_length: 8196

num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "grpo"

# data mask
max_len_mask: false
error_max_len_clip: false

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: false

use_pg_clip_range: true  
pg_clip_low: 0.2   
pg_clip_high: 0.28  



pretrain: /Qwen2.5-7B
reward_pretrain: /Qwen2.5-7B

use_kl_loss: false

init_kl_coef: 0

enable_reference: false

enable_old_logprobs_recompute: false

force_disable_old_logprobs_recompute: true  


validation:
  data_args:
    template: qwen2_5
    file_name:
      - data/math_benchmarks_0710.jsonl
  generating_args:
    top_p: 0.6
    top_k: 50
    num_beams: 1
    temperature: 0.6
    num_return_sequences: 8
  eval_steps: 20

actor_train:
  worker_cls: roll.pipeline.rlvr.actor_worker.ActorWorker
  pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-6
    weight_decay: 0
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 32
    warmup_steps: 20
    num_train_epochs: 50
  data_args:
    template: qwen2_5
    file_name:
      - data/math8k.jsonl
    domain_interleave_probs:
      math_rule: 1
      # math_rule: 0.4
      # code_sandbox: 0.3
      # llm_judge: 0.1
      # crossthinkqa: 0.1
      # ifeval: 0.1
    dataset_dir: data
    messages: messages
    interleave_probs: "1.0"
    preprocessing_num_workers: 16
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
      use_distributed_optimizer: true
      recompute_granularity: full
  device_mapping: list(range(0,8))
  infer_batch_size: 4
actor_infer:
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.99
    num_return_sequences: ${num_return_sequences_in_group}
    logprobs: 1
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.6
      block_size: 16
      max_model_len: 8000
reference:
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
    model_type: ~
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: megatron_infer
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
  device_mapping: list(range(0,8))
  infer_batch_size: 4

rewards:
  math_rule:
    worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen2_5
    tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule']
    world_size: 8
    infer_batch_size: 1

其中,训着训着就崩掉了,下面是崩掉了的效果图

Image

我查询了所有的观察指标,似乎都是在模型崩掉之后才崩溃的,唯一一点不一样的是,模型的response在不断减小,这与我之前项目的不一致,正常的训练都是长度不断增加,而使用ROLL后,是这个样子,总长度不断下降,错误回复的长度不断下降,正确回复的长度微微上升,到了我们的崩溃点就崩掉了。。。。。。。。

Image Image Image

不知道贵方在实现框架的时候,是否做了一些默认的更改,是的总体回复长度下降,或者观察到训练的崩溃现象,因为这是一个非常基础的配置。。。。。

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions