RLVR默认配置训练出现问题，造成不断减少回复长度，直至为0或不回答,崩掉模型效果

如上所示，

我使用的是非常官方的脚本，

```yaml
hydra:
  run:
    dir: .
  output_subdir: null

pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
exp_name: Qwen2.5-7B-RLVR-finial_true_infer_correction-${pg_variant}-${now:%Y%m%d-%H%M%S}
seed: 42
logging_dir: ./output/logs/${exp_name}
output_dir: ./output
system_envs:
  USE_MODELSCOPE: '1'

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
track_with: tensorboard
tracker_kwargs:
  log_dir: ./tensorboard/roll_exp/${exp_name}

num_gpus_per_node: 8

max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 20
resume_from_checkpoint: false

max_grad_norm: 1.0

rollout_batch_size: 64  # prompt
prompt_length: 2048
response_length: 8196

num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "grpo"

# data mask
max_len_mask: false
error_max_len_clip: false

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: false

use_pg_clip_range: true  
pg_clip_low: 0.2   
pg_clip_high: 0.28  



pretrain: /Qwen2.5-7B
reward_pretrain: /Qwen2.5-7B

use_kl_loss: false

init_kl_coef: 0

enable_reference: false

enable_old_logprobs_recompute: false

force_disable_old_logprobs_recompute: true  


validation:
  data_args:
    template: qwen2_5
    file_name:
      - data/math_benchmarks_0710.jsonl
  generating_args:
    top_p: 0.6
    top_k: 50
    num_beams: 1
    temperature: 0.6
    num_return_sequences: 8
  eval_steps: 20

actor_train:
  worker_cls: roll.pipeline.rlvr.actor_worker.ActorWorker
  pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-6
    weight_decay: 0
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 32
    warmup_steps: 20
    num_train_epochs: 50
  data_args:
    template: qwen2_5
    file_name:
      - data/math8k.jsonl
    domain_interleave_probs:
      math_rule: 1
      # math_rule: 0.4
      # code_sandbox: 0.3
      # llm_judge: 0.1
      # crossthinkqa: 0.1
      # ifeval: 0.1
    dataset_dir: data
    messages: messages
    interleave_probs: "1.0"
    preprocessing_num_workers: 16
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
      use_distributed_optimizer: true
      recompute_granularity: full
  device_mapping: list(range(0,8))
  infer_batch_size: 4
actor_infer:
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.99
    num_return_sequences: ${num_return_sequences_in_group}
    logprobs: 1
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.6
      block_size: 16
      max_model_len: 8000
reference:
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
    model_type: ~
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: megatron_infer
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
  device_mapping: list(range(0,8))
  infer_batch_size: 4

rewards:
  math_rule:
    worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen2_5
    tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule']
    world_size: 8
    infer_batch_size: 1

```

其中，训着训着就崩掉了，下面是崩掉了的效果图

<img width="1472" height="454" alt="Image" src="https://github.com/user-attachments/assets/fe91180c-7233-4811-83f2-32cc5e6fd295" />


我查询了所有的观察指标，似乎都是在模型崩掉之后才崩溃的，唯一一点不一样的是，模型的response在不断减小，这与我之前项目的不一致，正常的训练都是长度不断增加，而使用ROLL后，是这个样子，总长度不断下降，错误回复的长度不断下降，正确回复的长度微微上升，到了我们的崩溃点就崩掉了。。。。。。。。

<img width="1020" height="261" alt="Image" src="https://github.com/user-attachments/assets/dc1adefb-f984-4b1b-8c77-6241404a56a8" />

<img width="1068" height="257" alt="Image" src="https://github.com/user-attachments/assets/d0ee4ae4-c2d4-4082-82ad-08297892dc9d" />

<img width="680" height="250" alt="Image" src="https://github.com/user-attachments/assets/02067864-429c-4b00-a794-bca0256fe800" />



不知道贵方在实现框架的时候，是否做了一些默认的更改，是的总体回复长度下降，或者观察到训练的崩溃现象，因为这是一个非常基础的配置。。。。。





Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

RLVR默认配置训练出现问题，造成不断减少回复长度，直至为0或不回答,崩掉模型效果 #303

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

RLVR默认配置训练出现问题，造成不断减少回复长度，直至为0或不回答,崩掉模型效果 #303

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions