on-policy下，log-prob和old-logprob数值不一致，有一些崩溃现象

如图所示：


<img width="1423" height="538" alt="Image" src="https://github.com/user-attachments/assets/5b28a006-a2c9-41d9-bc51-03294a717d10" />

我自纯on-policy下，approxkl在训练的时候发生大的波动，log-prob和old-log-probs差距巨大，

<img width="369" height="247" alt="Image" src="https://github.com/user-attachments/assets/3dfef856-ea36-45fe-aaac-7877c69dfce3" />

导致训练崩溃

<img width="1400" height="259" alt="Image" src="https://github.com/user-attachments/assets/6e822dd7-5e87-422b-8815-d0c2a9a9bbaf" />



配置如下

```yaml
hydra:
  run:
    dir: .
  output_subdir: null

pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
exp_name: Qwen2.5-7B-RLVR-finial_true_infer_correction-${pg_variant}-${now:%Y%m%d-%H%M%S}
seed: 42
logging_dir: ./output/logs/${exp_name}
output_dir: ./output
system_envs:
  USE_MODELSCOPE: '1'

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}

# track_with: wandb
# tracker_kwargs:
#  api_key: xxx
#  project: roll_examples
#  name: ${exp_name}
#  notes: roll_examples
#  tags: rlvr
#    - rlvr
#    - baseline
track_with: tensorboard
tracker_kwargs:
  log_dir: ./tensorboard/roll_exp/${exp_name}

num_gpus_per_node: 8

max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 20
resume_from_checkpoint: false


rollout_batch_size: 64  # prompt
prompt_length: 2048
response_length: 12288

num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "grpo"

# data mask
max_len_mask: true
error_max_len_clip: false

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: true

use_pg_clip_range: true  
pg_clip_low: 0.2   
pg_clip_high: 0.28  


# dynamic sampling scheduler
# use_additional_prompts: true
# max_running_requests: 256
# is_num_return_sequences_expand: false

pretrain: /Qwen2.5-7B
reward_pretrain: /Qwen2.5-7B

# infer_correction: true 

# infer_is_mode: token 
# infer_is_threshold_min: 0.0
# infer_is_threshold_max: 2.0     # 1.5~5.0

# enable_token_reject: false
# infer_token_mask_threshold_min: 0.0
# infer_token_mask_threshold_max: 2.0 # 2~10

# enable_catastrophic_reject: false
# infer_catastrophic_threshold: 1e-4

# enable_seq_reject: None

validation:
  data_args:
    template: qwen2_5
    file_name:
      - data/math_benchmarks_0710.jsonl
  generating_args:
    top_p: 0.6
    top_k: 50
    num_beams: 1
    temperature: 0.6
    num_return_sequences: 1
  eval_steps: 20

actor_train:
  worker_cls: roll.pipeline.rlvr.actor_pg_worker.ActorPGWorker
  pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-6
    weight_decay: 0
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 64
    warmup_steps: 20
    num_train_epochs: 50
  data_args:
    template: qwen2_5
    file_name:
      - data/math_deepmath_deal.jsonl
    domain_interleave_probs:
      math_rule: 1
      # math_rule: 0.4
      # code_sandbox: 0.3
      # llm_judge: 0.1
      # crossthinkqa: 0.1
      # ifeval: 0.1
    dataset_dir: data
    messages: messages
    interleave_probs: "1.0"
    preprocessing_num_workers: 16
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
      use_distributed_optimizer: true
      recompute_granularity: full
  device_mapping: list(range(0,8))
  infer_batch_size: 2
  use_dynamic_batching_in_train: true
  max_tokens_per_microbatch_in_train: 30732
  sequence_length_round_in_train: 128
  use_dynamic_batching_in_infer: true
  max_tokens_per_microbatch_in_infer: 43008
  sequence_length_round_in_infer: 128

actor_infer:
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 1
    top_k: 100
    num_beams: 1
    temperature: 1
    num_return_sequences: ${num_return_sequences_in_group}
    logprobs: 1
  data_args:
    template: qwen2_5
  strategy_args:
    # strategy_name: vllm
    # strategy_config:
    #   gpu_memory_utilization: 0.6
    #   block_size: 16
    #   max_model_len: 8000
    strategy_name: sglang
    strategy_config:
      mem_fraction_static: 0.8
      load_format: dummy
  device_mapping: list(range(0,8))
  infer_batch_size: 2
  use_dynamic_batching_in_infer: true
  max_tokens_per_microbatch_in_infer: 43008
  sequence_length_round_in_infer: 128

reference:
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
    model_type: ~
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: megatron_infer
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
  device_mapping: list(range(0,8))
  infer_batch_size: 2
  use_dynamic_batching_in_infer: true
  max_tokens_per_microbatch_in_infer: 43008
  sequence_length_round_in_infer: 128

rewards:
  math_rule:
    worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen2_5
    tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule']
    world_size: 8
    infer_batch_size: 1

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

on-policy下，log-prob和old-logprob数值不一致，有一些崩溃现象 #284

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

on-policy下，log-prob和old-logprob数值不一致，有一些崩溃现象 #284

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions