Skip to content

on-policy下,log-prob和old-logprob数值不一致,有一些崩溃现象 #284

@millioniron

Description

@millioniron

如图所示:

Image

我自纯on-policy下,approxkl在训练的时候发生大的波动,log-prob和old-log-probs差距巨大,

Image

导致训练崩溃

Image

配置如下

hydra:
  run:
    dir: .
  output_subdir: null

pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
exp_name: Qwen2.5-7B-RLVR-finial_true_infer_correction-${pg_variant}-${now:%Y%m%d-%H%M%S}
seed: 42
logging_dir: ./output/logs/${exp_name}
output_dir: ./output
system_envs:
  USE_MODELSCOPE: '1'

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}

# track_with: wandb
# tracker_kwargs:
#  api_key: xxx
#  project: roll_examples
#  name: ${exp_name}
#  notes: roll_examples
#  tags: rlvr
#    - rlvr
#    - baseline
track_with: tensorboard
tracker_kwargs:
  log_dir: ./tensorboard/roll_exp/${exp_name}

num_gpus_per_node: 8

max_steps: 500
save_steps: 100
logging_steps: 1
eval_steps: 20
resume_from_checkpoint: false


rollout_batch_size: 64  # prompt
prompt_length: 2048
response_length: 12288

num_return_sequences_in_group: 8
ppo_epochs: 1
adv_estimator: "grpo"

# data mask
max_len_mask: true
error_max_len_clip: false

# data weight
difficulty_loss_weight: false
length_loss_weight: false

# reward
add_token_level_kl: false

# advantage
whiten_advantages: true

use_pg_clip_range: true  
pg_clip_low: 0.2   
pg_clip_high: 0.28  


# dynamic sampling scheduler
# use_additional_prompts: true
# max_running_requests: 256
# is_num_return_sequences_expand: false

pretrain: /Qwen2.5-7B
reward_pretrain: /Qwen2.5-7B

# infer_correction: true 

# infer_is_mode: token 
# infer_is_threshold_min: 0.0
# infer_is_threshold_max: 2.0     # 1.5~5.0

# enable_token_reject: false
# infer_token_mask_threshold_min: 0.0
# infer_token_mask_threshold_max: 2.0 # 2~10

# enable_catastrophic_reject: false
# infer_catastrophic_threshold: 1e-4

# enable_seq_reject: None

validation:
  data_args:
    template: qwen2_5
    file_name:
      - data/math_benchmarks_0710.jsonl
  generating_args:
    top_p: 0.6
    top_k: 50
    num_beams: 1
    temperature: 0.6
    num_return_sequences: 1
  eval_steps: 20

actor_train:
  worker_cls: roll.pipeline.rlvr.actor_pg_worker.ActorPGWorker
  pg_variant: ppo # topr, vanilla, tis, cispo, kimi15, ppo
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-6
    weight_decay: 0
    per_device_train_batch_size: 1
    gradient_accumulation_steps: 64
    warmup_steps: 20
    num_train_epochs: 50
  data_args:
    template: qwen2_5
    file_name:
      - data/math_deepmath_deal.jsonl
    domain_interleave_probs:
      math_rule: 1
      # math_rule: 0.4
      # code_sandbox: 0.3
      # llm_judge: 0.1
      # crossthinkqa: 0.1
      # ifeval: 0.1
    dataset_dir: data
    messages: messages
    interleave_probs: "1.0"
    preprocessing_num_workers: 16
  strategy_args:
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
      use_distributed_optimizer: true
      recompute_granularity: full
  device_mapping: list(range(0,8))
  infer_batch_size: 2
  use_dynamic_batching_in_train: true
  max_tokens_per_microbatch_in_train: 30732
  sequence_length_round_in_train: 128
  use_dynamic_batching_in_infer: true
  max_tokens_per_microbatch_in_infer: 43008
  sequence_length_round_in_infer: 128

actor_infer:
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: ${response_length}
    top_p: 1
    top_k: 100
    num_beams: 1
    temperature: 1
    num_return_sequences: ${num_return_sequences_in_group}
    logprobs: 1
  data_args:
    template: qwen2_5
  strategy_args:
    # strategy_name: vllm
    # strategy_config:
    #   gpu_memory_utilization: 0.6
    #   block_size: 16
    #   max_model_len: 8000
    strategy_name: sglang
    strategy_config:
      mem_fraction_static: 0.8
      load_format: dummy
  device_mapping: list(range(0,8))
  infer_batch_size: 2
  use_dynamic_batching_in_infer: true
  max_tokens_per_microbatch_in_infer: 43008
  sequence_length_round_in_infer: 128

reference:
  model_args:
    flash_attn: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
    model_type: ~
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: megatron_infer
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
  device_mapping: list(range(0,8))
  infer_batch_size: 2
  use_dynamic_batching_in_infer: true
  max_tokens_per_microbatch_in_infer: 43008
  sequence_length_round_in_infer: 128

rewards:
  math_rule:
    worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
    model_args:
      model_name_or_path: ${reward_pretrain}
    data_args:
      template: qwen2_5
    tag_included: [deepmath_103k, 'MATH-500', 'OlympiadBench', 'minervamath', 'aime2025', 'gsm8k', 'aime', 'amc23', 'math_rule']
    world_size: 8
    infer_batch_size: 1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions