alibaba
diff --git a/‎examples/qwen3-235BA22B-rlvr_megatron/rlvr_config.yaml‎
Lines changed: 261 additions & 0 deletions b/‎examples/qwen3-235BA22B-rlvr_megatron/rlvr_config.yaml‎
Lines changed: 261 additions & 0 deletions
diff --git a/‎examples/qwen3-235BA22B-rlvr_megatron/run_rlvr_pipeline.sh‎
Lines changed: 5 additions & 0 deletions b/‎examples/qwen3-235BA22B-rlvr_megatron/run_rlvr_pipeline.sh‎
Lines changed: 5 additions & 0 deletions
@@ -0,0 +1,261 @@
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+exp_name: "qwen3-235BA22B-rlvr-config"
+seed: 42
+logging_dir: ./output/logs
+output_dir: ./output
+system_envs:
+  USE_MODELSCOPE: '1'
+
+checkpoint_config:
+  type: file_system
+  output_dir: ./rl_examples/models/${exp_name}
+
+track_with: tensorboard
+tracker_kwargs:
+  log_dir: ./rl_examples/llm/tensorboard/roll_exp/rlvr
+
+num_gpus_per_node: 8
+
+max_steps: 500
+save_steps: 100
+logging_steps: 1
+eval_steps: 10
+resume_from_checkpoint: false
+
+
+rollout_batch_size: 64  # prompt
+prompt_length: 2048
+response_length: 4096
+
+num_return_sequences_in_group: 8
+ppo_epochs: 1
+adv_estimator: "reinforce"
+
+# clip
+value_clip: 0.5
+reward_clip: 10
+advantage_clip: 2.0
+dual_clip_loss: true
+
+# normalize
+reward_norm: null
+reward_shift: false
+reward_scale: false
+
+# data mask
+max_len_mask: true
+difficulty_mask: true
+difficulty_low_threshold: 0.1
+difficulty_high_threshold: 0.95
+error_max_len_clip: false
+
+# data weight
+difficulty_loss_weight: false
+length_loss_weight: false
+
+# reward
+add_token_level_kl: false
+
+# advantage
+whiten_advantages: true
+
+# dynamic sampling scheduler
+# use_additional_prompts: true
+# max_running_requests: 256
+# is_num_return_sequences_expand: false
+
+pretrain: Qwen/Qwen3-235B-A22B
+reward_pretrain: Qwen/Qwen3-235B-A22B
+
+validation:
+  data_args:
+    template: qwen3
+    file_name:
+      - data/math_benchmarks.jsonl
+  generating_args:
+    top_p: 0.6
+    top_k: 50
+    num_beams: 1
+    temperature: 0.6
+    num_return_sequences: 1
+  eval_steps: 10
+
+actor_train:
+  model_args:
+    disable_gradient_checkpointing: false
+    dtype: bf16
+    model_type: ~
+  training_args:
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 64
+    warmup_steps: 20
+    num_train_epochs: 50
+  data_args:
+    template: qwen3
+    file_name:
+      - data/code_KodCode_data.jsonl
+      # - data/llm_judge_Multi-subject-RLVR_deal_new.jsonl
+      - data/math_deepmath_deal.jsonl
+      - data/general_ifeval_train_deal.jsonl
+      - data/general_CrossThink-QA_deal.jsonl
+    domain_interleave_probs:
+      math_rule: 0.4
+      code_sandbox: 0.3
+      # llm_judge: 0.1
+      crossthinkqa: 0.1
+      ifeval: 0.1
+    dataset_dir: data
+    messages: messages
+    interleave_probs: "1.0"
+    preprocessing_num_workers: 16
+  strategy_args:
+    strategy_name: megatron_train
+    strategy_config:
+      tensor_model_parallel_size: 4
+      pipeline_model_parallel_size: 8
+      virtual_pipeline_model_parallel_size: 6
+      expert_model_parallel_size: 8
+      context_parallel_size: 1
+      account_for_loss_in_pipeline_split: true
+      account_for_embedding_in_pipeline_split: true
+      use_distributed_optimizer: true
+      sequence_parallel: true
+      overlap_grad_reduce: true
+      bias_activation_fusion: true
+      apply_rope_fusion: true
+      moe_grouped_gemm: true
+      moe_layer_recompute: true
+      moe_token_dispatcher_type: "alltoall"
+  device_mapping: list(range(0,256))
+  infer_batch_size: 2
+
+actor_infer:
+  model_args:
+    disable_gradient_checkpointing: true
+    dtype: bf16
+  generating_args:
+    max_new_tokens: ${response_length}
+    top_p: 0.99
+    top_k: 100
+    num_beams: 1
+    temperature: 0.99
+    num_return_sequences: ${num_return_sequences_in_group}
+  data_args:
+    template: qwen3
+  strategy_args:
+    strategy_name: vllm
+    strategy_config:
+      gpu_memory_utilization: 0.75
+      load_format: dummy
+      tensor_parallel_size: 8
+  num_gpus_per_worker: 8
+  device_mapping: list(range(0,200)) # device share with llm reward
+  infer_batch_size: 1
+
+reference:
+  model_args:
+    dtype: bf16
+    model_type: ~
+  data_args:
+    template: qwen3
+  strategy_args:
+    strategy_name: megatron_infer
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 8
+      virtual_pipeline_model_parallel_size: 6
+      expert_model_parallel_size: 8
+      account_for_loss_in_pipeline_split: true
+      account_for_embedding_in_pipeline_split: true
+      use_distributed_optimizer: true
+      sequence_parallel: true
+      bias_activation_fusion: true
+      apply_rope_fusion: true
+      moe_grouped_gemm: true
+      moe_token_dispatcher_type: "alltoall"
+  device_mapping: list(range(0,256))
+  infer_batch_size: 2
+
+rewards:
+  crossthinkqa:
+    worker_cls: roll.pipeline.rlvr.rewards.crossthinkqa_rule_reward_worker.CrossThinkQARuleRewardWorker
+    reward_type: soft
+    response_length_penalty_coef: 0.0
+    model_args:
+      model_name_or_path: ${reward_pretrain}
+    data_args:
+      template: qwen3
+    tag_included: [crossthinkqa]
+    world_size: 8
+    infer_batch_size: 4
+  ifeval:
+    worker_cls: roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker.GeneralRuleRewardWorker
+    reward_type: soft
+    model_args:
+      model_name_or_path: ${reward_pretrain}
+    data_args:
+      template: qwen3
+    tag_included: [ifeval]
+    world_size: 8
+    infer_batch_size: 4
+  math_rule:
+    worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker
+    model_args:
+      model_name_or_path: ${reward_pretrain}
+    data_args:
+      template: qwen3
+    tag_included: [deepmath_103k, aime]
+    world_size: 8
+    infer_batch_size: 1
+# dynamic filter config
+#    query_filter_config:
+#      type: mean_filter
+#      filter_args:
+#        threshold_up: 0.9
+#        threshold_down: 0.1
+  code_sandbox:
+    use_local: true
+    worker_cls: roll.pipeline.rlvr.rewards.code_sandbox_reward_worker.CodeSandboxRewardWorker
+    tag_included: [KodCode]
+    model_args:
+      model_name_or_path: ${reward_pretrain}
+    data_args:
+      template: qwen3
+    world_size: 8
+    infer_batch_size: 1
+#    query_filter_config:
+#      type: std_filter
+#      filter_args:
+#        std_threshold: 0
+  llm_judge:
+    # NOTE: llm as judge 也需要gpu, 不能和actor infer共享gpu
+    worker_cls: roll.pipeline.rlvr.rewards.llm_judge_reward_worker.LLMJudgeRewardWorker
+    judge_prompt: Qwen2.5-7B-Instruct-RLVR-prompt
+    judge_model_type: inference
+    tag_included: [RLVR]
+    model_args:
+      model_name_or_path: virtuoussy/Qwen2.5-7B-Instruct-RLVR
+      attn_implementation: fa2
+      disable_gradient_checkpointing: true
+      dtype: bf16
+      model_type: trl
+    generating_args:
+      max_new_tokens: 100
+      top_p: 0.8
+      top_k: 50
+      num_beams: 1
+      temperature: 0.8
+      num_return_sequences: 1
+    data_args:
+      template: qwen3
+    strategy_args:
+      strategy_name: hf_infer
+      strategy_config: null
+    device_mapping: list(range(200,256))
+    infer_batch_size: 4
@@ -0,0 +1,5 @@
+#!/bin/bash
+set +x
+
+CONFIG_PATH=$(basename $(dirname $0))
+python examples/start_rlvr_pipeline.py --config_path $CONFIG_PATH  --config_name rlvr_config