alibaba · PanAndy · Dec 8, 2025 · Oct 28, 2025 · Oct 29, 2025 · Oct 29, 2025
diff --git a/docs_roll/docs/User Guides/Configuration/vllm.md b/docs_roll/docs/User Guides/Configuration/vllm.md
@@ -74,6 +74,20 @@ In the configuration example, we can see:
 
 This design allows different components to choose the most suitable inference engine according to their needs.
 
+### beam_search Configuration
+RLVRPipeline supports vllm beam_search generation method, configured as follows:
+```yaml
+generate_opt_level: 0 # Degrades to batch_generate generation method, generate_opt_level=1 is prompt-level parallel method
+num_return_sequences_in_group: 8 
+actor_infer:
+  generating_args:
+    num_beams: ${num_return_sequences_in_group}
+    num_return_sequences: ${num_return_sequences_in_group}
+```
+Note:
+- generating_args.num_beams and generating_args.num_return_sequences must be set to the same value.
+- The generating_args configuration in validate is also configured in the same way.
+
 ## Performance Optimization Recommendations
 
 1. **Memory Management**:

diff --git a/...h-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/vllm.md b/...h-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/vllm.md
@@ -74,6 +74,21 @@ actor_infer:
 
 这种设计允许不同组件根据其需求选择最适合的推理引擎。
 
+### beam_search 配置方式
+RLVRPipeline 支持vllm beam_search 的生成方式，配置方式如下：
+```yaml
+generate_opt_level: 0 # 退化为batch_generate生成方式，generate_opt_level=1是prompt粒度并行方式
+num_return_sequences_in_group: 8 
+actor_infer:
+  generating_args:
+    num_beams: ${num_return_sequences_in_group}
+    num_return_sequences: ${num_return_sequences_in_group}
+```
+注意：
+- generating_args.num_beams 和 generating_args.num_return_sequences 必须设置为相同的值。
+- validate中配置generating_args也是相同的方式。
+
+
 ## 性能优化建议
 
 1. **内存管理**：

diff --git a/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake_amd.yaml b/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake_amd.yaml
@@ -107,7 +107,7 @@ actor_infer:
   strategy_args:
     strategy_name: vllm
     strategy_config:
-      gpu_memory_utilization: 0.4
+      gpu_memory_utilization: 0.6
       block_size: 16
       load_format: auto
   device_mapping: list(range(0,8))
@@ -131,7 +131,6 @@ reward_normalization:
   method: mean_std # asym_clip / identity / mean_std
 
 train_env_manager:
-  format_penalty: -0.15 # sokoban env penalty_for_step=-0.1
   max_env_num_per_worker: 16
   num_env_groups: 128
   # under the same group, the env config and env seed are ensured to be equal
@@ -163,8 +162,8 @@ custom_envs:
     ${custom_env.FrozenLakeThink}
   FrozenLakeLocallyDefineExamples:  # Can import from unified envs config or define dict locally
     env_type: frozen_lake
+    max_steps: ${max_actions_per_traj}
     max_tokens_per_step: ${max_tokens_per_step}
-    user_prompt_format: ${user_prompt_think_format}
     env_manager_cls: ${env_manager_cls}
     use_thread_lock: true
     env_config:

diff --git a/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake_async_amd.yaml b/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake_async_amd.yaml
@@ -0,0 +1,163 @@
+defaults:
+  - ../config/traj_envs@_here_
+  - ../config/deepspeed_zero@_here_
+  - ../config/deepspeed_zero2@_here_
+  - ../config/deepspeed_zero3@_here_
+  - ../config/deepspeed_zero3_cpuoffload@_here_
+
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+
+exp_name: "agentic_pipeline_async"
+seed: 42
+logging_dir: ./output/logs
+output_dir: ./output
+render_save_dir: ./output/render
+system_envs:
+  USE_MODELSCOPE: '1'
+
+#track_with: wandb
+#tracker_kwargs:
+#  api_key:
+#  project: roll-agentic
+#  name: ${exp_name}_sokoban
+#  notes: "agentic_pipeline"
+#  tags:
+#    - agentic
+#    - roll
+#    - baseline
+
+track_with: tensorboard
+tracker_kwargs:
+  log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_frozen_lake_async
+
+checkpoint_config:
+  type: file_system
+  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
+
+num_gpus_per_node: 8
+
+max_steps: 1024
+save_steps: 10000
+logging_steps: 1
+eval_steps: 10
+resume_from_checkpoint: false
+
+async_generation_ratio: 1
+
+rollout_batch_size: 1024
+val_batch_size: 1024
+sequence_length: 8192
+
+advantage_clip: 0.2
+ppo_epochs: 1
+adv_estimator: "grpo"
+#pg_clip: 0.1
+#dual_clip_loss: True
+init_kl_coef: 0.0
+whiten_advantages: true
+entropy_loss_coef: 0
+max_grad_norm: 1.0
+
+pretrain: Qwen/Qwen2.5-0.5B-Instruct
+reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct
+
+actor_train:
+  model_args:
+    attn_implementation: fa2
+    disable_gradient_checkpointing: false
+    dtype: bf16
+    model_type: ~
+  training_args:
+    learning_rate: 1.0e-6
+    weight_decay: 0
+    per_device_train_batch_size: 2
+    gradient_accumulation_steps: 128
+    warmup_steps: 10
+    lr_scheduler_type: cosine
+  data_args:
+    template: qwen2_5
+  strategy_args:
+#    strategy_name: deepspeed_train
+#    strategy_config: ${deepspeed_zero3}
+    strategy_name: megatron_train
+    strategy_config:
+      tensor_model_parallel_size: 1
+      pipeline_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      use_distributed_optimizer: true
+      recompute_granularity: full
+  device_mapping: list(range(0,4))
+  infer_batch_size: 2
+
+actor_infer:
+  model_args:
+    disable_gradient_checkpointing: true
+    dtype: bf16
+  generating_args:
+    max_new_tokens: 128 # single-turn response length
+    top_p: 0.99
+    top_k: 100
+    num_beams: 1
+    temperature: 0.99
+    num_return_sequences: 1
+  data_args:
+    template: qwen2_5
+  strategy_args:
+    strategy_name: vllm
+    strategy_config:
+      gpu_memory_utilization: 0.6
+      block_size: 16
+      load_format: auto
+  device_mapping: list(range(4,8))
+
+reference:
+  model_args:
+    attn_implementation: fa2
+    disable_gradient_checkpointing: true
+    dtype: bf16
+    model_type: ~
+  data_args:
+    template: qwen2_5
+  strategy_args:
+    strategy_name: hf_infer
+    strategy_config: ~
+  device_mapping: list(range(0,4))
+  infer_batch_size: 2
+
+reward_normalization:
+  grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv
+  method: mean_std # asym_clip / identity / mean_std
+
+train_env_manager:
+  format_penalty: -0.15 # sokoban env penalty_for_step=-0.1
+  max_env_num_per_worker: 16
+  num_env_groups: 128
+  # under the same group, the env config and env seed are ensured to be equal
+  group_size: 8
+  tags: [FrozenLake]
+  num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
+
+val_env_manager:
+  max_env_num_per_worker: 32
+  num_env_groups: 1024
+  group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
+  tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake]
+  num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
+
+# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64
+max_tokens_per_step: 64
+
+custom_envs:
+  SimpleSokoban:
+    ${custom_env.SimpleSokoban}
+  LargerSokoban:
+    ${custom_env.LargerSokoban}
+  SokobanDifferentGridVocab:
+    ${custom_env.SokobanDifferentGridVocab}
+  FrozenLake:
+    ${custom_env.FrozenLake}
+  FrozenLakeThink:
+    ${custom_env.FrozenLakeThink}
diff --git a/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd.sh b/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set +x
+source "examples/scripts/config.sh"
+
+WORKER_COUNT=1
+CONFIG_FILE="agent_val_frozen_lake_amd.yaml"
+# 替换为mos uri
+NEBULA_MODEL=""
+ENTRY_FILE="examples/start_agentic_pipeline.py"
+
+CONFIG_PATH=$(basename $(dirname $0))
+CONFIG_NAME="${CONFIG_FILE%.yaml}"
+JOB_NAME="$CONFIG_PATH-$CONFIG_NAME"
+
+
+QUEUE="nebula_test2_308x_gpu_hang"
+# QUEUE="nebula_test_308x"
+ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0"
+# ENVS="NCCL_PF_UCM_TIMEOUT=600000"
+
+echo "JOB_NAME: ${JOB_NAME}"
+echo "WORKER_COUNT: ${WORKER_COUNT}"
+echo "CONFIG_NAME: ${CONFIG_NAME}"
+echo "CONFIG_PATH: ${CONFIG_PATH}"
+echo "ENTRY_FILE: ${ENTRY_FILE}"
+
+args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}"
+
+mdl_args="--queue=${QUEUE} \
+        --entry=${ENTRY_FILE} \
+        --worker_count=${WORKER_COUNT}  \
+        --file.cluster_file=examples/scripts/cluster.json \
+        --job_name=${JOB_NAME} \
+        --algo_name=pytorch280 \
+        --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \
+        --oss_appendable=true \
+        --_NEBULA_MODEL=${NEBULA_MODEL} \
+        --nebula_model=${NEBULA_MODEL} \
+        --env=${ENVS} \
+        --force \
+        "
+if [ -n "${OPENLM_TOKEN}" ]; then
+    mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}"
+fi
+
+echo ${args}
+echo ${mdl_args}
+
+nebulactl run mdl --user_params="${args}" $mdl_args
diff --git a/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd_async.sh b/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd_async.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set +x
+source "examples/scripts/config.sh"
+
+WORKER_COUNT=1
+CONFIG_FILE="agent_val_frozen_lake_async_amd.yaml"
+# 替换为mos uri
+NEBULA_MODEL=""
+ENTRY_FILE="examples/start_agentic_pipeline.py"
+
+CONFIG_PATH=$(basename $(dirname $0))
+CONFIG_NAME="${CONFIG_FILE%.yaml}"
+JOB_NAME="$CONFIG_PATH-$CONFIG_NAME"
+
+
+QUEUE="nebula_test2_308x_gpu_hang"
+# QUEUE="nebula_test_308x"
+ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0"
+# ENVS="NCCL_PF_UCM_TIMEOUT=600000"
+
+echo "JOB_NAME: ${JOB_NAME}"
+echo "WORKER_COUNT: ${WORKER_COUNT}"
+echo "CONFIG_NAME: ${CONFIG_NAME}"
+echo "CONFIG_PATH: ${CONFIG_PATH}"
+echo "ENTRY_FILE: ${ENTRY_FILE}"
+
+args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}"
+
+mdl_args="--queue=${QUEUE} \
+        --entry=${ENTRY_FILE} \
+        --worker_count=${WORKER_COUNT}  \
+        --file.cluster_file=examples/scripts/cluster.json \
+        --job_name=${JOB_NAME} \
+        --algo_name=pytorch280 \
+        --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \
+        --oss_appendable=true \
+        --_NEBULA_MODEL=${NEBULA_MODEL} \
+        --nebula_model=${NEBULA_MODEL} \
+        --env=${ENVS} \
+        --force \
+        "
+if [ -n "${OPENLM_TOKEN}" ]; then
+    mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}"
+fi
+
+echo ${args}
+echo ${mdl_args}
+
+nebulactl run mdl --user_params="${args}" $mdl_args
diff --git a/examples/qwen2.5-7B-distill_megatron/distill_megatron.yaml b/examples/qwen2.5-7B-distill_megatron/distill_megatron.yaml
@@ -28,7 +28,7 @@ distill_on_prompt: False
 
 logits_transfer_backend: "nccl-only" # support "ipc+nccl", "nccl_only" and "ray"
 
-sequence_length: 1024
+sequence_length: 2048
 max_grad_norm: 1.0
 
 question_key: question_zh
@@ -43,8 +43,8 @@ student:
   training_args:
     learning_rate: 2.0e-5
     lr_scheduler_type: constant
-    per_device_train_batch_size: 2
-    gradient_accumulation_steps: 1
+    per_device_train_batch_size: 8
+    gradient_accumulation_steps: 4
     warmup_steps: 0
     num_train_epochs: 1
 
@@ -57,10 +57,12 @@ student:
     strategy_name: megatron_train
     strategy_config:
       tensor_model_parallel_size: 2
+      sequence_parallel: True
       pipeline_model_parallel_size: 2
       context_parallel_size: 2
       use_distributed_optimizer: true
       recompute_granularity: full
+  use_sequence_packing: True
   device_mapping: list(range(0,8))
 
 teacher:
@@ -72,14 +74,16 @@ teacher:
     template: qwen2_5
   training_args:
     # teacher forward micro_batch_size
-    per_device_train_batch_size: 1
+    per_device_train_batch_size: 8
   strategy_args:
     strategy_name: megatron_infer
     strategy_config:
       tensor_model_parallel_size: 2
+      sequence_parallel: True
       pipeline_model_parallel_size: 2
       context_parallel_size: 2
       bf16: true
+  use_sequence_packing: True
   device_mapping: list(range(0,8))
 
 system_envs: