Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions 3.test_cases/pytorch/verl/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
kustomization.yaml
18 changes: 15 additions & 3 deletions 3.test_cases/pytorch/verl/rlvr/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Dockerfile for VERL with EFA support
# Using hiyouga/verl base image and adding EFA capabilities
FROM hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0

ARG TAG=vllm011.latest
FROM verlai/verl:${TAG}
ARG MEGATRON_LM_VERSION=core_v0.13.1
# EFA configuration
ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
ENV EFA_VERSION=1.43.3
Expand Down Expand Up @@ -115,10 +116,21 @@ RUN git clone https://github.com/volcengine/verl.git && \
cd verl && \
git checkout v0.6.1
WORKDIR /workspace/verl

# Install VERL in development mode
RUN pip install -e .

#####################
# Install megatron-lm
#####################
RUN pip install -U setuptools
RUN cd /workspace && git clone --depth 1 --branch ${MEGATRON_LM_VERSION} https://github.com/NVIDIA/Megatron-LM.git \
&& cd Megatron-LM \
&& pip install nltk \
&& pip install .
# Check if speparate Apex is needed
# https://verl.readthedocs.io/en/latest/start/install.html#install-from-docker-image


# Set working directory
WORKDIR /workspace

Expand Down
10 changes: 8 additions & 2 deletions 3.test_cases/pytorch/verl/rlvr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,15 @@ Build a Docker image with verl, EFA networking support, and push to ECR:
./setup/build-push.sh
```

Deploy the Ray cluster with head and worker pods configured for distributed training:
Generate kustomization.yaml from your environment variables and deploy the Ray cluster:
```bash
envsubst < setup/raycluster.yaml | kubectl apply -f -
./setup/generate-kustomization.sh
kubectl apply -k setup/
```

Alternatively, you can combine both steps:
```bash
./setup/generate-kustomization.sh && kubectl apply -k setup/
```

> **Note**: Considerations before applying raycluster.yaml
Expand Down
182 changes: 182 additions & 0 deletions 3.test_cases/pytorch/verl/rlvr/recipe/run_qwen3-235b_megatron_96gb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#!/usr/bin/env bash
set -xeuo pipefail

## !!!!!!!important!!!!!!
## set the following environment variables on all your nodes
# env_vars:
# CUDA_DEVICE_MAX_CONNECTIONS: "1"
# NCCL_NVLS_ENABLE: "0"
# VLLM_USE_V1: 1
# install mbridge=0.1.13 on all your node with the following command:
# pip3 install git+https://github.com/ISEEKYAN/mbridge

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
[ -f "${SCRIPT_DIR}/env.sh" ] && source "${SCRIPT_DIR}/env.sh"

adv_estimator=grpo

use_kl_in_reward=False
kl_coef=0.0
use_kl_loss=True
kl_loss_coef=0.001

clip_ratio_low=0.2
clip_ratio_high=0.28

max_prompt_length=$((1024 * 2))
max_response_length=$((1204 * 8))
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 1))
overlong_penalty_factor=1.0

loss_agg_mode="token-mean"

train_prompt_bsz=${TRAIN_BS:-32}
n_resp_per_prompt=8
train_prompt_mini_bsz=16

# minimum nodes need for qwen3-235B-A22B
NNODES=${NNODES:-4}
# Paths

RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}

MODEL_PATH=$RAY_DATA_HOME/models/Qwen3-235B-A22B
MODEL_PATH="${MODEL_PATH:-Qwen/Qwen3-VL-235B-A22B-Instruct}"

TRAIN_FILE=/fsx/verl/data/geo3k/train.parquet
TEST_FILE=/fsx/verl/data/geo3k/test.parquet

# Algorithm
temperature=1.0
top_p=1.0
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
val_top_p=0.7
# Performance Related Parameter
use_dynamic_bsz=True
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 10 / 10))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
offload=True
OPTIM_OFFLOAD=${OPTIM_OFFLOAD:-True}
gen_tp=8
train_tp=${TP:-4}
train_pp=${PP:-8}

EP=${EP:-4}
ETP=1
CP=1
optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
last_layer=${LAST_LAYER:-10}

project_name='verl-qwen3'
exp_name="235B-${NNODES}-pp${train_pp}-tp${train_tp}-ep${EP}-actor-length${actor_ppo_max_token_len}"
CKPTS_DIR=$RAY_DATA_HOME/ckpt/${project_name}/${exp_name}

# TODO: support cuda graph for rollout by setting the following config
# actor_rollout_ref.rollout.cudagraph_capture_sizes=[1,2,4,8,16,32]
# actor_rollout_ref.rollout.enforce_eager=False

python3 -m verl.trainer.main_ppo \
--config-path=config \
--config-name='ppo_megatron_trainer.yaml' \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
data.truncation='left' \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
data.train_batch_size=${train_prompt_bsz} \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.enforce_eager=True \
actor_rollout_ref.rollout.free_cache_engine=True \
algorithm.adv_estimator=${adv_estimator} \
algorithm.use_kl_in_reward=${use_kl_in_reward} \
algorithm.kl_ctrl.kl_coef=${kl_coef} \
actor_rollout_ref.model.use_fused_kernels=True \
actor_rollout_ref.actor.megatron.use_mbridge=True \
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.clip_ratio_c=10.0 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.model.path="${MODEL_PATH}" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.optim.weight_decay=0.1 \
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} \
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.megatron.param_offload=${offload} \
actor_rollout_ref.actor.megatron.optimizer_offload=${OPTIM_OFFLOAD} \
actor_rollout_ref.actor.megatron.grad_offload=${offload} \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.optim.clip_grad=1.0 \
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.temperature=${temperature} \
actor_rollout_ref.rollout.top_p=${top_p} \
actor_rollout_ref.rollout.top_k=${top_k} \
actor_rollout_ref.nccl_timeout=1200 \
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.val_kwargs.n=1 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
actor_rollout_ref.ref.megatron.param_offload=${offload} \
+actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.masked_softmax_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.bias_activation_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.bias_dropout_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.deallocate_pipeline_outputs=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.persist_layer_norm=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type="flex" \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=True \
reward_model.reward_manager=dapo \
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+reward_model.reward_kwargs.max_resp_len=${max_response_length} \
trainer.logger=['console','wandb'] \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.n_gpus_per_node=8 \
trainer.nnodes="${NNODES}" \
trainer.val_before_train=False \
trainer.test_freq=10 \
trainer.save_freq=100 \
trainer.total_epochs=10 \
trainer.default_local_dir="${CKPTS_DIR}" \
trainer.resume_mode=auto \
trainer.log_val_generations=10
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#laAdopted from https://github.com/volcengine/verl/blob/249c0831b793751f488d80bcfeafb8f7c544cd65/examples/grpo_trainer/run_qwen3_vl-235b-megatron.sh
set -x
ENGINE=${1:-vllm}
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping

export VLLM_ALLREDUCE_USE_SYMM_MEM=0 # for vllm0.11.0 with TP


HF_MODEL_PATH=${HF_MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-235B-A22B-Instruct"}

GEN_TP=${GEN_TP:-16}
CP=${CP:-2}
TP=${TP:-1}
PP=${PP:-8}
EP=${EP:-8}
ETP=${ETP:-1}

# Use env-provided data paths or default to Geometry3k under RAY_DATA_HOME
train_path=${TRAIN_FILE:-${RAY_DATA_HOME}/data/geo3k/train.parquet}
test_path=${TEST_FILE:-${RAY_DATA_HOME}/data/geo3k/test.parquet}

python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files="$train_path" \
data.val_files="$test_path" \
data.train_batch_size=512 \
data.max_prompt_length=1024 \
data.max_response_length=2048 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=$HF_MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.01 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.rollout.tensor_model_parallel_size=$GEN_TP \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096 \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096 \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
actor_rollout_ref.rollout.name=$ENGINE \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.actor.megatron.use_mbridge=True \
actor_rollout_ref.actor.megatron.param_offload=True \
actor_rollout_ref.actor.megatron.optimizer_offload=True \
actor_rollout_ref.actor.megatron.grad_offload=True \
actor_rollout_ref.ref.megatron.param_offload=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=True \
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=True \
algorithm.use_kl_in_reward=False \
trainer.critic_warmup=0 \
trainer.logger='["console","wandb"]' \
trainer.project_name='verl_grpo_example_geo3k' \
trainer.experiment_name='qwen3_vl_235b_megatron' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=4 \
trainer.save_freq=20 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
Loading