Skip to content

Commit 56e8fcb

Browse files
feat: add dapo recipe and test (#1617)
Signed-off-by: Zhiyu Li <[email protected]>
1 parent 02d5142 commit 56e8fcb

File tree

3 files changed

+139
-0
lines changed

3 files changed

+139
-0
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
defaults: ../../../grpo_math_1B.yaml
2+
grpo:
3+
num_prompts_per_step: 256
4+
num_generations_per_prompt: 16
5+
batch_multiplier: 3 # Multiplier for dataloader batch size calculation
6+
max_rollout_turns: 1
7+
max_num_steps: 10
8+
use_leave_one_out_baseline: false
9+
val_period: 5
10+
max_val_samples: 256
11+
val_batch_size: 256
12+
use_dynamic_sampling: true
13+
dynamic_sampling_max_gen_batches: 10
14+
reward_scaling:
15+
enabled: true
16+
source_min: 0.0
17+
source_max: 1.0
18+
target_min: -1.0
19+
target_max: 1.0
20+
reward_shaping:
21+
enabled: true
22+
overlong_buffer_length: 512
23+
max_response_length: 1024
24+
loss_fn:
25+
reference_policy_kl_penalty: 0.0 # Corresponds to actor_rollout_ref.actor.kl_loss_coef
26+
ratio_clip_max: 0.28 # clip_ratio_high
27+
ratio_clip_min: 0.2 # clip_ratio_low
28+
ratio_clip_c: 10.0
29+
checkpointing:
30+
checkpoint_dir: results/grpo_dapomath17k_dsv3_megatron
31+
keep_top_k: 100
32+
policy:
33+
model_name: /path/to/dsv3-bf16-checkpoint
34+
hf_config_overrides:
35+
max_position_embeddings: 1536
36+
train_micro_batch_size: 1
37+
logprob_batch_size: 1
38+
max_total_sequence_length: 1536
39+
dtensor_cfg:
40+
enabled: false
41+
make_sequence_length_divisible_by: ${mul:${policy.dtensor_cfg.tensor_parallel_size},
42+
${mul:2, ${policy.dtensor_cfg.context_parallel_size}}}
43+
megatron_cfg:
44+
empty_unused_memory_level: 2
45+
enabled: true
46+
activation_checkpointing: true
47+
tensor_model_parallel_size: 8
48+
expert_model_parallel_size: 32
49+
pipeline_model_parallel_size: 8
50+
num_layers_in_first_pipeline_stage: 7
51+
num_layers_in_last_pipeline_stage: 6
52+
context_parallel_size: 4
53+
sequence_parallel: true
54+
moe_permute_fusion: true
55+
apply_rope_fusion: false
56+
optimizer:
57+
lr: 5.0e-07
58+
min_lr: 5.0e-08
59+
scheduler:
60+
lr_warmup_init: 5.0e-08
61+
sequence_packing:
62+
enabled: true
63+
generation:
64+
max_new_tokens: 1536
65+
vllm_cfg:
66+
async_engine: false
67+
tensor_parallel_size: 64
68+
enforce_eager: true
69+
data:
70+
max_input_seq_length: 512 # max_prompt_length
71+
prompt_file: null
72+
dataset_name: DAPOMath17K
73+
env:
74+
dapo:
75+
num_workers: 64
76+
math:
77+
num_workers: 64
78+
math_verify_impl: "dapo_math_verify"
79+
80+
logger:
81+
monitor_gpus: true
82+
wandb:
83+
project: DAPO
84+
name: DAPO-DeepSeek-671b-megatron
85+
mlflow:
86+
experiment_name: DAPO
87+
run_name: DAPO-DeepSeek-671b-megatron
88+
cluster:
89+
gpus_per_node: 8
90+
num_nodes: 64
91+
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/bash
2+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
3+
source $SCRIPT_DIR/common.env
4+
# disable NVLS to avoid OOM issue
5+
export NCCL_NVLS_ENABLE=0
6+
export NRT_REBUILD_VENVS=true
7+
# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md
8+
export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"}
9+
10+
# ===== BEGIN CONFIG =====
11+
NUM_NODES=64
12+
STEPS_PER_RUN=10
13+
MAX_STEPS=10
14+
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
15+
NUM_MINUTES=240
16+
# ===== END CONFIG =====
17+
18+
exit_if_max_steps_reached
19+
20+
# Run the experiment
21+
cd $PROJECT_ROOT
22+
uv run examples/run_grpo_math.py \
23+
--config $CONFIG_PATH \
24+
grpo.num_prompts_per_step=64 \
25+
grpo.num_generations_per_prompt=8 \
26+
grpo.max_num_steps=$MAX_STEPS \
27+
policy.model_name=$MODEL_NAME \
28+
policy.tokenizer.name=$MODEL_NAME \
29+
cluster.num_nodes=$NUM_NODES \
30+
logger.log_dir=$LOG_DIR \
31+
logger.wandb_enabled=True \
32+
logger.wandb.project=nemo-rl \
33+
logger.wandb.name=$EXP_NAME \
34+
logger.monitor_gpus=True \
35+
logger.tensorboard_enabled=True \
36+
$@ \
37+
2>&1 | tee $RUN_LOG
38+
39+
# Convert tensorboard logs to json
40+
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
41+
42+
# Only run metrics if the target step is reached
43+
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
44+
uv run tests/check_metrics.py $JSON_METRICS \
45+
'mean(data["train/token_mult_prob_error"]) < 1.1' \
46+
'data["train/token_mult_prob_error"]["10"] < 1.1'
47+
fi

tests/test_suites/performance_h100.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
1010
tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
1111
tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
1212
tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh
13+
tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh
1314

1415
## ASYNC 1-off
1516
tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh

0 commit comments

Comments
 (0)