feat: add dapo recipe and test (#1617)

ZhiyuLi-Nvidia · web-flow · commit 56e8fcbfc2b9 · 2025-12-22T10:07:15.000-08:00
Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;
diff --git a/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml b/examples/configs/recipes/llm/performance/dapo-deepseek-v3-64n8g.yaml
@@ -0,0 +1,91 @@
+defaults: ../../../grpo_math_1B.yaml
+grpo:
+  num_prompts_per_step: 256
+  num_generations_per_prompt: 16
+  batch_multiplier: 3 # Multiplier for dataloader batch size calculation
+  max_rollout_turns: 1
+  max_num_steps: 10
+  use_leave_one_out_baseline: false
+  val_period: 5
+  max_val_samples: 256
+  val_batch_size: 256
+  use_dynamic_sampling: true
+  dynamic_sampling_max_gen_batches: 10
+  reward_scaling:
+    enabled: true
+    source_min: 0.0
+    source_max: 1.0
+    target_min: -1.0
+    target_max: 1.0
+  reward_shaping:
+    enabled: true
+    overlong_buffer_length: 512
+    max_response_length: 1024
+loss_fn:
+  reference_policy_kl_penalty: 0.0  # Corresponds to actor_rollout_ref.actor.kl_loss_coef
+  ratio_clip_max: 0.28  # clip_ratio_high
+  ratio_clip_min: 0.2   # clip_ratio_low
+  ratio_clip_c: 10.0
+checkpointing:
+  checkpoint_dir: results/grpo_dapomath17k_dsv3_megatron
+  keep_top_k: 100
+policy:
+  model_name: /path/to/dsv3-bf16-checkpoint
+  hf_config_overrides:
+    max_position_embeddings: 1536
+  train_micro_batch_size: 1
+  logprob_batch_size: 1
+  max_total_sequence_length: 1536
+  dtensor_cfg:
+    enabled: false
+  make_sequence_length_divisible_by: ${mul:${policy.dtensor_cfg.tensor_parallel_size},
+    ${mul:2, ${policy.dtensor_cfg.context_parallel_size}}}
+  megatron_cfg:
+    empty_unused_memory_level: 2
+    enabled: true
+    activation_checkpointing: true
+    tensor_model_parallel_size: 8
+    expert_model_parallel_size: 32
+    pipeline_model_parallel_size: 8
+    num_layers_in_first_pipeline_stage: 7
+    num_layers_in_last_pipeline_stage: 6
+    context_parallel_size: 4
+    sequence_parallel: true
+    moe_permute_fusion: true
+    apply_rope_fusion: false
+    optimizer:
+      lr: 5.0e-07
+      min_lr: 5.0e-08
+    scheduler:
+      lr_warmup_init: 5.0e-08
+  sequence_packing:
+    enabled: true
+  generation:
+    max_new_tokens: 1536
+    vllm_cfg:
+      async_engine: false
+      tensor_parallel_size: 64
+      enforce_eager: true
+data:
+  max_input_seq_length: 512  # max_prompt_length
+  prompt_file: null
+  dataset_name: DAPOMath17K
+env:
+  dapo:
+    num_workers: 64
+  math:
+    num_workers: 64
+    math_verify_impl: "dapo_math_verify"
+
+logger:
+  monitor_gpus: true
+  wandb:
+    project: DAPO
+    name: DAPO-DeepSeek-671b-megatron
+  mlflow:
+    experiment_name: DAPO
+    run_name: DAPO-DeepSeek-671b-megatron
+cluster:
+  gpus_per_node: 8
+  num_nodes: 64
+
diff --git a/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh b/tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+# disable NVLS to avoid OOM issue
+export NCCL_NVLS_ENABLE=0
+export NRT_REBUILD_VENVS=true
+# allow user to pass an existing HF checkpoint path based on instruction in https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/deepseek.md
+export MODEL_NAME=${NRL_DEEPSEEK_V3_HF_CKPT:-"unsloth/DeepSeek-V3-0324-BF16"}
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=64
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=240
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.num_prompts_per_step=64 \
+    grpo.num_generations_per_prompt=8 \
+    grpo.max_num_steps=$MAX_STEPS \
+    policy.model_name=$MODEL_NAME \
+    policy.tokenizer.name=$MODEL_NAME \
+    cluster.num_nodes=$NUM_NODES \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/performance_h100.txt b/tests/test_suites/performance_h100.txt
@@ -10,6 +10,7 @@ tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g.sh
 tests/test_suites/llm/performance/grpo-deepseek-v3-32n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-16n8g.sh
+tests/test_suites/llm/performance/dapo-deepseek-v3-64n8g.sh
 
 ## ASYNC 1-off
 tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh