perf: Add qwen3 30b-a3b async-8-off recipe (#1642)

youngeunkwon0405 · web-flow · commit 7dd9a01bae1f · 2025-12-16T22:27:37.000-08:00
Signed-off-by: Youngeun Kwon &lt;youngeunk@nvidia.com&gt;
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.yaml
@@ -0,0 +1,33 @@
+defaults: ./grpo-qwen3-30ba3b-4n8g.yaml
+grpo:
+  async_grpo:
+    enabled: true
+    max_trajectory_age_steps: 8
+    in_flight_weight_updates: true
+loss_fn:
+  use_importance_sampling_correction: true
+checkpointing:
+  checkpoint_dir: results/grpo-qwen3-30ba3b-24n8g-async-8off
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    pipeline_model_parallel_size: 1
+    expert_model_parallel_size: 8
+    sequence_parallel: false
+  generation:
+    colocated:
+      enabled: false
+      resources:
+        num_nodes: 8
+        gpus_per_node: 8
+    vllm_cfg:
+      async_engine: true
+      tensor_parallel_size: 2
+      gpu_memory_utilization: 0.8
+logger:
+  log_dir: logs/grpo-qwen3-30ba3b-24n8g-16T8G-async-8off
+  wandb:
+    name: grpo-qwen3-30ba3b-24n8g-16T8G-async-8off
+cluster:
+  gpus_per_node: 8
+  num_nodes: 24
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=24
+STEPS_PER_RUN=10
+MAX_STEPS=10
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=100
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_grpo_math.py \
+    --config $CONFIG_PATH \
+    grpo.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'mean(data["train/token_mult_prob_error"]) < 1.1' \
+        'data["train/token_mult_prob_error"]["10"] < 1.1'
+fi
diff --git a/tests/test_suites/performance.txt b/tests/test_suites/performance.txt
@@ -12,4 +12,6 @@ tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
 tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
-tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
+
+tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh