Skip to content

Commit 7dd9a01

Browse files
perf: Add qwen3 30b-a3b async-8-off recipe (#1642)
Signed-off-by: Youngeun Kwon <youngeunk@nvidia.com>
1 parent 0bddd47 commit 7dd9a01

File tree

3 files changed

+75
-1
lines changed

3 files changed

+75
-1
lines changed
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
defaults: ./grpo-qwen3-30ba3b-4n8g.yaml
2+
grpo:
3+
async_grpo:
4+
enabled: true
5+
max_trajectory_age_steps: 8
6+
in_flight_weight_updates: true
7+
loss_fn:
8+
use_importance_sampling_correction: true
9+
checkpointing:
10+
checkpoint_dir: results/grpo-qwen3-30ba3b-24n8g-async-8off
11+
policy:
12+
megatron_cfg:
13+
tensor_model_parallel_size: 1
14+
pipeline_model_parallel_size: 1
15+
expert_model_parallel_size: 8
16+
sequence_parallel: false
17+
generation:
18+
colocated:
19+
enabled: false
20+
resources:
21+
num_nodes: 8
22+
gpus_per_node: 8
23+
vllm_cfg:
24+
async_engine: true
25+
tensor_parallel_size: 2
26+
gpu_memory_utilization: 0.8
27+
logger:
28+
log_dir: logs/grpo-qwen3-30ba3b-24n8g-16T8G-async-8off
29+
wandb:
30+
name: grpo-qwen3-30ba3b-24n8g-16T8G-async-8off
31+
cluster:
32+
gpus_per_node: 8
33+
num_nodes: 24
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
3+
source $SCRIPT_DIR/common.env
4+
5+
# ===== BEGIN CONFIG =====
6+
NUM_NODES=24
7+
STEPS_PER_RUN=10
8+
MAX_STEPS=10
9+
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
10+
NUM_MINUTES=100
11+
# ===== END CONFIG =====
12+
13+
exit_if_max_steps_reached
14+
15+
# Run the experiment
16+
cd $PROJECT_ROOT
17+
uv run examples/run_grpo_math.py \
18+
--config $CONFIG_PATH \
19+
grpo.max_num_steps=$MAX_STEPS \
20+
logger.log_dir=$LOG_DIR \
21+
logger.wandb_enabled=True \
22+
logger.wandb.project=nemo-rl \
23+
logger.wandb.name=$EXP_NAME \
24+
logger.monitor_gpus=True \
25+
logger.tensorboard_enabled=True \
26+
checkpointing.enabled=True \
27+
checkpointing.checkpoint_dir=$CKPT_DIR \
28+
$@ \
29+
2>&1 | tee $RUN_LOG
30+
31+
# Convert tensorboard logs to json
32+
uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
33+
34+
# Only run metrics if the target step is reached
35+
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
36+
uv run tests/check_metrics.py $JSON_METRICS \
37+
'mean(data["train/token_mult_prob_error"]) < 1.1' \
38+
'data["train/token_mult_prob_error"]["10"] < 1.1'
39+
fi

tests/test_suites/performance.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,6 @@ tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
1212
tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
1313
tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
1414
tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
15-
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
15+
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
16+
17+
tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh

0 commit comments

Comments
 (0)