File tree Expand file tree Collapse file tree 3 files changed +75
-1
lines changed
examples/configs/recipes/llm/performance Expand file tree Collapse file tree 3 files changed +75
-1
lines changed Original file line number Diff line number Diff line change 1+ defaults : ./grpo-qwen3-30ba3b-4n8g.yaml
2+ grpo :
3+ async_grpo :
4+ enabled : true
5+ max_trajectory_age_steps : 8
6+ in_flight_weight_updates : true
7+ loss_fn :
8+ use_importance_sampling_correction : true
9+ checkpointing :
10+ checkpoint_dir : results/grpo-qwen3-30ba3b-24n8g-async-8off
11+ policy :
12+ megatron_cfg :
13+ tensor_model_parallel_size : 1
14+ pipeline_model_parallel_size : 1
15+ expert_model_parallel_size : 8
16+ sequence_parallel : false
17+ generation :
18+ colocated :
19+ enabled : false
20+ resources :
21+ num_nodes : 8
22+ gpus_per_node : 8
23+ vllm_cfg :
24+ async_engine : true
25+ tensor_parallel_size : 2
26+ gpu_memory_utilization : 0.8
27+ logger :
28+ log_dir : logs/grpo-qwen3-30ba3b-24n8g-16T8G-async-8off
29+ wandb :
30+ name : grpo-qwen3-30ba3b-24n8g-16T8G-async-8off
31+ cluster :
32+ gpus_per_node : 8
33+ num_nodes : 24
Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+ SCRIPT_DIR=$( cd -- " $( dirname -- " ${BASH_SOURCE[0]} " ) " & > /dev/null && pwd)
3+ source $SCRIPT_DIR /common.env
4+
5+ # ===== BEGIN CONFIG =====
6+ NUM_NODES=24
7+ STEPS_PER_RUN=10
8+ MAX_STEPS=10
9+ NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1 ) / STEPS_PER_RUN )) # Round up
10+ NUM_MINUTES=100
11+ # ===== END CONFIG =====
12+
13+ exit_if_max_steps_reached
14+
15+ # Run the experiment
16+ cd $PROJECT_ROOT
17+ uv run examples/run_grpo_math.py \
18+ --config $CONFIG_PATH \
19+ grpo.max_num_steps=$MAX_STEPS \
20+ logger.log_dir=$LOG_DIR \
21+ logger.wandb_enabled=True \
22+ logger.wandb.project=nemo-rl \
23+ logger.wandb.name=$EXP_NAME \
24+ logger.monitor_gpus=True \
25+ logger.tensorboard_enabled=True \
26+ checkpointing.enabled=True \
27+ checkpointing.checkpoint_dir=$CKPT_DIR \
28+ $@ \
29+ 2>&1 | tee $RUN_LOG
30+
31+ # Convert tensorboard logs to json
32+ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
33+
34+ # Only run metrics if the target step is reached
35+ if [[ $( jq ' to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS ) -ge $MAX_STEPS ]]; then
36+ uv run tests/check_metrics.py $JSON_METRICS \
37+ ' mean(data["train/token_mult_prob_error"]) < 1.1' \
38+ ' data["train/token_mult_prob_error"]["10"] < 1.1'
39+ fi
Original file line number Diff line number Diff line change @@ -12,4 +12,6 @@ tests/test_suites/llm/performance/grpo-deepseek-v3-64n8g-async-1off.sh
1212tests/test_suites/llm/performance/grpo-llama3.1-8b-instruct-2n8g-async-1off.sh
1313tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
1414tests/test_suites/llm/performance/grpo-qwen3-235b-32n8g-async-1off.sh
15- tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
15+ tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-async-1off.sh
16+
17+ tests/test_suites/llm/performance/grpo-qwen3-30ba3b-24n8g-async-8off.sh
You can’t perform that action at this time.
0 commit comments