Skip to content

Commit 95ec444

Browse files
committed
update nightly test
Signed-off-by: Yuki Huang <yukih@nvidia.com>
1 parent ccef1a1 commit 95ec444

7 files changed

+36
-79
lines changed
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
defaults: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
2+
checkpointing:
3+
checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp2-temp0.8-topp0.9-topk50
4+
policy:
5+
dtensor_cfg:
6+
tensor_parallel_size: 2
7+
generation:
8+
temperature: 0.8
9+
top_p: 0.9
10+
top_k: 50
11+
logger:
12+
log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp2-temp0.8-topp0.9-topk50
13+
wandb:
14+
name: grpo-llama3.2-1b-instruct-1n8g-fsdp2tp2-temp0.8-topp0.9-topk50

examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-sampling.yaml

Lines changed: 0 additions & 37 deletions
This file was deleted.

examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.6.yaml

Lines changed: 0 additions & 35 deletions
This file was deleted.
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
defaults: grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
2+
checkpointing:
3+
checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.8-topp0.9-topk50
4+
policy:
5+
megatron_cfg:
6+
tensor_model_parallel_size: 2
7+
generation:
8+
temperature: 0.8
9+
top_p: 0.9
10+
top_k: 50
11+
logger:
12+
log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.8-topp0.9-topk50
13+
wandb:
14+
name: grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.8-topp0.9-topk50

tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-sampling.sh renamed to tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp2-temp0.8-topp0.9-topk50.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ NUM_NODES=1
77
STEPS_PER_RUN=500
88
MAX_STEPS=500
99
NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up
10-
NUM_MINUTES=180
10+
NUM_MINUTES=120
1111
# ===== END CONFIG =====
1212

1313
exit_if_max_steps_reached
@@ -34,10 +34,9 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
3434
# Only run metrics if the target step is reached
3535
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
3636
uv run tests/check_metrics.py $JSON_METRICS \
37-
'mean(data["train/token_mult_prob_error"], ignore_top_p=0.01) < 1.05' \
37+
'median(data["train/token_mult_prob_error"]) < 1.1' \
3838
'data["train/token_mult_prob_error"]["500"] < 1.1' \
39-
'data["train/reward"]["500"] > 0.1' \
40-
'mean(data["timing/train/total_step_time"], -6, -1) < 12.5'
39+
'mean(data["timing/train/total_step_time"], -6, -1) < 10'
4140

4241
# Clean up checkpoint directory after successful run to save space.
4342
rm -rf "$CKPT_DIR"

tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.6.sh renamed to tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.8-topp0.9-topk50.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
3434
# Only run metrics if the target step is reached
3535
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
3636
uv run tests/check_metrics.py $JSON_METRICS \
37-
'mean(data["train/token_mult_prob_error"], ignore_top_p=0.01) < 1.05' \
37+
'median(data["train/token_mult_prob_error"]) < 1.1' \
3838
'data["train/token_mult_prob_error"]["500"] < 1.1' \
3939
'data["train/reward"]["500"] > 0.1' \
4040
'mean(data["timing/train/total_step_time"], -6, -1) < 10.5'

tests/test_suites/nightly.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ tests/test_suites/llm/grpo-moonlight-16b-automodel-1n8g-ep8.sh
2323
# Megatron
2424
tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.sh
2525
tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh
26-
tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-sampling.sh
27-
tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.6.sh
2826

2927
# Functional 32b run
3028
tests/test_suites/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8-actckpt.v3.sh
@@ -80,6 +78,10 @@ tests/test_suites/llm/grpo-nanov3-30BA3B-2n8g-megatron-lora.sh
8078
tests/test_suites/llm/grpo-qwen3-8B-base-1n8g-fsdp2-lora.sh
8179
tests/test_suites/llm/grpo-qwen3-8b-base-1n8g-megatron-lora.sh
8280

81+
# Sampling (temperature, top-p, top-k)
82+
tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp2-temp0.8-topp0.9-topk50.sh
83+
tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.8-topp0.9-topk50.sh
84+
8385
#######
8486
# SFT #
8587
#######

0 commit comments

Comments
 (0)