Skip to content

Commit 868a898

Browse files
committed
17.5 step time
Signed-off-by: Terry Kong <[email protected]>
1 parent 1fce978 commit 868a898

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,12 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
3434

3535
# Only run metrics if the target step is reached
3636
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
37+
# total_step_time observed around ~16, so 17.5 for buffer
3738
uv run tests/check_metrics.py $JSON_METRICS \
3839
'median(data["train/token_mult_prob_error"]) < 1.1' \
3940
'data["train/token_mult_prob_error"]["500"] < 1.1' \
4041
'data["train/reward"]["500"] > 0.1' \
41-
'mean(data["timing/train/total_step_time"], -6, -1) < 10.5'
42+
'mean(data["timing/train/total_step_time"], -6, -1) < 17.5'
4243

4344
# Clean up checkpoint directory after successful run to save space.
4445
rm -rf "$CKPT_DIR"

0 commit comments

Comments
 (0)