test: enable 8k/16k/24k deepscaler nightly tests (#934)

terrykong · web-flow · commit d149a6249439 · 2025-08-19T16:20:53.000Z
Signed-off-by: Terry Kong &lt;terryk@nvidia.com&gt;
diff --git a/docs/guides/eval.md b/docs/guides/eval.md
@@ -79,7 +79,7 @@ When you complete the evaluation, you will receive a summary similar to the foll
 ```
 ============================================================
 model_name='Qwen2.5-Math-1.5B-Instruct' dataset_name='aime2024'
-max_new_tokens=2048 temperature=0.0 top_p=1.0 top_k=-1
+max_new_tokens=2048 temperature=0.0 top_p=1.0 top_k=-1 seed=42
 
 metric=pass@1 num_tests_per_prompt=1
 
diff --git a/docs/guides/grpo-deepscaler.md b/docs/guides/grpo-deepscaler.md
@@ -35,11 +35,17 @@ Throughout training, the checkpoints of the model will be saved to the `results`
 uv run examples/run_eval.py \
     generation.model_name=results/grpo-deepscaler-1.5b-8K/step_240/hf \
     data.prompt_file=examples/prompts/cot.txt \
-    generation.vllm_cfg.max_model_len=32768
+    generation.vllm_cfg.max_model_len=32768 \
+    generation.vllm_cfg.enforce_eager=True \
+    generation.temperature=1.0
 ```
 
 Use `generation.model_name` to specify the path to the Hugging Face checkpoint. In addition, we use AIME24 as the validation dataset and calculate pass@1 on it throughout training.
 
+> [!NOTE]
+> AIME24 only has 30 examples so the accuracy can be very noisy.
+> To reduce the variance consider runing `run_eval.py` with `eval.num_tests_per_prompt=16`.
+
 ## Evaluation Results
 Using the above instructions to train DeepSeek-R1-Distill-Qwen-1.5B on the DeepScaleR dataset, we can track the model's performance on the AIME24 benchmark throughout training. The following plot shows the evaluation metrics as training progresses:
 
diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py
@@ -64,7 +64,9 @@ def math_data_processor(
         add_generation_prompt=True,
         add_special_tokens=False,
     )
-    user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0]
+    user_message["token_ids"] = tokenizer(
+        message, return_tensors="pt", add_special_tokens=False
+    )["input_ids"][0]
     user_message["content"] = message
     message_log.append(user_message)
 
diff --git a/nemo_rl/evals/eval.py b/nemo_rl/evals/eval.py
@@ -485,14 +485,15 @@ def _print_results(
     dataset_name = os.path.basename(master_config["data"]["dataset_name"])
     model_name = os.path.basename(generation_config["model_name"])
     max_new_tokens = generation_config["vllm_cfg"]["max_model_len"]
+    seed = master_config["eval"]["seed"]
     temperature = generation_config["temperature"]
     top_p = generation_config["top_p"]
     top_k = generation_config["top_k"]
     average_score = score / dataset_size
 
     print("\n" + "=" * 60)
     print(f"{model_name=} {dataset_name=}")
-    print(f"{max_new_tokens=} {temperature=} {top_p=} {top_k=}\n")
+    print(f"{max_new_tokens=} {temperature=} {top_p=} {top_k=} {seed=}\n")
     print(f"metric={metric[:-1]}{k_value} {num_tests_per_prompt=}\n")
     print(f"score={average_score:.4f} ({score}/{dataset_size})")
     print("=" * 60 + "\n")
diff --git a/tests/functional/eval.sh b/tests/functional/eval.sh
@@ -27,4 +27,4 @@ uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJE
 cat $RUN_LOG | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-  'data["score"] == 0.1' \
+  'data["score"] == 0.1'
diff --git a/tests/functional/eval_async.sh b/tests/functional/eval_async.sh
@@ -29,4 +29,4 @@ uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJE
 cat $RUN_LOG | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-  'data["score"] == 0.1' \
+  'data["score"] == 0.1'
diff --git a/tests/functional/grpo.sh b/tests/functional/grpo.sh
@@ -38,5 +38,5 @@ uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJE
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-    'max(data["train/token_mult_prob_error"]) < 1.05' \
+    'max(data["train/token_mult_prob_error"]) < 1.05'
 
diff --git a/tests/functional/grpo_megatron.sh b/tests/functional/grpo_megatron.sh
@@ -41,5 +41,5 @@ uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJE
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-    'max(data["train/token_mult_prob_error"]) < 1.05' \
+    'max(data["train/token_mult_prob_error"]) < 1.05'
 
diff --git a/tests/functional/grpo_multiturn.sh b/tests/functional/grpo_multiturn.sh
@@ -41,5 +41,5 @@ uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJE
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-    'max(data["train/token_mult_prob_error"]) < 1.1' \
+    'max(data["train/token_mult_prob_error"]) < 1.1'
 
diff --git a/tests/functional/grpo_non_colocated.sh b/tests/functional/grpo_non_colocated.sh
@@ -39,5 +39,5 @@ uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJE
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-    'max(data["train/token_mult_prob_error"]) < 1.05' \
+    'max(data["train/token_mult_prob_error"]) < 1.05'
 
diff --git a/tests/functional/sft.sh b/tests/functional/sft.sh
@@ -41,5 +41,5 @@ uv run coverage run -a --data-file=$PROJECT_ROOT/tests/.coverage --source=$PROJE
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-  'data["train/loss"]["3"] < 5.9' \
+  'data["train/loss"]["3"] < 5.9'
 
diff --git a/tests/test_suites/README.md b/tests/test_suites/README.md
@@ -50,6 +50,10 @@ DRYRUN=1 CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama
 
 # Prints Estimated GPUhrs, creates code snapshot, then exits
 DRYRUN=2 CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
+
+# Launch but set extra env vars
+EXTRA_ENV="NRL_FORCE_REBUILD_VENVS=true NRL_DEEPSCALER_8K_CKPT=/8k-ckpt NRL_DEEPSCALER_16K_CKPT=/16k-ckpt" \
+CONTAINER=... ACCOUNT=... PARTITION=... ../tools/launch ./llm/sft-llama3.2-1b-1n8g-fsdp2tp1.sh
 ```
 
 After this completes, you can find the result under
diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-16K.sh
@@ -4,25 +4,25 @@ source $SCRIPT_DIR/common.env
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=1
-STEPS_PER_RUN=30
-MAX_STEPS=30
+STEPS_PER_RUN=20
+MAX_STEPS=20
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=240
 # ===== END CONFIG =====
 
 exit_if_max_steps_reached
 
 # Use checkpoint created from the 8K checkpoint in grpo-deepscaler-1.5b-8K.sh
-if [[ -z "$CACHED_MODEL_PATH" ]]; then
-    echo "Need to set CACHED_MODEL_PATH to the path to the trained 8K checkpoint"
+if [[ -z "$NRL_DEEPSCALER_8K_CKPT" ]]; then
+    echo "Need to set NRL_DEEPSCALER_8K_CKPT to the path to the trained 8K checkpoint"
     exit 1
 fi
 
 # Run the experiment
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
-    policy.model_name=$CACHED_MODEL_PATH \
+    policy.model_name=$NRL_DEEPSCALER_8K_CKPT \
     grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
@@ -36,30 +36,34 @@ uv run examples/run_grpo_math.py \
     2>&1 | tee $RUN_LOG
 
 # Convert tensorboard logs to json
-uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS --allow-conflicts
 
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.1"
+        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05"
 fi
 
-# TODO: enable in subsequent PR to do a quick accuracy check
-## Convert 16k checkpoint
-#uv run examples/converters/convert_dcp_to_hf.py \
-#  --config=$CKPT_DIR/step_${MAX_STEPS}/config.yaml \
-#  --dcp-ckpt-path=$CKPT_DIR/step_${MAX_STEPS}/policy/weights \
-#  --hf-ckpt-path=$CKPT_DIR/grpo-deepscaler-16k-${MAX_STEPS}-hf
-#
-## Run eval
-#uv run examples/run_eval.py \
-#    generation.model_name=$CKPT_DIR/grpo-deepscaler-16k-${MAX_STEPS}-hf \
-#    data.prompt_file=examples/prompts/cot.txt \
-#    generation.vllm_cfg.max_model_len=32768 2>&1 | tee ${RUN_LOG}.aime-16k
-#
-#cat ${RUN_LOG}.aime-16k       | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > ${RUN_LOG}-16k-metric.json
-# 
-#uv run tests/check_metrics.py ${RUN_LOG}-16k-metric.json \
-#  'data["score"] >= 0.25' \
+# Convert 16k checkpoint
+uv run examples/converters/convert_dcp_to_hf.py \
+  --config=$CKPT_DIR/step_${MAX_STEPS}/config.yaml \
+  --dcp-ckpt-path=$CKPT_DIR/step_${MAX_STEPS}/policy/weights \
+  --hf-ckpt-path=$CKPT_DIR/grpo-deepscaler-16k-${MAX_STEPS}-hf
+
+# Run eval
+uv run examples/run_eval.py \
+    generation.model_name=$CKPT_DIR/grpo-deepscaler-16k-${MAX_STEPS}-hf \
+    data.prompt_file=examples/prompts/cot.txt \
+    generation.vllm_cfg.max_model_len=32768 \
+    generation.vllm_cfg.enforce_eager=True \
+    generation.temperature=1.0 \
+    eval.num_tests_per_prompt=16 \
+    2>&1 | tee ${RUN_LOG}.aime-16k
+
+cat ${RUN_LOG}.aime-16k       | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > ${RUN_LOG}-16k-metric.json
+ 
+# 240 step checkpoint 0.3
+uv run tests/check_metrics.py ${RUN_LOG}-16k-metric.json \
+  'data["score"] >= 0.2396'
 
diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-24K.sh
@@ -3,26 +3,26 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
 
 # ===== BEGIN CONFIG =====
-NUM_NODES=4
-STEPS_PER_RUN=30
-MAX_STEPS=30
+NUM_NODES=1
+STEPS_PER_RUN=15
+MAX_STEPS=15
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=240
 # ===== END CONFIG =====
 
 exit_if_max_steps_reached
 
 # Use checkpoint created from the 16K checkpoint in grpo-deepscaler-1.5b-16K.sh
-if [[ -z "$CACHED_MODEL_PATH" ]]; then
-    echo "Need to set CACHED_MODEL_PATH to the path to the trained 16K checkpoint"
+if [[ -z "$NRL_DEEPSCALER_16K_CKPT" ]]; then
+    echo "Need to set NRL_DEEPSCALER_16K_CKPT to the path to the trained 16K checkpoint"
     exit 1
 fi
 
 # Run the experiment
 cd $PROJECT_ROOT
 uv run examples/run_grpo_math.py \
     --config $CONFIG_PATH \
-    policy.model_name=$CACHED_MODEL_PATH \
+    policy.model_name=$NRL_DEEPSCALER_16K_CKPT \
     grpo.max_num_steps=$MAX_STEPS \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=True \
@@ -36,30 +36,33 @@ uv run examples/run_grpo_math.py \
     2>&1 | tee $RUN_LOG
 
 # Convert tensorboard logs to json
-uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS --allow-conflicts
 
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     uv run tests/check_metrics.py $JSON_METRICS \
-        'mean(data["train/token_mult_prob_error"]) < 1.1' \
-        "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.1"
+        'mean(data["train/token_mult_prob_error"]) < 1.05' \
+        "data['train/token_mult_prob_error']['$MAX_STEPS'] < 1.05"
 fi
 
-# TODO: enable in subsequent PR to do a quick accuracy check
-## Convert 24k checkpoint
-#uv run examples/converters/convert_dcp_to_hf.py \
-#  --config=$CKPT_DIR/step_${MAX_STEPS}/config.yaml \
-#  --dcp-ckpt-path=$CKPT_DIR/step_${MAX_STEPS}/policy/weights \
-#  --hf-ckpt-path=$CKPT_DIR/grpo-deepscaler-24k-${MAX_STEPS}-hf
-#
-## Run eval
-#uv run examples/run_eval.py \
-#    generation.model_name=$CKPT_DIR/grpo-deepscaler-24k-${MAX_STEPS}-hf \
-#    data.prompt_file=examples/prompts/cot.txt \
-#    generation.vllm_cfg.max_model_len=32768 2>&1 | tee ${RUN_LOG}.aime-24k
-#
-#cat ${RUN_LOG}.aime-24k       | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > ${RUN_LOG}-24k-metric.json
-# 
-#uv run tests/check_metrics.py ${RUN_LOG}-24k-metric.json \
-#  'data["score"] >= 0.25' \
+# Convert 24k checkpoint
+uv run examples/converters/convert_dcp_to_hf.py \
+  --config=$CKPT_DIR/step_${MAX_STEPS}/config.yaml \
+  --dcp-ckpt-path=$CKPT_DIR/step_${MAX_STEPS}/policy/weights \
+  --hf-ckpt-path=$CKPT_DIR/grpo-deepscaler-24k-${MAX_STEPS}-hf
+
+# Run eval
+uv run examples/run_eval.py \
+    generation.model_name=$CKPT_DIR/grpo-deepscaler-24k-${MAX_STEPS}-hf \
+    data.prompt_file=examples/prompts/cot.txt \
+    generation.vllm_cfg.max_model_len=32768 \
+    generation.vllm_cfg.enforce_eager=True \
+    generation.temperature=1.0 \
+    eval.num_tests_per_prompt=16 \
+    2>&1 | tee ${RUN_LOG}.aime-24k
+
+cat ${RUN_LOG}.aime-24k       | grep "score=" | sed 's/.*score=\([^ ]*\).*/{"score": \1}/' > ${RUN_LOG}-24k-metric.json
+ 
+uv run tests/check_metrics.py ${RUN_LOG}-24k-metric.json \
+  'data["score"] >= 0.2396'
 
diff --git a/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh b/tests/test_suites/llm/grpo-deepscaler-1.5b-8K.sh
diff --git a/tools/code_snapshot.sh b/tools/code_snapshot.sh
diff --git a/tools/launch b/tools/launch