cp: feat: Add Nemotron‑3 Nano 30B A3B BF16 SFT nightly tests (FSDP2, +LoRA) (1648) into r0.5.0 (#1697)

chtruong814 · RayenTian · web-flow · commit e883ac420a00 · 2025-12-24T14:43:36.000Z
Signed-off-by: ruit &lt;ruit@nvidia.com&gt;
Signed-off-by: NeMo Bot &lt;nemo-bot@nvidia.com&gt;
Co-authored-by: Rayen &lt;ruit@nvidia.com&gt;
diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.yaml
@@ -0,0 +1,26 @@
+defaults: ../../sft.yaml
+sft:
+  max_num_steps: 100
+checkpointing:
+  enabled: false
+policy:
+  model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+  train_global_batch_size: 16
+  max_total_sequence_length: 2048
+  dtensor_cfg:
+    lora_cfg:
+      enabled: true
+      dim: 256
+      alpha: 512
+      use_triton: false
+logger:
+  wandb:
+    project: nemo-rl
+    name: sft-nanov3-30BA3B-2n8g-fsdp2-lora
+  tensorboard:
+    log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2-lora
+  mlflow:
+    run_name: sft-nanov3-30BA3B-2n8g-fsdp2-lora
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2
diff --git a/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml b/examples/configs/recipes/llm/sft-nanov3-30BA3B-2n8g-fsdp2.yaml
@@ -0,0 +1,20 @@
+defaults: ../../sft.yaml
+sft:
+  max_num_steps: 100
+checkpointing:
+  enabled: false
+policy:
+  model_name: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-Base-BF16
+  train_global_batch_size: 16
+  max_total_sequence_length: 2048
+logger:
+  wandb:
+    project: nemo-rl
+    name: sft-nanov3-30BA3B-2n8g-fsdp2
+  tensorboard:
+    log_dir: tb_logs-sft-nanov3-30BA3B-2n8g-fsdp2
+  mlflow:
+    run_name: sft-nanov3-30BA3B-2n8g-fsdp2
+cluster:
+  gpus_per_node: 8
+  num_nodes: 2
diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=20  # step_time ~ 10sec
+MAX_STEPS=20
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=15 # Usually 15 minutes is enough for 20 steps, but we add a buffer of 3 minutes in metrics check
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["20"] < 2.03' \
+        'mean(data["timing/train/total_step_time"], 2) < 18'
+fi
diff --git a/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh b/tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
+source $SCRIPT_DIR/common.env
+
+# ===== BEGIN CONFIG =====
+NUM_NODES=2
+STEPS_PER_RUN=20  # step_time ~ 15sec
+MAX_STEPS=20
+NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
+NUM_MINUTES=15
+# ===== END CONFIG =====
+
+exit_if_max_steps_reached
+
+# Run the experiment
+cd $PROJECT_ROOT
+uv run examples/run_sft.py \
+    --config $CONFIG_PATH \
+    sft.max_num_steps=$MAX_STEPS \
+    logger.log_dir=$LOG_DIR \
+    logger.wandb_enabled=True \
+    logger.wandb.project=nemo-rl \
+    logger.wandb.name=$EXP_NAME \
+    logger.monitor_gpus=True \
+    logger.tensorboard_enabled=True \
+    checkpointing.checkpoint_dir=$CKPT_DIR \
+    $@ \
+    2>&1 | tee $RUN_LOG
+
+# Convert tensorboard logs to json
+uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
+
+# Only run metrics if the target step is reached
+if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    uv run tests/check_metrics.py $JSON_METRICS \
+        'data["train/loss"]["20"] < 1.98' \
+        'mean(data["timing/train/total_step_time"], 2) < 15'
+fi
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
@@ -90,6 +90,10 @@ tests/test_suites/llm/sft-qwen2.5-math7b-2n8g-megatron.sh
 # gpt-oss 20b DeepEP test
 tests/test_suites/llm/sft-gpt-oss-20b-1n8g-fsdp8ep8-automodel.sh
 
+# Nemotron 3 Nano 30B A3B Base BF16 tests
+tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2.sh
+tests/test_suites/llm/sft-nanov3-30BA3B-2n8g-fsdp2-lora.sh
+
 #######
 # DPO #
 #######
diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
@@ -180,7 +180,7 @@ def test_all_recipe_yamls_accounted_for_in_test_suites(
     )
 
 
-def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker):
+def test_nightly_compute_stays_below_1140_hours(nightly_test_suite, tracker):
     command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}"
 
     print(f"Running command: {command}")
@@ -212,8 +212,8 @@ def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker):
         f"Last line of output was not as expected: '{last_line}'"
     )
     total_gpu_hours = float(last_line.split(":")[-1].strip())
-    assert total_gpu_hours <= 1130, (
-        f"Total GPU hours exceeded 1130: {last_line}. We should revisit the test suites to reduce the total GPU hours."
+    assert total_gpu_hours <= 1140, (
+        f"Total GPU hours exceeded 1140: {last_line}. We should revisit the test suites to reduce the total GPU hours."
     )
     tracker.track("total_nightly_gpu_hours", total_gpu_hours)
 

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ def test_all_recipe_yamls_accounted_for_in_test_suites(`
`180`	`180`	`)`
`181`	`181`
`182`	`182`
`183`		`-def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker):`
	`183`	`+def test_nightly_compute_stays_below_1140_hours(nightly_test_suite, tracker):`
`184`	`184`	`command = f"DRYRUN=1 HF_HOME=... HF_DATASETS_CACHE=... CONTAINER= ACCOUNT= PARTITION= ./tools/launch {' '.join(nightly_test_suite)}"`
`185`	`185`
`186`	`186`	`print(f"Running command: {command}")`
`@@ -212,8 +212,8 @@ def test_nightly_compute_stays_below_1130_hours(nightly_test_suite, tracker):`
`212`	`212`	`f"Last line of output was not as expected: '{last_line}'"`
`213`	`213`	`)`
`214`	`214`	`total_gpu_hours = float(last_line.split(":")[-1].strip())`
`215`		`- assert total_gpu_hours <= 1130, (`
`216`		`- f"Total GPU hours exceeded 1130: {last_line}. We should revisit the test suites to reduce the total GPU hours."`
	`215`	`+ assert total_gpu_hours <= 1140, (`
	`216`	`+ f"Total GPU hours exceeded 1140: {last_line}. We should revisit the test suites to reduce the total GPU hours."`
`217`	`217`	`)`
`218`	`218`	`tracker.track("total_nightly_gpu_hours", total_gpu_hours)`
`219`	`219`