[NVIDIA#10056][test] AutoDeploy: Add accuracy test for Nemotron SuperV3 (NVIDIA#10131)

galagam · nvchenghaoz · web-flow · commit 20b69a982a5d · 2025-12-19T13:28:42.000-08:00
Signed-off-by: Gal Hubara Agam &lt;96368689+galagam@users.noreply.github.com&gt;
Signed-off-by: Chenghao Zhang &lt;211069071+nvchenghaoz@users.noreply.github.com&gt;
Co-authored-by: Chenghao Zhang &lt;211069071+nvchenghaoz@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -289,6 +289,8 @@ bigcode/starcoder2-15b:
   - accuracy: 54.5
 mistral/Mistral-Large-3-675B:
   - accuracy: 90.83
+nvidia/Nemotron-Super-V3:
+  - accuracy: 84.38
 nvidia/Nemotron-3-Nano:
   - accuracy: 69.37
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -346,6 +346,8 @@ mistralai/Mistral-Nemo-12b-Base:
     accuracy: 69.66
 mistral/Mistral-Large-3-675B:
   - accuracy: 87.54
+nvidia/Nemotron-Super-V3:
+  - accuracy: 79.41
 nvidia/Nemotron-3-Nano:
   - accuracy: 73.85
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -232,3 +232,50 @@ def test_fp8(self):
             task.evaluate(llm, sampling_params=sampling_params)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "nvidia/Nemotron-Super-V3"
+    MODEL_PATH_BF16 = "/scratch/models/super-v3-iter_0440000/hf"  # add to llm_models_root? I don't have permissions
+
+    def get_default_kwargs(self):
+        return {
+            "skip_tokenizer_init": False,
+            "trust_remote_code": True,
+            "skip_loading_weights": False,
+            "compile_backend": "torch-cudagraph",
+            "free_mem_ratio": 0.5,  # maybe we can increase
+            "max_batch_size": 128,
+            "max_seq_len": 8192,
+            "max_num_tokens": 8192,
+            "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128],
+            "transforms": {
+                "detect_sharding": {
+                    "sharding_source": ['factory', 'heuristic'],
+                    "sharding_dims": ['ep', 'bmm'],
+                },
+            }
+        }
+
+    def get_default_sampling_params(self):
+        eos_id = -1
+        beam_width = 1
+        return SamplingParams(end_id=eos_id,
+                              pad_id=eos_id,
+                              n=beam_width,
+                              use_beam_search=beam_width > 1)
+
+    @pytest.mark.skip_less_device_memory(
+        32000)  # might need to require more memory
+    @pytest.mark.skip_less_device(8)
+    def test_bf16(self):
+        kwargs = self.get_default_kwargs()
+        sampling_params = self.get_default_sampling_params()
+        with AutoDeployLLM(model=self.MODEL_PATH_BF16,
+                           tokenizer=self.MODEL_PATH_BF16,
+                           world_size=8,
+                           **kwargs) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=sampling_params)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)