[https://nvbugs/5721644][fix] Update tests for nemotron_h (#9993)

Wanli-Jiang · web-flow · commit 601c29ca7349 · 2025-12-18T12:38:02.000+08:00
Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -289,3 +289,8 @@ bigcode/starcoder2-15b:
   - accuracy: 54.5
 mistral/Mistral-Large-3-675B:
   - accuracy: 90.83
+nvidia/Nemotron-3-Nano:
+  - accuracy: 69.37
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 68.73
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -346,3 +346,8 @@ mistralai/Mistral-Nemo-12b-Base:
     accuracy: 69.66
 mistral/Mistral-Large-3-675B:
   - accuracy: 87.54
+nvidia/Nemotron-3-Nano:
+  - accuracy: 73.85
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 74.35
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -5017,3 +5017,43 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+class TestNemotronV3Nano(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "nvidia/Nemotron-3-Nano"
+    # Test with no thinking to save time.
+    EXTRA_EVALUATOR_KWARGS = dict(chat_template_kwargs=dict(
+        enable_thinking=False))
+
+    def test_auto_dtype(self):
+        with LLM(
+                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+                kv_cache_config=KvCacheConfig(
+                    enable_block_reuse=False,
+                    mamba_ssm_cache_dtype="float32",
+                ),
+                max_batch_size=32,
+        ) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
+
+    @skip_pre_hopper
+    def test_fp8(self):
+        with LLM(
+                f"{llm_models_root()}/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
+                kv_cache_config=KvCacheConfig(
+                    enable_block_reuse=False,
+                    mamba_ssm_cache_dtype="float32",
+                ),
+                max_batch_size=32,
+        ) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -648,6 +648,8 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
 accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
 accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_fp8
 
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -33,8 +33,7 @@ l0_h100:
   - unittest/_torch/modeling -k "modeling_nemotron"
   - unittest/_torch/modeling -k "modeling_gemma3"
   - unittest/_torch/modeling -k "modeling_gpt_oss"
-  - unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32]
-  - unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None]
+  - unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_sanity
   - unittest/disaggregated/test_disagg_utils.py
   - unittest/disaggregated/test_router.py
   - unittest/disaggregated/test_remoteDictionary.py
@@ -84,6 +83,7 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=0]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestNemotronV3Nano::test_fp8
   - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
   - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_tp1_single_gpu[DeepSeek-V3-Lite-fp8]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -416,8 +416,6 @@ disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing] SKI
 disaggregated/test_auto_scaling.py::test_worker_restart[http-round_robin] SKIP (https://nvbugs/5726118)
 disaggregated/test_auto_scaling.py::test_disagg_server_restart[etcd-round_robin] SKIP (https://nvbugs/5726066)
 disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin] SKIP (https://nvbugs/5736923)
-unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:None] SKIP (https://nvbugs/5721644)
-unittest/_torch/modeling/test_modeling_nemotron_h.py::test_nemotron_h_correctness[Nemotron-Nano-3-30B-A3.5B-dev-1024-mamba_ssm_cache_dtype:float32] SKIP (https://nvbugs/5721644)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
diff --git a/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py b/tests/unittest/_torch/modeling/test_modeling_nemotron_h.py
@@ -1,12 +1,12 @@
 import pytest
 import torch
 from utils.llm_data import llm_models_root
-from utils.util import similar, skip_gpu_memory_less_than
+from utils.util import skip_fp8_pre_ada, skip_gpu_memory_less_than
 
 from tensorrt_llm import LLM
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.llm import RequestOutput
-from tensorrt_llm.llmapi.llm_args import CudaGraphConfig
+from tensorrt_llm.llmapi.llm_args import CudaGraphConfig, LoadFormat
 from tensorrt_llm.sampling_params import SamplingParams
 
 
@@ -37,9 +37,15 @@ def create_nemotron_h_llm(model_folder,
                           max_batch_size,
                           mamba_ssm_cache_dtype=None,
                           enable_chunked_prefill=False,
-                          max_num_tokens=8192):
+                          max_num_tokens=8192,
+                          load_format=None):
     """Create LLM with specific overlap scheduler setting"""
     model_dir = f"{llm_models_root(check=True)}/{model_folder}"
+    kwargs = {}
+    if max_num_tokens is not None:
+        kwargs["max_num_tokens"] = max_num_tokens
+    if load_format is not None:
+        kwargs["load_format"] = load_format
     return LLM(
         model=model_dir,
         tensor_parallel_size=1,
@@ -52,19 +58,71 @@ def create_nemotron_h_llm(model_folder,
             if mamba_ssm_cache_dtype is None else mamba_ssm_cache_dtype),
         sampler_type="TRTLLMSampler",
         enable_chunked_prefill=enable_chunked_prefill,
-        **({} if max_num_tokens is None else {
-            "max_num_tokens": max_num_tokens
-        }),
+        **kwargs,
     )
 
 
+@pytest.mark.parametrize("mamba_ssm_cache_dtype", [None, "float32"],
+                         ids=lambda n: f"mamba_ssm_cache_dtype:{n}")
+@pytest.mark.parametrize("model_folder", [
+    pytest.param("NVIDIA-Nemotron-3-Nano-30B-A3B-BF16",
+                 marks=skip_gpu_memory_less_than((2 * 30 + 1) * 2**30)),
+    pytest.param("NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
+                 marks=skip_gpu_memory_less_than((30 + 1) * 2**30)),
+])
+def test_nemotron_h_sanity(mamba_ssm_cache_dtype, model_folder):
+    # Skip test if FP8 is not supported on the current architecture.
+    use_fp8 = model_folder == "NVIDIA-Nemotron-3-Nano-30B-A3B-FP8"
+    skip_fp8_pre_ada(use_fp8)
+
+    torch.cuda.empty_cache()
+
+    text_prompts = [
+        "The future of AI is",
+        "The president of the United States is",
+    ]
+    num_prompts = len(text_prompts)
+
+    with create_nemotron_h_llm(
+            model_folder=model_folder,
+            use_cuda_graph=False,
+            disable_overlap_scheduler=False,
+            max_batch_size=num_prompts,
+            mamba_ssm_cache_dtype=mamba_ssm_cache_dtype,
+            load_format=LoadFormat.DUMMY,
+    ) as nemotron_h:
+        sampling_params = SamplingParams(max_tokens=9,
+                                         temperature=0.0,
+                                         add_special_tokens=False,
+                                         return_context_logits=True,
+                                         return_generation_logits=True)
+
+        # Non-batching prefill sanity check.
+        _ = [
+            nemotron_h.generate(text_prompt, sampling_params)
+            for text_prompt in text_prompts
+        ]
+
+        # Batching prefill sanity check.
+        results_batching = nemotron_h.generate(text_prompts, sampling_params)
+        completions_batching = [
+            result.outputs[0].text for result in results_batching
+        ]
+
+        # Decoding sanity check.
+        text_prompts_with_completions = [
+            f"{text_prompts[i]}{completions_batching[i]}"
+            for i in range(num_prompts)
+        ]
+        sampling_params.max_tokens = 1
+        nemotron_h.generate(text_prompts_with_completions, sampling_params)
+
+
 @pytest.mark.parametrize("mamba_ssm_cache_dtype", [None, "float32"],
                          ids=lambda n: f"mamba_ssm_cache_dtype:{n}")
 @pytest.mark.parametrize("model_folder", [
     pytest.param("Nemotron-H-8B-Base-8K",
                  marks=skip_gpu_memory_less_than((2 * 8 + 1) * 2**30)),
-    pytest.param("Nemotron-Nano-3-30B-A3.5B-dev-1024",
-                 marks=skip_gpu_memory_less_than((2 * 30 + 1) * 2**30)),
 ])
 def test_nemotron_h_correctness(mamba_ssm_cache_dtype, model_folder):
     torch.cuda.empty_cache()
@@ -152,50 +210,6 @@ def test_nemotron_h_correctness(mamba_ssm_cache_dtype, model_folder):
                 -0.04291720315814018
             ])
         ]
-    elif model_folder == "Nemotron-Nano-3-30B-A3.5B-dev-1024":
-
-        expected_completions = [
-            " bright, with endless possibilities for innovation and growth",
-            " the head of state and head of government of",
-        ]
-
-        # Copied from prefill_logprobs_no_batching[0] directly.
-        prefill_logprobs_ref_mcore = torch.tensor(
-            [-8.5145, -0.8952, -2.3531, -1.6690])
-
-        # reference logprobs from initial implementation (commit e4e42e0ec30227866ce30fc9c93d5e49352bb79c on single H200).
-        initial_impl_atol = 2.0
-        batching_atol = 2.0
-
-        prefill_logprobs_ref_initial_no_batching = [
-            torch.tensor([-8.5145, -0.8952, -2.3531, -1.6690]),
-            torch.tensor([-9.9306, -1.4935, -0.4787, -1.4945, -0.0195, -1.5253])
-        ]
-        prefill_logprobs_ref_initial_with_batching = [
-            torch.tensor([-8.5221, -0.8114, -2.4334, -1.6909]),
-            torch.tensor([-9.9466, -1.5095, -0.5282, -1.4701, -0.0185, -1.4108])
-        ]
-
-        decode_logprobs_ref_initial_no_batching = [
-            torch.tensor([
-                -9.2718e-01, -9.7786e-01, -7.5823e-01, -3.3243e-01, -8.7978e-01,
-                -3.2046e-02, -9.5047e-01, -9.2678e-01, -2.5973e-04
-            ]),
-            torch.tensor([
-                -1.6836, -0.8289, -0.0063, -0.5166, -0.1798, -0.6075, -1.0987,
-                -0.9075, -0.0025
-            ])
-        ]
-        decode_logprobs_ref_initial_with_batching = [
-            torch.tensor([
-                -9.0849e-01, -9.3238e-01, -8.2788e-01, -3.5542e-01, -9.0881e-01,
-                -3.4794e-02, -9.4975e-01, -9.2631e-01, -2.4041e-04
-            ]),
-            torch.tensor([
-                -1.6331, -0.7666, -0.0063, -0.5110, -0.1617, -0.6578, -1.1073,
-                -1.1447, -0.0024
-            ])
-        ]
     else:
         raise ValueError(f"Invalid model folder: {model_folder}")
 
@@ -255,14 +269,9 @@ def test_nemotron_h_correctness(mamba_ssm_cache_dtype, model_folder):
                 atol=initial_impl_atol,
                 rtol=0.0)
 
-            if model_folder == "Nemotron-H-8B-Base-8K":
-                # compare expected completion
-                assert completions_batching[i] == expected_completions[i]
-                assert completions_no_batching[i] == expected_completions[i]
-            else:
-                assert similar(completions_batching[i],
-                               completions_no_batching[i],
-                               threshold=0.5)
+            # compare expected completion
+            assert completions_batching[i] == expected_completions[i]
+            assert completions_no_batching[i] == expected_completions[i]
 
             # compare decode logprobs with initial implementation
             torch.testing.assert_close(