Update llama torch model + llm api pytorch test

JintaoPengCS · JintaoPengCS · commit 8cfb36396874 · 2026-01-14T07:05:11.000Z
Signed-off-by: jintaop &lt;jintaop@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -635,7 +635,7 @@ def __init__(
         )
         self.is_nvfp4 = self.is_quanted and model_config.quant_config.quant_mode.has_nvfp4(
         )
-
+        # Self Attention
         self.self_attn = LlamaAttention(
             model_config,
             layer_idx=layer_idx,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -95,6 +95,19 @@ def test_nvfp4(self):
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
+            
+    def test_nvfp4_with_norm_quant(self, monkeypatch):
+        model_path = f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B"
+        with LLM(model_path) as llm:
+            sm_version = get_sm_version()
+            if sm_version not in (100, 103):
+                pytest.skip(f"test_nvfp4_with_norm_quant supports SM 100 and 103 only")
+            monkeypatch.setenv("TRTLLM_DISABLE_NVFP4_LAYERNORM_FUSION", "0")
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
 
     @skip_pre_blackwell
     @pytest.mark.parametrize("stream_interval", [4, 64],

Original file line number	Diff line number	Diff line change
`@@ -635,7 +635,7 @@ def __init__(`
`635`	`635`	`)`
`636`	`636`	`self.is_nvfp4 = self.is_quanted and model_config.quant_config.quant_mode.has_nvfp4(`
`637`	`637`	`)`
`638`		`-`
	`638`	`+ # Self Attention`
`639`	`639`	`self.self_attn = LlamaAttention(`
`640`	`640`	`model_config,`
`641`	`641`	`layer_idx=layer_idx,`