[https://nvbugs/5516710][fix] fix Llama 3.3 TP PP case (#7717)

Superjomn · web-flow · commit 2f3e3ae465fe · 2025-09-18T03:35:16.000+08:00
Signed-off-by: Yan Chunwei &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -560,7 +560,8 @@ def forward(
                 # Adjust the scale and fusion pattern.
                 if self.next_attn is not None and (self.is_nvfp4
                                                    or self.is_fp8_quant):
-                    scale = self.next_attn.qkv_proj.input_scale
+                    scale = self.next_attn.qkv_proj.input_scale if hasattr(
+                        self.next_attn.qkv_proj, 'input_scale') else None
                 else:
                     self.post_feed_forward_fusion_op = AllReduceFusionOp.RESIDUAL_RMS_NORM
                     scale = None
@@ -769,7 +770,8 @@ def forward(
                 # Adjust the scale and fusion pattern.
                 if self.next_attn is not None and (self.is_nvfp4
                                                    or self.is_fp8_quant):
-                    scale = self.next_attn.qkv_proj.input_scale
+                    scale = self.next_attn.qkv_proj.input_scale if hasattr(
+                        self.next_attn.qkv_proj, 'input_scale') else None
                 else:
                     self.post_mlp_fusion_op = AllReduceFusionOp.RESIDUAL_RMS_NORM
                     scale = None
diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py
@@ -826,7 +826,8 @@ def forward(
                         fusion_op=AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8,
                         residual=residual,
                         norm_weight=self.next_layer_layernorm.weight,
-                        scale=self.next_attn.qkv_proj.input_scale,
+                        scale=self.next_attn.qkv_proj.input_scale if hasattr(
+                            self.next_attn.qkv_proj, 'input_scale') else None,
                         eps=self.next_layer_layernorm.variance_epsilon,
                     ))
             elif use_fp4_allreduce and self.next_attn is not None:
@@ -837,7 +838,8 @@ def forward(
                         RESIDUAL_RMS_NORM_QUANT_NVFP4,
                         residual=residual,
                         norm_weight=self.next_layer_layernorm.weight,
-                        scale=self.next_attn.qkv_proj.input_scale,
+                        scale=self.next_attn.qkv_proj.input_scale if hasattr(
+                            self.next_attn.qkv_proj, 'input_scale') else None,
                         eps=self.next_layer_layernorm.variance_epsilon,
                     ))
             else:
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -15,6 +15,7 @@ l0_dgx_b200:
       backend: pytorch
   tests:
   - unittest/_torch/multi_gpu_modeling -k "deepseek"
+  - unittest/_torch/multi_gpu_modeling/test_llama3.py::test_llama_3_3
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[pp4-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False]
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_llama3.py b/tests/unittest/_torch/multi_gpu_modeling/test_llama3.py
@@ -0,0 +1,26 @@
+from utils.llm_data import llm_models_root
+from utils.util import similar
+
+from tensorrt_llm import LLM
+
+
+def test_llama_3_3():
+    model_dir = llm_models_root(
+    ) / "llama-3.3-models" / "Llama-3.3-70B-Instruct-FP8"
+    tp = 2
+    pp = 2
+
+    llm = LLM(model_dir, tensor_parallel_size=tp, pipeline_parallel_size=pp)
+    prompts = [
+        "The capital of France is",
+        "The president of the United States is",
+    ]
+
+    outputs = llm.generate(prompts)
+
+    expected_outputs = [
+        " a city of romance, art, fashion, and cuisine. Paris, also known as the City of Light, is a must-visit destination for anyone interested in",
+        " the head of state and head of government of the United States. The president is also the commander-in-chief of the armed forces. The president is elected by the",
+    ]
+    for i, output in enumerate(outputs):
+        assert similar(output.outputs[0].text, expected_outputs[i])