test: Unwaive Llama 3.1 with torch compile test (NVIDIA#3475)

yizhang-nv · web-flow · commit 98966cb45ecc · 2025-04-22T10:41:56.000+08:00
* Fix log info Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> * Revert "test: Waive torch compile tests (NVIDIA#3471)" This reverts commit 410f563. Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> * Update test_llm_api_pytorch.py Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com> --------- Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -521,7 +521,7 @@ def _create_extra_inputs(bs, num_tokens_per_request):
                                 # No KV cache space!
                                 continue
                             logger.info(
-                                f"Run warmup for batch size={bs}, pure {'context' if num_tokens_per_request is not None else 'generation'} phase"
+                                f"Run warmup for batch size={bs}, pure {'context' if num_tokens_per_request > 1 else 'generation'} phase"
                             )
                             self.forward(
                                 batch,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -58,8 +58,6 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     @parametrize_with_ids("torch_compile", [False, True])
     @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
     def test_bfloat16(self, attn_backend, torch_compile):
-        if torch_compile:
-            pytest.skip("https://nvbugs/5216737")
         pytorch_config = PyTorchConfig(
             torch_compile_enabled=torch_compile,
             cuda_graph_padding_enabled=torch_compile,
@@ -84,8 +82,6 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
                 "Pipeline parallel with torch.compile is not supported yet.\n"
                 "Issue: Unfusing flashinfer_fused_add_rmsnorm causes outputs to be "
                 "discarded at graph breaks.")
-        if torch_compile:
-            pytest.skip("https://nvbugs/5216737")
         pytorch_config = PyTorchConfig(
             torch_compile_enabled=torch_compile,
             cuda_graph_padding_enabled=torch_compile,
@@ -107,8 +103,6 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
     @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
     @parametrize_with_ids("fp8kv", [False, True])
     def test_fp8(self, fp8kv, attn_backend, torch_compile):
-        if torch_compile:
-            pytest.skip("https://nvbugs/5216737")
         quant_config = QuantConfig(QuantAlgo.FP8)
         pytorch_config = PyTorchConfig(
             torch_compile_enabled=torch_compile,
@@ -140,9 +134,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
                              ids=["tp4", "tp2pp2"])
     def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
                        torch_compile):
-        if torch_compile:
-            pytest.skip("https://nvbugs/5216737")
-        if torch_compile and pp_size > 1:
+        if pp_size > 1:
             pytest.skip(
                 "Pipeline parallel with torch.compile is not supported yet.\n"
                 "Issue: Unfusing flashinfer_fused_add_rmsnorm causes outputs to be "

Original file line number	Diff line number	Diff line change
`@@ -521,7 +521,7 @@ def _create_extra_inputs(bs, num_tokens_per_request):`
`521`	`521`	`# No KV cache space!`
`522`	`522`	`continue`
`523`	`523`	`logger.info(`
`524`		`- f"Run warmup for batch size={bs}, pure {'context' if num_tokens_per_request is not None else 'generation'} phase"`
	`524`	`+ f"Run warmup for batch size={bs}, pure {'context' if num_tokens_per_request > 1 else 'generation'} phase"`
`525`	`525`	`)`
`526`	`526`	`self.forward(`
`527`	`527`	`batch,`