From 17b443acfc577a5fa62925c21440f729472f2c21 Mon Sep 17 00:00:00 2001 From: Ludwig Schneider Date: Tue, 6 Jan 2026 09:45:25 -0800 Subject: [PATCH 1/3] activate NCCL_SYMMETRIC auto-tuning Signed-off-by: Ludwig Schneider --- tensorrt_llm/_torch/custom_ops/torch_custom_ops.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index e3b725b3930..0465ccc7806 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -1690,8 +1690,7 @@ def get_valid_tactics( **kwargs, ) -> List[int]: valid_strategies = [ - # TODO: NCCL_SYMMETRIC will cause hang during tuning process - # AllReduceStrategy.NCCL_SYMMETRIC.value, + AllReduceStrategy.NCCL_SYMMETRIC.value, AllReduceStrategy.NCCL.value, ] # Fallback in allreduceOp is set to NCCL_SYMMETRIC as default @@ -1720,7 +1719,7 @@ def forward( input, residual, norm_weight, scale, bias, workspace = inputs if tactic == -1: # TODO: Use NCCL instead of NCCL_SYMMETRIC to avoid hanging during tuning process - tactic = AllReduceStrategy.NCCL.value + tactic = AllReduceStrategy.NCCL_SYMMETRIC.value return torch.ops.trtllm.allreduce( input, From 411d6e1de68f12218b361d966395cf36c16969f7 Mon Sep 17 00:00:00 2001 From: Ludwig Schneider Date: Wed, 7 Jan 2026 12:44:01 -0600 Subject: [PATCH 2/3] unwaive test, a previous PR may have resolved hang Signed-off-by: Ludwig Schneider --- tensorrt_llm/_torch/custom_ops/torch_custom_ops.py | 5 +++-- tests/integration/test_lists/waives.txt | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index 0465ccc7806..e3b725b3930 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -1690,7 +1690,8 @@ def get_valid_tactics( **kwargs, ) -> List[int]: valid_strategies = [ - AllReduceStrategy.NCCL_SYMMETRIC.value, + # TODO: NCCL_SYMMETRIC will cause hang during tuning process + # AllReduceStrategy.NCCL_SYMMETRIC.value, AllReduceStrategy.NCCL.value, ] # Fallback in allreduceOp is set to NCCL_SYMMETRIC as default @@ -1719,7 +1720,7 @@ def forward( input, residual, norm_weight, scale, bias, workspace = inputs if tactic == -1: # TODO: Use NCCL instead of NCCL_SYMMETRIC to avoid hanging during tuning process - tactic = AllReduceStrategy.NCCL_SYMMETRIC.value + tactic = AllReduceStrategy.NCCL.value return torch.ops.trtllm.allreduce( input, diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 5000daf6338..26264603842 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -269,7 +269,6 @@ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp2pp2] SKIP (https://nvbugs/5596337) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5721672) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5741304) -unittest/_torch/multi_gpu/test_allreduce.py::test_allreduce_fusion_patterns[2-residual_rms_norm_out_quant_fp8-hidden:7168-seqlen:8192] SKIP (https://nvbugs/5741392) unittest/executor/test_rpc.py::TestRpcCorrectness::test_incremental_task_async SKIP (https://nvbugs/5741476) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5740377) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5740377) From e1c1947155626de15b1b15cfef79b1810c93615f Mon Sep 17 00:00:00 2001 From: Ludwig Schneider Date: Mon, 12 Jan 2026 14:38:02 -0600 Subject: [PATCH 3/3] unwaived other failing test as well (same suspected root cause) Signed-off-by: Ludwig Schneider --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 26264603842..86aa6eefba8 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -295,7 +295,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5759338) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5759338) test_e2e.py::test_ptp_quickstart_advanced_2gpus_sm120[Nemotron-Super-49B-v1-BF16-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1] SKIP (https://nvbugs/5670469) -unittest/_torch/multi_gpu/test_mnnvl_allreduce.py::test_row_linear_residual_norm_fusion[no_fusion-strategy:8-dtype:bfloat16-hidden:8192-seqlen:[15]] SKIP (https://nvbugs/5761364) triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP (https://nvbugs/5762854) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype SKIP (https://nvbugs/5762822) unittest/_torch/sampler/test_return_logits.py SKIP (https://nvbugs/5764627)