LarryXFly
diff --git a/‎tests/unittest/_torch/multi_gpu/test_allreduce.py‎
Lines changed: 37 additions & 35 deletions b/‎tests/unittest/_torch/multi_gpu/test_allreduce.py‎
Lines changed: 37 additions & 35 deletions
diff --git a/‎tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py‎
Lines changed: 15 additions & 11 deletions b/‎tests/unittest/_torch/multi_gpu/test_lowprecision_allreduce.py‎
Lines changed: 15 additions & 11 deletions
@@ -21,7 +21,6 @@
 import pytest
 import torch
 from mpi4py import MPI
-from mpi4py.futures import MPIPoolExecutor
 from utils.util import skip_pre_blackwell
 
 import tensorrt_llm
@@ -40,6 +39,9 @@
     pickle.HIGHEST_PROTOCOL,
 )
 
+# needed since we reuse the mpi executor pool, first test running will leak a thread
+pytestmark = pytest.mark.threadleak(enabled=False)
+
 
 def fp8_quant(input, scale):
     finfo = torch.finfo(torch.float8_e4m3fn)
@@ -278,22 +280,23 @@ def ref_residual_rms_norm_out_quant_nvfp4(x, res):
                      marks=skip_pre_blackwell),
     ],
 )
-def test_allreduce_fusion_patterns(seq_len, hidden_size, fusion_op):
+@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True)
+def test_allreduce_fusion_patterns(seq_len, hidden_size, fusion_op,
+                                   mpi_pool_executor):
     torch.manual_seed(0)
     dtype = torch.bfloat16
-    tensor_parallel_size = 2
+    tensor_parallel_size = mpi_pool_executor.num_workers
     x = torch.randn((seq_len, hidden_size), dtype=dtype)
     residual = torch.randn_like(x)
     linear_weight = torch.randn((hidden_size, hidden_size), dtype=dtype)
-    with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor:
-        results = executor.map(
-            run_single_rank,
-            *zip(*[(tensor_parallel_size, run_allreduce_op, x, residual,
-                    [linear_weight], hidden_size, dtype, fusion_op)] *
-                 tensor_parallel_size),
-        )
-        for r in results:
-            assert r is True
+    results = mpi_pool_executor.map(
+        run_single_rank,
+        *zip(*[(tensor_parallel_size, run_allreduce_op, x, residual,
+                [linear_weight], hidden_size, dtype, fusion_op)] *
+             tensor_parallel_size),
+    )
+    for r in results:
+        assert r is True
 
 
 @torch.inference_mode()
@@ -426,13 +429,14 @@ def run_moe_allreduce_op(token_input: torch.Tensor, residual: torch.Tensor,
 
 
 @torch.inference_mode()
-def test_moe_allreduce_patterns():
+@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True)
+def test_moe_allreduce_patterns(mpi_pool_executor):
     torch.manual_seed(42)
 
     seq_len = 16
     hidden_size = 7168
     dtype = torch.bfloat16
-    tensor_parallel_size = 2
+    tensor_parallel_size = mpi_pool_executor.num_workers
     num_global_experts = 4
 
     # [num_token, 7168]
@@ -448,15 +452,14 @@ def test_moe_allreduce_patterns():
     residual = torch.randn_like(token_input)
 
     l0_weight = torch.randn((hidden_size, hidden_size), dtype=dtype)
-    with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor:
-        results = executor.map(
-            run_moe_single_rank,
-            *zip(*[(tensor_parallel_size, run_moe_allreduce_op, token_input,
-                    residual, active_experts_token_input, scale, l0_weight)] *
-                 tensor_parallel_size),
-        )
-        for r in results:
-            assert r is True
+    results = mpi_pool_executor.map(
+        run_moe_single_rank,
+        *zip(*[(tensor_parallel_size, run_moe_allreduce_op, token_input,
+                residual, active_experts_token_input, scale, l0_weight)] *
+             tensor_parallel_size),
+    )
+    for r in results:
+        assert r is True
 
 
 def run_moe_finalize_single_rank(tensor_parallel_size, single_rank_forward_func,
@@ -544,13 +547,14 @@ def run_moe_finalize_allreduce_op(
 
 
 @torch.inference_mode()
-def test_moe_finalize_allreduce_patterns():
+@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True)
+def test_moe_finalize_allreduce_patterns(mpi_pool_executor):
     torch.manual_seed(42)
 
     seq_len = 16
     hidden_size = 7168
     dtype = torch.bfloat16
-    tensor_parallel_size = 2
+    tensor_parallel_size = mpi_pool_executor.num_workers
     top_k = 8
 
     shared_expert_output = torch.randn((seq_len, hidden_size), dtype=dtype)
@@ -562,13 +566,11 @@ def test_moe_finalize_allreduce_patterns():
                                                  dtype=torch.int32)
     residual = torch.randn_like(shared_expert_output)
 
-    with MPIPoolExecutor(max_workers=tensor_parallel_size) as executor:
-        results = executor.map(
-            run_moe_finalize_single_rank,
-            *zip(*[(tensor_parallel_size, run_moe_finalize_allreduce_op,
-                    fc2_output, residual, shared_expert_output,
-                    expanded_idx_to_permuted_idx, scale)] *
-                 tensor_parallel_size),
-        )
-        for r in results:
-            assert r is True
+    results = mpi_pool_executor.map(
+        run_moe_finalize_single_rank,
+        *zip(*[(tensor_parallel_size, run_moe_finalize_allreduce_op, fc2_output,
+                residual, shared_expert_output, expanded_idx_to_permuted_idx,
+                scale)] * tensor_parallel_size),
+    )
+    for r in results:
+        assert r is True
@@ -19,7 +19,6 @@
 import pytest
 import torch
 from mpi4py import MPI
-from mpi4py.futures import MPIPoolExecutor
 
 from tensorrt_llm._torch.distributed import AllReduceStrategy
 
@@ -30,6 +29,9 @@
     pickle.HIGHEST_PROTOCOL,
 )
 
+# needed since we reuse the mpi executor pool, first test running will leak a thread
+pytestmark = pytest.mark.threadleak(enabled=False)
+
 
 def run_single_rank(dtype, strategy, message_size):
     import numpy as np
@@ -243,18 +245,20 @@ def test(self, mode="acc"):
     [1024 * 1024 * x for x in [2, 4, 16, 32, 64, 132, 144]] + [64 * 70000],
     ids=lambda x: f"size{x}")
 @pytest.mark.parametrize(
-    "tp_size",
+    "mpi_pool_executor",
     [2, 4],  # 8
-    ids=["tp_size_2", "tp_size_4"])  # "tp_size_8"
-def test_lowprecision_allreduce_acc(dtype, strategy, message_size, tp_size):
+    ids=["tp_size_2", "tp_size_4"],
+    indirect=True)  # "tp_size_8"
+def test_lowprecision_allreduce_acc(dtype, strategy, message_size,
+                                    mpi_pool_executor):
     """
     Only test for accuracy. For performance testing,
     manually call TestLowPrecisionAllreduce(...).test('perf')
     """
-    with MPIPoolExecutor(max_workers=tp_size) as executor:
-        results = executor.map(
-            run_single_rank,
-            *zip(*[(dtype, strategy, message_size)] * tp_size),
-        )
-        for r in results:
-            assert r is True
+    tp_size = mpi_pool_executor.num_workers
+    results = mpi_pool_executor.map(
+        run_single_rank,
+        *zip(*[(dtype, strategy, message_size)] * tp_size),
+    )
+    for r in results:
+        assert r is True