feat: auto deduce use_oneshot from token_num in all-reduce (#1365)

yyihuang · web-flow · commit 2a614727cac8 · 2025-08-01T23:58:07.000-07:00
## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/comm/trtllm_ar.py b/flashinfer/comm/trtllm_ar.py
@@ -775,10 +775,10 @@ def trtllm_allreduce_fusion(
     hidden_dim: int,
     workspace_ptrs: torch.Tensor,
     launch_with_pdl: bool,
-    use_oneshot: bool,
     trigger_completion_at_end: bool,
     fp32_acc: bool,
     pattern_code: AllReduceFusionPattern,
+    use_oneshot: Optional[bool],
     allreduce_out: Optional[torch.Tensor],
     residual_in: Optional[torch.Tensor],
     residual_out: Optional[torch.Tensor],
@@ -815,14 +815,16 @@ def trtllm_allreduce_fusion(
     - layout_code: the layout code.
 
     Note:
-    Regarding the `use_oneshot` parameter:
+    Regarding the `use_oneshot` parameter, you could force to use the one-shot strategy based on your use case.
+    Otherwise, it would be enabled if token_num is less than the one-shot max token number (currently 128) for min-latency mode.
+    """
 
-    It should only be enabled when:
-    (1) Force to use the one-shot strategy based on your use case.
-    (2) In min-latency mode, the sequence length is less than the one-shot max token number (currently 128).
+    if use_oneshot is None:
+        logging.warning(
+            f"use_oneshot is not specified. It would be enabled if token_num is less than the one-shot max token number (currently 128) for min-latency mode."
+        )
+        use_oneshot = token_num <= 128
 
-    Otherwise, it should be disabled (as False).
-    """
     if not use_oneshot:
         assert token_num > world_size, "sequence length should be larger than tp_size"
 
diff --git a/tests/test_trtllm_allreduce_fusion.py b/tests/test_trtllm_allreduce_fusion.py
@@ -49,7 +49,7 @@ def _run_correctness_worker(world_size, rank, dtype, hidden_dim, distributed_ini
             comm.FP4QuantizationSFLayout.SWIZZLED,
         ]
         launch_with_pdls = [True, False]
-        use_oneshots = [True, False]
+        use_oneshots = [True, False, None]
         trigger_completion_at_ends = [True, False]
         fp32_accs = [True, False]
 
@@ -315,7 +315,7 @@ def multi_process_parallel(
         ), f"Process {i} failed with exit code {procs[i].exitcode}"
 
 
-@pytest.mark.parametrize("world_size", [2, 4])
+@pytest.mark.parametrize("world_size", [2, 4, 8])
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_dim", [1024, 2048, 4096, 7168, 8192])
 def test_trtllm_allreduce_fusion(world_size, dtype, hidden_dim):