[TRTLLM-8129][feat] Apply AutoTuner to AllReduce Op for strategy tuning.

hyukn · hyukn · commit 872bb3c2e03e · 2025-10-23T01:52:47.000-07:00
Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -15,6 +15,7 @@
 import torch
 
 import tensorrt_llm
+from tensorrt_llm._utils import mpi_barrier
 from tensorrt_llm.bindings.internal.runtime import delay_kernel
 from tensorrt_llm.logger import logger
 
@@ -534,8 +535,6 @@ def __init__(self, warmup=3, repeat=10, stream_delay_micro_secs=1000):
         # Add statistics tracking
         self.stats = AutoTunerStatistics()
 
-        self.profiling_debug = True
-
     @classmethod
     def get(cls):
         if cls._instance is None:
@@ -745,6 +744,10 @@ def _profile_single_kernel(
             are used to ensure accurate timing.
         """
         stream = torch.cuda.current_stream()
+
+        if self._is_sync_op(runner):
+            mpi_barrier()
+
         # warm up, no timing
         for _ in range(self.warmup):
             runner(inputs, tactic=tactic, **kwargs)
@@ -757,6 +760,9 @@ def _profile_single_kernel(
         start = torch.cuda.Event(enable_timing=True)
         end = torch.cuda.Event(enable_timing=True)
 
+        if self._is_sync_op(runner):
+            mpi_barrier()
+
         start.record(stream=stream)
         for _ in range(self.repeat):
             runner(inputs, tactic=tactic, **kwargs)
@@ -939,6 +945,9 @@ def _prepare_input_tensors(
             tensors.append(tensor)
         return tensors
 
+    def _is_sync_op(self, runner: TunableRunner) -> bool:
+        return runner.__class__.__name__ in ["AllReduceRunner"]
+
     def clear_cache(self) -> None:
         """Clear the profiling cache."""
         self.profiling_cache.clear()
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -8,6 +8,7 @@
 import tensorrt_llm.quantization.utils.fp8_utils as fp8_utils
 from tensorrt_llm import deep_gemm
 from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm.functional import AllReduceFusionOp, AllReduceStrategy
 
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
                          OptimizationProfile, TunableRunner, TuningConfig)
@@ -1139,6 +1140,172 @@ def _(
     return x.new_empty((b, d), dtype=o_dtype)
 
 
+class AllReduceRunner(TunableRunner):
+    all_support_ops = {
+        AllReduceFusionOp.NONE.value,
+        AllReduceFusionOp.RESIDUAL_RMS_NORM.value,
+    }
+
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(DynamicTensorSpec(
+            0, 0,
+            (8192, 4096, 2048, 1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1),
+            last_positive_power_of_2), ),
+        constraint_specs=(ConstraintSpec(1, 0, lambda shapes: shapes[0][0]), ),
+    )
+
+    def __init__(
+        self,
+        tp_size: int,
+        group: List[int],
+        op: int,
+        eps: float,
+        trigger_completion_at_end: bool,
+    ):
+        self.tp_size = tp_size
+        self.op = op
+        self._group = group
+        self._eps = eps
+        self._trigger_completion_at_end = trigger_completion_at_end
+
+    def __hash__(self):
+        return hash((self.tp_size, self.op))
+
+    def get_valid_tactics(
+        self,
+        inputs: List[torch.Tensor],
+        profile: OptimizationProfile,
+        **kwargs,
+    ) -> List[int]:
+        valid_tactics = [
+            AllReduceStrategy.NCCL.value,
+            AllReduceStrategy.ONESHOT.value,
+        ]
+        if inputs[0].shape[0] >= self.tp_size:
+            valid_tactics.append(AllReduceStrategy.TWOSHOT.value)
+        return valid_tactics
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = -1,
+    ) -> torch.Tensor:
+        input, residual, norm_weight, scale, bias, workspace = inputs
+        if tactic == -1:
+            tactic = AllReduceStrategy.NCCL.value
+
+        torch.ops.trtllm.allreduce(
+            input,
+            residual,
+            norm_weight,
+            scale,
+            bias,
+            workspace,
+            self._group,
+            tactic,
+            self.op,
+            self._eps,
+            self._trigger_completion_at_end,
+        )
+
+
+@torch.library.custom_op("trtllm::tunable_allreduce", mutates_args=())
+def tunable_allreduce(
+    input: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    norm_weight: Optional[torch.Tensor],
+    scale: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    workspace: Optional[torch.Tensor],
+    group: List[int],
+    strategy: int,
+    op: int,
+    eps: float,
+    tp_size: int,
+    trigger_completion_at_end: bool,
+) -> List[torch.Tensor]:
+
+    tuner = AutoTuner.get()
+
+    allreduce_runner = AllReduceRunner(
+        tp_size,
+        group,
+        op,
+        eps,
+        trigger_completion_at_end,
+    )
+
+    _, best_tactic = tuner.choose_one(
+        "trtllm::tunable_allreduce::allreduce",
+        [allreduce_runner],
+        AllReduceRunner.tuning_config,
+        [input, residual, norm_weight, scale, bias, workspace],
+    )
+
+    if best_tactic == -1:
+        best_tactic = AllReduceStrategy.NCCL.value
+
+    return torch.ops.trtllm.allreduce(
+        input,
+        residual,
+        norm_weight,
+        scale,
+        bias,
+        workspace,
+        group,
+        best_tactic,
+        op,
+        eps,
+        trigger_completion_at_end,
+    )
+
+
+@tunable_allreduce.register_fake
+def _(
+    input: torch.Tensor,
+    residual: Optional[torch.Tensor],
+    norm_weight: Optional[torch.Tensor],
+    scale: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    workspace: Optional[torch.Tensor],
+    group: List[int],
+    strategy: int,
+    op: int,
+    eps: float,
+    trigger_completion_at_end: bool,
+) -> torch.Tensor:
+    if op == int(AllReduceFusionOp.NONE):
+        return [torch.empty_like(input)]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM):
+        norm_out = torch.empty_like(input)
+        residual_out = torch.empty_like(input)
+        return [norm_out, residual_out]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8):
+        quant_out = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+        residual_out = torch.empty_like(input)
+        return [quant_out, residual_out]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_FP8):
+        norm_out = torch.empty_like(input)
+        quant_out = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+        residual_out = torch.empty_like(input)
+        return [norm_out, quant_out, residual_out]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4):
+        fp4_shape, scale_shape = fp4_utils.get_fp4_shape(input.shape, 16)
+        quant_fp4 = input.new_empty(fp4_shape, dtype=torch.uint8)
+        scale_fp4 = input.new_empty(scale_shape, dtype=torch.uint8)
+        residual_out = torch.empty_like(input)
+        return [quant_fp4, scale_fp4, residual_out]
+    elif op == int(AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4):
+        fp4_shape, scale_shape = fp4_utils.get_fp4_shape(input.shape, 16)
+        quant_fp4 = input.new_empty(fp4_shape, dtype=torch.uint8)
+        scale_fp4 = input.new_empty(scale_shape, dtype=torch.uint8)
+        norm_out = torch.empty_like(input)
+        residual_out = torch.empty_like(input)
+        return [norm_out, quant_fp4, scale_fp4, residual_out]
+    else:
+        return [torch.empty_like(input)]
+
+
 def get_event(event_idx: int):
     from ..utils import get_model_extra_attrs
     extra_attrs = get_model_extra_attrs()
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
@@ -505,7 +505,6 @@ def __init__(self,
         self._disable_mpi = mpi_disabled()
 
         self.all_reduce_op = torch.ops.trtllm.allreduce_pg if self._disable_mpi else torch.ops.trtllm.allreduce
-
         if self.mapping.tp_size > 1:
             # When Strategy is UB, it is guaranteed that the workspace is not used.
             if self.strategy != AllReduceStrategy.UB:
@@ -574,6 +573,7 @@ def forward(
         input = input.contiguous()  # Underlying op requires contiguous input
 
         allreduce_strategy = self.strategy
+
         if all_reduce_params is None:
             all_reduce_params = AllReduceParams()
 
@@ -598,21 +598,38 @@ def forward(
                 "pg": pg.boxed(),
             }
 
-        output = self.all_reduce_op(
-            input=input,
-            residual=all_reduce_params.residual,
-            norm_weight=all_reduce_params.norm_weight,
-            scale=all_reduce_params.scale,
-            bias=all_reduce_params.bias,
-            workspace=self.workspace,
-            group=self.mapping.tp_group,
-            strategy=allreduce_strategy,
-            op=all_reduce_params.fusion_op,
-            eps=all_reduce_params.eps,
-            trigger_completion_at_end=all_reduce_params.
-            trigger_completion_at_end,
-            **additional_args,
-        )
+        if self.strategy == AllReduceStrategy.AUTOTUNE:
+            output = torch.ops.trtllm.tunable_allreduce(
+                input=input,
+                residual=all_reduce_params.residual,
+                norm_weight=all_reduce_params.norm_weight,
+                scale=all_reduce_params.scale,
+                bias=all_reduce_params.bias,
+                workspace=self.workspace,
+                group=self.mapping.tp_group,
+                strategy=allreduce_strategy,
+                op=all_reduce_params.fusion_op,
+                eps=all_reduce_params.eps,
+                tp_size=self.mapping.tp_size,
+                trigger_completion_at_end=all_reduce_params.
+                trigger_completion_at_end,
+            )
+        else:
+            output = self.all_reduce_op(
+                input=input,
+                residual=all_reduce_params.residual,
+                norm_weight=all_reduce_params.norm_weight,
+                scale=all_reduce_params.scale,
+                bias=all_reduce_params.bias,
+                workspace=self.workspace,
+                group=self.mapping.tp_group,
+                strategy=allreduce_strategy,
+                op=all_reduce_params.fusion_op,
+                eps=all_reduce_params.eps,
+                trigger_completion_at_end=all_reduce_params.
+                trigger_completion_at_end,
+                **additional_args,
+            )
 
         return output if len(output) > 1 else output[0]
 
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
@@ -3883,6 +3883,7 @@ class AllReduceStrategy(IntEnum):
     LOWPRECISION = 6
     MNNVL = 7
     NCCL_SYMMETRIC = 8
+    AUTOTUNE = 9
 
 
 class AllReduceFusionOp(IntEnum):
diff --git a/tests/unittest/_torch/multi_gpu/test_allreduce.py b/tests/unittest/_torch/multi_gpu/test_allreduce.py