Added symm mem strategy

MrGeva · MrGeva · commit 3192b19b10cd · 2025-11-04T11:25:40.000-08:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
@@ -7,6 +7,8 @@
 import torch
 from torch import nn
 
+from tensorrt_llm._torch.distributed.symm_mem_allreduce import \
+    SymmetricMemoryAllReduce
 from tensorrt_llm._utils import mpi_comm, mpi_disabled
 from tensorrt_llm.bindings.internal.runtime import McastGPUBuffer
 from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams,
@@ -516,13 +518,19 @@ def __init__(self,
             strategy (AllReduceStrategy):
                 The following all-reduce strategies are supported:
 
+                - SYMM_MEM: Uses PyTorch's symmetric memory with MULTIMEM hardware instructions (H100+).
+                  Provides 3x faster performance on supported configurations (4/6/8 GPUs on H100).
+                  Currently only supports plain allreduce (NONE fusion op). Falls back automatically
+                  if not supported.
+
                 - UB: AllReduce uses user-buffer based all-reduce kernel.
 
                 - NCCL: Use NCCL allreduce.
 
                 - MIN_LATENCY: AllReduce uses MIN_LATENCY mode kernel.
 
-                - AUTO: AUTO chooses between NCCL and MIN_LATENCY mode based on a heuristic policy.
+                - AUTO: AUTO chooses the best available strategy. Will try SYMM_MEM first (if available),
+                  then MNNVL, then choose between NCCL and MIN_LATENCY based on a heuristic policy.
 
                 - LOWPRECISION: AllReduce quantizes data to lower precision for transmission.
                   Should only be used on topologies with PCIe switches and without NVLink.
@@ -551,6 +559,7 @@ def __init__(self,
         self.workspace = None
         self.strategy = strategy
         self.mnnvl_allreduce = None
+        self.symm_mem_allreduce = None
         self._disable_mpi = mpi_disabled()
 
         self.all_reduce_op = torch.ops.trtllm.allreduce_pg if self._disable_mpi else torch.ops.trtllm.allreduce
@@ -562,6 +571,29 @@ def __init__(self,
                     allocate_low_presicion_allreduce_workspace(self.mapping)
                 self.workspace = get_allreduce_workspace(self.mapping)
 
+            # Initialize Symmetric Memory AllReduce if needed (H100+ hardware acceleration)
+            if self.strategy in (AllReduceStrategy.AUTO,
+                                 AllReduceStrategy.SYMM_MEM):
+                try:
+                    symm_mem = SymmetricMemoryAllReduce(
+                        self.mapping,
+                        dtype=dtype if dtype else torch.bfloat16,
+                    )
+                    if not symm_mem.disabled:
+                        self.symm_mem_allreduce = symm_mem
+                        logger.info(
+                            f"SymmetricMemoryAllReduce (MULTIMEM) is enabled for world_size={self.mapping.tp_size}"
+                        )
+                    else:
+                        logger.debug(
+                            f"SymmetricMemoryAllReduce is disabled (not supported or unavailable)"
+                        )
+                except Exception as e:
+                    logger.debug(
+                        f"Symmetric Memory AllReduce can't be enabled due to {e}."
+                    )
+                    self.symm_mem_allreduce = None
+
             # Initialize MNNVL AllReduce if needed
             if self.strategy in (AllReduceStrategy.AUTO,
                                  AllReduceStrategy.MNNVL):
@@ -626,16 +658,27 @@ def forward(
         if all_reduce_params is None:
             all_reduce_params = AllReduceParams()
 
-        # Try MNNVL AllReduce first if available
+        # Try Symmetric Memory AllReduce first if available (H100+ hardware acceleration)
+        # Note: Currently only supports NONE fusion op (plain allreduce)
+        if self.symm_mem_allreduce and all_reduce_params.fusion_op == AllReduceFusionOp.NONE:
+            symm_mem_output = self.symm_mem_allreduce(input)
+            if symm_mem_output is not None:
+                logger.debug(
+                    f"Using SymmetricMemoryAllReduce (MULTIMEM) for input shape {input.shape}"
+                )
+                return symm_mem_output
+
+        # Try MNNVL AllReduce if symm_mem didn't handle it
         if self.mnnvl_allreduce:
             mnnvl_output = self.mnnvl_allreduce(
                 input, all_reduce_params=all_reduce_params)
             if mnnvl_output is not None:
                 return mnnvl_output
 
-        # Fall back to regular AllReduce if MNNVL is not available or not applicable
-        # Make sure the strategy is AUTO since allreduceOp does not have the branch for MNNVL
-        if allreduce_strategy == AllReduceStrategy.MNNVL:
+        # Fall back to regular AllReduce if specialized methods are not available or not applicable
+        # Make sure the strategy is AUTO since allreduceOp does not have the branch for MNNVL/SYMM_MEM
+        if allreduce_strategy in (AllReduceStrategy.MNNVL,
+                                  AllReduceStrategy.SYMM_MEM):
             allreduce_strategy = AllReduceStrategy.AUTO
 
         additional_args = {}
diff --git a/tensorrt_llm/_torch/distributed/symm_mem_allreduce.py b/tensorrt_llm/_torch/distributed/symm_mem_allreduce.py
@@ -0,0 +1,221 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+"""
+Symmetric Memory AllReduce for H100+ GPUs
+
+This module provides PyTorch Symmetric Memory-based allreduce operations,
+leveraging H100's MULTIMEM hardware instructions for 3x faster performance
+compared to custom CUDA kernels on supported configurations.
+"""
+
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+from torch import nn
+
+from tensorrt_llm.logger import logger
+from tensorrt_llm.mapping import Mapping
+
+try:
+    import torch.distributed._symmetric_memory as torch_symm_mem
+    SYMM_MEM_AVAILABLE = True
+except ImportError:
+    SYMM_MEM_AVAILABLE = False
+    logger.warning(
+        "PyTorch symmetric memory not available. Install PyTorch >= 2.8 for MULTIMEM support."
+    )
+
+
+class SymmetricMemoryAllReduce(nn.Module):
+    """
+    AllReduce implementation using PyTorch's symmetric memory operations.
+
+    This leverages H100's MULTIMEM hardware instructions for significantly faster
+    allreduce operations compared to software implementations.
+
+    Supported configurations (world_size):
+    - SM 9.0 (H100): 4, 6, 8 GPUs
+    - SM 10.0 (future): 6, 8 GPUs
+
+    Based on vLLM's implementation but integrated into TensorRT-LLM.
+    """
+
+    # World sizes that support MULTIMEM instructions
+    _WORLD_SIZES_MULTIMEM = {
+        "9.0": [4, 6, 8],  # H100
+        "10.0": [6, 8],  # Future architectures
+    }
+
+    # Maximum buffer sizes for symmetric memory (bytes)
+    _MAX_SIZES = {
+        "9.0": {
+            4: 8 * 1024 * 1024,  # 8MB for 4 GPUs
+            6: 6 * 1024 * 1024,  # 6MB for 6 GPUs
+            8: 4 * 1024 * 1024,  # 4MB for 8 GPUs
+        },
+        "10.0": {
+            6: 8 * 1024 * 1024,
+            8: 6 * 1024 * 1024,
+        }
+    }
+
+    def __init__(
+        self,
+        mapping: Mapping,
+        dtype: torch.dtype = torch.bfloat16,
+        group: Optional[dist.ProcessGroup] = None,
+    ):
+        super().__init__()
+
+        self.disabled = True
+        self.mapping = mapping
+        self.dtype = dtype
+        self.world_size = mapping.tp_size
+
+        if not SYMM_MEM_AVAILABLE:
+            logger.warning(
+                "SymmetricMemoryAllReduce: PyTorch symm_mem not available")
+            return
+
+        if not torch.cuda.is_available():
+            logger.warning("SymmetricMemoryAllReduce: CUDA not available")
+            return
+
+        # Get device capability
+        device = torch.device(f"cuda:{mapping.tp_rank}")
+        capability = torch.cuda.get_device_capability(device)
+        self.device_capability = f"{capability[0]}.{capability[1]}"
+
+        # Check if this configuration is supported
+        if self.device_capability not in self._MAX_SIZES:
+            logger.warning(
+                f"SymmetricMemoryAllReduce: Device capability {self.device_capability} not supported"
+            )
+            return
+
+        if self.world_size not in self._MAX_SIZES[self.device_capability]:
+            logger.info(
+                f"SymmetricMemoryAllReduce: World size {self.world_size} not supported "
+                f"for SM {self.device_capability}")
+            return
+
+        # Get max buffer size for this configuration
+        self.max_size = self._MAX_SIZES[self.device_capability][self.world_size]
+
+        # Set up process group
+        if group is None:
+            # Get or create TP group with correct ranks
+            # For TP parallelism, we need ranks [0, 1, 2, ..., tp_size-1] globally
+            # NOT starting from tp_rank!
+            if not dist.is_initialized():
+                logger.warning(
+                    "SymmetricMemoryAllReduce: torch.distributed not initialized"
+                )
+                self.disabled = True
+                return
+
+            # Assume contiguous TP ranks for now
+            # TODO: Get actual TP group from mapping if available
+            tp_group_ranks = list(range(mapping.tp_size))
+            self.group = dist.new_group(tp_group_ranks) if len(
+                tp_group_ranks) > 1 else None
+        else:
+            self.group = group
+
+        if self.group is None:
+            logger.warning("SymmetricMemoryAllReduce: No valid process group")
+            self.disabled = True
+            return
+
+        # Allocate symmetric memory buffer
+        try:
+            self.buffer = torch_symm_mem.empty(
+                self.max_size // self.dtype.itemsize,
+                device=device,
+                dtype=self.dtype,
+            )
+            # Pass group_name (string) not the group object
+            handle = torch_symm_mem.rendezvous(self.buffer,
+                                               self.group.group_name)
+
+            if handle.multicast_ptr == 0:
+                logger.warning(
+                    "SymmetricMemoryAllReduce: MULTIMEM operations not supported (multicast_ptr is 0)"
+                )
+                return
+
+            # Determine which algorithm to use
+            self.use_multimem = (self.world_size
+                                 in self._WORLD_SIZES_MULTIMEM.get(
+                                     self.device_capability, []))
+
+            self.disabled = False
+            logger.info(f"SymmetricMemoryAllReduce initialized: "
+                        f"world_size={self.world_size}, "
+                        f"max_size={self.max_size}, "
+                        f"SM={self.device_capability}, "
+                        f"use_multimem={self.use_multimem}")
+
+        except Exception as e:
+            logger.warning(
+                f"SymmetricMemoryAllReduce initialization failed: {e}")
+            return
+
+    def should_use_symm_mem(self, inp: torch.Tensor) -> bool:
+        """Check if symmetric memory can be used for this tensor."""
+        if self.disabled:
+            return False
+        if inp.dtype != self.dtype:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        if inp_size % 4 != 0:
+            return False
+        if inp_size >= self.max_size:
+            return False
+        return True
+
+    def forward(
+        self,
+        inp: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Perform allreduce using symmetric memory operations.
+
+        Args:
+            inp: Input tensor to reduce
+            out: Optional output tensor (if None, will be allocated)
+
+        Returns:
+            Reduced tensor
+        """
+        if not self.should_use_symm_mem(inp):
+            return None  # Caller should fall back to other strategy
+
+        if out is None:
+            out = torch.empty_like(inp)
+
+        # Copy input to symmetric memory buffer
+        self.buffer[:inp.numel()].copy_(inp.view(-1))
+
+        # Perform allreduce using appropriate algorithm
+        if self.use_multimem:
+            # Use MULTIMEM hardware instructions (faster)
+            torch.ops.symm_mem.multimem_all_reduce_(
+                self.buffer[:inp.numel()],
+                "sum",
+                self.group.group_name,
+            )
+        else:
+            # Use two-shot algorithm (fallback)
+            torch.ops.symm_mem.two_shot_all_reduce_(
+                self.buffer[:inp.numel()],
+                "sum",
+                self.group.group_name,
+            )
+
+        # Copy result back
+        out.copy_(self.buffer[:inp.numel()].view(out.shape))
+
+        return out
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
@@ -3883,6 +3883,7 @@ class AllReduceStrategy(IntEnum):
     LOWPRECISION = 6
     MNNVL = 7
     NCCL_SYMMETRIC = 8
+    SYMM_MEM = 9  # PyTorch symmetric memory with MULTIMEM (H100+)
 
 
 class AllReduceFusionOp(IntEnum):