intel
diff --git a/‎python/triton_kernels/bench/bench_mlp.py‎
Lines changed: 4 additions & 1 deletion b/‎python/triton_kernels/bench/bench_mlp.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎python/triton_kernels/bench/distributed.py‎
Lines changed: 52 additions & 7 deletions b/‎python/triton_kernels/bench/distributed.py‎
Lines changed: 52 additions & 7 deletions
diff --git a/‎python/triton_kernels/tests/test_distributed.py‎
Lines changed: 22 additions & 100 deletions b/‎python/triton_kernels/tests/test_distributed.py‎
Lines changed: 22 additions & 100 deletions
@@ -70,6 +70,8 @@ def bench_mlp(batch_per_expt, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_d
 
     input_x = torch.randn((batch // DP, dim1), device=dev)
     expt_assignment = triton_dist.create_expt_assignment(EP, n_expts_tot, torch.device(dev))
+    triton_dist.initialize_matmul_ogs(batch, dim1, dim2, n_expts_act, n_expts_tot, input_x.dtype)
+
     # run layer
     fpath = Path(tempfile.mktemp())
     proton.start(str(fpath), hook="triton")
@@ -79,7 +81,8 @@ def bench_mlp(batch_per_expt, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_d
         if n_expts_tot > 1:  # sparse
             logits = matmul_ogs(xg, wg, bg, precision_config=pcg)
             x, rdata, gather_indx, scatter_indx, metadata = triton_dist.routing(input_x, logits, n_expts_act, EP=EP,
-                                                                                TP=TP, expt_assignment=expt_assignment)
+                                                                                TP=TP, expt_assignment=expt_assignment,
+                                                                                mode="ep_sharding")
         else:  # dense
             x = triton_dist.all_gather(input_x, dim=0)
             rdata, gather_indx, scatter_indx, metadata = None, None, None, None
 
@@ -16,7 +16,7 @@
 from triton_kernels.target_info import get_cdna_version, is_hip, is_cuda, cuda_capability_geq
 from triton_kernels.tensor_details import layout
 from triton_kernels.tensor import make_ragged_tensor_metadata, remap_ragged_tensor_metadata
-from triton_kernels.distributed import make_expt_dict_uniform, make_expt_assignment, convert_dp_to_ep, convert_ep_to_dp, ExptAssignment
+from triton_kernels.distributed import make_expt_dict_uniform, make_expt_assignment, convert_dp_to_ep, convert_ep_to_dp, ExptAssignment, symm_mem_pool
 
 from bench_utils import quantize_weight
 
@@ -40,6 +40,31 @@ def create_expt_assignment(EP: int, n_expts_tot: int, device: torch.device) -> O
     return make_expt_assignment(EP, n_expts_tot, expt_dict, device)
 
 
+def initialize_matmul_ogs(
+    batch: int,
+    dim1: int,
+    dim2: int,
+    n_expts_act: int,
+    n_expts_tot: int,
+    dtype: torch.dtype,
+) -> None:
+    if not _is_distributed_launch():
+        return
+    world_size = dist.get_world_size()
+    device = torch.cuda.current_device()
+    symm_mem_pool.initialize_matmul_ogs(
+        n_tokens_global=batch,
+        d_input=dim1,
+        d_model=dim2,
+        n_expts_act=n_expts_act,
+        n_expts_tot=n_expts_tot,
+        n_ranks=world_size,
+        dtype=dtype,
+        group=dist.group.WORLD,
+        device=device,
+    )
+
+
 def setup() -> Tuple[int, int]:
     if _is_distributed_launch():
         world_size = int(os.environ["WORLD_SIZE"])
@@ -112,11 +137,18 @@ def reduce_scatter(
 # TODO: clean up duplicate code with triton_kernels.test_distributed.py
 # TODO: Support nonuniform expert assignment
 def routing(
-    x, logits, n_expts_act, sm_first: bool = False, y_indx: Optional[torch.Tensor] = None, EP: int = 1, TP: int = 1,
-    expt_assignment: Optional[ExptAssignment] = None, mode: str = "ep_sharding"
+    x,
+    logits,
+    n_expts_act,
+    sm_first: bool = False,
+    y_indx: Optional[torch.Tensor] = None,
+    EP: int = 1,
+    TP: int = 1,
+    expt_assignment: Optional[ExptAssignment] = None,
+    mode: Optional[str] = None,
 ) -> Tuple[torch.Tensor, RoutingData, GatherIndx, ScatterIndx, Optional[ReduceScatterMetadata]]:
     n_expts_tot = logits.shape[-1]
-    if _is_distributed_launch():
+    if _is_distributed_launch() and mode:
         if mode == "ep_sharding":
             if not expt_assignment:
                 raise ValueError("expt_assignment must be provided for distributed routing.")
@@ -150,6 +182,7 @@ def routing(
         else:
             raise NotImplementedError(f"Distributed routing mode {mode} is not implemented yet.")
     else:
+        # If mode is not specified or we have a single process, we do single-GPU routing.
         logits = topk(logits, n_expts_act, y_indx=y_indx, apply_softmax=not sm_first)
         dispatch_indx = logits.mask_metadata.col_sorted_indx
         combine_indx = logits.mask_metadata.row_sorted_indx
@@ -262,6 +295,17 @@ def distributed_run(rank, world_size, batch, dim1, dim2, n_expts_tot, n_expts_ac
     xd = torch.randn((batch // world_size, dim1), device=dev).to(dtype_map[x_dtype])
     x0 = all_gather(xd, dim=0)
     expt_assignment = create_expt_assignment(EP, n_expts_tot, torch.device(dev))
+    symm_mem_pool.initialize_matmul_ogs(
+        n_tokens_global=batch,
+        d_input=dim1,
+        d_model=dim2,
+        n_expts_act=n_expts_act,
+        n_expts_tot=n_expts_tot,
+        n_ranks=world_size,
+        dtype=x0.dtype,
+        group=dist.group.WORLD,
+        device=torch.cuda.current_device(),
+    )
 
     # single-GPU pass
     def single(x):
@@ -279,7 +323,8 @@ def distributed(x):
         xg = x.to(wg.dtype if n_expts_tot > 1 else x.dtype)
         if n_expts_tot > 1:  # sparse
             logits = matmul_ogs(xg, wg, bg, precision_config=pcg)
-            x, rdata, gi, si, metadata = routing(x, logits, n_expts_act, EP=EP, TP=TP, expt_assignment=expt_assignment)
+            x, rdata, gi, si, metadata = routing(x, logits, n_expts_act, EP=EP, TP=TP, expt_assignment=expt_assignment,
+                                                 mode="ep_sharding")
         else:  # dense
             x = all_gather(x, dim=0)
             rdata = gi = si = metadata = None
@@ -322,12 +367,12 @@ def distributed(x):
 )
 def test_mlp_mp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP, EP, monkeypatch):
     parallelism = TP * EP
+    if is_hip():
+        pytest.skip("[TODO] HIP support for distributed MoE.")
     if torch.cuda.device_count() < parallelism:
         pytest.skip(f"Test requires at least {parallelism} GPUs.")
     if is_cuda() and not cuda_capability_geq(9, 0):
         pytest.skip("Test requires CUDA compute capability >= 9.0.")
-    if is_hip() and get_cdna_version() == 4 and EP > 1:
-        pytest.skip("[TODO] Unknown issue with CDNA 4 and EP > 1")
     if TP > 1:
         pytest.skip("[TODO] TP > 1 is not supported yet in distributed mode.")
 
 
@@ -4,10 +4,9 @@
 
 import torch
 import torch.distributed as dist
-import torch.distributed._symmetric_memory as symm_mem
 import torch.multiprocessing as mp
 import triton
-from triton_kernels.distributed import convert_dp_to_ep, convert_ep_to_dp, make_expt_dict_uniform, make_expt_dict_random, make_expt_assignment
+from triton_kernels.distributed import convert_dp_to_ep, convert_ep_to_dp, make_expt_dict_uniform, make_expt_dict_random, make_expt_assignment, symm_mem_pool
 from triton_kernels.reduce import reduce
 from triton_kernels.topk import topk
 from triton_kernels.matmul_ogs import matmul_ogs, RoutingData, GatherIndx, ScatterIndx
@@ -166,99 +165,6 @@ def mixture_of_expt_epsharded(x_dp_local, l_dp_local, w_ep_local, b_ep_local, ex
     return z_dp_local
 
 
-def _capture_with_prepared_symm_mem(fn):
-    """
-    Run `fn` once to record symmetric-memory allocations, preallocate them outside the CUDA graph,
-    and capture a CUDA graph that reuses the recorded buffers.
-    """
-    orig_symm_empty = symm_mem.empty
-    orig_symm_rendezvous = symm_mem.rendezvous
-    recorded_empty_calls = []
-    recorded_rendezvous_calls = []
-    buffer_id_to_index = {}
-
-    def recording_empty(*args, **kwargs):
-        buf = orig_symm_empty(*args, **kwargs)
-        idx = len(recorded_empty_calls)
-        buffer_id_to_index[id(buf)] = idx
-        recorded_empty_calls.append((args, dict(kwargs)))
-        return buf
-
-    def recording_rendezvous(buf, *args, **kwargs):
-        buf_id = id(buf)
-        if buf_id not in buffer_id_to_index:
-            raise RuntimeError("symm_mem.rendezvous called on unknown buffer")
-        hdl = orig_symm_rendezvous(buf, *args, **kwargs)
-        recorded_rendezvous_calls.append((buffer_id_to_index[buf_id], args, dict(kwargs)))
-        return hdl
-
-    symm_mem.empty = recording_empty
-    symm_mem.rendezvous = recording_rendezvous
-    try:
-        warmup_result = fn()
-    finally:
-        symm_mem.empty = orig_symm_empty
-        symm_mem.rendezvous = orig_symm_rendezvous
-
-    prepared_empty_buffers = [orig_symm_empty(*args, **kwargs) for args, kwargs in recorded_empty_calls]
-    prepared_handles = [
-        orig_symm_rendezvous(prepared_empty_buffers[idx], *args, **kwargs)
-        for idx, args, kwargs in recorded_rendezvous_calls
-    ]
-
-    capture_stream = torch.cuda.Stream()
-    graph = torch.cuda.CUDAGraph()
-
-    if recorded_empty_calls:
-        empty_idx = 0
-        rendezvous_idx = 0
-
-        def reuse_empty(*args, **kwargs):
-            nonlocal empty_idx
-            if empty_idx >= len(prepared_empty_buffers):
-                raise RuntimeError("symm_mem.empty called more times than recorded")
-            expected_args, expected_kwargs = recorded_empty_calls[empty_idx]
-            if expected_args != args or expected_kwargs != kwargs:
-                raise RuntimeError("symm_mem.empty called with unexpected arguments")
-            buf = prepared_empty_buffers[empty_idx]
-            empty_idx += 1
-            return buf
-
-        def reuse_rendezvous(buf, *args, **kwargs):
-            nonlocal rendezvous_idx
-            if rendezvous_idx >= len(prepared_handles):
-                raise RuntimeError("symm_mem.rendezvous called more times than recorded")
-            expected_empty_idx, expected_args, expected_kwargs = recorded_rendezvous_calls[rendezvous_idx]
-            expected_buf = prepared_empty_buffers[expected_empty_idx]
-            if buf is not expected_buf:
-                raise RuntimeError("symm_mem.rendezvous received unexpected buffer")
-            if expected_args != args or expected_kwargs != kwargs:
-                raise RuntimeError("symm_mem.rendezvous called with unexpected arguments")
-            handle = prepared_handles[rendezvous_idx]
-            rendezvous_idx += 1
-            return handle
-
-        symm_mem.empty = reuse_empty
-        symm_mem.rendezvous = reuse_rendezvous
-        try:
-            with torch.cuda.stream(capture_stream):
-                with torch.cuda.graph(graph):
-                    fn()
-        finally:
-            symm_mem.empty = orig_symm_empty
-            symm_mem.rendezvous = orig_symm_rendezvous
-    else:
-        with torch.cuda.stream(capture_stream):
-            with torch.cuda.graph(graph):
-                fn()
-
-    # Keep references alive for as long as the graph exists.
-    graph._symm_mem_buffers = prepared_empty_buffers
-    graph._symm_mem_handles = prepared_handles
-    graph._capture_stream = capture_stream
-    return warmup_result, graph
-
-
 def _run_expert_sharding(rank, world_size, *, n_tokens, d_model, n_expts_tot, n_expts_act, affinity_mode):
     torch.manual_seed(0)
 
@@ -303,17 +209,33 @@ def run_mixture():
             y_indx=y_indx_global,
         )
 
-    # test cuda graph capture + replay with symmetric memory
-    y_dp_local_tri, graph = _capture_with_prepared_symm_mem(run_mixture)
+    symm_mem_pool.initialize_matmul_ogs(
+        n_tokens_global=n_tokens_global,
+        d_input=d_model,
+        d_model=d_model,
+        n_expts_act=n_expts_act,
+        n_expts_tot=n_expts_tot,
+        dtype=torch.bfloat16,
+        n_ranks=world_size,
+        group=dist.group.WORLD,
+        device=dev,
+    )
+    y_dp_local_tri = run_mixture()
     y_global_tri = torch.empty_like(y_global_ref)
 
     # Validate warmup run.
     dist.all_gather_into_tensor(y_global_tri, y_dp_local_tri)
     triton.testing.assert_close(y_global_ref, y_global_tri)
 
-    # Validate first replay with unchanged inputs.
-    graph.replay()
-    dist.all_gather_into_tensor(y_global_tri, y_dp_local_tri)
+    # Validate cuda graph capture + replay.
+    g = torch.cuda.CUDAGraph()
+    stream = torch.cuda.Stream()
+    with torch.cuda.stream(stream):
+        with torch.cuda.graph(g):
+            y_dp_local_tri_graph = run_mixture()
+
+    g.replay()
+    dist.all_gather_into_tensor(y_global_tri, y_dp_local_tri_graph)
     triton.testing.assert_close(y_global_ref, y_global_tri)