[KERNELS][MULTI-GPU] Initialize simple multi-gpu moe baseline (#7352)

Jokeren · web-flow · commit 5045b79691a1 · 2025-07-29T23:35:17.000Z
diff --git a/.github/workflows/integration-tests-amd.yml b/.github/workflows/integration-tests-amd.yml
@@ -139,7 +139,9 @@ jobs:
             cd ../../triton_kernels/
             python3 -m pytest -s -n 12 tests/
           fi
-
+      - name: Run distributed tests
+        run: |
+          make test-distributed
       - name: Run asan tests on AMD
         if: false
         run: |
diff --git a/.github/workflows/integration-tests-nvidia.yml b/.github/workflows/integration-tests-nvidia.yml
@@ -68,13 +68,16 @@ jobs:
       - name: Update PATH
         run: |
           echo "$HOME/.local/bin" >> $GITHUB_PATH
+      - name: Setup Python environment for GB200
+        if: ${{ matrix.runner[0] == 'nvidia-gb200' }}
+        run: |
+          echo "/venv/bin" >> $GITHUB_PATH
+          echo "VIRTUAL_ENV=/venv" >> $GITHUB_ENV
+          echo "PYTHONHOME=" >> $GITHUB_ENV
       - name: Install Triton
         env:
           CUDA_HOME: "/usr/local/cuda"
         run: |
-          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
-            source /venv/bin/activate
-          fi
           nproc
           nvidia-smi
           echo "PATH is '$PATH'"
@@ -85,20 +88,14 @@ jobs:
       - name: Run lit tests
         run: make test-lit
       - name: Run python tests on CUDA
-        run: |
-          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
-            source /venv/bin/activate
-          fi
-          make NUM_PROCS=24 test-unit
+        run: make NUM_PROCS=24 test-unit
+      - name: Run distributed tests
+        run: make test-distributed
       - name: Run interpreter tests
         if: ${{ matrix.runner[0] == 'nvidia-h100' }}
         run: make test-interpret
       - name: Run regression tests
-        run: |
-          if [ "${{ matrix.runner[0] }}" == "nvidia-gb200" ]; then
-            source /venv/bin/activate
-          fi
-          make test-regression
+        run: make test-regression
       - name: Run C++ unittests
         run: make test-cpp
       - name: Run Proton tests
diff --git a/Makefile b/Makefile
@@ -44,6 +44,12 @@ test-unit: all
 		$(PYTEST) --capture=tee-sys -rfs -vvv python/test/unit/instrumentation/test_gpuhello.py
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
 
+.PHONY: test-distributed
+test-distributed: all
+	$(PYTHON) -m pip install --upgrade pip
+	$(PYTHON) -m pip install python/triton_kernels -v
+	$(PYTEST) -s python/triton_kernels/bench/distributed.py
+
 .PHONY: test-gluon
 test-gluon: all
 	$(PYTEST) -s -n $(NUM_PROCS) python/test/gluon
diff --git a/python/triton/language/standard.py b/python/triton/language/standard.py
@@ -317,9 +317,9 @@ def _or_combine(x, y):
 
 @core._tensor_member_fn
 @jit
-@core._add_reduction_docstr("reduce_of")
+@core._add_reduction_docstr("reduce_or")
 def reduce_or(input, axis, keep_dims=False):
-    core.static_assert(input.type.scalar.is_int(), "reduce_of only supported for integers")
+    core.static_assert(input.type.scalar.is_int(), "reduce_or only supported for integers")
     return core.reduce(input, axis, _or_combine, keep_dims=keep_dims)
 
 
diff --git a/python/triton_kernels/bench/bench_mlp.py b/python/triton_kernels/bench/bench_mlp.py
@@ -4,46 +4,25 @@
 import triton.profiler as proton
 from triton.profiler import viewer
 import torch
+import argparse
 import triton_kernels
 import triton_kernels.swiglu
-from triton_kernels.numerics_details.mxfp import downcast_to_mxfp
 from triton_kernels.matmul_ogs import matmul_ogs, PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
-from triton_kernels.numerics import InFlexData
-from triton_kernels.routing import routing
-from triton_kernels.target_info import is_cuda, is_hip, get_cdna_version, cuda_capability_geq
-from triton_kernels.tensor import convert_layout
-from triton_kernels.tensor import wrap_torch_tensor, FP4
+from triton_kernels.target_info import is_hip, get_cdna_version
 from dataclasses import dataclass
+import distributed as triton_dist
 from triton_kernels.tensor_details import layout
+from bench_utils import quantize_weight
 
 if torch.cuda.is_available() and not is_hip():
     from triton._C.libtriton import nvidia
+
     cublas_workspace = torch.empty(32 * 1024 * 1024, device="cuda", dtype=torch.uint8)
     cublas = nvidia.cublas.CublasLt(cublas_workspace)
 else:
     cublas = None
 
 
-def quantize(w, dtype, **opt):
-    if dtype == "bf16":
-        wq = w.to(torch.bfloat16).transpose(-1, -2).contiguous().transpose(-1, -2)
-        return wq, InFlexData(), None
-    elif dtype == "fp8":
-        fp8e4_dtype = torch.float8_e4m3fn if get_cdna_version() != 3 \
-            else torch.float8_e4m3fnuz
-        wq = w.to(fp8e4_dtype)
-        if is_cuda() and not cuda_capability_geq(10, 0):
-            wq = wq.transpose(-1, -2).contiguous().transpose(-1, -2)
-        return wq, InFlexData(dtype=wq.dtype, scale=w.abs().max().unsqueeze(0)), None
-    else:
-        assert dtype == "mx4", f"{dtype=}"
-        w, w_scale = downcast_to_mxfp(w.to(torch.bfloat16), torch.uint8, axis=1)
-        if opt:
-            w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"], **opt["value_layout_opts"])
-            w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"], **opt["scale_layout_opts"])
-        return w, InFlexData(), w_scale
-
-
 @dataclass
 class PerfData:
     time: float
@@ -69,13 +48,22 @@ def opint(self):
 
     @property
     def max_tbps(self):
-        return proton.specs.max_bps(self.device_type, self.device_info["arch"], self.device_info["bus_width"],
-                                    self.device_info["memory_clock_rate"]) * 1e-12
+        return (proton.specs.max_bps(
+            self.device_type,
+            self.device_info["arch"],
+            self.device_info["bus_width"],
+            self.device_info["memory_clock_rate"],
+        ) * 1e-12)
 
     @property
     def max_tflops(self):
-        return proton.specs.max_flops(self.device_type, self.device_info["arch"], self.bitwidth,
-                                      self.device_info["num_sms"], self.device_info["clock_rate"]) * 1e-12
+        return (proton.specs.max_flops(
+            self.device_type,
+            self.device_info["arch"],
+            self.bitwidth,
+            self.device_info["num_sms"],
+            self.device_info["clock_rate"],
+        ) * 1e-12)
 
     @property
     def util(self) -> float:
@@ -85,62 +73,83 @@ def util(self) -> float:
         return max(min_t_flop, min_t_bw) / self.time
 
 
+def get_bench_path(name, rank, x_dtype, w_dtype, TP, EP):
+    return Path(f"logs/{name}/{rank}/{x_dtype}-{w_dtype}-TP{TP}-EP{EP}/")
+
+
 def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP, EP, name):
     assert n_expts_tot % EP == 0
     assert dim2 % TP == 0
-    dev = "cuda"
+    rank, world_size = triton_dist.setup()
+    dev = f"cuda:{rank}"
+    DP = world_size
+
+    assert n_expts_tot % EP == 0, f"{n_expts_tot=}, {EP=}, n_expts_tot must be divisible by EP"
+    assert dim2 % TP == 0, f"{dim2=}, {TP=}, dim2 must be divisible by TP"
 
     # input
     # weights
-    wg = torch.randn((dim1, n_expts_tot), device=dev)
+    wg = triton_dist.broadcast(torch.randn((dim1, n_expts_tot), device=dev))
     w1 = torch.randn((n_expts_tot // EP, dim1, dim2 // TP), device=dev)
     w2 = torch.randn((n_expts_tot // EP, dim2 // TP // 2, dim1), device=dev)
+
     # biases
-    bg = torch.randn((n_expts_tot, ), device=dev)
+    bg = triton_dist.broadcast(torch.randn((n_expts_tot, ), device=dev))
     b1 = torch.randn((n_expts_tot // EP, dim2 // TP), device=dev)
     b2 = torch.randn((n_expts_tot // EP, dim1), device=dev)
+    ep_indx = (rank // TP) % EP
+    groups = [list(range(ep * TP, (ep + 1) * TP)) for ep in range(EP)]
+    b2 = triton_dist.broadcast(b2, src=ep_indx * TP, groups=groups, group_idx=ep_indx)
 
     # -- numerics --
-    optg = dict()
     opt1 = dict()
     opt2 = dict()
     if w_dtype == "mx4" and not is_hip():
         num_warps = 4 if batch <= 512 else 8
         value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
         scale_layout, scale_layout_opts = layout.make_default_matmul_mxfp4_w_scale_layout(
             mx_axis=1, num_warps=num_warps)
-        opt1 = {"value_layout": value_layout, "value_layout_opts": value_layout_opts, \
-                "scale_layout": scale_layout, "scale_layout_opts": scale_layout_opts}
+        opt1 = {
+            "value_layout": value_layout,
+            "value_layout_opts": value_layout_opts,
+            "scale_layout": scale_layout,
+            "scale_layout_opts": scale_layout_opts,
+        }
         opt2 = deepcopy(opt1)
-    wg, wg_flex, wg_scale = quantize(wg, "bf16", **optg)
-    w1, w1_flex, w1_scale = quantize(w1, w_dtype, **opt1)
-    w2, w2_flex, w2_scale = quantize(w2, w_dtype, **opt2)
+    wg, wg_flex, wg_scale = quantize_weight(wg, "bf16")
+    w1, w1_flex, w1_scale = quantize_weight(w1, w_dtype, **opt1)
+    w2, w2_flex, w2_scale = quantize_weight(w2, w_dtype, **opt2)
     pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=wg_flex), weight_scale=wg_scale)
     act = FusedActivation(FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), (1.0, 1.0), 2)
     pc1 = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w1_flex), weight_scale=w1_scale)
     pc2 = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=w2_flex), weight_scale=w2_scale)
 
     # -- benchmark --
-    fpath = Path(f"logs/{name}/{x_dtype}-{w_dtype}-TP{TP}-EP{EP}/profiles/batch-{batch}.hatchet")
+    fpath = get_bench_path(name, rank, x_dtype, w_dtype, TP, EP) / f"profiles/batch-{batch}.hatchet"
     fpath.parent.mkdir(parents=True, exist_ok=True)
     x_dtype = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.float8_e4m3fn}[x_dtype]
     # special treatment of fp8_e4m3 on AMD CDNA3 because it uses fp8_e4m3fnuz
     if x_dtype == torch.float8_e4m3fn and get_cdna_version() == 3:
         x_dtype = torch.float8_e4m3fnuz
 
-    x = torch.randn((batch, dim1), device=dev)
-    xg = x.to(wg.dtype if n_expts_tot > 1 else x_dtype)
-    x = x.to(x_dtype)
+    input_x = torch.randn((batch // DP, dim1), device=dev)
     # run layer
-    proton.start(str(fpath.with_suffix('')), hook="triton")
+    proton.start(str(fpath.with_suffix("")), hook="triton")
+    input_x = input_x.to(x_dtype)
+    xg = input_x.to(wg.dtype if n_expts_tot > 1 else input_x.dtype)
     for i in range(100):
-        if n_expts_tot > 1:
+        if n_expts_tot > 1:  # sparse
             logits = matmul_ogs(xg, wg, bg, precision_config=pcg)
-            rdata, gather_indx, scatter_indx = routing(logits, n_expts_act, simulated_ep=EP)
-        else:
-            rdata, gather_indx, scatter_indx = None, None, None
-        x = matmul_ogs(x, w1, b1, rdata, gather_indx=gather_indx, precision_config=pc1, fused_activation=act)
-        x = matmul_ogs(x, w2, b2, rdata, scatter_indx=scatter_indx, precision_config=pc2)
+            x, rdata, gather_indx, scatter_indx, metadata = triton_dist.routing(input_x, logits, n_expts_act, EP=EP,
+                                                                                TP=TP)
+        else:  # dense
+            x = triton_dist.all_gather(input_x, dim=0)
+            rdata, gather_indx, scatter_indx, metadata = None, None, None, None
+        if x.nelement() > 0:
+            x = matmul_ogs(x, w1, b1, rdata, gather_indx=gather_indx, precision_config=pc1, fused_activation=act)
+            x = matmul_ogs(x, w2, b2 if rank % TP == 0 else None, rdata, scatter_indx=scatter_indx,
+                           precision_config=pc2)
+        x = triton_dist.reduce_scatter(x, metadata=metadata, dim=0)
     proton.finalize()
 
     # -- analyze --
@@ -153,14 +162,21 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP,
     device_type = matmuls["device_type"].iloc[0]
     device_id = matmuls["device_id"].iloc[0]
     device_info = info[device_type][device_id]
-    return PerfData(time=time, flops=flops, bytes=bytes, bitwidth=x.dtype.itemsize * 8, device_type=device_type,
-                    device_info=device_info)
+    return PerfData(
+        time=time,
+        flops=flops,
+        bytes=bytes,
+        bitwidth=x.dtype.itemsize * 8,
+        device_type=device_type,
+        device_info=device_info,
+    )
 
 
 def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP=1, EP=1, name="",
                  verbose=True):
     from itertools import chain
     from bisect import bisect_left
+
     batches = list(chain(*[range(*r) for r in batch_ranges]))
     # collect performance data
     perfs = []
@@ -198,18 +214,13 @@ def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_
     y_comp = [max_tflops] * len(x_comp)
     ax.plot(x_bw, y_bw, "--", label=f"BW-bound  ({max_tbps:.1f} TB/s)", color="blue")
     ax.plot(x_comp, y_comp, "--", label=f"Compute-bound  ({max_tflops:.0f} TFLOP/s)", color="orange")
-    x_bw, x_comp = xs[:knee], xs[knee:]
-    x_bw = [x_bw[0], x_comp[0]]
-    y_bw = [opints[0] * max_tbps, max_tflops]
-    y_comp = [max_tflops] * len(x_comp)
-    ax.plot(x_bw, y_bw, "--", label=f"BW-bound  ({max_tbps:.1f} TB/s)")
-    ax.plot(x_comp, y_comp, "--", label=f"Compute-bound  ({max_tflops:.0f} TFLOP/s)")
     # plot data
     ax.scatter(xs, perf, marker="+")
     ax.legend(frameon=False, loc="lower right")
     ax.grid(True, which="both", ls=":", lw=0.5)
     fig.tight_layout()
-    fpath = Path(f"logs/{name}/{x_dtype}-{w_dtype}-TP{TP}-EP{EP}/roofline.png")
+    rank, _ = triton_dist.setup()
+    fpath = get_bench_path(name, rank, x_dtype, w_dtype, TP, EP) / "roofline.png"
     plt.savefig(fpath)
 
 
@@ -219,7 +230,34 @@ def roofline_mlp(batch_ranges, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_
     batch_ranges_moe = [(128, 512, 32), (512, 32000, 128)]
     dense_dtypes = ["fp8", "fp8"]
     quantized_dtypes = ["fp8", "mx4"] if has_native_mx4 else ["bf16", "mx4"]
-    roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *dense_dtypes, TP=1, EP=1, name="dense")
-    roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *quantized_dtypes, TP=1, EP=1, name="dense")
-    roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *dense_dtypes, TP=1, EP=1, name="llama4-maverick")
-    roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *quantized_dtypes, TP=1, EP=1, name="llama4-maverick")
+    rank, world_size = triton_dist.setup()
+    if world_size > 1:
+        # Running all workloads at once may cause OOM on some GPUs such as H100 80GB.
+        # Thus we request users to run each workload separately.
+        # For example, all eligible combinations of options are listed below when four GPUs are used:
+        # torchrun --nproc-per-node=4 ./bench_mlp.py --tp 2 --ep 2 --name llama4-maverick
+        # torchrun --nproc-per-node=4 ./bench_mlp.py --tp 1 --ep 4 --name llama4-maverick
+        # torchrun --nproc-per-node=4 ./bench_mlp.py --tp 4 --ep 1 --name llama4-maverick
+        # torchrun --nproc-per-node=4 ./bench_mlp.py --tp 4 --ep 1 --name dense
+        # torchrun --nproc-per-node=4 ./bench_mlp.py --tp 2 --ep 2 --name llama4-maverick --quantized
+        # torchrun --nproc-per-node=4 ./bench_mlp.py --tp 1 --ep 4 --name llama4-maverick --quantized
+        # torchrun --nproc-per-node=4 ./bench_mlp.py --tp 4 --ep 1 --name llama4-maverick --quantized
+        # torchrun --nproc-per-node=4 ./bench_mlp.py --tp 4 --ep 1 --name dense --quantized
+        argparse = argparse.ArgumentParser()
+        argparse.add_argument("--tp", type=int, default=1)
+        argparse.add_argument("--ep", type=int, default=1)
+        argparse.add_argument("--name", type=str, choices=["dense", "llama4-maverick"])
+        argparse.add_argument("--quantized", action="store_true", default=False)
+        args = argparse.parse_args()
+        dtypes = dense_dtypes if args.quantized else quantized_dtypes
+        if args.name == "dense":
+            assert args.ep == 1, "EP must be 1 for dense"
+            roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *dtypes, TP=args.tp, EP=args.ep, name="dense")
+        else:
+            roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *dtypes, TP=args.tp, EP=args.ep, name="llama4-maverick")
+        triton_dist.cleanup()
+    else:
+        roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *dense_dtypes, TP=1, EP=1, name="dense")
+        roofline_mlp(batch_ranges_dense, 8192, 8192, 1, 1, *quantized_dtypes, TP=1, EP=1, name="dense")
+        roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *dense_dtypes, TP=1, EP=1, name="llama4-maverick")
+        roofline_mlp(batch_ranges_moe, 5120, 8192, 128, 4, *quantized_dtypes, TP=1, EP=1, name="llama4-maverick")
diff --git a/python/triton_kernels/bench/bench_utils.py b/python/triton_kernels/bench/bench_utils.py
@@ -0,0 +1,25 @@
+from triton_kernels.numerics import InFlexData
+from triton_kernels.numerics_details.mxfp import downcast_to_mxfp
+from triton_kernels.tensor import convert_layout
+from triton_kernels.tensor import wrap_torch_tensor, FP4
+from triton_kernels.target_info import is_cuda, get_cdna_version, cuda_capability_geq
+import torch
+
+
+def quantize_weight(w, dtype, **opt):
+    if dtype == "bf16":
+        wq = w.to(torch.bfloat16).transpose(-1, -2).contiguous().transpose(-1, -2)
+        return wq, InFlexData(), None
+    elif dtype == "fp8":
+        fp8e4_dtype = torch.float8_e4m3fn if get_cdna_version() != 3 else torch.float8_e4m3fnuz
+        wq = w.to(fp8e4_dtype)
+        if is_cuda() and not cuda_capability_geq(10, 0):
+            wq = wq.transpose(-1, -2).contiguous().transpose(-1, -2)
+        return wq, InFlexData(dtype=wq.dtype, scale=w.abs().max().unsqueeze(0)), None
+    else:
+        assert dtype == "mx4", f"{dtype=}"
+        w, w_scale = downcast_to_mxfp(w.to(torch.bfloat16), torch.uint8, axis=1)
+        if opt:
+            w = convert_layout(wrap_torch_tensor(w, dtype=FP4), opt["value_layout"], **opt["value_layout_opts"])
+            w_scale = convert_layout(wrap_torch_tensor(w_scale), opt["scale_layout"], **opt["scale_layout_opts"])
+        return w, InFlexData(), w_scale
diff --git a/python/triton_kernels/bench/distributed.py b/python/triton_kernels/bench/distributed.py
diff --git a/python/triton_kernels/pyproject.toml b/python/triton_kernels/pyproject.toml