intel
diff --git a/‎python/triton_kernels/bench/bench_mlp.py‎
Lines changed: 3 additions & 4 deletions b/‎python/triton_kernels/bench/bench_mlp.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 72 additions & 2 deletions b/‎python/triton_kernels/tests/test_matmul.py‎
Lines changed: 72 additions & 2 deletions
diff --git a/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 49 additions & 23 deletions b/‎python/triton_kernels/triton_kernels/matmul_ogs.py‎
Lines changed: 49 additions & 23 deletions
@@ -7,7 +7,7 @@
 import triton_kernels
 import triton_kernels.swiglu
 from triton_kernels.numerics_details.mxfp import downcast_to_mxfp, SwizzlingType
-from triton_kernels.matmul_ogs import MicroscalingCtx, matmul_ogs, PrecisionConfig, FlexCtx
+from triton_kernels.matmul_ogs import MicroscalingCtx, matmul_ogs, PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
 from triton_kernels.numerics import InFlexData
 from triton_kernels.routing import routing
 from triton_kernels.target_info import is_hip, get_cdna_version
@@ -143,7 +143,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP,
     w1, w1_flex, w1_mx = quantize(w1, w_dtype, dev, **opt1)
     w2, w2_flex, w2_mx = quantize(w2, w_dtype, dev, **opt2)
     pcg = PrecisionConfig(mx_ctx=wg_mx, flex_ctx=FlexCtx(rhs_data=wg_flex))
-    pcs = triton_kernels.swiglu.PrecisionConfig(limit=1.0)
+    act = FusedActivation(FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), (1.0, 1.0), 2)
     pc1 = PrecisionConfig(mx_ctx=w1_mx, flex_ctx=FlexCtx(rhs_data=w1_flex))
     pc2 = PrecisionConfig(mx_ctx=w2_mx, flex_ctx=FlexCtx(rhs_data=w2_flex))
 
@@ -166,8 +166,7 @@ def bench_mlp(batch, dim1, dim2, n_expts_tot, n_expts_act, x_dtype, w_dtype, TP,
             rdata, gather_indx, scatter_indx = routing(logits, n_expts_act, simulated_ep=EP)
         else:
             rdata, gather_indx, scatter_indx = None, None, None
-        x = matmul_ogs(x, w1, b1, rdata, gather_indx=gather_indx, precision_config=pc1)
-        x = triton_kernels.swiglu.swiglu(x, 1.0, pcs, routing_data=rdata)
+        x = matmul_ogs(x, w1, b1, rdata, gather_indx=gather_indx, precision_config=pc1, fused_activation=act)
         x = matmul_ogs(x, w2, b2, rdata, scatter_indx=scatter_indx, precision_config=pc2)
     proton.finalize()
 
 
@@ -7,9 +7,10 @@
 from triton_kernels.routing import routing
 # matmul utilities
 import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
-from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig, MicroscalingCtx
+from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig, MicroscalingCtx, FusedActivation, FnSpecs
 from triton_kernels.matmul_ogs import can_use_persistent_tma
 from triton_kernels.matmul_ogs import matmul_ogs, matmul_ogs_torch
+from triton_kernels.swiglu import swiglu, swiglu_fn, PrecisionConfig as SwiGLUPrecisionConfig
 # numerics utilities
 from triton_kernels.numerics import InFlexData, OutFlexData
 from triton_kernels.numerics_details.mxfp import SwizzlingType, downcast_to_mxfp, upcast_from_mxfp
@@ -122,6 +123,13 @@ def dtype_str_to_torch(dtype_str: str) -> torch.dtype:
     return torch.uint8 if dtype_str == "float4_e2m1" else getattr(torch, dtype_str)
 
 
+# Scope to ensure that the opt_flags_constraints are reset after the test
+@pytest.fixture
+def opt_flags_scope(request):
+    yield
+    opt_flags.reset_opt_flags_constraints()
+
+
 # ---------------
 # unit tests
 # ---------------
@@ -218,7 +226,7 @@ class Case:
 @pytest.mark.parametrize("is_persistent", [False, True])
 def test_op(m, n, k, split_k, do_gather, do_scatter, fused_scatter, has_y_gammas, is_persistent, n_expts_tot,
             n_expts_act, n_expt_shards, mode, act_dtype_str, weight_dtype_str, block_m, hbm_swizzling, epilogue_subtile,
-            device):
+            device, opt_flags_scope):
     # TODO: remove when Triton FP8 supports proper RTNE
     if "float8" in weight_dtype_str and torch.cuda.get_device_capability()[0] < 9:
         pytest.skip("Float8 not tested on A100")
@@ -401,3 +409,65 @@ def round_x(x, idx):
         ref_y_scale = compute_actual_scale(ref_y, tri_y.dtype)
         assert (ref_y_scale -
                 tri_y_scale).abs() < 1e-10, f"ref_y_scale: {ref_y_scale}, tri_y_scale: {tri_y_scale.item()}"
+
+
+@pytest.mark.parametrize("m, n, k, mode", [
+    (1200, 704, 608, "ragged"),
+    (800, 800, 400, "batched"),
+])
+@pytest.mark.parametrize("split_k", [1, 2])
+@pytest.mark.parametrize("do_gather, do_scatter, fused_scatter", [
+    (False, False, False),
+    (True, False, False),
+    (False, True, False),
+    (True, True, False),
+    (True, True, True),
+])
+@pytest.mark.parametrize("is_persistent, epilogue_subtile", [
+    (False, False),
+    (True, False),
+    (True, True),
+])
+@pytest.mark.parametrize("swiglu_alpha, swiglu_limit", [
+    (1.1, 1.4),
+    (1.0, 1.2),
+    (0.7, 1.0),
+])
+def test_fused_act(m, n, k, mode, split_k, do_gather, do_scatter, fused_scatter, is_persistent, epilogue_subtile,
+                   swiglu_alpha, swiglu_limit, device, opt_flags_scope):
+    if fused_scatter and split_k > 1:
+        pytest.skip("fused scatter scratchpad not supported with split_k")
+    torch.manual_seed(0)
+    constraints = {
+        "is_persistent": is_persistent,
+        "epilogue_subtile": epilogue_subtile,
+        "fused_scatter": fused_scatter,
+        "split_k": split_k,
+    }
+    n_expts_tot, n_expts_act, n_expt_shards = 1, 1, 1
+    opt_flags.update_opt_flags_constraints(constraints)
+
+    weight_dtype, act_dtype = torch.float16, torch.float16
+    if mode == "ragged":
+        m, rdata, gindx, sindx = init_routing_data(m, n_expts_tot, n_expts_act, n_expt_shards, do_gather, do_scatter,
+                                                   device=device)
+    else:
+        rdata = gindx = sindx = None
+
+    precision_opt = init_precision(act_dtype, False, False, n_expts_tot // n_expt_shards, device=device)
+    x, w, bias, _, _ = init_compute_data(m, n, k, gindx, sindx, n_expts_tot, n_expts_act, n_expt_shards, mode,
+                                         act_dtype, weight_dtype, False, requires_grad=False, device=device)
+
+    if is_persistent and not can_use_persistent_tma(x.view(1, x.shape[-2], x.shape[-1]),
+                                                    w.view(1, w.shape[-2], w.shape[-1]), gindx, precision_opt):
+        pytest.skip("persistent TMAs not supported for this test")
+
+    if mode == "batched":
+        rdata, gindx, sindx = None, None, None
+    a = swiglu(matmul_ogs(x, w, bias, rdata, gindx, sindx, precision_opt), swiglu_alpha,
+               precision_config=SwiGLUPrecisionConfig(swiglu_limit))
+    b = matmul_ogs(
+        x, w, bias, rdata, gindx, sindx, precision_opt,
+        fused_activation=FusedActivation(FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")), (swiglu_alpha, swiglu_limit),
+                                         2))
+    assert_close(a, b)
@@ -20,42 +20,62 @@
 
 
 @dataclass
-class EpilogueSpecs:
+class FnSpecs:
     name: str
     fn: "triton.runtime.jit.JITFunction"
     fn_arg_names: tuple[str]
     fn_arg_do_not_specialize: tuple[str] = tuple()
 
+    @staticmethod
+    def default():
+        return FnSpecs("dflt", None, tuple())
+
+
+@dataclass
+class FusedActivation:
+    specs: FnSpecs
+    fn_args: tuple[object]
+    reduction_n: int
+
 
 @dataclass
 class Epilogue:
-    specs: EpilogueSpecs
+    specs: FnSpecs
     fn_arg_values_matmul: tuple[object]
     fn_arg_values_finalize: tuple[object]
     is_expensive: bool = False
 
 
+EpilogueSpecs = FnSpecs  # TODO: remove this alias when callers are updated
+
 _kernels = dict()
 
 
-def get_kernels(epilogue: EpilogueSpecs):
+def get_kernels(epilogue: FnSpecs = FnSpecs.default(), fused_activation: FnSpecs = FnSpecs.default()):
     global _kernels
-    if epilogue.name in _kernels:
-        return _kernels[epilogue.name]
-    spec_constants = {"EPILOGUE_FN": epilogue.fn}
-    spec_tuples = {"epilogue_fn_args": epilogue.fn_arg_names}
-    do_not_specialize = epilogue.fn_arg_do_not_specialize
+    key = (fused_activation.name, epilogue.name)
+    if key in _kernels:
+        return _kernels[key]
+    spec_constants = {
+        "ACTIVATION_FN": fused_activation.fn,
+        "EPILOGUE_FN": epilogue.fn,
+    }
+    spec_tuples = {
+        "activation_fn_args": fused_activation.fn_arg_names,
+        "epilogue_fn_args": epilogue.fn_arg_names,
+    }
+    do_not_specialize = fused_activation.fn_arg_do_not_specialize + epilogue.fn_arg_do_not_specialize
     import types
 
-    module = types.ModuleType(f"matmul_ogs_{epilogue.name}")
+    module = types.ModuleType(f"matmul_ogs_{'_'.join(key)}")
     sys.modules[module.__name__] = module
     module._finalize_matmul = specialize(_finalize_matmul, module, spec_constants, spec_tuples,
                                          do_not_specialize=do_not_specialize)
     module._matmul_ogs = specialize(_matmul_ogs, module, spec_constants, spec_tuples,
                                     do_not_specialize=do_not_specialize)
     module._p_matmul_ogs = specialize(_p_matmul_ogs, module, spec_constants, spec_tuples,
                                       do_not_specialize=do_not_specialize)
-    _kernels[epilogue.name] = module
+    _kernels[key] = module
     return module
 
 
@@ -254,8 +274,8 @@ def can_use_persistent_tma(x, w, gather_indx, precision_config):
         and mx_ctx.swizzle_value is None
     )
 
-def can_use_fused_scatter(scatter_indx):
-    return scatter_indx is not None
+def can_use_fused_scatter(scatter_indx, fused_activation):
+    return scatter_indx is not None and fused_activation.specs.fn is None
 
 # ---------------------
 # Preprocessing
@@ -341,7 +361,7 @@ def init_postprocessing_features(routing_data, scatter_indx, opt_flags):
     return PostprocessingFeatures(finalize)
 
 def apply_postprocessing_features(scatter_indx, finalize_scatter_idxs, opt_flags, expt_offs, num_indx, precision_config, routing_data,
-                                  postprocess_features, memory, epilogue):
+                                  postprocess_features, memory, fused_activation, epilogue):
     out = memory["output"]
     flex_ctx = precision_config.flex_ctx
     if postprocess_features.finalize:
@@ -407,14 +427,15 @@ def compute_grid(BLOCK_N, num_warps):
         grid, (BLOCK_N, num_warps) = sorted([(compute_grid(*c), c) for c in candidates], key=lambda x: x[0][1])[0]
         STAGES = 1 if num_warps == 1 else min(triton.cdiv(triton.cdiv(N, BLOCK_N), grid[1]), 5)
 
-        kernels = get_kernels(epilogue.specs)
+        kernels = get_kernels(epilogue.specs, fused_activation.specs)
         kernels._finalize_matmul[grid](
             flex_ctx.out_data.reinterpret(out_scatter),
             *out_scatter_flex,
             flex_ctx.out_data.reinterpret(inp), inp.stride(0), inp.stride(2),
             inp_flex.expected_scale,
             scatter_src_indx, finalize_scatter_idxs,
             inp.shape[0], M, N, num_rows,
+            *fused_activation.fn_args, fused_activation.reduction_n,
             *epilogue.fn_arg_values_finalize,
             EXPT_PER_TOK=EXPT_PER_TOK,
             BLOCK_N=BLOCK_N,
@@ -443,7 +464,7 @@ class MatmulAllocation:
     output: tuple[tuple[int], torch.dtype]
     scratchpads: dict[str, tuple]
 
-def init_allocation(x, w, precision_config, routing_data, gather_indx, scatter_indx, opt_flags,
+def init_allocation(x, w, precision_config, fused_activation, routing_data, gather_indx, scatter_indx, opt_flags,
                     preprocessing_features, postprocessing_features):
     # ---- output ------
     N = precision_config.mx_ctx.get_packed_tensor_logical_shape(w)[-1]
@@ -462,7 +483,7 @@ def init_allocation(x, w, precision_config, routing_data, gather_indx, scatter_i
     else:
         Mc = scatter_indx.src_indx.shape[0] // routing_data.n_expts_act # compressed number of rows
         y_rows = Mc
-    y_shape = (x.shape[0], y_rows, N)
+    y_shape = (x.shape[0], y_rows, N // fused_activation.reduction_n)
     out_dtype = precision_config.out_dtype or x.dtype
     output = (y_shape, out_dtype)
     # ---- scratchpad -----#
@@ -500,6 +521,7 @@ def matmul_ogs(x, w, bias,
                gammas: torch.Tensor | None = None,
                out_alpha: float | None = None,
                y: torch.Tensor | None = None,
+               fused_activation: FusedActivation | None = None,
                epilogue: Epilogue | None = None,
                ):
     """
@@ -516,9 +538,10 @@ def matmul_ogs(x, w, bias,
         assert w.ndim == 3 and w.shape[0] == x.shape[0]
     if precision_config is None:
         precision_config = PrecisionConfig()
+    if fused_activation is None:
+        fused_activation = FusedActivation(FnSpecs.default(), tuple(), 1)
     if epilogue is None:
-        epilogue_specs = EpilogueSpecs("dflt", None, tuple(), tuple())
-        epilogue = Epilogue(epilogue_specs, tuple(), tuple(), False)
+        epilogue = Epilogue(FnSpecs.default(), tuple(), tuple(), False)
     if w.ndim == 2:
         w = w.view(1, w.shape[-2], w.shape[-1])
     if x.ndim == 2:
@@ -540,7 +563,7 @@ def matmul_ogs(x, w, bias,
     opt_flags = make_opt_flags(out_dtype, x.dtype, w.dtype, precision_config,
         M, N, K, routing_data,
         can_use_persistent_tma(x, w, gather_indx, precision_config),
-        can_use_fused_scatter(scatter_indx),
+        can_use_fused_scatter(scatter_indx, fused_activation),
         epilogue.is_expensive,
     )
     # compute grid size
@@ -551,25 +574,27 @@ def matmul_ogs(x, w, bias,
     grid_n = triton.cdiv(N, opt_flags.block_n)
     assert n_expts_tot == routing_data.n_expts_tot
     assert grid_m > 0
-    assert x.dtype == w.dtype or mx_ctx.weight_scale is not None
     # determine necessary pre/post processing
     preprocessing_features = init_preprocessing_features(w, precision_config, opt_flags)
     postprocessing_features = init_postprocessing_features(routing_data, scatter_indx, opt_flags)
     # allocate output/scratchpad memory
-    allocation = init_allocation(x, w, precision_config, routing_data, gather_indx, scatter_indx, opt_flags,
+    allocation = init_allocation(x, w, precision_config, fused_activation, routing_data, gather_indx, scatter_indx, opt_flags,
                                  preprocessing_features, postprocessing_features)
     memory = apply_allocation(allocation, y)
     # TMA descriptors require a global memory allocation
     if opt_flags.is_persistent:
         triton.set_allocator(get_per_device_per_stream_alloc_fn(x.device))
     # Intermediate tensors and postprocess kernels for each situation
     out0, out0_flex = memory["output"], precision_config.flex_ctx.out_data
+    fused_postprocess_activation = FusedActivation(FnSpecs.default(), tuple(), 1)
     if postprocessing_features.finalize:
         if opt_flags.fused_scatter:
             out0 = memory["output"]
         else:
             out0 = memory["scratchpad"]["matmul"]
         out0_flex = OutFlexData() if out0.dtype == torch.float32 else precision_config.flex_ctx.out_data
+
+        fused_activation, fused_postprocess_activation = fused_postprocess_activation, fused_activation
     # pre-processing
     x, w, swap_xw, writeback_idxs, writeback_size, finalize_scatter_idxs, expt_data  = apply_preprocessing_features(
         x, w, gather_indx, scatter_indx, routing_data, opt_flags, preprocessing_features
@@ -584,7 +609,7 @@ def matmul_ogs(x, w, bias,
     flex = precision_config.flex_ctx
     bias_stride = None if bias is None else bias.stride(0)
     num_indx = None if scatter_indx is None else scatter_indx.src_indx.shape[0]
-    kernels = get_kernels(epilogue.specs)
+    kernels = get_kernels(epilogue.specs, fused_activation.specs)
     (kernels._p_matmul_ogs if opt_flags.is_persistent else kernels._matmul_ogs)[(n_cta,)](
                    flex.out_data.reinterpret(memory["output"]),
                    flex.out_data.reinterpret(out0), *out0.stride(),
@@ -606,6 +631,7 @@ def matmul_ogs(x, w, bias,
                    expt_data.hist, expt_data.offs, expt_data.offs_sum, expt_data.blocks,
                    batch_size, grid_m, grid_n,
                    out_alpha,
+                   *fused_activation.fn_args, fused_activation.reduction_n,
                    *epilogue.fn_arg_values_matmul,
                    routing_data.n_expts_tot, routing_data.n_expts_act,
                    precision_config.max_num_imprecise_acc,
@@ -635,7 +661,7 @@ def matmul_ogs(x, w, bias,
     # post-processing
     out = apply_postprocessing_features(scatter_indx, finalize_scatter_idxs, opt_flags, expt_data.offs,
                                 num_indx, precision_config, routing_data,
-                                postprocessing_features, memory, epilogue)
+                                postprocessing_features, memory, fused_postprocess_activation, epilogue)
 
     # remove split-k
     out = out.squeeze(0)