Merge commit 'cda4229558c5dca7f7c4734bedd3e596ebcae0b8'

whitneywhtsang · whitneywhtsang · commit 328fd8a7b2b7 · 2025-06-05T05:06:49.000Z
diff --git a/python/test/unit/language/test_frontend.py b/python/test/unit/language/test_frontend.py
@@ -109,3 +109,20 @@ def test_list_of_functions():
     # CHECK-NEXT: call @anchor
     # CHECK-NEXT: call @forward
     list_of_functions_constexpr(tl.arange(0, 4), [anchor, forward])
+
+
+@triton.jit
+def accumulate(a, b):
+    return a + b
+
+
+# Check that we can call a function returning a value from a loop.
+@filecheck_test
+@triton.jit
+def test_call_in_loop():
+    # CHECK-LABEL: test_call_in_loop
+    acc = 0
+    # CHECK: scf.for
+    # CHECK:   call @accumulate
+    for i in range(10):
+        acc = accumulate(acc, i)
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -135,10 +135,9 @@ def _visit_stmts(self, body) -> bool:
         return any(self.visit(s) for s in body)
 
     def _visit_function(self, fn) -> bool:
-        # Currently we only support JITFunctions defined in the global scope
-        if isinstance(fn, JITFunction) and not fn.noinline:
-            fn_node = fn.parse()
-            return ContainsReturnChecker(self.gscope).visit(fn_node)
+        # no need to check within the function as it won't cause an early return.
+        # If the function itself has unstructured control flow we may not be able to inline it causing poor performance.
+        # We should check for this and fail or emit a warning.
         return False
 
     def generic_visit(self, node) -> bool:
diff --git a/python/triton_kernels/tests/test_routing.py b/python/triton_kernels/tests/test_routing.py
@@ -44,7 +44,8 @@ def ref_expt_data(routing_data, n_gates, block_m):
 @pytest.mark.parametrize("n_expts_tot, n_expts_act", [(128, 4), (1500, 8)])
 @pytest.mark.parametrize("block_m", [64, 128])
 @pytest.mark.parametrize("use_expt_indx", [False, True])
-def test_op(n_tokens, n_expts_tot, n_expts_act, block_m, use_expt_indx, device):
+@pytest.mark.parametrize("renormalize", [True, False])
+def test_op(n_tokens, n_expts_tot, n_expts_act, renormalize, block_m, use_expt_indx, device):
     torch.manual_seed(2)
     tri_logits = init_data(n_tokens, n_expts_tot, device=device).detach()
     ref_logits = tri_logits.clone()
@@ -55,8 +56,11 @@ def test_op(n_tokens, n_expts_tot, n_expts_act, block_m, use_expt_indx, device):
         ref_expt_indx = tri_expt_indx[:n_tokens]
     else:
         tri_expt_indx = ref_expt_indx = None
-    ref_routing_data, ref_gather, ref_scatter = routing_torch(ref_logits, n_expts_act, ref_expt_indx)
-    tri_routing_data, tri_gather, tri_scatter = routing(tri_logits, n_expts_act, tri_expt_indx)
+    if not renormalize:
+        tri_logits = torch.softmax(tri_logits, dim=-1)
+        ref_logits = torch.softmax(ref_logits, dim=-1)
+    ref_routing_data, ref_gather, ref_scatter = routing_torch(ref_logits, n_expts_act, renormalize, ref_expt_indx)
+    tri_routing_data, tri_gather, tri_scatter = routing(tri_logits, n_expts_act, renormalize, tri_expt_indx)
     ref_metadata = ref_expt_data(ref_routing_data, n_tokens * n_expts_act, block_m)
     tri_metadata = compute_metadata(tri_routing_data, n_tokens * n_expts_act, block_m)
 
diff --git a/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py b/python/triton_kernels/triton_kernels/matmul_ogs_details/_p_matmul_ogs.py
@@ -514,7 +514,6 @@ def _p_matmul_ogs(
             if SWAP_XW:
                 acc_tile = acc_tile.T
             acc_tile = acc_tile + biases[a_i][None, :] * betas[:, None]
-            acc_tile *= gammas[:, None]
             if out_alpha is not None:
                 acc_tile *= out_alpha
 
@@ -525,6 +524,8 @@ def _p_matmul_ogs(
                 tl.static_assert(ACTIVATION_REDUCTION_N == 1, "Activation reduction must be 1 if no activation fn is provided")
                 out = acc_tile
 
+            out *= gammas[:, None]
+
             if MASK_ACC:
                 out = tl.where(mask_m[:, None], out, 0.0)
             # Flexpoint
diff --git a/python/triton_kernels/triton_kernels/routing.py b/python/triton_kernels/triton_kernels/routing.py
@@ -53,7 +53,7 @@ def n_blocks(self, n_rows, block_m):
 # --------------------------
 
 
-def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
+def routing(logits, n_expts_act, renormalize=True, expt_indx=None, simulated_ep=1):
     from .topk import topk
     from .compaction import compaction
     cdiv = triton.cdiv
@@ -63,7 +63,7 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
     n_tokens, n_expts_tot = logits.shape
     n_gates = n_tokens * n_expts_act
     device = logits.device
-    expt_scal, expt_indx, bitmatrix = topk(logits, n_expts_act, y_indx=expt_indx)
+    expt_scal, expt_indx, bitmatrix = topk(logits, n_expts_act, apply_softmax=renormalize, y_indx=expt_indx)
     # mutate bitmatrix
     if simulated_ep > 1:
         assert n_expts_tot % simulated_ep == 0
@@ -108,7 +108,7 @@ def routing(logits, n_expts_act, expt_indx=None, simulated_ep=1):
     return RoutingData(gate_scal, hist, n_expts_tot, n_expts_act), gather_indx, scatter_indx
 
 
-def routing_torch(logits, n_expts_act, expt_indx=None):
+def routing_torch(logits, n_expts_act, renormalize=True, expt_indx=None):
 
     def topk(vals, k, expt_indx):
         # topk of experts
@@ -121,7 +121,8 @@ def topk(vals, k, expt_indx):
 
     _, n_expts_tot = logits.shape
     expt_scal, expt_indx = topk(logits, n_expts_act, expt_indx)
-    expt_scal = torch.softmax(expt_scal, dim=-1)
+    if renormalize:
+        expt_scal = torch.softmax(expt_scal, dim=-1)
     # flatten topk data
     expt_scal = expt_scal.reshape(-1)
     expt_indx = expt_indx.reshape(-1).to(torch.int32)
diff --git a/python/triton_kernels/triton_kernels/topk.py b/python/triton_kernels/triton_kernels/topk.py
@@ -3,7 +3,7 @@
 from .bitmatrix import Bitmatrix
 
 
-def topk(x, k, dim=1, return_bitmatrix=True, y_indx=None):
+def topk(x, k, apply_softmax=True, dim=1, return_bitmatrix=True, y_indx=None):
     cdiv = lambda a, b: (a + b - 1) // b
     BLOCK_M = 32
     BLOCK_N = 32
@@ -39,5 +39,5 @@ def topk(x, k, dim=1, return_bitmatrix=True, y_indx=None):
         S, BLOCK_S, s_blocks,  # thing to memset to zero
         BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,  # tunable parameter
         N_EXPTS_PAD=n_cols_pad, N_EXPTS_ACT=k,  # constants
-    )
+        APPLY_SOFTMAX=apply_softmax)
     return y_vals, y_indx, Bitmatrix(bitmatrix, [n_rows, n_cols], S)
diff --git a/python/triton_kernels/triton_kernels/topk_details/_topk.py b/python/triton_kernels/triton_kernels/topk_details/_topk.py
@@ -72,7 +72,8 @@ def _topk(X, stride_xm,  # inputs
           Yv, Yi, stride_ym,  # topk values/indices
           USE_PROVIDED_INDX: tl.constexpr, Bits, stride_rm: tl.constexpr, stride_rn: tl.constexpr, n_rows,  # bitmatrix
           n_expts_tot, S, BLOCK_S: tl.constexpr, s_blocks,  # thing to memset
-          BLOCK_M: tl.constexpr, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr, BLOCK_N: tl.constexpr):
+          BLOCK_M: tl.constexpr, N_EXPTS_PAD: tl.constexpr, N_EXPTS_ACT: tl.constexpr, BLOCK_N: tl.constexpr,
+          APPLY_SOFTMAX: tl.constexpr):
 
     pid = tl.program_id(0)
 
@@ -105,8 +106,8 @@ def _topk(X, stride_xm,  # inputs
         y_indices = y & 0x0000FFFF
         y_values = (y >> x_nbits).to(x_utype).to(x_dtype, bitcast=True)
 
-    # normalize selected values
-    y_values = tl.softmax(y_values.to(tl.float32), dim=1, keep_dims=True).to(x_dtype)
+    if APPLY_SOFTMAX:
+        y_values = tl.softmax(y_values.to(tl.float32), dim=1, keep_dims=True).to(x_dtype)
 
     # write back
     Yv_ptrs = Yv + offs_m[:, None] * stride_ym + offs_y_n[None, :]