[Gluon] Add constexpr_function and static_range (#7531)

peterbell10 · web-flow · commit 559237b8b041 · 2025-07-16T13:30:19.000Z
These are used in the attention tutorial, but ideally everything should
come from the `gluon.language` module.
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -12,6 +12,7 @@
 import triton.language.core as tl_core
 from triton.language.core import (
     constexpr,
+    constexpr_function,
     base_value,
     base_type,
     dtype,
@@ -38,6 +39,7 @@
     float64,
     _unwrap_if_constexpr,
     _unwrap_shape,
+    static_range,
     tensor,
     tuple,
     tuple_type,
@@ -68,6 +70,7 @@
 
 __all__ = [
     "constexpr",
+    "constexpr_function",
     "base_value",
     "base_type",
     "dtype",
@@ -105,6 +108,7 @@
     "allocate_shared_memory",
     "set_auto_layout",
     "shared_memory_descriptor",
+    "static_range",
     "warp_specialize",
     *_IMPORT_FROM_TRITON,
 ]
diff --git a/python/tutorials/gluon/01-attention-forward.py b/python/tutorials/gluon/01-attention-forward.py
@@ -1,6 +1,5 @@
 import torch
 import triton
-import triton.language as tl
 import pytest
 import itertools
 
@@ -25,7 +24,7 @@
 # ===-----------------------------------------------------------------------===#
 
 
-@tl.constexpr_function
+@gl.constexpr_function
 def get_tmem_32x32b_reg_layout(instr_shape, shape, num_warps):
     assert len(shape) == 2, "expected a 2D tensor"
     assert num_warps in [4, 8], "expected 4 or 8 warps"
@@ -61,15 +60,15 @@ def get_tmem_32x32b_reg_layout(instr_shape, shape, num_warps):
     )
 
 
-@tl.constexpr_function
+@gl.constexpr_function
 def get_mma_instr_shape(shape, element_ty):
     m = 128 if shape[0] >= 128 else 64
     n = 256 if shape[1] >= 256 else shape[1]
     k = 256 // element_ty.primitive_bitwidth
     return (m, n, k)
 
 
-@tl.constexpr_function
+@gl.constexpr_function
 def get_nvmma_layout(shape, element_ty, order=[1, 0], fp4_padded=False):
     packing_factor = 2 if fp4_padded else 1
 
@@ -99,7 +98,7 @@ def get_nvmma_layout(shape, element_ty, order=[1, 0], fp4_padded=False):
     )
 
 
-@tl.constexpr_function
+@gl.constexpr_function
 def get_mma_reg_layout(shape, num_warps, dtype=gl.float32):
     instr_shape = get_mma_instr_shape(shape, dtype)
     return get_tmem_32x32b_reg_layout(instr_shape, shape, num_warps)
@@ -133,7 +132,7 @@ def alloc(shape: gl.constexpr, dtype: gl.constexpr, layout: gl.constexpr, num_bu
             mem = alloc_fn(dtype, [num_buffers] + shape, layout)
             ready_bars = gl.allocate_shared_memory(gl.int64, [num_buffers, 1], mbarrier.MBarrierLayout())
             empty_bars = gl.allocate_shared_memory(gl.int64, [num_buffers, 1], mbarrier.MBarrierLayout())
-            for i in tl.static_range(num_buffers):
+            for i in gl.static_range(num_buffers):
                 mbarrier.init(ready_bars.index(i), count=1)
                 mbarrier.init(empty_bars.index(i), count=num_consumers)
                 mbarrier.arrive(empty_bars.index(i), count=num_consumers)
@@ -179,7 +178,7 @@ def create_consumer(self):
         def release(self):
             if isinstance(self.mem, gl.shared_memory_descriptor):
                 self.mem._keep_alive()
-            for i in tl.static_range(self.num_buffers):
+            for i in gl.static_range(self.num_buffers):
                 mbarrier.invalidate(self.ready_bars.index(i))
                 mbarrier.invalidate(self.empty_bars.index(i))
 
@@ -847,7 +846,7 @@ def _attn_fwd_correction_rescale(config, s_tmem, corr_consumer, o_consumer):
     mbarrier.arrive(corr_bar, count=1)
     alpha = gl.convert_layout(alpha.reshape([config.SPLIT_M]), alpha_layout)
 
-    for i in tl.static_range(config.SPLIT_D_FACTOR):
+    for i in gl.static_range(config.SPLIT_D_FACTOR):
         o_ref = o_tmem.slice(i * config.SPLIT_D, config.SPLIT_D)
         o = o_ref.load(config.o_splitn_layout)
         o = _mul_f32x2(o, alpha[:, None])
@@ -882,7 +881,7 @@ def _attn_fwd_correction_epilogue(config, prog, s_tmem, M, corr_consumer, epi_pr
     SPLIT_N: gl.constexpr = o_smem.type.shape[1] // SPLIT_N_FACTOR
 
     scale = 1 / l_i
-    for i in tl.static_range(SPLIT_N_FACTOR):
+    for i in gl.static_range(SPLIT_N_FACTOR):
         o_ref = o_tmem.slice(i * SPLIT_N, SPLIT_N)
         o = o_ref.load(config.o_splitn_layout)
         o = _mul_f32x2(o, scale[:, None])
@@ -992,7 +991,7 @@ def attention_kernel(  #
 def torch_dtype_to_triton(dtype):
     if dtype == torch.float8_e5m2:
         return gl.float8e5
-    return getattr(tl, str(dtype).split('.')[1])
+    return getattr(gl, str(dtype).split('.')[1])
 
 
 def make_tensor_desc(x, shape, strides, block_shape):