[Gluon] Fix memdesc_trans alloc shape inference and constexpr getitem (#7102)

Mogball · web-flow · commit b6dabff373e0 · 2025-06-06T12:46:52.000-07:00
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -247,8 +247,8 @@ def shared_memory_cast_kernel():
                                                       rank=2)
     layout_T: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, transposed=True, element_bitwidth=8,
                                                       rank=2)
-    smem = ttgl.allocate_shared_memory(ttgl.int8, [256, 128], layout_a)
-    smem.permute((1, 0), layout_T)
+    smem = ttgl.allocate_shared_memory(ttgl.int8, [2, 256, 128], layout_a)
+    smem.subslice(0).permute((1, 0), layout_T)
 
     layout_b: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, transposed=False, element_bitwidth=16,
                                                       rank=4, cta_order=[3, 2, 1, 0])
@@ -271,11 +271,14 @@ def test_shared_memory_cast(fresh_knobs):
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
   tt.func public @shared_memory_cast_kernel() attributes {noinline = false} {
-    %0 = ttg.local_alloc : () -> !ttg.memdesc<256x128xi8, #shared, #smem, mutable>
-    %1 = ttg.memdesc_trans %0 {order = array<i32: 1, 0>} : !ttg.memdesc<256x128xi8, #shared, #smem, mutable> -> !ttg.memdesc<128x256xi8, #shared1, #smem, mutable>
-    %2 = ttg.local_alloc : () -> !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable>
-    %3 = ttg.memdesc_reshape %2 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<128x64xf16, #shared3, #smem, mutable, 32x1x4x64>
-    %4 = ttg.memdesc_reinterpret %2 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<1024xi8, #shared4, #smem, mutable>
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<2x256x128xi8, #shared, #smem, mutable>
+    %c0_i32 = arith.constant 0 : i32
+    %c0_i32_0 = arith.constant 0 : i32
+    %1 = ttg.memdesc_subview %0[%c0_i32_0, %c0_i32, %c0_i32] : !ttg.memdesc<2x256x128xi8, #shared, #smem, mutable> -> !ttg.memdesc<256x128xi8, #shared, #smem, mutable, 2x256x128>
+    %2 = ttg.memdesc_trans %1 {order = array<i32: 1, 0>} : !ttg.memdesc<256x128xi8, #shared, #smem, mutable, 2x256x128> -> !ttg.memdesc<128x256xi8, #shared1, #smem, mutable, 2x128x256>
+    %3 = ttg.local_alloc : () -> !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable>
+    %4 = ttg.memdesc_reshape %3 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<128x64xf16, #shared3, #smem, mutable, 32x1x4x64>
+    %5 = ttg.memdesc_reinterpret %3 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<1024xi8, #shared4, #smem, mutable>
     tt.return
   }
 }
diff --git a/python/test/unit/language/test_frontend.py b/python/test/unit/language/test_frontend.py
@@ -257,3 +257,23 @@ def test_reassign_aggregate_with_constexpr():
         agg = agg.modify(tl.arange(8, 12))
     # CHECK: call @{{.*}}anchor{{.*}}([[AGG]])
     anchor(agg)
+
+
+@tl.constexpr_function
+def make_shape(m, n):
+    return (m, n)
+
+
+@tl.constexpr_function
+def add_shape_dims(m, n):
+    return m + n
+
+
+@filecheck_test
+@triton.jit
+def test_constexpr_getitem():
+    # CHECK-LABEL: test_constexpr_getitem
+    # CHECK: make_range {end = 12 : i32, start = 4 : i32}
+    shape: tl.constexpr = make_shape(4, 8)
+    sum: tl.constexpr = add_shape_dims(shape[0], shape[1])
+    tl.arange(4, sum)
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -164,7 +164,7 @@ def memdesc_trans(self, mem_desc, order, layout):
         shape = [mem_desc.shape[i] for i in order]
         alloc_shape = mem_desc.type.alloc_shape
         new_alloc_shape = alloc_shape[:len(alloc_shape) - mem_desc.rank]
-        new_alloc_shape += [alloc_shape[:mem_desc.rank][i] for i in order]
+        new_alloc_shape += [alloc_shape[len(alloc_shape) - mem_desc.rank:][i] for i in order]
 
         ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, new_alloc_shape)
         handle = self.builder.create_memdesc_trans(ty.to_ir(self.builder), mem_desc.handle, order)
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -329,6 +329,10 @@ def __iter__(self):
     def __call__(self, *args, **kwds):
         return self.value(*args, **kwds)
 
+    def __getitem__(self, *args):
+        args = (_unwrap_if_constexpr(x) for x in _normalize_tuple(args))
+        return self.value.__getitem__(*args)
+
 
 def constexpr_function(f):
     """