[Gluon] Fix memdesc_trans alloc shape (#7149)

Mogball · web-flow · commit 268ead7dceef · 2025-06-13T16:11:15.000-04:00
`new_alloc_shape` was not being returned
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -252,6 +252,8 @@ def shared_memory_cast_kernel():
     smem = ttgl.allocate_shared_memory(ttgl.int8, [2, 256, 128], layout_a)
     perm = smem.index(0).permute((1, 0))
     ttgl.static_assert(perm.type.layout == layout_T)
+    # Check that the MLIR type and Gluon types match by emitting a call.
+    anchor_noinline(perm)
 
     layout_b: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, transposed=False, element_bitwidth=16,
                                                       rank=4, cta_order=[3, 2, 1, 0])
@@ -279,11 +281,15 @@ def test_shared_memory_cast(fresh_knobs):
     %c0_i32_0 = arith.constant 0 : i32
     %1 = ttg.memdesc_subview %0[%c0_i32_0, %c0_i32, %c0_i32] : !ttg.memdesc<2x256x128xi8, #shared, #smem, mutable> -> !ttg.memdesc<256x128xi8, #shared, #smem, mutable, 2x256x128>
     %2 = ttg.memdesc_trans %1 {order = array<i32: 1, 0>} : !ttg.memdesc<256x128xi8, #shared, #smem, mutable, 2x256x128> -> !ttg.memdesc<128x256xi8, #shared1, #smem, mutable, 2x128x256>
+    tt.call @"test_frontend.anchor_noinline__MDi8S128_256SLNVMMA_64_8_True_False_NVMMALAS[2, 128, 256]ASMD__"(%2) : (!ttg.memdesc<128x256xi8, #shared1, #smem, mutable, 2x128x256>) -> ()
     %3 = ttg.local_alloc : () -> !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable>
     %4 = ttg.memdesc_reshape %3 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<128x64xf16, #shared3, #smem, mutable, 32x1x4x64>
     %5 = ttg.memdesc_reinterpret %3 : !ttg.memdesc<32x1x4x64xf16, #shared2, #smem, mutable> -> !ttg.memdesc<1024xi8, #shared4, #smem, mutable>
     tt.return
   }
+  tt.func private @"test_frontend.anchor_noinline__MDi8S128_256SLNVMMA_64_8_True_False_NVMMALAS[2, 128, 256]ASMD__"(%arg0: !ttg.memdesc<128x256xi8, #shared1, #smem, mutable, 2x128x256>) attributes {noinline = true} {
+    tt.return
+  }
 }
 """)
 
@@ -318,6 +324,11 @@ def anchor(x):
     pass
 
 
+@gluon.jit(noinline=True)
+def anchor_noinline(x):
+    pass
+
+
 @filecheck_test
 @gluon.jit
 def test_warp_specialize():
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -173,7 +173,7 @@ def _flatten_ir_types(self, builder: GluonOpBuilder, out: List[ir.type]) -> None
         out.append(self.to_ir(builder))
 
     def __str__(self) -> str:
-        return f"shared_memory_descriptor<{self.element_ty}, {self.shape}, {self.layout}>"
+        return f"shared_memory_descriptor<{self.element_ty}, {self.shape}, {self.layout}, {self.alloc_shape}>"
 
     def __eq__(self, other) -> bool:
         return (type(self) is type(other) and self.shape == other.shape and self.layout == other.layout
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -191,8 +191,8 @@ def memdesc_trans(self, mem_desc, order):
 
         handle = self.builder.create_memdesc_trans(mem_desc.handle, order)
         layout = self.builder.get_gluon_layout_from_memdesc(handle)
-        return ttgl.shared_memory_descriptor(handle, element_ty=mem_desc.dtype, shape=shape, alloc_shape=alloc_shape,
-                                             layout=layout)
+        return ttgl.shared_memory_descriptor(handle, element_ty=mem_desc.dtype, shape=shape,
+                                             alloc_shape=new_alloc_shape, layout=layout)
 
     def memdesc_reshape(self, mem_desc, shape, layout):
         ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)