[Gluon] Add local_dealloc and enable local_alloc with no initializer (#6994)

Mogball · web-flow · commit d510a3db28c4 · 2025-05-30T12:11:57.000+01:00
`local_dealloc` is an unfortunate necessity because the compiler doesn't
correctly reason about the liveranges of shared memory used by async
operations. For now, users will need to manually keep shared memory
alive using `smem._keep_alive()`.
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -94,6 +94,10 @@ void init_gluon_ir(py::module &&m) {
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttg::ConvertLayoutOp>(resultTy, value);
            })
+      .def("create_local_alloc",
+           [](GluonOpBuilder &self, Type resultTy) -> Value {
+             return self.create<ttg::LocalAllocOp>(resultTy);
+           })
       .def("create_local_alloc",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttg::LocalAllocOp>(resultTy, value);
@@ -106,6 +110,11 @@ void init_gluon_ir(py::module &&m) {
            [](GluonOpBuilder &self, Type resultTy, Value memDesc) -> Value {
              return self.create<ttg::LocalLoadOp>(resultTy, memDesc);
            })
+      .def("create_local_dealloc",
+           [](GluonOpBuilder &self, Value memDesc) -> Operation * {
+             return self.create<ttg::LocalDeallocOp>(memDesc);
+           })
+
       .def("create_tmem_alloc",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttng::TMEMAllocOp>(resultTy, value);
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -41,10 +41,12 @@ def test_convert_layout(fresh_knobs):
 @gluon.jit
 def shared_memory_kernel(XBLOCK: ttgl.constexpr, YBLOCK: ttgl.constexpr, layout_a: ttgl.constexpr,
                          layout_b: ttgl.constexpr, smem_layout: ttgl.constexpr):
+    unused = ttgl.allocate_shared_memory(ttgl.int32, [XBLOCK, YBLOCK], smem_layout)
     a = ttgl.full([XBLOCK, YBLOCK], 0, ttgl.int32, layout_a)
     mem = ttgl.allocate_shared_memory(ttgl.int32, a.shape, smem_layout, a)
     b = mem.load(layout_b)  # noqa: F841
     mem.store(a)
+    unused._keep_alive()
 
 
 def test_shared_memory(fresh_knobs):
@@ -63,11 +65,13 @@ def test_shared_memory(fresh_knobs):
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-warps" = 4 : i32} {
   tt.func public @shared_memory_kernel() attributes {noinline = false} {
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<8x32xi32, #shared, #smem, mutable> loc(#loc)
     %c0_i32 = arith.constant 0 : i32 loc(#loc)
     %cst = arith.constant dense<0> : tensor<8x32xi32, #blocked> loc(#loc)
-    %0 = ttg.local_alloc %cst : (tensor<8x32xi32, #blocked>) -> !ttg.memdesc<8x32xi32, #shared, #smem, mutable> loc(#loc)
-    %1 = ttg.local_load %0 : !ttg.memdesc<8x32xi32, #shared, #smem, mutable> -> tensor<8x32xi32, #blocked1> loc(#loc)
-    ttg.local_store %cst, %0 : tensor<8x32xi32, #blocked> -> !ttg.memdesc<8x32xi32, #shared, #smem, mutable> loc(#loc)
+    %1 = ttg.local_alloc %cst : (tensor<8x32xi32, #blocked>) -> !ttg.memdesc<8x32xi32, #shared, #smem, mutable> loc(#loc)
+    %2 = ttg.local_load %1 : !ttg.memdesc<8x32xi32, #shared, #smem, mutable> -> tensor<8x32xi32, #blocked1> loc(#loc)
+    ttg.local_store %cst, %1 : tensor<8x32xi32, #blocked> -> !ttg.memdesc<8x32xi32, #shared, #smem, mutable> loc(#loc)
+    ttg.local_dealloc %0 : !ttg.memdesc<8x32xi32, #shared, #smem, mutable> loc(#loc)
     tt.return loc(#loc)
   } loc(#loc)
 } loc(#loc)
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -200,6 +200,10 @@ def load(self, layout, _builder: GluonOpBuilder) -> tensor:
     def store(self, value, _builder: GluonOpBuilder) -> None:
         return semantic.shared_store(self, value, _builder)
 
+    @builtin
+    def _keep_alive(self, _builder=None) -> None:
+        return semantic.shared_dealloc(self, _builder)
+
 
 for name in _IMPORT_FROM_TRITON:
     fn = getattr(tl_core, name)
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -32,7 +32,10 @@ def convert_layout(value, layout, builder: GluonOpBuilder):
 
 def allocate_shared(element_ty, shape, layout, value, builder: GluonOpBuilder):
     ty = ttgl.shared_memory_descriptor_type(element_ty, shape, layout, shape)
-    handle = builder.create_local_alloc(ty.to_ir(builder), value.handle)
+    if value is not None:
+        handle = builder.create_local_alloc(ty.to_ir(builder), value.handle)
+    else:
+        handle = builder.create_local_alloc(ty.to_ir(builder))
     return ttgl.shared_memory_descriptor(handle, element_ty, shape, layout, shape)
 
 
@@ -46,6 +49,10 @@ def shared_store(mem_desc, value, builder: GluonOpBuilder):
     builder.create_local_store(mem_desc.handle, value.handle)
 
 
+def shared_dealloc(mem_desc, builder: GluonOpBuilder):
+    builder.create_local_dealloc(mem_desc.handle)
+
+
 def warp_specialize(args, default_partition, worker_partitions, worker_num_warps: Sequence[int],
                     worker_num_regs: Sequence[int], builder: GluonOpBuilder, generator):
     num_partitions = len(worker_partitions)