[Gluon] Implement tensor memory (#6985)

peterbell10 · web-flow · commit 9f88c7f8ba9b · 2025-05-30T02:44:27.000Z
This implements:
- `ttgl.nvidia.blackwell.allocate_tensor`
- `ttgl.nvidia.blackwell.TensorMemoryLayout`
- `tensor_memory_descriptor.load`
- `tensor_memory_descriptor.store`
- `tensor_memory_descriptor.subslice`
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/TensorMemoryAllocation.cpp
@@ -314,6 +314,10 @@ class TritonTensorMemoryAllocationPass
     : public impl::TritonTensorMemoryAllocationPassBase<
           TritonTensorMemoryAllocationPass> {
 public:
+  IntegerAttr getI32Attr(int32_t value) {
+    return Builder(&getContext()).getI32IntegerAttr(value);
+  }
+
   void runOnOperation() override {
     ModuleOp mod = getOperation();
     MLIRContext *ctx = &getContext();
@@ -323,6 +327,9 @@ class TritonTensorMemoryAllocationPass
     int totalMemorySize = allocateTMem(mod, offsets);
 
     std::array<int, 6> possibleAllocations = {0, 32, 64, 128, 256, 512};
+    // NOTE: if totalMemorySize > 512 we exceeded the maximum amount of tensor
+    // memory, but we let the compilation finish so that we can raise an
+    // exception in python for the auto-tuner.
     if (totalMemorySize <= 512) {
       for (int size : possibleAllocations) {
         if (totalMemorySize <= size) {
@@ -331,18 +338,18 @@ class TritonTensorMemoryAllocationPass
         }
       }
     }
-    // if totalMemorySize > 512 we exceeded the maximum amount of tensor memory,
-    // let the compilation finish so that we can raise an exception in python
-    // for auto-tuner.
     if (totalMemorySize > 0) {
-      assert(mod->getAttr("ttg.shared") != nullptr &&
-             cast<IntegerAttr>(mod->getAttr("ttg.shared")).getInt() != 0 &&
-             "Shared memory is required for allocation of Tensor Core memory.");
+      // We use a small smem allocation to get the tensor memory base address
+      // from tcgen05.alloc, ensure the block has at least 4 bytes of smem
+      int shared = 0;
+      if (auto sharedAttr = mod->getAttr("ttg.shared")) {
+        shared = cast<IntegerAttr>(sharedAttr).getInt();
+      }
+      if (shared < 4) {
+        mod->setAttr("ttg.shared", getI32Attr(4));
+      }
     }
-
-    mod->setAttr("ttg.tensor_memory_size",
-                 mlir::IntegerAttr::get(mlir::IntegerType::get(ctx, 32),
-                                        totalMemorySize));
+    mod->setAttr("ttg.tensor_memory_size", getI32Attr(totalMemorySize));
   }
 };
 
diff --git a/python/src/gluon_ir.cc b/python/src/gluon_ir.cc
@@ -4,13 +4,14 @@
 
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Types.h"
-#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 using namespace mlir;
 namespace py = pybind11;
 namespace ttg = triton::gpu;
+namespace ttng = triton::nvidia_gpu;
 
 struct GluonOpBuilder : public TritonOpBuilder {};
 
@@ -35,6 +36,16 @@ void init_gluon_ir(py::module &&m) {
                                           /*mutableMemory=*/true,
                                           /*allocShape=*/allocShape);
            })
+      .def("get_tensor_mem_desc_ty",
+           [](GluonOpBuilder &self, Type &elementType,
+              std::vector<int64_t> &shape, Attribute layout,
+              std::vector<int64_t> &allocShape) -> Type {
+             auto ctx = self.getContext();
+             return ttg::MemDescType::get(shape, elementType, layout,
+                                          ttng::TensorMemorySpaceAttr::get(ctx),
+                                          /*mutableMemory=*/true,
+                                          /*allocShape=*/allocShape);
+           })
       .def("get_blocked_layout",
            [](GluonOpBuilder &self, std::vector<unsigned> &sizePerThread,
               std::vector<unsigned> &threadsPerWarp,
@@ -69,6 +80,16 @@ void init_gluon_ir(py::module &&m) {
                  ctx, swizzleByteWidth, transposed, elementBitwidth, fp4Padded,
                  ctaLayout);
            })
+      .def("get_tensor_memory_layout",
+           [](GluonOpBuilder &self, std::vector<unsigned> &block, bool unpacked,
+              std::vector<unsigned> &ctaSplitNum) -> Attribute {
+             auto ctx = self.getContext();
+             assert(block.size() == 2);
+             assert(ctaSplitNum.size() == 2);
+             return ttng::TensorMemoryEncodingAttr::get(
+                 ctx, block[0], block[1], unpacked, ctaSplitNum[0],
+                 ctaSplitNum[1]);
+           })
       .def("create_convert_layout",
            [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
              return self.create<ttg::ConvertLayoutOp>(resultTy, value);
@@ -85,7 +106,23 @@ void init_gluon_ir(py::module &&m) {
            [](GluonOpBuilder &self, Type resultTy, Value memDesc) -> Value {
              return self.create<ttg::LocalLoadOp>(resultTy, memDesc);
            })
-
+      .def("create_tmem_alloc",
+           [](GluonOpBuilder &self, Type resultTy, Value value) -> Value {
+             return self.create<ttng::TMEMAllocOp>(resultTy, value);
+           })
+      .def("create_tmem_store",
+           [](GluonOpBuilder &self, Value memDesc, Value value, Value pred) {
+             self.create<ttng::TMEMStoreOp>(memDesc, value, pred);
+           })
+      .def("create_tmem_load",
+           [](GluonOpBuilder &self, Type resultTy, Value memDesc) -> Value {
+             return self.create<ttng::TMEMLoadOp>(resultTy, memDesc);
+           })
+      .def("create_tmem_subslice",
+           [](GluonOpBuilder &self, Type resultTy, Value memDesc,
+              int N) -> Value {
+             return self.create<ttng::TMEMSubSliceOp>(resultTy, memDesc, N);
+           })
       .def("create_warp_return",
            [](GluonOpBuilder &self) -> Operation * {
              return self.create<ttg::WarpReturnOp>();
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -1,10 +1,13 @@
 import expecttest
+import torch
+import pytest
 
 from triton import knobs
 from triton.experimental import gluon
 from triton.experimental.gluon import language as ttgl
 from triton._filecheck import filecheck_test
 import triton.language as tl
+from triton._internal_testing import is_cuda
 
 
 @gluon.jit
@@ -39,7 +42,7 @@ def test_convert_layout(fresh_knobs):
 def shared_memory_kernel(XBLOCK: ttgl.constexpr, YBLOCK: ttgl.constexpr, layout_a: ttgl.constexpr,
                          layout_b: ttgl.constexpr, smem_layout: ttgl.constexpr):
     a = ttgl.full([XBLOCK, YBLOCK], 0, ttgl.int32, layout_a)
-    mem = ttgl.allocate_shared(ttgl.int32, a.shape, smem_layout, a)
+    mem = ttgl.allocate_shared_memory(ttgl.int32, a.shape, smem_layout, a)
     b = mem.load(layout_b)  # noqa: F841
     mem.store(a)
 
@@ -72,6 +75,47 @@ def test_shared_memory(fresh_knobs):
 """)
 
 
+@gluon.jit
+def tensor_memory_kernel(layout: ttgl.constexpr, tmem_layout: ttgl.constexpr):
+    XBLOCK: ttgl.constexpr = tmem_layout.block[0]
+    YBLOCK: ttgl.constexpr = tmem_layout.block[1]
+    a = ttgl.full([XBLOCK, YBLOCK], 0, ttgl.int32, layout)
+    mem = ttgl.nvidia.blackwell.allocate_tensor_memory(ttgl.int32, a.shape, tmem_layout, a)
+    b = mem.load(layout)  # noqa: F841
+    mem.store(a)
+    slice1 = mem.subslice(0, YBLOCK // 2)  # noqa: F841
+    slice2 = mem.subslice(YBLOCK // 2, YBLOCK // 2)  # noqa: F841
+
+
+@pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] != 10,
+                    reason="Requires blackwell tensor cores")
+def test_tensor_memory(fresh_knobs):
+    knobs.compilation.disable_line_info = True
+
+    layout = ttgl.BlockedLayout(size_per_thread=[1, 64], threads_per_warp=[32, 1], warps_per_cta=[4, 1], order=[0, 1])
+    tmem_layout = ttgl.nvidia.blackwell.TensorMemoryLayout(block=[128, 128], unpacked=True)
+    h = tensor_memory_kernel.warmup(layout, tmem_layout, num_warps=4, grid=(1, ))
+    expecttest.assert_expected_inline(
+        h.asm["ttgir"], """\
+#blocked = #ttg.blocked<{sizePerThread = [1, 64], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
+module attributes {"ttg.num-warps" = 4 : i32} {
+  tt.func public @tensor_memory_kernel() attributes {noinline = false} {
+    %c0_i32 = arith.constant 0 : i32 loc(#loc)
+    %cst = arith.constant dense<0> : tensor<128x128xi32, #blocked> loc(#loc)
+    %result = ttng.tmem_alloc %cst : (tensor<128x128xi32, #blocked>) -> !ttg.memdesc<128x128xi32, #tmem, #ttng.tensor_memory, mutable> loc(#loc)
+    %result_0 = ttng.tmem_load %result : !ttg.memdesc<128x128xi32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x128xi32, #blocked> loc(#loc)
+    %true = arith.constant true loc(#loc)
+    ttng.tmem_store %cst, %result, %true : tensor<128x128xi32, #blocked> -> !ttg.memdesc<128x128xi32, #tmem, #ttng.tensor_memory, mutable> loc(#loc)
+    %0 = ttng.tmem_subslice %result {N = 0 : i32} : !ttg.memdesc<128x128xi32, #tmem, #ttng.tensor_memory, mutable> -> !ttg.memdesc<128x64xi32, #tmem, #ttng.tensor_memory, mutable, 128x128> loc(#loc)
+    %1 = ttng.tmem_subslice %result {N = 64 : i32} : !ttg.memdesc<128x128xi32, #tmem, #ttng.tensor_memory, mutable> -> !ttg.memdesc<128x64xi32, #tmem, #ttng.tensor_memory, mutable, 128x128> loc(#loc)
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+#loc = loc(unknown)
+""")
+
+
 @gluon.jit
 def warp_specialize_default(a, b):
     return b, a
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
@@ -1349,6 +1349,9 @@ def visit_Attribute(self, node):
         lhs = self.visit(node.value)
         if _is_triton_tensor(lhs) and node.attr == "T":
             return semantic.permute(lhs, (1, 0), builder=self.builder)
+        # NOTE: special case ".value" for BC
+        if isinstance(lhs, constexpr) and node.attr != "value":
+            lhs = lhs.value
         attr = getattr(lhs, node.attr)
         if _is_triton_value(lhs) and isinstance(attr, JITFunction):
             return BoundJITMethod(lhs, attr)
diff --git a/python/triton/experimental/gluon/language/__init__.py b/python/triton/experimental/gluon/language/__init__.py
@@ -3,4 +3,10 @@
 from ._layouts import *  # NOQA: F403
 from ._layouts import __all__ as __layouts_all
 
-__all__ = [*__core_all, *__layouts_all]
+from . import nvidia
+
+__all__ = [
+    *__core_all,
+    *__layouts_all,
+    "nvidia",
+]
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -41,6 +41,13 @@
 )
 from . import _semantic as semantic
 
+_IMPORT_FROM_TRITON: List[str] = [
+    "program_id",  # NOQA: F822
+    "load",  # NOQA: F822
+    "store",  # NOQA: F822
+    "to_tensor",  # NOQA: F822
+]
+
 __all__ = [
     "constexpr",
     "base_value",
@@ -71,15 +78,13 @@
     "float64",
     "_unwrap_if_constexpr",
     "tensor",
-    "program_id",  # NOQA: F822
-    "load",  # NOQA: F822
-    "store",  # NOQA: F822
     "arange",
     "full",
     "convert_layout",
-    "allocate_shared",
+    "allocate_shared_memory",
     "shared_memory_descriptor",
     "warp_specialize",
+    *_IMPORT_FROM_TRITON,
 ]
 
 T = TypeVar("T")
@@ -196,11 +201,7 @@ def store(self, value, _builder: GluonOpBuilder) -> None:
         return semantic.shared_store(self, value, _builder)
 
 
-for name in [
-        "program_id",
-        "load",
-        "store",
-]:
+for name in _IMPORT_FROM_TRITON:
     fn = getattr(tl_core, name)
     globals()[name] = builtin(fn)
 
@@ -229,7 +230,7 @@ def full(shape, value, dtype, layout, _builder=None):
 
 
 @builtin
-def allocate_shared(element_ty, shape, layout, value=None, _builder=None):
+def allocate_shared_memory(element_ty, shape, layout, value=None, _builder=None):
     element_ty = _unwrap_if_constexpr(element_ty)
     shape = _unwrap_if_constexpr(shape)
     layout = _unwrap_if_constexpr(layout)
diff --git a/python/triton/experimental/gluon/language/nvidia/__init__.py b/python/triton/experimental/gluon/language/nvidia/__init__.py
@@ -0,0 +1,3 @@
+from . import blackwell
+
+__all__ = ["blackwell"]
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell.py b/python/triton/experimental/gluon/language/nvidia/blackwell.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from . import blackwell`
	`2`	`+`
	`3`	`+__all__ = ["blackwell"]`