[Gluon][Tutorial] Persistent attention (#7298)

Mogball · web-flow · commit ade3d49e624a · 2025-07-09T18:24:42.000Z
Rewrite the attention kernel to be persistent. This gives better
performance at low-contexts. However, fp16 at large context has suffered
a bit due to a ptxas instruction scheduling issue in the softmax
partition. fp8 is ~100 tflops faster when the kernel name has "cutlass"
in it.

```
Attention Z=4 H=32 D=64 causal=False:
     N_CTX  triton-fp16  triton-fp8
0   1024.0   359.574448  370.119987
1   2048.0   612.103928  641.204555
2   4096.0   653.868402  682.337948
3   8192.0   692.102228  721.555690
4  16384.0   696.972041  726.190035
5  32768.0   698.723685  727.983456
6  65536.0   699.865817  728.558321
Attention Z=4 H=32 D=64 causal=True:
     N_CTX  triton-fp16  triton-fp8
0   1024.0   181.879039  177.982453
1   2048.0   441.315463  454.310072
2   4096.0   532.170527  539.995252
3   8192.0   633.620646  638.544937
4  16384.0   667.687180  670.681255
5  32768.0   684.276329  688.571907
6  65536.0   692.953202  694.648353
Attention Z=4 H=32 D=128 causal=False:
     N_CTX  triton-fp16   triton-fp8
0   1024.0   718.580015   709.863720
1   2048.0  1133.490258  1222.548477
2   4096.0  1247.605551  1369.800195
3   8192.0  1243.482713  1406.799697
4  16384.0  1125.744367  1514.857403
5  32768.0  1124.116305  1521.267973
6  65536.0  1064.588719  1518.738037
Attention Z=4 H=32 D=128 causal=True:
     N_CTX  triton-fp16   triton-fp8
0   1024.0   355.642522   351.161232
1   2048.0   846.404095   854.547917
2   4096.0  1013.840017  1021.676435
3   8192.0  1176.258395  1152.844234
4  16384.0  1190.290681  1325.786204
5  32768.0  1063.658200  1394.413325
6  65536.0   970.531569  1413.282610
```
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp
@@ -128,8 +128,8 @@ SmallVector<int64_t> getTMABlockShape(ArrayRef<int64_t> shapePerCTA,
   if (swizzleBytes != 0) {
     auto contigDimSize = (8 * swizzleBytes) / elementBitWidth;
     if (blockShape[contigDim] < contigDimSize) {
-      llvm::reportFatalUsageError("Block shape is too small for the swizzle "
-                                  "byte size in NVMMA Shared Layout.");
+      llvm::report_fatal_error("Block shape is too small for the swizzle byte "
+                               "size in NVMMA Shared Layout.");
     }
     blockShape[contigDim] = contigDimSize;
   }
diff --git a/python/triton/experimental/gluon/language/_layouts.py b/python/triton/experimental/gluon/language/_layouts.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from typing import List, Optional
-from triton.language.core import _unwrap_if_constexpr, _unwrap_shape
+from triton.language.core import _unwrap_if_constexpr, _unwrap_shape, constexpr_type
 
 __all__ = [
     "BlockedLayout",
@@ -25,7 +25,10 @@ class DistributedLayout:
     """
     Base class for distributed memory layouts in Gluon IR.
     """
-    pass
+
+    @property
+    def type(self):
+        return constexpr_type(self)
 
 
 @dataclass(frozen=True)
@@ -213,7 +216,10 @@ class SharedLayout:
     """
     Base class for shared memory layouts in Gluon IR.
     """
-    pass
+
+    @property
+    def type(self):
+        return constexpr_type(self)
 
 
 @dataclass(frozen=True)
diff --git a/python/triton/experimental/gluon/language/_standard.py b/python/triton/experimental/gluon/language/_standard.py
@@ -6,6 +6,7 @@
 from . import _core as ttgl
 
 _IMPORT_FROM_TRITON = [
+    "cdiv",
     "sum",
     "max",
     "min",
diff --git a/python/triton/experimental/gluon/language/nvidia/hopper/tma.py b/python/triton/experimental/gluon/language/nvidia/hopper/tma.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 from typing import List, Tuple, TYPE_CHECKING
 from dataclasses import dataclass
+from triton.language.core import base_type, base_value
 import triton.experimental.gluon.language._core as ttgl
 from triton.experimental.gluon.language._layouts import NVMMASharedLayout
 from triton.experimental.gluon.language._core import builtin, _unwrap_if_constexpr
@@ -12,7 +13,7 @@
 
 
 @dataclass(eq=True)
-class tensor_descriptor_type:
+class tensor_descriptor_type(base_type):
     block_type: ttgl.block_type
     shape_type: ttgl.tuple_type
     strides_type: ttgl.tuple_type
@@ -44,7 +45,7 @@ def mangle(self) -> str:
         return f"TD{self.block_type.mangle}_{self.layout.mangle()}TD"
 
 
-class tensor_descriptor:
+class tensor_descriptor(base_value):
 
     def __init__(self, handle, shape: List[ttgl.tensor], strides: List[ttgl.tensor], block_type: ttgl.block_type,
                  layout: NVMMASharedLayout):
diff --git a/python/tutorials/gluon/01-attention-forward.py b/python/tutorials/gluon/01-attention-forward.py

Original file line number	Diff line number	Diff line change
`@@ -128,8 +128,8 @@ SmallVector<int64_t> getTMABlockShape(ArrayRef<int64_t> shapePerCTA,`
`128`	`128`	`if (swizzleBytes != 0) {`
`129`	`129`	`auto contigDimSize = (8 * swizzleBytes) / elementBitWidth;`
`130`	`130`	`if (blockShape[contigDim] < contigDimSize) {`
`131`		`- llvm::reportFatalUsageError("Block shape is too small for the swizzle "`
`132`		`- "byte size in NVMMA Shared Layout.");`
	`131`	`+ llvm::report_fatal_error("Block shape is too small for the swizzle byte "`
	`132`	`+ "size in NVMMA Shared Layout.");`
`133`	`133`	`}`
`134`	`134`	`blockShape[contigDim] = contigDimSize;`
`135`	`135`	`}`