[Gluon] Add numel and nbytes properties (#7507)

peterbell10 · web-flow · commit 8e52b2e483eb · 2025-07-15T10:07:22.000+01:00
This adds new properties:
- shared_memory_descriptor.numel
- block_type.numel
- block_type.nbytes

And I update the attention tutorial to use
`tensor_descriptor.block_type.nbytes` when calling `mbarrier.expect`.
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -113,6 +113,8 @@ def shared_memory_kernel(XBLOCK: ttgl.constexpr, YBLOCK: ttgl.constexpr, layout_
                          layout_b: ttgl.constexpr, smem_layout: ttgl.constexpr):
     unused = ttgl.allocate_shared_memory(ttgl.int32, [XBLOCK, YBLOCK], smem_layout)
     a = ttgl.full([XBLOCK, YBLOCK], 0, ttgl.int32, layout_a)
+    tl.static_assert(a.numel == unused.numel)
+    tl.static_assert(unused.numel == XBLOCK * YBLOCK)
     mem = ttgl.allocate_shared_memory(ttgl.int32, a.shape, smem_layout, a)
     b = mem.load(layout_b)  # noqa: F841
     mem.store(a)
@@ -611,7 +613,8 @@ def async_tma_kernel(input_desc, XBLOCK: ttgl.constexpr):
     mbarrier.init(bar, count=1)
 
     tma.async_copy_global_to_shared(input_desc, [0, 0], bar, smem)
-    mbarrier.expect(bar, XBLOCK * XBLOCK * ttgl.float16.primitive_bitwidth // 8)
+    tl.static_assert(input_desc.block_type.nbytes == XBLOCK * XBLOCK * 2)
+    mbarrier.expect(bar, input_desc.block_type.nbytes)
     mbarrier.wait(bar, 0)
 
     mbarrier.invalidate(bar)
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import math
 from typing import TypeVar, List, TYPE_CHECKING, Tuple
 from functools import wraps
 
@@ -216,6 +217,10 @@ def shape(self):
     def rank(self):
         return len(self.shape)
 
+    @property
+    def numel(self) -> int:
+        return math.prod(self.shape)
+
     @property
     def layout(self):
         return self.type.layout
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import math
 from warnings import warn
 from contextlib import contextmanager
 from enum import Enum
@@ -752,6 +753,10 @@ def __eq__(self, other) -> bool:
     def scalar(self):
         return self.element_ty
 
+    @property
+    def nbytes(self):
+        return self.numel * (self.element_ty.primitive_bitwidth // 8)
+
     def mangle(self) -> str:
         elt = self.scalar.mangle()
         shape = '_'.join(map(str, self.shape))
@@ -878,10 +883,7 @@ def __init__(self, handle, type: dtype):
         self.handle = handle
         # Block shape
         self.shape = type.shape if type.is_block() else ()
-        self.numel = 1
-        for s in self.shape:
-            self.numel *= s
-        self.numel = constexpr(self.numel)
+        self.numel = constexpr(math.prod(self.shape))
         self.type = type  # Tensor type (can be block_type)
         # Following the practice in pytorch, dtype is scalar type
         self.dtype = type.scalar
diff --git a/python/tutorials/gluon/01-attention-forward.py b/python/tutorials/gluon/01-attention-forward.py
@@ -233,18 +233,9 @@ def get_desc_channel(desc, num_buffers: gl.constexpr, num_consumers: gl.constexp
     return SharedMemoryChannel.alloc(shape, desc.dtype, layout, num_buffers, num_consumers)
 
 
-@tl.constexpr_function
-def get_load_size_bytes(desc):
-    size = desc.dtype.primitive_bitwidth // 8
-    for dim in desc.block_type.shape:
-        size *= dim
-    return size
-
-
 @gluon.jit
 def issue_async_tma_load(smem, bar, desc, offset):
-    size: gl.constexpr = get_load_size_bytes(desc)
-    mbarrier.expect(bar, size)
+    mbarrier.expect(bar, desc.block_type.nbytes)
     tma.async_copy_global_to_shared(desc, [offset, 0], bar, smem)