[TLX] Add tlx.size_of (#710)

htyu · meta-codesync[bot] · commit 89823c246c54 · 2025-12-02T19:22:17.000-08:00
Summary: Introduces a new tlx.size_of() utility function that returns the size in bytes of a given Triton dtype. This helps unify kernels through different dtypes. Pull Request resolved: #710 Reviewed By: dshi7 Differential Revision: D88204044 Pulled By: htyu fbshipit-source-id: 13b9ba652619956808989aca75ab1d48fa78fb53
diff --git a/README.md b/README.md
@@ -221,6 +221,24 @@ Examples: how mbarriers are communicated in warp specialization
 
     Returns the id of the current thread instance along the given `axis`.
 
+- `tlx.dtype_of(v)`
+
+    Returns the dtype of a tensor or tensor descriptor.
+
+- `tlx.size_of(dtype)`
+
+    Returns the size in bytes of a given Triton dtype. This is useful for dynamically computing memory sizes based on dtype, especially in barrier synchronization code.
+
+    Example:
+    ```python
+    # Instead of hardcoding size values
+    tlx.barrier_expect_bytes(barrier, 2 * BLOCK_M * BLOCK_K)  # Assumes float16
+
+    # Use size_of for dtype-aware computation
+    tlx.barrier_expect_bytes(barrier,
+                           tlx.size_of(tlx.dtype_of(desc)) * BLOCK_M * BLOCK_K)
+    ```
+
 - `tlx.clock64()`
 
     Returns the current 64-bit hardware clock value. E.g,
diff --git a/python/test/unit/language/test_tlx.py b/python/test/unit/language/test_tlx.py
@@ -1977,6 +1977,34 @@ def kernel(y_ptr, BLOCK_SIZE: tl.constexpr):
     assert kerenl_info.asm["ttir"].count("store") == 1
 
 
+def test_size_of(device):
+
+    @triton.jit
+    def size_of_kernel(output_ptr):
+        # Test size_of for various dtypes
+        size_fp32 = tlx.size_of(tl.float32)
+        size_fp16 = tlx.size_of(tl.float16)
+        size_int32 = tlx.size_of(tl.int32)
+        size_int8 = tlx.size_of(tl.int8)
+        size_int64 = tlx.size_of(tl.int64)
+
+        # Store results
+        tl.store(output_ptr + 0, size_fp32)
+        tl.store(output_ptr + 1, size_fp16)
+        tl.store(output_ptr + 2, size_int32)
+        tl.store(output_ptr + 3, size_int8)
+        tl.store(output_ptr + 4, size_int64)
+
+    # Expected sizes in bytes
+    expected_sizes = torch.tensor([4, 2, 4, 1, 8], dtype=torch.int32, device=device)
+    output = torch.zeros(5, dtype=torch.int32, device=device)
+
+    grid = lambda meta: (1, )
+    size_of_kernel[grid](output)
+
+    torch.testing.assert_close(output, expected_sizes)
+
+
 @pytest.mark.skipif(not is_blackwell(), reason="Need Blackwell")
 def test_async_dots_blackwell_tmem(device):
     """
diff --git a/third_party/tlx/language/tlx/__init__.py b/third_party/tlx/language/tlx/__init__.py
@@ -15,10 +15,26 @@
     CLCPipelineContext,
     async_token,
 )
-from .mem_ops import (local_alloc, local_view, remote_view, local_slice, subslice, async_load, async_load_commit_group,
-                      async_load_wait_group, local_load, local_store, local_trans, local_reinterpret, global_alloc,
-                      async_descriptor_load, async_descriptor_store, async_descriptor_store_wait, fence_async_shared,
-                      make_tensor_descriptor)
+from .mem_ops import (
+    local_alloc,
+    local_view,
+    remote_view,
+    local_slice,
+    subslice,
+    async_load,
+    async_load_commit_group,
+    async_load_wait_group,
+    local_load,
+    local_store,
+    local_trans,
+    local_reinterpret,
+    global_alloc,
+    async_descriptor_load,
+    async_descriptor_store,
+    async_descriptor_store_wait,
+    fence_async_shared,
+    make_tensor_descriptor,
+)
 from .barrier import (
     alloc_barriers,
     barrier_expect_bytes,
@@ -38,6 +54,7 @@
     thread_id,
     async_task_replica_id,
     dtype_of,
+    size_of,
     clock64,
     stoch_round,
 )
@@ -107,6 +124,7 @@
     "thread_id",
     "async_task_replica_id",
     "dtype_of",
+    "size_of",
     "clock64",
     "stoch_round",
     # dynamic launcher ops
diff --git a/third_party/tlx/language/tlx/utility.py b/third_party/tlx/language/tlx/utility.py
@@ -6,7 +6,7 @@
 
 def is_hip():
     target = driver.active.get_current_target()
-    return target.backend == 'hip'
+    return target.backend == "hip"
 
 
 def cuda_parse_arch(arch):
@@ -42,8 +42,9 @@ def thread_id(axis, _semantic=None):
 @tl.builtin
 def async_task_replica_id(_semantic=None):
     from triton.language.extra.tlx.compiler.code_generator import region_replica_id_stack
-    assert len(region_replica_id_stack
-               ) > 0, "async_task_replica_id must be called inside an async region where the stack must be non-empty"
+
+    assert len(region_replica_id_stack) > 0, (
+        "async_task_replica_id must be called inside an async region where the stack must be non-empty")
     return tl.constexpr(region_replica_id_stack[-1])
 
 
@@ -63,6 +64,15 @@ def dtype_of(v, _semantic=None) -> tl.dtype:
         raise ValueError(f"dtype_of only works on tensors and tensor descriptors, but got {v}")
 
 
+@tl.builtin
+def size_of(dtype: tl.dtype, _semantic=None) -> tl.constexpr:
+    """
+    Returns the size of a given dtype.
+    """
+    assert isinstance(dtype, tl.dtype), f"size_of expects a dtype, but got {type(dtype)}"
+    return tl.constexpr(dtype.primitive_bitwidth // 8)
+
+
 @tl.builtin
 def clock64(_semantic=None):
     """
diff --git a/third_party/tlx/tutorials/blackwell-grouped-gemm_test.py b/third_party/tlx/tutorials/blackwell-grouped-gemm_test.py
@@ -542,7 +542,7 @@ def grouped_matmul_tlx_kernel(
                             buf, phase = _get_bufidx_phase(accum_cnt, NUM_SMEM_BUFFERS)
                             tlx.barrier_wait(smem_empty_bars[buf], phase ^ 1)
                             tlx.barrier_expect_bytes(smem_full_bars[buf],
-                                                     2 * (BLOCK_SIZE_M + BLOCK_SIZE_N) * BLOCK_SIZE_K)  # float16
+                                                     tlx.size_of(dtype) * (BLOCK_SIZE_M + BLOCK_SIZE_N) * BLOCK_SIZE_K)
                             tlx.async_descriptor_load(a_desc, buffers_A[buf], [offs_am, kk * BLOCK_SIZE_K],
                                                       smem_full_bars[buf])
                             tlx.async_descriptor_load(b_desc, buffers_B[buf], [kk * BLOCK_SIZE_K, offs_bn],
diff --git a/third_party/tlx/tutorials/hopper-gemm-ws_test.py b/third_party/tlx/tutorials/hopper-gemm-ws_test.py
@@ -119,23 +119,23 @@ def matmul_kernel_tlx_ws(a_desc, b_desc, c_desc,  #
                 empty_a_1st = tlx.local_view(bars_empty_a, buf)  # mbar
                 full_a_1st = tlx.local_view(bars_full_a, buf)  # mbar
                 tlx.barrier_wait(bar=empty_a_1st, phase=p)  # EmptyBar A1 wait
-                tlx.barrier_expect_bytes(full_a_1st, BLOCK_M_SPLIT * BK * 2)
+                tlx.barrier_expect_bytes(full_a_1st, BLOCK_M_SPLIT * BK * tlx.size_of(tlx.dtype_of(a_desc)))
                 data_a_1st = tlx.local_view(a, buf)  # smem data
                 tlx.async_descriptor_load(a_desc, data_a_1st, [offset_am, offset_k], full_a_1st)
 
                 # Async load to b[buf]
                 empty_b = tlx.local_view(bars_empty_b, buf)
                 full_b = tlx.local_view(bars_full_b, buf)
                 tlx.barrier_wait(bar=empty_b, phase=p)
-                tlx.barrier_expect_bytes(full_b, BN * BK * 2)
+                tlx.barrier_expect_bytes(full_b, BN * BK * tlx.size_of(tlx.dtype_of(a_desc)))
                 data_b = tlx.local_view(b, buf)
                 tlx.async_descriptor_load(b_desc, data_b, [offset_k, offset_bn], full_b)
 
                 # Async load to a[buf+NUM_STAGES]
                 empty_a_2nd = tlx.local_view(bars_empty_a, buf + NUM_STAGES)
                 full_a_2nd = tlx.local_view(bars_full_a, buf + NUM_STAGES)
                 tlx.barrier_wait(bar=empty_a_2nd, phase=p)
-                tlx.barrier_expect_bytes(bar=full_a_2nd, size=BLOCK_M_SPLIT * BK * 2)
+                tlx.barrier_expect_bytes(bar=full_a_2nd, size=BLOCK_M_SPLIT * BK * tlx.size_of(tlx.dtype_of(a_desc)))
                 data_a_2nd = tlx.local_view(a, buf + NUM_STAGES)  # smem data
                 tlx.async_descriptor_load(a_desc, data_a_2nd, [offset_am + BLOCK_M_SPLIT, offset_k], full_a_2nd)