[Gluon] Rename memdesc subview ops (#7165)

peterbell10 · web-flow · commit 2f817569ec20 · 2025-06-12T16:41:33.000+01:00
To be more consistent with pytorch, rename:
- subslice -&gt; index
- split -&gt; slice

Technically the inputs to a slice should be start and end, instead of
start
and length. But we need to know the length statically in order to create
the types, so this isn't possible unfortunately.
diff --git a/python/test/gluon/test_frontend.py b/python/test/gluon/test_frontend.py
@@ -109,12 +109,12 @@ def tensor_memory_kernel(layout: ttgl.constexpr, tmem_layout: ttgl.constexpr):
     mem = ttgl.nvidia.blackwell.allocate_tensor_memory(ttgl.int32, a.shape, tmem_layout, a)
     b = mem.load(layout)  # noqa: F841
     mem.store(a)
-    slice1 = mem.split(0, YBLOCK // 2)  # noqa: F841
-    slice2 = mem.split(YBLOCK // 2, YBLOCK // 2)  # noqa: F841
+    slice1 = mem.slice(0, YBLOCK // 2)  # noqa: F841
+    slice2 = mem.slice(YBLOCK // 2, YBLOCK // 2)  # noqa: F841
 
     buffers = ttgl.nvidia.blackwell.allocate_tensor_memory(ttgl.float32, [2, XBLOCK, YBLOCK], tmem_layout)
     for i in range(2):
-        buffers.subslice(i).load(layout)
+        buffers.index(i).load(layout)
 
 
 @pytest.mark.skipif(not is_cuda() or torch.cuda.get_device_capability()[0] != 10,
@@ -165,9 +165,9 @@ def test_tensor_memory(fresh_knobs):
 def shared_memory_subview_kernel(XBLOCK: ttgl.constexpr, layout: ttgl.constexpr, smem_layout: ttgl.constexpr):
     XHALF: ttgl.constexpr = XBLOCK // 2
     smem = ttgl.allocate_shared_memory(ttgl.int32, [XBLOCK, XBLOCK], smem_layout)
-    view = smem.split(XHALF, XHALF, dim=1)
+    view = smem.slice(XHALF, XHALF, dim=1)
     value = view.load(layout)
-    view = smem.split(XHALF, XHALF, dim=0)
+    view = smem.slice(XHALF, XHALF, dim=0)
     view.store(value.trans())
 
 
@@ -203,25 +203,25 @@ def test_shared_memory_subview(fresh_knobs):
 
 
 @gluon.jit
-def shared_memory_subslice_kernel(XBLOCK: ttgl.constexpr, layout: ttgl.constexpr, smem_layout: ttgl.constexpr):
+def shared_memory_index_kernel(XBLOCK: ttgl.constexpr, layout: ttgl.constexpr, smem_layout: ttgl.constexpr):
     smem = ttgl.allocate_shared_memory(ttgl.int32, [4, XBLOCK], smem_layout)
     for i in range(4):
-        smem.subslice(i).load(layout)
+        smem.index(i).load(layout)
 
 
-def test_shared_memory_subslice(fresh_knobs):
+def test_shared_memory_index(fresh_knobs):
     knobs.compilation.disable_line_info = True
 
     layout = ttgl.BlockedLayout(size_per_thread=[1], threads_per_warp=[32], warps_per_cta=[4], order=[0])
     smem_layout = ttgl.NVMMASharedLayout(swizzle_byte_width=128, element_bitwidth=32, rank=2)
-    h = shared_memory_subslice_kernel.warmup(256, layout, smem_layout, num_warps=4, grid=(1, ))
+    h = shared_memory_index_kernel.warmup(256, layout, smem_layout, num_warps=4, grid=(1, ))
     expecttest.assert_expected_inline(
         anonymize_ir(h.asm["source"]), """\
 #blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
 #shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
 #smem = #ttg.shared_memory
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
-  tt.func public @shared_memory_subslice_kernel() attributes {noinline = false} {
+  tt.func public @shared_memory_index_kernel() attributes {noinline = false} {
     %0 = ttg.local_alloc : () -> !ttg.memdesc<4x256xi32, #shared, #smem, mutable> loc(#loc)
     %c0_i32 = arith.constant 0 : i32 loc(#loc)
     %c4_i32 = arith.constant 4 : i32 loc(#loc)
@@ -250,7 +250,7 @@ def shared_memory_cast_kernel():
                                                       rank=2, ctas_per_cga=[1, 1], cta_split_num=[1,
                                                                                                   1], cta_order=[1, 0])
     smem = ttgl.allocate_shared_memory(ttgl.int8, [2, 256, 128], layout_a)
-    perm = smem.subslice(0).permute((1, 0))
+    perm = smem.index(0).permute((1, 0))
     ttgl.static_assert(perm.type.layout == layout_T)
 
     layout_b: ttgl.constexpr = ttgl.NVMMASharedLayout(swizzle_byte_width=64, transposed=False, element_bitwidth=16,
@@ -562,18 +562,18 @@ def kernel():
 
 
 @gluon.jit
-def tmem_subslice_kernel():
+def tmem_index_kernel():
     layout: ttgl.constexpr = TensorMemoryLayout(block=[128, 128], unpacked=True)
     tmem = ttgl.nvidia.blackwell.allocate_tensor_memory(ttgl.int32, [2, 256, 256], layout)
-    tmem.subslice(0)
+    tmem.index(0)
 
 
-def test_tmem_subslice_constexpr():
+def test_tmem_index_constexpr():
     expecttest.assert_expected_inline(
-        anonymize_ir(run_parser(tmem_subslice_kernel).str_nodebug()), """\
+        anonymize_ir(run_parser(tmem_index_kernel).str_nodebug()), """\
 #tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
-  tt.func public @tmem_subslice_kernel() attributes {noinline = false} {
+  tt.func public @tmem_index_kernel() attributes {noinline = false} {
     %result = ttng.tmem_alloc : () -> !ttg.memdesc<2x256x256xi32, #tmem, #ttng.tensor_memory, mutable>
     %c0_i32 = arith.constant 0 : i32
     %c0_i32_0 = arith.constant 0 : i32
diff --git a/python/triton/experimental/gluon/language/_core.py b/python/triton/experimental/gluon/language/_core.py
@@ -206,6 +206,10 @@ def shape(self):
     def rank(self):
         return len(self.shape)
 
+    @property
+    def layout(self):
+        return self.type.layout
+
     def __str__(self) -> str:
         return str(self.type)
 
@@ -219,31 +223,16 @@ def store(self, value, _semantic: GluonSemantic) -> None:
         return _semantic.shared_store(self, value)
 
     @builtin
-    def split(self, offset, size, dim=None, layout=None, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
-        if layout is None:
-            layout = self.type.layout
-        if dim is None:
-            dim = 0
-
-        offset = _unwrap_if_constexpr(offset)
-        size = _unwrap_if_constexpr(size)
+    def slice(self, start, length, dim=0, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
+        start = _unwrap_if_constexpr(start)
+        length = _unwrap_if_constexpr(length)
         dim = _unwrap_if_constexpr(dim)
-        layout = _unwrap_if_constexpr(layout)
-
-        return _semantic.memdesc_split(self, offset, size, dim, layout)
+        return _semantic.memdesc_slice(self, start, length, dim)
 
     @builtin
-    def subslice(self, index, shape=None, layout=None, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
-        if layout is None:
-            layout = self.type.layout
-        if shape is None:
-            shape = self.shape[1:]
-
+    def index(self, index, _semantic: GluonSemantic = None) -> shared_memory_descriptor:
         index = _unwrap_if_constexpr(index)
-        shape = [_unwrap_if_constexpr(s) for s in shape]
-        layout = _unwrap_if_constexpr(layout)
-
-        return _semantic.memdesc_slice(self, index, shape, layout)
+        return _semantic.memdesc_index(self, index)
 
     @builtin
     def permute(self, order, _semantic: GluonSemantic) -> shared_memory_descriptor:
diff --git a/python/triton/experimental/gluon/language/_semantic.py b/python/triton/experimental/gluon/language/_semantic.py
@@ -160,26 +160,25 @@ def shared_store(self, mem_desc, value):
     def shared_dealloc(self, mem_desc):
         self.builder.create_local_dealloc(mem_desc.handle)
 
-    def _memdesc_subview(self, mem_desc, offsets, shape, layout):
+    def _memdesc_subview(self, mem_desc, offsets, shape):
+        layout = mem_desc.layout
         ty = ttgl.shared_memory_descriptor_type(mem_desc.dtype, shape, layout, mem_desc.type.alloc_shape)
         builder = self.builder
         handle = builder.create_memdesc_subview(ty.to_ir(builder), mem_desc.handle, offsets)
         return ttgl.shared_memory_descriptor(handle, **ty.__dict__)
 
-    def memdesc_split(self, mem_desc, offset, size, dim, layout):
+    def memdesc_slice(self, mem_desc, start, length, dim):
         offsets = [self.builder.get_int32(0)] * mem_desc.rank
-        offsets[dim] = self.builder.get_int32(offset)
+        offsets[dim] = self.to_tensor(start).handle
         shape = list(mem_desc.shape)
-        shape[dim] = size
-        return self._memdesc_subview(mem_desc, offsets, shape, layout)
-
-    def memdesc_slice(self, mem_desc, index, shape, layout):
-        assert mem_desc.rank > len(
-            shape), f"source rank ({mem_desc.rank}) must be greater than result rank ({len(shape)})"
+        shape[dim] = length
+        return self._memdesc_subview(mem_desc, offsets, shape)
 
+    def memdesc_index(self, mem_desc, index):
+        shape = mem_desc.shape[1:]
         offsets = [self.builder.get_int32(0)] * mem_desc.rank
-        offsets[0] = self._convert_elem_to_ir_value(index, require_i64=False)
-        return self._memdesc_subview(mem_desc, offsets, shape, layout)
+        offsets[0] = self.to_tensor(index).handle
+        return self._memdesc_subview(mem_desc, offsets, shape)
 
     def memdesc_trans(self, mem_desc, order):
         assert len(order) == len(
diff --git a/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py b/python/triton/experimental/gluon/language/nvidia/blackwell/__init__.py
@@ -4,6 +4,7 @@
 from dataclasses import dataclass
 from triton.experimental.gluon.language import _core as ttgl
 from triton.experimental.gluon.language._core import builtin, base_type, base_value, _unwrap_if_constexpr
+from triton.experimental.gluon.language._semantic import _check
 
 from . import tma
 from ..hopper import mbarrier, fence_async_shared
@@ -108,6 +109,10 @@ def shape(self):
     def rank(self):
         return len(self.shape)
 
+    @property
+    def layout(self):
+        return self.type.layout
+
     def __str__(self) -> str:
         return str(self.type)
 
@@ -126,12 +131,12 @@ def store(self, value, pred=True, _semantic: GluonSemantic = None) -> None:
         _semantic.builder.create_tmem_store(self.handle, value.handle, pred.handle)
 
     @builtin
-    def split(self, start, length, _semantic: GluonSemantic) -> None:
+    def slice(self, start, length, _semantic: GluonSemantic) -> None:
         start = _unwrap_if_constexpr(start)
         length = _unwrap_if_constexpr(length)
-        assert isinstance(start, int)
-        assert isinstance(length, int)
-        shape = [self.shape[0], length]
+        _check(isinstance(start, int), lambda: "start must be a constant int")
+        _check(isinstance(length, int), lambda: "length must be a constant int")
+        shape = self.shape[:-1] + [length]
         layout = self.type.layout
         layout = TensorMemoryLayout((layout.block[0], min(layout.block[1], length)), layout.unpacked,
                                     layout.cta_split_num)
@@ -141,19 +146,13 @@ def split(self, start, length, _semantic: GluonSemantic) -> None:
         return ret
 
     @builtin
-    def subslice(self, index, shape=None, layout=None, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
-        if layout is None:
-            layout = self.type.layout
-        if shape is None:
-            shape = self.shape[1:]
-
-        index = _semantic._convert_elem_to_ir_value(index, require_i64=False)
-        shape = [_unwrap_if_constexpr(s) for s in shape]
-        layout = _unwrap_if_constexpr(layout)
-
+    def index(self, index, _semantic: GluonSemantic = None) -> tensor_memory_descriptor:
+        index = _semantic.to_tensor(index)
         builder = _semantic.builder
         offsets = [builder.get_int32(0)] * self.rank
-        offsets[0] = index
+        offsets[0] = index.handle
+        shape = self.shape[1:]
+        layout = self.layout
         ret = tensor_memory_descriptor(None, self.dtype, shape, layout, self.type.alloc_shape)
         ret.handle = builder.create_memdesc_subview(ret.type.to_ir(builder), self.handle, offsets)
         return ret