@@ -109,12 +109,12 @@ def tensor_memory_kernel(layout: ttgl.constexpr, tmem_layout: ttgl.constexpr):
109
109
mem = ttgl .nvidia .blackwell .allocate_tensor_memory (ttgl .int32 , a .shape , tmem_layout , a )
110
110
b = mem .load (layout ) # noqa: F841
111
111
mem .store (a )
112
- slice1 = mem .split (0 , YBLOCK // 2 ) # noqa: F841
113
- slice2 = mem .split (YBLOCK // 2 , YBLOCK // 2 ) # noqa: F841
112
+ slice1 = mem .slice (0 , YBLOCK // 2 ) # noqa: F841
113
+ slice2 = mem .slice (YBLOCK // 2 , YBLOCK // 2 ) # noqa: F841
114
114
115
115
buffers = ttgl .nvidia .blackwell .allocate_tensor_memory (ttgl .float32 , [2 , XBLOCK , YBLOCK ], tmem_layout )
116
116
for i in range (2 ):
117
- buffers .subslice (i ).load (layout )
117
+ buffers .index (i ).load (layout )
118
118
119
119
120
120
@pytest .mark .skipif (not is_cuda () or torch .cuda .get_device_capability ()[0 ] != 10 ,
@@ -165,9 +165,9 @@ def test_tensor_memory(fresh_knobs):
165
165
def shared_memory_subview_kernel (XBLOCK : ttgl .constexpr , layout : ttgl .constexpr , smem_layout : ttgl .constexpr ):
166
166
XHALF : ttgl .constexpr = XBLOCK // 2
167
167
smem = ttgl .allocate_shared_memory (ttgl .int32 , [XBLOCK , XBLOCK ], smem_layout )
168
- view = smem .split (XHALF , XHALF , dim = 1 )
168
+ view = smem .slice (XHALF , XHALF , dim = 1 )
169
169
value = view .load (layout )
170
- view = smem .split (XHALF , XHALF , dim = 0 )
170
+ view = smem .slice (XHALF , XHALF , dim = 0 )
171
171
view .store (value .trans ())
172
172
173
173
@@ -203,25 +203,25 @@ def test_shared_memory_subview(fresh_knobs):
203
203
204
204
205
205
@gluon .jit
206
- def shared_memory_subslice_kernel (XBLOCK : ttgl .constexpr , layout : ttgl .constexpr , smem_layout : ttgl .constexpr ):
206
+ def shared_memory_index_kernel (XBLOCK : ttgl .constexpr , layout : ttgl .constexpr , smem_layout : ttgl .constexpr ):
207
207
smem = ttgl .allocate_shared_memory (ttgl .int32 , [4 , XBLOCK ], smem_layout )
208
208
for i in range (4 ):
209
- smem .subslice (i ).load (layout )
209
+ smem .index (i ).load (layout )
210
210
211
211
212
- def test_shared_memory_subslice (fresh_knobs ):
212
+ def test_shared_memory_index (fresh_knobs ):
213
213
knobs .compilation .disable_line_info = True
214
214
215
215
layout = ttgl .BlockedLayout (size_per_thread = [1 ], threads_per_warp = [32 ], warps_per_cta = [4 ], order = [0 ])
216
216
smem_layout = ttgl .NVMMASharedLayout (swizzle_byte_width = 128 , element_bitwidth = 32 , rank = 2 )
217
- h = shared_memory_subslice_kernel .warmup (256 , layout , smem_layout , num_warps = 4 , grid = (1 , ))
217
+ h = shared_memory_index_kernel .warmup (256 , layout , smem_layout , num_warps = 4 , grid = (1 , ))
218
218
expecttest .assert_expected_inline (
219
219
anonymize_ir (h .asm ["source" ]), """\
220
220
#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
221
221
#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 32}>
222
222
#smem = #ttg.shared_memory
223
223
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
224
- tt.func public @shared_memory_subslice_kernel () attributes {noinline = false} {
224
+ tt.func public @shared_memory_index_kernel () attributes {noinline = false} {
225
225
%0 = ttg.local_alloc : () -> !ttg.memdesc<4x256xi32, #shared, #smem, mutable> loc(#loc)
226
226
%c0_i32 = arith.constant 0 : i32 loc(#loc)
227
227
%c4_i32 = arith.constant 4 : i32 loc(#loc)
@@ -250,7 +250,7 @@ def shared_memory_cast_kernel():
250
250
rank = 2 , ctas_per_cga = [1 , 1 ], cta_split_num = [1 ,
251
251
1 ], cta_order = [1 , 0 ])
252
252
smem = ttgl .allocate_shared_memory (ttgl .int8 , [2 , 256 , 128 ], layout_a )
253
- perm = smem .subslice (0 ).permute ((1 , 0 ))
253
+ perm = smem .index (0 ).permute ((1 , 0 ))
254
254
ttgl .static_assert (perm .type .layout == layout_T )
255
255
256
256
layout_b : ttgl .constexpr = ttgl .NVMMASharedLayout (swizzle_byte_width = 64 , transposed = False , element_bitwidth = 16 ,
@@ -562,18 +562,18 @@ def kernel():
562
562
563
563
564
564
@gluon .jit
565
- def tmem_subslice_kernel ():
565
+ def tmem_index_kernel ():
566
566
layout : ttgl .constexpr = TensorMemoryLayout (block = [128 , 128 ], unpacked = True )
567
567
tmem = ttgl .nvidia .blackwell .allocate_tensor_memory (ttgl .int32 , [2 , 256 , 256 ], layout )
568
- tmem .subslice (0 )
568
+ tmem .index (0 )
569
569
570
570
571
- def test_tmem_subslice_constexpr ():
571
+ def test_tmem_index_constexpr ():
572
572
expecttest .assert_expected_inline (
573
- anonymize_ir (run_parser (tmem_subslice_kernel ).str_nodebug ()), """\
573
+ anonymize_ir (run_parser (tmem_index_kernel ).str_nodebug ()), """\
574
574
#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 128, unpacked = true>
575
575
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "...", "ttg.threads-per-warp" = 32 : i32} {
576
- tt.func public @tmem_subslice_kernel () attributes {noinline = false} {
576
+ tt.func public @tmem_index_kernel () attributes {noinline = false} {
577
577
%result = ttng.tmem_alloc : () -> !ttg.memdesc<2x256x256xi32, #tmem, #ttng.tensor_memory, mutable>
578
578
%c0_i32 = arith.constant 0 : i32
579
579
%c0_i32_0 = arith.constant 0 : i32
0 commit comments