[TEST] Add more block store test cases (#4717)

whitneywhtsang · web-flow · commit 2823aaa6a67a · 2025-07-15T08:31:38.000+08:00
diff --git a/python/test/unit/intel/test_block_store.py b/python/test/unit/intel/test_block_store.py
@@ -25,80 +25,145 @@ def __str__(self):
         return f"#ttig.dpas<{{repeatCount={self.repeatCount}, systolicDepth={self.systolic_depth}, executionSize = {self.execution_size}, opsPerChan = {self.ops_per_chan}, threadsPerWarp = {self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, repCluster={self.rep_cluster}}}>"
 
 
+class DotOperandLayout:
+
+    def __init__(self, parent, op_idx, k_width):
+        self.parent = parent
+        self.op_idx = op_idx
+        self.k_width = k_width
+        self.threads_per_warp = parent.threads_per_warp
+
+    def __str__(self):
+        return f"#ttg.dot_op<{{parent={self.parent}, opIdx={self.op_idx}, kWidth={self.k_width}}}>"
+
+
+class SliceLayout:
+
+    def __init__(self, dim, parent):
+        self.dim = dim
+        self.parent = parent
+        self.threads_per_warp = parent.threads_per_warp
+
+    def __str__(self):
+        return f"#ttg.slice<{{dim = {self.dim}, parent = {self.parent}}}>"
+
+
+class BlockedLayout:
+
+    def __init__(self, size_per_thread, threads_per_warp, warps_per_cta, order, ctas_per_cga=[1, 1],
+                 cta_split_num=[1, 1], cta_order=[0, 1]):
+        self.sz_per_thread = size_per_thread
+        self.threads_per_warp = threads_per_warp
+        self.warps_per_cta = warps_per_cta
+        self.order = order
+        self.ctas_per_cga = ctas_per_cga
+        self.cta_split_num = cta_split_num
+        self.cta_order = cta_order
+
+    def __str__(self):
+        return f"#ttg.blocked<{{sizePerThread={self.sz_per_thread}, threadsPerWarp={self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, order={self.order}, CTAsPerCGA={self.ctas_per_cga}, CTASplitNum={self.cta_split_num}, CTAOrder={self.cta_order}}}>"
+
+
 def warps_per_cta(layout):
-    return layout.warps_per_cta
+    if isinstance(layout, (SliceLayout, DotOperandLayout)):
+        return warps_per_cta(layout.parent)
+    else:
+        return layout.warps_per_cta
 
 
 layouts = [
-    # Layout for Xe
+    BlockedLayout([1, 1], [2, 16], [4, 1], [1, 0], [1, 1], [1, 1], [0, 1]),
+    # DPAS layout
     DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=4, threads_per_warp=16,
                warps_per_cta=[1, 4], rep_cluster=[1, 2]),
     DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=2, threads_per_warp=16,
                warps_per_cta=[8, 4], rep_cluster=[4, 2]),
     DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=1, threads_per_warp=16,
                warps_per_cta=[8, 4], rep_cluster=[1, 1]),
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=8, ops_per_chan=1, threads_per_warp=32,
+               warps_per_cta=[4, 1], rep_cluster=[1, 1]),
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=2, threads_per_warp=32,
+               warps_per_cta=[2, 2], rep_cluster=[1, 1]),
+    DpasLayout(repeatCount=8, systolic_depth=8, execution_size=8, ops_per_chan=4, threads_per_warp=32,
+               warps_per_cta=[4, 1], rep_cluster=[1, 1]),
+    # DotOp A
+    DotOperandLayout(
+        parent=DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=2, threads_per_warp=32,
+                          warps_per_cta=[2, 2], rep_cluster=[1, 1]), op_idx=0, k_width=1),
+    DotOperandLayout(
+        parent=DpasLayout(repeatCount=8, systolic_depth=8, execution_size=8, ops_per_chan=1, threads_per_warp=16,
+                          warps_per_cta=[2, 2], rep_cluster=[1, 1]), op_idx=0, k_width=1),
+    # DotOp B
+    DotOperandLayout(
+        parent=DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=1, threads_per_warp=16,
+                          warps_per_cta=[2, 2], rep_cluster=[1, 1]), op_idx=1, k_width=1),
+    DotOperandLayout(
+        parent=DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=2, threads_per_warp=16,
+                          warps_per_cta=[2, 2], rep_cluster=[1, 1]), op_idx=1, k_width=2),
+    DotOperandLayout(
+        parent=DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=4, threads_per_warp=16,
+                          warps_per_cta=[2, 2], rep_cluster=[1, 1]), op_idx=1, k_width=4),
+    DotOperandLayout(
+        parent=DpasLayout(repeatCount=8, systolic_depth=8, execution_size=16, ops_per_chan=1, threads_per_warp=32,
+                          warps_per_cta=[2, 2], rep_cluster=[1, 1]), op_idx=1, k_width=1),
+    # Slice layout
+    SliceLayout(dim=1, parent=BlockedLayout([1, 4, 1], [2, 1, 16], [2, 1, 2], [2, 1, 0], [1, 1, 1], [1, 1, 1],
+                                            [0, 1, 2])),
 ]
 
 
-@pytest.mark.parametrize("M, N", [[M, N] for M, N in itertools.product([32, 64, 128, 256], [32, 64, 128, 256])])
+@pytest.mark.parametrize("M, N", [[M, N] for M, N in itertools.product([32, 64, 128], [64, 128])])
 @pytest.mark.parametrize("dtype_str", ["float32", "float16", "int8"])
 @pytest.mark.parametrize("layout", layouts)
+@pytest.mark.parametrize("block_ptr", [True, False])
 @pytest.mark.skipif(not is_xpu(), reason="Block store tests are specific to the XPU backend")
-def test_tensor_pointer_block_store(M, N, dtype_str, layout, device, tmp_path: pathlib.Path):
+def test_block_store(M, N, dtype_str, layout, block_ptr, device, tmp_path: pathlib.Path):
 
     warps = warps_per_cta(layout)
     num_warps = int(np.prod(warps))
     threads_per_warp = layout.threads_per_warp
-    ops_per_chan = layout.ops_per_chan
-    A_width = 1 if ops_per_chan == 1 else ops_per_chan // 2
-    B_width = ops_per_chan
+    threads_per_warp = int(np.prod(threads_per_warp))
 
     ty = {"float32": "f32", "float16": "f16", "bfloat16": "i16", "int8": "i8"}[dtype_str]
 
     support_block_io = torch.xpu.get_device_capability()['has_subgroup_2d_block_io']
 
+    if block_ptr:
+        store_ops = f"""
+            %M_i64 = arith.constant {M} : i64
+            %N_i64 = arith.constant {N} : i64
+            %c1_i64 = arith.constant 1 : i64
+            %c0_i32 = arith.constant 0 : i32
+
+            %blk_ptr = tt.make_tensor_ptr %dst, [%M_i64, %N_i64], [%N_i64, %c1_i64], [%c0_i32, %c0_i32] {{order = array<i32: 1, 0>}} : <tensor<{M}x{N}x{ty}, #layout>>
+            tt.store %blk_ptr, %store_val {{ttig.block_io = "row_major", boundaryCheck = array<i32: 0, 1>}} : !tt.ptr<tensor<{M}x{N}x{ty}, #layout>>
+            """
+    else:
+        store_ops = f"""
+            %12 = tt.splat %dst : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>
+            %13 = tt.addptr %12, %8 : tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>, tensor<{M}x{N}xi32, #layout>
+            tt.store %13, %store_val {{ttig.block_io = "row_major"}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>
+            """
+
     ir = f"""
-    #mma = {layout}
-    #dot_a = #ttg.dot_op<{{opIdx = 0, parent = #mma, kWidth = {A_width}}}>
-    #dot_b = #ttg.dot_op<{{opIdx = 1, parent = #mma, kWidth = {B_width}}}>
+    #layout = {layout}
     module attributes {{{"ttig.support_sg_2d_block," if support_block_io else ""} "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = {num_warps} : i32, ttg.target = "xpu", "ttg.threads-per-warp" = {threads_per_warp} : i32}} {{
-        tt.func public @tensor_pointer_block_store(%arg0: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg1: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %arg2: !tt.ptr<{ty}> {{tt.divisibility = 16: i32}}, %arg3: !tt.ptr<{ty}> {{tt.divisibility = 16: i32}}) {{
-
-            // A matrix
-            %stride_a = arith.constant dense<{N}> : tensor<{M}x1xi32, #dot_a>
-            %1 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_a}}>>
-            %2 = tt.expand_dims %1 {{axis = 1 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_a}}>> -> tensor<{M}x1xi32, #dot_a>
-            %3 = arith.muli %2, %stride_a : tensor<{M}x1xi32, #dot_a>
-            %4 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_a}}>>
-            %5 = tt.expand_dims %4 {{axis = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_a}}>> -> tensor<1x{N}xi32, #dot_a>
-            %6 = tt.broadcast %3 : tensor<{M}x1xi32, #dot_a> -> tensor<{M}x{N}xi32, #dot_a>
-            %7 = tt.broadcast %5 : tensor<1x{N}xi32, #dot_a> -> tensor<{M}x{N}xi32, #dot_a>
-            %8 = arith.addi %6, %7 : tensor<{M}x{N}xi32, #dot_a>
-
-            %9 = tt.splat %arg0 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>
-            %10 = tt.addptr %9, %8 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>, tensor<{M}x{N}xi32, #dot_a>
-            %11 = tt.load %10 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>
-            %12 = tt.splat %arg1 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>
-            %13 = tt.addptr %12, %8 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>, tensor<{M}x{N}xi32, #dot_a>
-            tt.store %13, %11 {{ttig.block_io = "row_major"}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_a>
-
-            // B matrix
-            %stride_b = arith.constant dense<{N}> : tensor<{M}x1xi32, #dot_b>
-            %21 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_b}}>>
-            %22 = tt.expand_dims %21 {{axis = 1 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #dot_b}}>> -> tensor<{M}x1xi32, #dot_b>
-            %23 = arith.muli %22, %stride_b : tensor<{M}x1xi32, #dot_b>
-            %24 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_b}}>>
-            %25 = tt.expand_dims %24 {{axis = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #dot_b}}>> -> tensor<1x{N}xi32, #dot_b>
-            %26 = tt.broadcast %23 : tensor<{M}x1xi32, #dot_b> -> tensor<{M}x{N}xi32, #dot_b>
-            %27 = tt.broadcast %25 : tensor<1x{N}xi32, #dot_b> -> tensor<{M}x{N}xi32, #dot_b>
-            %28 = arith.addi %26, %27 : tensor<{M}x{N}xi32, #dot_b>
-
-            %29 = tt.splat %arg2 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>
-            %30 = tt.addptr %29, %28 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>, tensor<{M}x{N}xi32, #dot_b>
-            %31 = tt.load %30 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>
-            %32 = tt.splat %arg3 : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>
-            %33 = tt.addptr %32, %28 : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>, tensor<{M}x{N}xi32, #dot_b>
-            tt.store %33, %31 {{ttig.block_io = "row_major"}} : tensor<{M}x{N}x!tt.ptr<{ty}>, #dot_b>
+        tt.func public @block_store(%src: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}, %dst: !tt.ptr<{ty}> {{tt.divisibility = 16 : i32}}) {{
+
+            %stride = arith.constant dense<{N}> : tensor<{M}x1xi32, #layout>
+            %1 = tt.make_range {{end = {M} : i32, start = 0 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #layout}}>>
+            %2 = tt.expand_dims %1 {{axis = 1 : i32}} : tensor<{M}xi32, #ttg.slice<{{dim = 1, parent = #layout}}>> -> tensor<{M}x1xi32, #layout>
+            %3 = arith.muli %2, %stride : tensor<{M}x1xi32, #layout>
+            %4 = tt.make_range {{end = {N} : i32, start = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #layout}}>>
+            %5 = tt.expand_dims %4 {{axis = 0 : i32}} : tensor<{N}xi32, #ttg.slice<{{dim = 0, parent = #layout}}>> -> tensor<1x{N}xi32, #layout>
+            %6 = tt.broadcast %3 : tensor<{M}x1xi32, #layout> -> tensor<{M}x{N}xi32, #layout>
+            %7 = tt.broadcast %5 : tensor<1x{N}xi32, #layout> -> tensor<{M}x{N}xi32, #layout>
+            %8 = arith.addi %6, %7 : tensor<{M}x{N}xi32, #layout>
+            %9 = tt.splat %src : !tt.ptr<{ty}> -> tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>
+            %10 = tt.addptr %9, %8 : tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>, tensor<{M}x{N}xi32, #layout>
+            %store_val = tt.load %10 : tensor<{M}x{N}x!tt.ptr<{ty}>, #layout>
+
+            {store_ops}
 
             tt.return
         }}
@@ -112,11 +177,10 @@ def test_tensor_pointer_block_store(M, N, dtype_str, layout, device, tmp_path: p
         a = torch.randint(low=-127, high=128, size=(M, N), dtype=torch_dtype, device=device)
 
     x = torch.empty_like(a)
-    y = torch.empty_like(a)
 
-    temp_file = tmp_path / "test_tensor_pointer_block_store.ttgir"
+    temp_file = tmp_path / "test_block_store.ttgir"
     temp_file.write_text(ir)
     kernel = triton.compile(str(temp_file))
 
-    kernel[(1, 1, 1)](a, x, a, y)
-    assert torch.equal(a, x) and torch.equal(a, y)
+    kernel[(1, 1, 1)](a, x)
+    assert torch.equal(a, x)
diff --git a/scripts/skiplist/lts/intel.txt b/scripts/skiplist/lts/intel.txt
@@ -1,2 +1,2 @@
 python/test/unit/intel/test_block_load.py::test_block_load_dpas_layout
-python/test/unit/intel/test_block_store.py::test_tensor_pointer_block_store
+python/test/unit/intel/test_block_store.py::test_block_store

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`python/test/unit/intel/test_block_load.py::test_block_load_dpas_layout`
`2`		`-python/test/unit/intel/test_block_store.py::test_tensor_pointer_block_store`
	`2`	`+python/test/unit/intel/test_block_store.py::test_block_store`