Skip to content

Commit 67ddd4f

Browse files
committed
further reduction
Signed-off-by: Anatoly Myachev <[email protected]>
1 parent 06d5701 commit 67ddd4f

File tree

1 file changed

+6
-16
lines changed

1 file changed

+6
-16
lines changed

python/test/unit/intel/test_regressions.py

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -60,44 +60,34 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path):
6060
ir = """
6161
#blocked = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
6262
#blocked1 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
63-
#blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
6463
#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
6564
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
6665
#smem = #ttg.shared_memory
6766
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} {
68-
tt.func public @matmul_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
67+
tt.func public @matmul_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) {
6968
%cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
70-
%c63_i32 = arith.constant 63 : i32
71-
%c127_i32 = arith.constant 127 : i32
72-
%c1_i32 = arith.constant 1 : i32
73-
%c0_i32 = arith.constant 0 : i32
74-
%c64_i32 = arith.constant 64 : i32
7569
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32, #blocked1>
76-
%c8_i32 = arith.constant 8 : i32
77-
%c128_i32 = arith.constant 128 : i32
7870
%18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
79-
%42 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<128x64x!tt.ptr<f32>, #blocked2>
71+
%42 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<128x64x!tt.ptr<f32>, #blocked1>
8072
%44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>
8173
%45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1>
8274
%46 = tt.expand_dims %18 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1>
8375
%50 = tt.broadcast %46 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1>
8476
%52 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<64x128x!tt.ptr<f32>, #blocked1>
8577
%53 = tt.addptr %52, %50 : tensor<64x128x!tt.ptr<f32>, #blocked1>, tensor<64x128xi32, #blocked1>
8678
87-
%81 = arith.subi %arg5, %c64_i32 : i32
88-
%85 = tt.load %42: tensor<128x64x!tt.ptr<f32>, #blocked2>
89-
%86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1>
79+
%85 = tt.load %42: tensor<128x64x!tt.ptr<f32>, #blocked1>
80+
%86 = tt.splat %arg5 : i32 -> tensor<64x1xi32, #blocked1>
9081
%87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1>
9182
%88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1>
9283
%89 = tt.load %53, %88, %cst_0 : tensor<64x128x!tt.ptr<f32>, #blocked1>
93-
%91 = ttg.local_alloc %85 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem>
84+
%91 = ttg.local_alloc %85 : (tensor<128x64xf32, #blocked1>) -> !ttg.memdesc<128x64xf32, #shared, #smem>
9485
%92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
9586
%94 = ttg.local_alloc %89 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem>
96-
%cst_test = arith.constant dense<1.11111116> : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
9787
%cst_test2 = arith.constant dense<1.11111116> : tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
9888
%96 = tt.dot %92, %cst_test2, %cst, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked>
9989
100-
%78 = ttg.convert_layout %96 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #blocked2>
90+
%78 = ttg.convert_layout %96 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #blocked1>
10191
tt.return
10292
}
10393
}

0 commit comments

Comments
 (0)