Skip to content

Commit 06d5701

Browse files
committed
further reduction
Signed-off-by: Anatoly Myachev <[email protected]>
1 parent a92234a commit 06d5701

File tree

1 file changed

+4
-32
lines changed

1 file changed

+4
-32
lines changed

python/test/unit/intel/test_regressions.py

Lines changed: 4 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -65,55 +65,27 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path):
6565
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
6666
#smem = #ttg.shared_memory
6767
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} {
68-
tt.func public @matmul_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
68+
tt.func public @matmul_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
6969
%cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
7070
%c63_i32 = arith.constant 63 : i32
7171
%c127_i32 = arith.constant 127 : i32
7272
%c1_i32 = arith.constant 1 : i32
7373
%c0_i32 = arith.constant 0 : i32
7474
%c64_i32 = arith.constant 64 : i32
7575
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32, #blocked1>
76-
%cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked2>
7776
%c8_i32 = arith.constant 8 : i32
7877
%c128_i32 = arith.constant 128 : i32
79-
%cst_2 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
80-
%cst_3 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
81-
%0 = tt.get_program_id x : i32
82-
%1 = arith.addi %arg3, %c127_i32 : i32
83-
%2 = arith.divsi %1, %c128_i32 : i32
84-
%5 = arith.muli %2, %c8_i32 : i32
85-
%6 = arith.divsi %0, %5 : i32
86-
%7 = arith.muli %6, %c8_i32 : i32
87-
%8 = arith.subi %2, %7 : i32
88-
%9 = arith.minsi %8, %c8_i32 : i32
89-
%12 = arith.remsi %0, %5 : i32
90-
%13 = arith.divsi %12, %9 : i32
91-
%15 = arith.muli %13, %c128_i32 : i32
9278
%18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
93-
%20 = tt.splat %c128_i32 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
94-
%24 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
95-
%26 = arith.addi %24, %18 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
96-
%28 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
97-
%29 = arith.cmpi slt, %20, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
98-
%31 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
99-
%32 = arith.cmpi slt, %26, %31 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
100-
%33 = arith.select %32, %26, %cst_3 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 0, parent = #blocked1}>>, tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
101-
%37 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>>
102-
%38 = tt.expand_dims %37 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked2}>> -> tensor<1x64xi32, #blocked2>
10379
%42 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<128x64x!tt.ptr<f32>, #blocked2>
10480
%44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>
10581
%45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1>
106-
%46 = tt.expand_dims %33 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1>
82+
%46 = tt.expand_dims %18 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1>
10783
%50 = tt.broadcast %46 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1>
10884
%52 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<64x128x!tt.ptr<f32>, #blocked1>
10985
%53 = tt.addptr %52, %50 : tensor<64x128x!tt.ptr<f32>, #blocked1>, tensor<64x128xi32, #blocked1>
11086
111-
%80 = arith.muli %c0_i32, %c64_i32 : i32
112-
%81 = arith.subi %arg5, %80 : i32
113-
%82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2>
114-
%83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2>
115-
%84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2>
116-
%85 = tt.load %42, %84, %cst_1 : tensor<128x64x!tt.ptr<f32>, #blocked2>
87+
%81 = arith.subi %arg5, %c64_i32 : i32
88+
%85 = tt.load %42: tensor<128x64x!tt.ptr<f32>, #blocked2>
11789
%86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1>
11890
%87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1>
11991
%88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1>

0 commit comments

Comments
 (0)