Skip to content

Commit 55a8540

Browse files
committed
further reduction
Signed-off-by: Anatoly Myachev <[email protected]>
1 parent b008c9b commit 55a8540

File tree

1 file changed

+27
-73
lines changed

1 file changed

+27
-73
lines changed

python/test/unit/intel/test_regressions.py

Lines changed: 27 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -66,15 +66,15 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path):
6666
#shared1 = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
6767
#smem = #ttg.shared_memory
6868
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "xpu", "ttg.threads-per-warp" = 32 : i32, ttig.min_sg_size = 8 : i32, ttig.support_bf16_conversion, ttig.support_dpas, ttig.target_arch = "spir64"} {
69-
tt.func public @matmul_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32} ) attributes {noinline = false} {
69+
tt.func public @matmul_kernel(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32} ) attributes {noinline = false} {
7070
%cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
7171
%c63_i32 = arith.constant 63 : i32
7272
%c127_i32 = arith.constant 127 : i32
7373
%c1_i32 = arith.constant 1 : i32
7474
%c0_i32 = arith.constant 0 : i32
7575
%c64_i32 = arith.constant 64 : i32
76-
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf16, #blocked1>
77-
%cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf16, #blocked2>
76+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<64x128xf32, #blocked1>
77+
%cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #blocked2>
7878
%c8_i32 = arith.constant 8 : i32
7979
%c128_i32 = arith.constant 128 : i32
8080
%cst_2 = arith.constant dense<0> : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
@@ -98,17 +98,11 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path):
9898
%14 = arith.muli %11, %c128_i32 : i32
9999
%15 = arith.muli %13, %c128_i32 : i32
100100
%16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
101-
%17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>>
102101
%18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
103-
%19 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>>
104102
%20 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
105-
%21 = tt.splat %14 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>>
106103
%22 = arith.addi %20, %16 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
107-
%23 = arith.addi %21, %17 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>>
108104
%24 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
109-
%25 = tt.splat %15 : i32 -> tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>>
110105
%26 = arith.addi %24, %18 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>>
111-
%27 = arith.addi %25, %19 : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>>
112106
%28 = tt.splat %arg3 : i32 -> tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
113107
%29 = arith.cmpi slt, %22, %28 : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
114108
%30 = arith.select %29, %22, %cst_2 {tt.contiguity = dense<128> : tensor<1xi32>, tt.divisibility = dense<128> : tensor<1xi32>} : tensor<128xi1, #ttg.slice<{dim = 1, parent = #blocked2}>>, tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked2}>>
@@ -123,8 +117,8 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path):
123117
%39 = tt.broadcast %36 : tensor<128x1xi32, #blocked2> -> tensor<128x64xi32, #blocked2>
124118
%40 = tt.broadcast %38 : tensor<1x64xi32, #blocked2> -> tensor<128x64xi32, #blocked2>
125119
%41 = arith.addi %39, %40 : tensor<128x64xi32, #blocked2>
126-
%42 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x64x!tt.ptr<f16>, #blocked2>
127-
%43 = tt.addptr %42, %41 : tensor<128x64x!tt.ptr<f16>, #blocked2>, tensor<128x64xi32, #blocked2>
120+
%42 = tt.splat %arg0 : !tt.ptr<f32> -> tensor<128x64x!tt.ptr<f32>, #blocked2>
121+
%43 = tt.addptr %42, %41 : tensor<128x64x!tt.ptr<f32>, #blocked2>, tensor<128x64xi32, #blocked2>
128122
%44 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>>
129123
%45 = tt.expand_dims %44 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked1}>> -> tensor<64x1xi32, #blocked1>
130124
%46 = tt.expand_dims %33 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked1}>> -> tensor<1x128xi32, #blocked1>
@@ -133,74 +127,34 @@ def test_kernel_from_09_tutorial(device, tmp_path: pathlib.Path):
133127
%49 = tt.broadcast %45 : tensor<64x1xi32, #blocked1> -> tensor<64x128xi32, #blocked1>
134128
%50 = tt.broadcast %48 : tensor<1x128xi32, #blocked1> -> tensor<64x128xi32, #blocked1>
135129
%51 = arith.addi %49, %50 : tensor<64x128xi32, #blocked1>
136-
%52 = tt.splat %arg1 : !tt.ptr<f16> -> tensor<64x128x!tt.ptr<f16>, #blocked1>
137-
%53 = tt.addptr %52, %51 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1>
130+
%52 = tt.splat %arg1 : !tt.ptr<f32> -> tensor<64x128x!tt.ptr<f32>, #blocked1>
131+
%53 = tt.addptr %52, %51 : tensor<64x128x!tt.ptr<f32>, #blocked1>, tensor<64x128xi32, #blocked1>
138132
%54 = arith.addi %arg5, %c63_i32 : i32
139133
%55 = arith.divsi %54, %c64_i32 : i32
140134
%56 = arith.remsi %arg5, %c64_i32 : i32
141135
%57 = arith.cmpi eq, %56, %c0_i32 : i32
142136
%58 = arith.cmpi sgt, %arg5, %c64_i32 : i32
143137
%59 = arith.andi %57, %58 : i1
144-
%60 = scf.if %59 -> (tensor<128x128xf32, #blocked>) {
145-
%79:3 = scf.for %arg9 = %c0_i32 to %55 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %43, %arg12 = %53) -> (tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1>) : i32 {
146-
%80 = tt.load %arg11 : tensor<128x64x!tt.ptr<f16>, #blocked2>
147-
%81 = tt.load %arg12 : tensor<64x128x!tt.ptr<f16>, #blocked1>
148-
%82 = tt.fp_to_fp %80 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2>
149-
%83 = ttg.local_alloc %82 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem>
150-
%84 = ttg.local_load %83 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
151-
%85 = tt.fp_to_fp %81 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1>
152-
%86 = ttg.local_alloc %85 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem>
153-
%87 = ttg.local_load %86 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
154-
%88 = tt.dot %84, %87, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked>
155-
%89 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr<f16>, #blocked2>, tensor<128x64xi32, #blocked2>
156-
%90 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1>
157-
scf.yield %88, %89, %90 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1>
158-
}
159-
scf.yield %79#0 : tensor<128x128xf32, #blocked>
160-
} else {
161-
%79:3 = scf.for %arg9 = %c0_i32 to %55 step %c1_i32 iter_args(%arg10 = %cst, %arg11 = %43, %arg12 = %53) -> (tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1>) : i32 {
162-
%80 = arith.muli %arg9, %c64_i32 : i32
163-
%81 = arith.subi %arg5, %80 : i32
164-
%82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2>
165-
%83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2>
166-
%84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2>
167-
%85 = tt.load %arg11, %84, %cst_1 : tensor<128x64x!tt.ptr<f16>, #blocked2>
168-
%86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1>
169-
%87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1>
170-
%88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1>
171-
%89 = tt.load %arg12, %88, %cst_0 : tensor<64x128x!tt.ptr<f16>, #blocked1>
172-
%90 = tt.fp_to_fp %85 : tensor<128x64xf16, #blocked2> -> tensor<128x64xf32, #blocked2>
173-
%91 = ttg.local_alloc %90 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem>
174-
%92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
175-
%93 = tt.fp_to_fp %89 : tensor<64x128xf16, #blocked1> -> tensor<64x128xf32, #blocked1>
176-
%94 = ttg.local_alloc %93 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem>
177-
%95 = ttg.local_load %94 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
178-
%96 = tt.dot %92, %95, %arg10, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked>
179-
%97 = tt.addptr %arg11, %cst_4 : tensor<128x64x!tt.ptr<f16>, #blocked2>, tensor<128x64xi32, #blocked2>
180-
%98 = tt.addptr %arg12, %cst_5 : tensor<64x128x!tt.ptr<f16>, #blocked1>, tensor<64x128xi32, #blocked1>
181-
scf.yield %96, %97, %98 : tensor<128x128xf32, #blocked>, tensor<128x64x!tt.ptr<f16>, #blocked2>, tensor<64x128x!tt.ptr<f16>, #blocked1>
182-
}
183-
scf.yield %79#0 : tensor<128x128xf32, #blocked>
184-
}
185-
%61 = arith.truncf %60 : tensor<128x128xf32, #blocked> to tensor<128x128xf16, #blocked>
186-
%62 = tt.expand_dims %23 {axis = 1 : i32} : tensor<128xi32, #ttg.slice<{dim = 1, parent = #blocked3}>> -> tensor<128x1xi32, #blocked3>
187-
%63 = tt.splat %arg8 : i32 -> tensor<128x1xi32, #blocked3>
188-
%64 = arith.muli %63, %62 : tensor<128x1xi32, #blocked3>
189-
%65 = tt.splat %arg2 : !tt.ptr<f16> -> tensor<128x1x!tt.ptr<f16>, #blocked3>
190-
%66 = tt.addptr %65, %64 : tensor<128x1x!tt.ptr<f16>, #blocked3>, tensor<128x1xi32, #blocked3>
191-
%67 = tt.expand_dims %27 {axis = 0 : i32} : tensor<128xi32, #ttg.slice<{dim = 0, parent = #blocked3}>> -> tensor<1x128xi32, #blocked3>
192-
%68 = tt.broadcast %66 : tensor<128x1x!tt.ptr<f16>, #blocked3> -> tensor<128x128x!tt.ptr<f16>, #blocked3>
193-
%69 = tt.broadcast %67 : tensor<1x128xi32, #blocked3> -> tensor<128x128xi32, #blocked3>
194-
%70 = tt.addptr %68, %69 : tensor<128x128x!tt.ptr<f16>, #blocked3>, tensor<128x128xi32, #blocked3>
195-
%71 = tt.splat %arg3 : i32 -> tensor<128x1xi32, #blocked3>
196-
%72 = arith.cmpi slt, %62, %71 : tensor<128x1xi32, #blocked3>
197-
%73 = tt.splat %arg4 : i32 -> tensor<1x128xi32, #blocked3>
198-
%74 = arith.cmpi slt, %67, %73 : tensor<1x128xi32, #blocked3>
199-
%75 = tt.broadcast %72 : tensor<128x1xi1, #blocked3> -> tensor<128x128xi1, #blocked3>
200-
%76 = tt.broadcast %74 : tensor<1x128xi1, #blocked3> -> tensor<128x128xi1, #blocked3>
201-
%77 = arith.andi %75, %76 : tensor<128x128xi1, #blocked3>
202-
%78 = ttg.convert_layout %61 : tensor<128x128xf16, #blocked> -> tensor<128x128xf16, #blocked3>
203-
tt.store %70, %78, %77 : tensor<128x128x!tt.ptr<f16>, #blocked3>
138+
139+
%80 = arith.muli %c0_i32, %c64_i32 : i32
140+
%81 = arith.subi %arg5, %80 : i32
141+
%82 = tt.splat %81 : i32 -> tensor<1x64xi32, #blocked2>
142+
%83 = arith.cmpi slt, %38, %82 : tensor<1x64xi32, #blocked2>
143+
%84 = tt.broadcast %83 : tensor<1x64xi1, #blocked2> -> tensor<128x64xi1, #blocked2>
144+
%85 = tt.load %43, %84, %cst_1 : tensor<128x64x!tt.ptr<f32>, #blocked2>
145+
%86 = tt.splat %81 : i32 -> tensor<64x1xi32, #blocked1>
146+
%87 = arith.cmpi slt, %45, %86 : tensor<64x1xi32, #blocked1>
147+
%88 = tt.broadcast %87 : tensor<64x1xi1, #blocked1> -> tensor<64x128xi1, #blocked1>
148+
%89 = tt.load %53, %88, %cst_0 : tensor<64x128x!tt.ptr<f32>, #blocked1>
149+
%91 = ttg.local_alloc %85 : (tensor<128x64xf32, #blocked2>) -> !ttg.memdesc<128x64xf32, #shared, #smem>
150+
%92 = ttg.local_load %91 : !ttg.memdesc<128x64xf32, #shared, #smem> -> tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
151+
%94 = ttg.local_alloc %89 : (tensor<64x128xf32, #blocked1>) -> !ttg.memdesc<64x128xf32, #shared1, #smem>
152+
%95 = ttg.local_load %94 : !ttg.memdesc<64x128xf32, #shared1, #smem> -> tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>>
153+
%96 = tt.dot %92, %95, %cst, inputPrecision = tf32 : tensor<128x64xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> * tensor<64x128xf32, #ttg.dot_op<{opIdx = 1, parent = #blocked}>> -> tensor<128x128xf32, #blocked>
154+
%97 = tt.addptr %43, %cst_4 : tensor<128x64x!tt.ptr<f32>, #blocked2>, tensor<128x64xi32, #blocked2>
155+
%98 = tt.addptr %53, %cst_5 : tensor<64x128x!tt.ptr<f32>, #blocked1>, tensor<64x128xi32, #blocked1>
156+
157+
%78 = ttg.convert_layout %96 : tensor<128x128xf32, #blocked> -> tensor<128x128xf32, #blocked3>
204158
tt.return
205159
}
206160
}

0 commit comments

Comments
 (0)