Skip to content

Commit fa53ced

Browse files
committed
Fix pre_commit
Signed-off-by: Tiotto, Ettore <[email protected]>
1 parent 4dc1cf1 commit fa53ced

File tree

3 files changed

+44
-43
lines changed

3 files changed

+44
-43
lines changed

test/TritonIntelGPU/coalesce.mlir

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -146,49 +146,49 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
146146
// CHECK: [[BLOCKED_LAYOUT2:#.*]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [4, 4], warpsPerCTA = [1, 4], order = [0, 1]}>
147147
// CHECK: @test_block_ptrs
148148
tt.func public @test_block_ptrs(%arg0: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg5: !tt.ptr<f8E5M2> {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32, %arg10: i32, %arg11: i32 {tt.divisibility = 16 : i32}, %arg12: i32, %arg13: i32, %arg14: i32, %arg15: i32, %arg16: i32, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32, %arg19: i32, %arg20: i32) {
149-
%cst = arith.constant dense<0.000000e+00> : tensor<8x16xf32, #dpas>
150-
%cst_0 = arith.constant dense<0.000000e+00> : tensor<8xf32, #blocked>
151-
%cst_1 = arith.constant dense<0xFF800000> : tensor<8xf32, #blocked>
152-
%c1_i32 = arith.constant 1 : i32
153-
%c16_i32 = arith.constant 16 : i32
154-
%cst_2 = arith.constant dense<0.000000e+00> : tensor<8x64xf32, #blocked1>
155-
%c0_i32 = arith.constant 0 : i32
156-
%c1_i64 = arith.constant 1 : i64
157-
%c64_i64 = arith.constant 64 : i64
158-
%c8_i32 = arith.constant 8 : i32
159-
%0 = tt.get_program_id x : i32
160-
%1 = tt.get_program_id y : i32
161-
%2 = arith.divsi %1, %arg19 : i32
162-
%3 = arith.remsi %1, %arg19 : i32
163-
%4 = arith.extsi %2 : i32 to i64
164-
%5 = arith.extsi %arg6 : i32 to i64
165-
%6 = arith.muli %4, %5 : i64
166-
%7 = arith.extsi %3 : i32 to i64
167-
%8 = arith.extsi %arg7 : i32 to i64
168-
%9 = arith.muli %7, %8 : i64
169-
%10 = arith.addi %6, %9 : i64
170-
%11 = tt.addptr %arg0, %10 : !tt.ptr<f8E5M2>, i64
171-
%12 = arith.muli %0, %c8_i32 : i32
172-
%13 = arith.extsi %arg20 : i32 to i64
173-
%14 = arith.extsi %arg8 : i32 to i64
149+
%cst = arith.constant dense<0.000000e+00> : tensor<8x16xf32, #dpas>
150+
%cst_0 = arith.constant dense<0.000000e+00> : tensor<8xf32, #blocked>
151+
%cst_1 = arith.constant dense<0xFF800000> : tensor<8xf32, #blocked>
152+
%c1_i32 = arith.constant 1 : i32
153+
%c16_i32 = arith.constant 16 : i32
154+
%cst_2 = arith.constant dense<0.000000e+00> : tensor<8x64xf32, #blocked1>
155+
%c0_i32 = arith.constant 0 : i32
156+
%c1_i64 = arith.constant 1 : i64
157+
%c64_i64 = arith.constant 64 : i64
158+
%c8_i32 = arith.constant 8 : i32
159+
%0 = tt.get_program_id x : i32
160+
%1 = tt.get_program_id y : i32
161+
%2 = arith.divsi %1, %arg19 : i32
162+
%3 = arith.remsi %1, %arg19 : i32
163+
%4 = arith.extsi %2 : i32 to i64
164+
%5 = arith.extsi %arg6 : i32 to i64
165+
%6 = arith.muli %4, %5 : i64
166+
%7 = arith.extsi %3 : i32 to i64
167+
%8 = arith.extsi %arg7 : i32 to i64
168+
%9 = arith.muli %7, %8 : i64
169+
%10 = arith.addi %6, %9 : i64
170+
%11 = tt.addptr %arg0, %10 : !tt.ptr<f8E5M2>, i64
171+
%12 = arith.muli %0, %c8_i32 : i32
172+
%13 = arith.extsi %arg20 : i32 to i64
173+
%14 = arith.extsi %arg8 : i32 to i64
174174
// CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
175175
%15 = tt.make_tensor_ptr %11, [%13, %c64_i64], [%14, %c1_i64], [%12, %c0_i32] {order = array<i32: 1, 0>} : <tensor<8x64xf8E5M2, #dot1>>
176-
%16 = tt.addptr %arg1, %10 : !tt.ptr<f8E5M2>, i64
177-
%17 = arith.extsi %arg11 : i32 to i64
176+
%16 = tt.addptr %arg1, %10 : !tt.ptr<f8E5M2>, i64
177+
%17 = arith.extsi %arg11 : i32 to i64
178178
// CHECK: [[PTR2:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>
179179
%18 = tt.make_tensor_ptr %16, [%c64_i64, %13], [%c1_i64, %17], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x16xf8E5M2, #dot2>>
180-
%19 = tt.addptr %arg5, %10 : !tt.ptr<f8E5M2>, i64
181-
%20 = arith.extsi %arg17 : i32 to i64
180+
%19 = tt.addptr %arg5, %10 : !tt.ptr<f8E5M2>, i64
181+
%20 = arith.extsi %arg17 : i32 to i64
182182
// CHECK: [[PTR3:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
183183
%21 = tt.make_tensor_ptr %19, [%13, %c64_i64], [%20, %c1_i64], [%12, %c0_i32] {order = array<i32: 1, 0>} : <tensor<8x64xf8E5M2, #blocked1>>
184184
%22 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #blocked>
185-
%23 = tt.splat %12 : i32 -> tensor<8xi32, #blocked>
186-
%24 = arith.addi %23, %22 : tensor<8xi32, #blocked>
185+
%23 = tt.splat %12 : i32 -> tensor<8xi32, #blocked>
186+
%24 = arith.addi %23, %22 : tensor<8xi32, #blocked>
187187
// CHECK: [[LOAD1:%.*]] = tt.load [[PTR1]] : !tt.ptr<tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]>
188188
// CHECK-NEXT: triton_gpu.convert_layout [[LOAD1]] : tensor<8x64xf8E5M2, [[BLOCKED_LAYOUT1]]> -> tensor<8x64xf8E5M2, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
189189
%25 = tt.load %15 : !tt.ptr<tensor<8x64xf8E5M2, #dot1>>
190-
%26 = arith.addi %0, %c1_i32 : i32
191-
%27 = arith.muli %26, %c8_i32 : i32
190+
%26 = arith.addi %0, %c1_i32 : i32
191+
%27 = arith.muli %26, %c8_i32 : i32
192192
// CHECK: [[ADVANCE1:%.*]] = tt.advance [[PTR2]], {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>
193193
%28 = tt.advance %18, [%c0_i32, %12] : <tensor<64x16xf8E5M2, #dot2>>
194194
// CHECK: [[RES:%.*:2]] = scf.for {{.*}} iter_args(%arg22 = %cst_1, %arg23 = [[ADVANCE1]]) -> (tensor<8xf32, #blocked>, !tt.ptr<tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>)
@@ -202,8 +202,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
202202
%40 = triton_gpu.convert_layout %39 : tensor<8x16xf32, #dpas> -> tensor<8x16xf32, #blocked2>
203203
%41 = "tt.reduce"(%40) <{axis = 1 : i32}> ({
204204
^bb0(%arg24: f32, %arg25: f32):
205-
%44 = arith.maxnumf %arg24, %arg25 : f32
206-
tt.reduce.return %44 : f32
205+
%44 = arith.maxnumf %arg24, %arg25 : f32
206+
tt.reduce.return %44 : f32
207207
}) : (tensor<8x16xf32, #blocked2>) -> tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>
208208
%42 = triton_gpu.convert_layout %41 : tensor<8xf32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> -> tensor<8xf32, #blocked>
209209
// CHECK: [[ADVANCE2:%.*]] = tt.advance %arg23, {{.*}} : <tensor<64x16xf8E5M2, [[BLOCKED_LAYOUT2]]>>
@@ -219,7 +219,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
219219
tt.store %34, %30 : tensor<8x!tt.ptr<f32>, #blocked>
220220
%35 = tt.fp_to_fp %cst_2, rounding = rtne : tensor<8x64xf32, #blocked1> -> tensor<8x64xf8E5M2, #blocked1>
221221
tt.store %21, %35 : !tt.ptr<tensor<8x64xf8E5M2, #blocked1>>
222-
tt.return
222+
tt.return
223223
}
224224
}
225225

@@ -254,19 +254,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
254254
%13 = arith.extsi %arg19 : i32 to i64
255255
%19 = tt.addptr %arg1, %10 : !tt.ptr<f8E5M2>, i64
256256
%20 = arith.extsi %arg11 : i32 to i64
257-
// CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>
257+
// CHECK: [[PTR1:%.*]] = tt.make_tensor_ptr {{.*}} : <tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>
258258
%21 = tt.make_tensor_ptr %19, [%c64_i64, %13], [%c1_i64, %20], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf8E5M2, #dot2>>
259259
// CHECK: [[RES:%.*]]:2 = scf.for {{.*}} iter_args(%arg6 = %cst, %arg7 = [[PTR1]]) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>)
260260
%33:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args(%arg22 = %cst_1, %arg23 = %21) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr<tensor<64x32xf8E5M2, #dot2>>) : i32 {
261261
// CHECK: [[LOAD:%.*]] = tt.load %arg7 : !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
262-
// CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]> -> tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
262+
// CHECK-NEXT: triton_gpu.convert_layout [[LOAD]] : tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]> -> tensor<64x32xf8E5M2, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
263263
// CHECK-NEXT: scf.yield %arg6, %arg7 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, #blocked>>
264264
%load = tt.load %arg23 : !tt.ptr<tensor<64x32xf8E5M2, #dot2>>
265265
scf.yield %arg22, %arg23 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr<tensor<64x32xf8E5M2, #dot2>>
266266
}
267267
// CHECK: scf.for {{.*}} iter_args(%arg6 = [[RES]]#0, %arg7 = [[RES]]#1) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>)
268268
%34:2 = scf.for %arg21 = %c0_i32 to %12 step %c32_i32 iter_args(%arg22 = %33#0, %arg23 = %33#1) -> (tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr<tensor<64x32xf8E5M2, #dot2>>) : i32 {
269-
// CHECK: scf.yield %arg6, %arg7 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
269+
// CHECK: scf.yield %arg6, %arg7 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, !tt.ptr<tensor<64x32xf8E5M2, [[BLOCKED_LAYOUT]]>>
270270
scf.yield %arg22, %arg23 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #dpas}>>, !tt.ptr<tensor<64x32xf8E5M2, #dot2>>
271271
}
272272
tt.return

third_party/intel/backend/compiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def make_ttgir(mod, metadata, opt, properties):
217217
intel.passes.ttgpuir.add_accelerate_matmul(pm)
218218
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
219219
intel.passes.ttgpuir.add_materialize_block_pointer(pm)
220-
# intel.passes.ttgpuir.add_rewrite_tensor_pointer(pm)
220+
# intel.passes.ttgpuir.add_rewrite_tensor_pointer(pm)
221221
intel.passes.ttgpuir.add_pipeline(pm, opt.num_stages, False)
222222

223223
intel.passes.ttgpuir.add_coalesce(pm)

third_party/intel/triton_xpu.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,10 @@ using ret = py::return_value_policy;
4545
pm.addPass(builder({val0, val1})); \
4646
})
4747
#define ADD_PASS_WRAPPER_OPT_5(name, builder, ty0, ty1, ty2, ty3, ty4) \
48-
m.def(name, \
49-
[](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, ty3 val3, \
50-
ty4 val4) { pm.addPass(builder({val0, val1, val2, val3, val4})); })
48+
m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, \
49+
ty3 val3, ty4 val4) { \
50+
pm.addPass(builder({val0, val1, val2, val3, val4})); \
51+
})
5152

5253
static uint32_t findKernels(llvm::Module &M,
5354
std::set<llvm::Function *> &functions) {

0 commit comments

Comments
 (0)