Skip to content

Commit 16b2057

Browse files
[TritonGEN] Do not generate GenISA intrinsic (#2616)
There are new OpenCL C 2d block read variants supported in agama 1032. --------- Signed-off-by: Whitney Tsang <[email protected]>
1 parent 9cf9c63 commit 16b2057

File tree

3 files changed

+55
-131
lines changed

3 files changed

+55
-131
lines changed

test/TritonGEN/tritongen-2Dblockload-to-llvm.mlir

Lines changed: 47 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,34 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_
3737

3838
// -----
3939

40+
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
41+
// CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r16x1cPU3AS1viiiDv2_iPh(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
42+
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<8xi8>
43+
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi8>
44+
llvm.return
45+
}
46+
47+
// -----
48+
49+
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
50+
// CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x1cPU3AS1viiiDv2_iPh(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
51+
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi8>
52+
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi8>
53+
llvm.return
54+
}
55+
56+
// -----
57+
58+
// COM: This case come from the 06 tutorial of FP8 flash attention.
59+
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
60+
// CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r16x4cPU3AS1viiiDv2_iPh(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
61+
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<32xi8>
62+
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<32xi8>
63+
llvm.return
64+
}
65+
66+
// -----
67+
4068
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
4169
// CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x1cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
4270
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<8xi16>
@@ -64,6 +92,15 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_
6492

6593
// -----
6694

95+
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
96+
// CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r32x1cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
97+
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16>
98+
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=32, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
99+
llvm.return
100+
}
101+
102+
// -----
103+
67104
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
68105
// CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_32b_8r8x1cPU3AS1viiiDv2_iPj(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
69106
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<4xi32>
@@ -101,12 +138,22 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_
101138

102139
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
103140
// CHECK: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_read_32b_32r8x1cPU3AS1viiiDv2_iPj(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
141+
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
104142
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=32, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi32>
105143
llvm.return
106144
}
107145

108146
// -----
109147

148+
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
149+
// CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_32b_8r2x1cPU3AS1viiiDv2_iPj(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
150+
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<1xi32>
151+
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=2, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<1xi32>
152+
llvm.return
153+
}
154+
155+
// -----
156+
110157
llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
111158
// CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt(%arg0, %arg1, %arg2, %arg3, {{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
112159
// CHECK-NEXT: llvm.load [[DEST]] : !llvm.ptr -> vector<16xi16>
@@ -333,84 +380,3 @@ llvm.func @triton_gen.2Dblockload(%ptr : !llvm.ptr<1>, %base_width : i32, %base_
333380
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16>
334381
llvm.return
335382
}
336-
337-
// -----
338-
339-
// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i8
340-
// CHECK-LABEL: llvm.func @matrix_2Dblockload
341-
llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
342-
// CHECK: [[ELEM_SIZE_IN_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32
343-
// CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(16 : i32) : i32
344-
// CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32
345-
// CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32
346-
// CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1
347-
// CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1
348-
// CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_SIZE_IN_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}})
349-
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<8xi8>
350-
llvm.return
351-
}
352-
353-
// -----
354-
355-
// COM: This case come from the 06 tutorial of FP8 flash attention.
356-
// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v32i8
357-
// CHECK-LABEL: llvm.func @matrix_2Dblockload
358-
llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
359-
// CHECK: [[ELEM_SIZE_IN_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32
360-
// CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(16 : i32) : i32
361-
// CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32
362-
// CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(4 : i32) : i32
363-
// CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1
364-
// CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1
365-
// CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v32i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_SIZE_IN_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}})
366-
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=4, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<32xi8>
367-
llvm.return
368-
}
369-
370-
// -----
371-
372-
// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i8
373-
// CHECK-LABEL: llvm.func @matrix_2Dblockload
374-
llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
375-
// CHECK: [[ELEM_SIZE_IN_BITS:%.*]] = llvm.mlir.constant(8 : i32) : i32
376-
// CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(32 : i32) : i32
377-
// CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32
378-
// CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32
379-
// CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1
380-
// CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1
381-
// CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i8({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_SIZE_IN_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}})
382-
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<16xi8>
383-
llvm.return
384-
}
385-
386-
// -----
387-
388-
// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i16
389-
// CHECK-LABEL: llvm.func @matrix_2Dblockload
390-
llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
391-
// CHECK: [[ELEM_SIZE_IN_BITS:%.*]] = llvm.mlir.constant(16 : i32) : i32
392-
// CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(32 : i32) : i32
393-
// CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32
394-
// CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32
395-
// CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1
396-
// CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1
397-
// CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i16({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_SIZE_IN_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}})
398-
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=32, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<16xi16>
399-
llvm.return
400-
}
401-
402-
// -----
403-
404-
// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v1i32
405-
// CHECK-LABEL: llvm.func @matrix_2Dblockload
406-
llvm.func @matrix_2Dblockload(%ptr : !llvm.ptr, %base_width : i32, %base_height : i32, %base_pitch : i32, %x : i32, %y : i32) {
407-
// CHECK: [[ELEM_SIZE_IN_BITS:%.*]] = llvm.mlir.constant(32 : i32) : i32
408-
// CHECK: [[TILE_WIDTH:%.*]] = llvm.mlir.constant(2 : i32) : i32
409-
// CHECK: [[TILE_HEIGHT:%.*]] = llvm.mlir.constant(8 : i32) : i32
410-
// CHECK: [[VBLOCKS:%.*]] = llvm.mlir.constant(1 : i32) : i32
411-
// CHECK: [[TRANSPOSE:%.*]] = llvm.mlir.constant(false) : i1
412-
// CHECK: [[VNNI:%.*]] = llvm.mlir.constant(false) : i1
413-
// CHECK: llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v1i32({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, [[ELEM_SIZE_IN_BITS]], [[TILE_WIDTH]], [[TILE_HEIGHT]], [[VBLOCKS]], [[TRANSPOSE]], [[VNNI]], {{.*}})
414-
%0 = triton_gen.2Dblockload %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=2, tile_height=8, v_blocks=1, transpose=false, vnni_transform=false, cache_control=Default} : (!llvm.ptr, i32, i32, i32, i32, i32) -> vector<1xi32>
415-
llvm.return
416-
}

test/TritonIntelGPU/blockptr_load.mlir

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,6 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war
193193

194194
// -----
195195

196-
// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i32
197196
#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [1, 2], A = [8, 16], B = [16, 32], C = [8, 32]}>
198197
#dot_b = #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>
199198
module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
@@ -205,23 +204,20 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
205204
%c0_i32 = arith.constant 0 : i32
206205
%c32_i64 = arith.constant 32 : i64
207206
%21 = tt.make_tensor_ptr %arg0, [%c64_i64, %c64_i64], [%c1_i64, %col_stride], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
208-
// CHECK: llvm.ptrtoint
209-
// CHECK: %[[ELEM_BITS:.*]] = llvm.mlir.constant(32 : i32) : i32
210-
// CHECK: %[[TILE_WIDTH:.*]] = llvm.mlir.constant(8 : i32) : i32
211-
// CHECK: %[[TILE_HEIGHT:.*]] = llvm.mlir.constant(32 : i32) : i32
212-
// CHECK: %[[VBLOCKS:.*]] = llvm.mlir.constant(1 : i32) : i32
213-
// CHECK: %[[TRANSPOSE:.*]] = llvm.mlir.constant(true) : i1
214-
// CHECK: %[[VNNI:.*]] = llvm.mlir.constant(false) : i1
215-
// CHECK: %[[VAL_68:.*]] = llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i32({{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, {{.*}}, %[[ELEM_BITS]], %[[TILE_WIDTH]], %[[TILE_HEIGHT]], %[[VBLOCKS]], %[[TRANSPOSE]], %[[VNNI]], {{.*}})
207+
// CHECK: llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
208+
// CHECK: %[[VAL_68:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
216209
// CHECK: %[[VAL_69:.*]] = llvm.shufflevector %[[VAL_68]], %[[VAL_68]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
217210
// CHECK: %[[VAL_71:.*]] = llvm.shufflevector %[[VAL_68]], %[[VAL_68]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
218-
// CHECK: %[[VAL_103:.*]] = llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i32
211+
// CHECK: llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
212+
// CHECK: %[[VAL_103:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
219213
// CHECK: %[[VAL_104:.*]] = llvm.shufflevector %[[VAL_103]], %[[VAL_103]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
220214
// CHECK: %[[VAL_106:.*]] = llvm.shufflevector %[[VAL_103]], %[[VAL_103]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
221-
// CHECK: %[[VAL_138:.*]] = llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i32
215+
// CHECK: llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
216+
// CHECK: %[[VAL_138:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
222217
// CHECK: %[[VAL_139:.*]] = llvm.shufflevector %[[VAL_138]], %[[VAL_138]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
223218
// CHECK: %[[VAL_141:.*]] = llvm.shufflevector %[[VAL_138]], %[[VAL_138]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
224-
// CHECK: %[[VAL_173:.*]] = llvm.call spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v16i32
219+
// CHECK: llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
220+
// CHECK: %[[VAL_173:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
225221
// CHECK: %[[VAL_174:.*]] = llvm.shufflevector %[[VAL_173]], %[[VAL_173]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
226222
// CHECK: %[[VAL_176:.*]] = llvm.shufflevector %[[VAL_173]], %[[VAL_173]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
227223
%45 = tt.load %21 {triton_intel_gpu.block_io = "column_major"} : !tt.ptr<tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>

third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -175,37 +175,6 @@ loadCacheControlToCacheControls(Builder &builder,
175175
return builder.getAttr<TritonGEN::DecorationCacheControlAttr>(decorations);
176176
}
177177

178-
static bool isOCLBuiltinAvailable(TritonGEN::Matrix2DBlockLoadOp op) {
179-
VectorType resTy = op.getRes().getType();
180-
unsigned resElemTySize = resTy.getElementType().getIntOrFloatBitWidth();
181-
bool needsResElemSizeEqualTo32 =
182-
op.getElemSizeInBits() == 32 || op.getVnniTransform();
183-
assert((!needsResElemSizeEqualTo32 || resElemTySize == 32) &&
184-
"Expecting 32-bit element type");
185-
if (!needsResElemSizeEqualTo32 && resElemTySize != 16)
186-
return false;
187-
188-
if (op.getVnniTransform())
189-
return true;
190-
191-
if (op.getTranspose() && op.getTileHeight() != 16)
192-
return false;
193-
194-
uint32_t tileWidth = op.getTileWidth();
195-
switch (op.getElemSizeInBits()) {
196-
case 8:
197-
return (tileWidth == 32);
198-
case 16:
199-
return (tileWidth == 16);
200-
case 32:
201-
return (tileWidth == 8 || tileWidth == 16);
202-
default:
203-
llvm_unreachable("unexpected element size");
204-
}
205-
206-
return false;
207-
}
208-
209178
[[maybe_unused]] static Value
210179
createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op,
211180
ConversionPatternRewriter &rewriter) {
@@ -822,13 +791,6 @@ struct TritonMatrix2DBlockLoadLowering
822791
return success();
823792
}
824793

825-
if (!isOCLBuiltinAvailable(op)) {
826-
op.emitWarning() << "OpenCL API not available for this operation. Got "
827-
<< *op;
828-
rewriter.replaceOp(op, createGenISA2DBlockRead(op, rewriter));
829-
return success();
830-
}
831-
832794
MLIRContext *ctx = rewriter.getContext();
833795
Location loc = op->getLoc();
834796
VectorType resType = op.getRes().getType();

0 commit comments

Comments
 (0)