Skip to content

Commit b7ab0b8

Browse files
Revert "Revert "[XPU][TritonGPUToLLVM] Use reqd_work_group_size (#2845)" (#4426)" (#4758)
Fixes #4427 Inductor CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16427714380 (pass) Suite | Model | CI --|--|-- timm_models_amp_fp16_training| xcit_large_24_p8_224 | [pass_due_to_skip](https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16426850835) timm_models_float16_training| xcit_large_24_p8_224 | [pass_due_to_skip](https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16427556738) timm_models_amp_bf16_training| xcit_large_24_p8_224 | [pass_due_to_skip](https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16427569407) timm_models_bfloat16_training| xcit_large_24_p8_224 | [pass_due_to_skip](https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16427587295) torchbench_float32_inference| vision_maskrcnn | [pass](https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16428117853) torchbench_float32_training| vision_maskrcnn | [pass](https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16428131087) torchbench_amp_bf16_inference| doctr_reco_predictor | [pass](https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/16427605553) This reverts commit 0887d5a.
1 parent a1d1057 commit b7ab0b8

File tree

14 files changed

+41
-90
lines changed

14 files changed

+41
-90
lines changed

python/test/unit/tools/test_aot.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -669,5 +669,6 @@ def test_ttgir_to_spv():
669669
fp.write(src)
670670
k = triton.compile(kernel_path, target=triton.runtime.driver.active.get_current_target())
671671
spv = k.asm['spvdis']
672-
assert "OpCapability KernelAttributesINTEL" in spv
672+
assert "OpCapability Kernel" in spv
673+
assert "LocalSize 128 1 1" in spv
673674
assert "SubgroupSize 32" in spv

test/Conversion/intel/dpas_to_block_layout_convert.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.shared = 67584 : i32, "ttg.threads-per-warp" = 16 : i32} {
77
// CHECK-LABEL: llvm.func spir_kernelcc @convert_dpas(
88
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>)
9-
// CHECK-SAME: attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, triton_gen.max_work_group_size = array<i32: 512, 1, 1>} {
9+
// CHECK-SAME: attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, reqd_work_group_size = array<i32: 512, 1, 1>} {
1010
tt.func public @convert_dpas(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
1111
%cst = arith.constant dense<0.000000e+00> : tensor<128x256xf16, #mma>
1212

@@ -100,7 +100,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.sha
100100
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.shared = 67584 : i32, "ttg.threads-per-warp" = 16 : i32} {
101101
// CHECK-LABEL: llvm.func spir_kernelcc @convert_dpas(
102102
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>)
103-
// CHECK-SAME: attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, triton_gen.max_work_group_size = array<i32: 512, 1, 1>} {
103+
// CHECK-SAME: attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, reqd_work_group_size = array<i32: 512, 1, 1>} {
104104
tt.func public @convert_dpas(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
105105
%cst = arith.constant dense<0.000000e+00> : tensor<128x256xf16, #mma>
106106

test/Conversion/intel/tritongpu_to_gen.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
44
// CHECK: llvm.func spir_kernelcc @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>, %arg2: !llvm.ptr<1>)
55
// Here the 128 comes from the 4 in module attribute multiples 32
6-
// CHECK-SAME: attributes {intel_reqd_sub_group_size = 32 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
6+
// CHECK-SAME: attributes {intel_reqd_sub_group_size = 32 : i32, reqd_work_group_size = array<i32: 128, 1, 1>} {
77
tt.func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
88
// CHECK: llvm.return
99
tt.return

test/Conversion/intel/tritongpu_to_gen_dot.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
7373

7474
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32} {
7575
// CHECK-LABEL: llvm.func spir_kernelcc @dot_f32_tf32_tf32_f32_1(
76-
// CHECK-SAME: %[[A:.*]]: !llvm.struct<(f32, f32, f32, f32)>, %[[B:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 16, 1, 1>} {
76+
// CHECK-SAME: %[[A:.*]]: !llvm.struct<(f32, f32, f32, f32)>, %[[B:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
7777
tt.func @dot_f32_tf32_tf32_f32_1(%a: tensor<8x8xf32, #dot_operand_a>, %b: tensor<8x16xf32, #dot_operand_b>, %c: tensor<8x16xf32, #dpas>) {
7878
// COM: To simplify, only check RTNE and its usage for the last element of A, B, C
7979
// CHECK: %[[A_LAST_VAL:.*]] = llvm.extractvalue %[[A]][3]
@@ -116,7 +116,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
116116
// CHECK: llvm.func spir_funccc @_Z45__spirv_SubgroupMatrixMultiplyAccumulateINTELiDv8_sDv8_iDv8_fi(i32, vector<8xi16>, vector<8xi32>, vector<8xf32>, i32) -> vector<8xf32> attributes {convergent, memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
117117
// CHECK-LABEL: llvm.func spir_kernelcc @dot_rep_cluster_4_2(
118118
// CHECK-SAME: %[[A:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, %[[B:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>,
119-
// CHECK-SAME: %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 16, 1, 1>} {
119+
// CHECK-SAME: %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
120120
tt.func @dot_rep_cluster_4_2(%a: tensor<32x32xf16, #dot_operand_a>, %b: tensor<32x32xf16, #dot_operand_b>, %c: tensor<32x32xf32, #dpas>) {
121121
// CHECK: %[[VAL_3:.*]] = llvm.mlir.undef : vector<8xf32>
122122
// CHECK: %[[CST_15:.*]] = llvm.mlir.constant(15 : i32) : i32

test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-war
114114

115115
module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
116116
// CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_tf32(
117-
// CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 512, 1, 1>} {
117+
// CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 512, 1, 1>} {
118118
tt.func public @matmul_kernel_with_block_pointers_tf32(%arg0: !tt.ptr<f32>) {
119119
%c0_i64 = arith.constant 0 : i64
120120
%c0_i32 = arith.constant 0 : i32
@@ -134,7 +134,7 @@ module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-war
134134

135135
module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
136136
// CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_f16accu(
137-
// CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 512, 1, 1>} {
137+
// CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 512, 1, 1>} {
138138
tt.func public @matmul_kernel_with_block_pointers_f16accu(%arg0: !tt.ptr<f16>) {
139139
%c0_i64 = arith.constant 0 : i64
140140
%c0_i32 = arith.constant 0 : i32
@@ -154,7 +154,7 @@ module attributes {"ttig.support_sg_2d_block", "ttig.support_dpas", "ttg.num-war
154154
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32, ttig.min_sg_size = 16 : i32, ttig.support_dpas, ttig.support_sg_2d_block} {
155155

156156
// CHECK-LABEL: llvm.func spir_kernelcc @reduce_sum(
157-
// CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>, [[PTR_1:%.*]]: !llvm.ptr<1>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>}
157+
// CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>, [[PTR_1:%.*]]: !llvm.ptr<1>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>}
158158
tt.func public @reduce_sum(%arg0: tensor<8x16xf32>) -> f32 {
159159
// CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32
160160
// CHECK: [[VAL_2:%.*]] = llvm.extractelement [[VAL_0]][[[VAL_1]] : i32] : vector<8xf32>
@@ -169,7 +169,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
169169
}
170170

171171
// CHECK-LABEL: llvm.func spir_kernelcc @reduce_max(
172-
// CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>, [[PTR_1:%.*]]: !llvm.ptr<1>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>}
172+
// CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>, [[PTR_1:%.*]]: !llvm.ptr<1>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>}
173173
tt.func public @reduce_max(%arg0: tensor<8x16xf32>) -> f32 {
174174
// CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32
175175
// CHECK: [[VAL_2:%.*]] = llvm.extractelement [[VAL_0]][[[VAL_1]] : i32] : vector<8xf32>
@@ -227,7 +227,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
227227
}
228228

229229
// CHECK-LABEL: llvm.func spir_kernelcc @addptr(
230-
// CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>, [[PTR_1:%.*]]: !llvm.ptr<1>) -> !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>}
230+
// CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>, [[PTR_1:%.*]]: !llvm.ptr<1>) -> !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>}
231231
tt.func public @addptr(%arg0: !tt.ptr<f16>) -> !tt.ptr<f16> {
232232
// CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32
233233
// CHECK: [[VAL_2:%.*]] = llvm.call spir_funccc @_Z12get_group_idj([[VAL_1]]) {{.*}} : (i32) -> i64
@@ -367,7 +367,7 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32,
367367
#warp = #ttig.warp<{sizePerThread = [16, 64], threadsPerWarp = [1, 1], order = [1, 0]}>
368368

369369
// CHECK-LABEL: llvm.func spir_kernelcc @test(
370-
// CHECK-SAME: %[[VAL_0:.*]]: f32, %[[PTR_1:.*]]: !llvm.ptr<1>) -> vector<16xf32> attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 64, 1, 1>} {
370+
// CHECK-SAME: %[[VAL_0:.*]]: f32, %[[PTR_1:.*]]: !llvm.ptr<1>) -> vector<16xf32> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 64, 1, 1>} {
371371
// CHECK: %[[VAL_2:.*]] = llvm.mlir.poison : vector<16xf32>
372372
// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(0 : i32) : i32
373373
// CHECK: %[[VAL_4:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_0]], %[[VAL_3]])

test/Conversion/intel/tritongpu_transposed_reduction.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
1818
// CHECK: }
1919

2020
// CHECK: llvm.func spir_kernelcc @reduce_sum(
21-
// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>, %[[PTR_1:.*]]: !llvm.ptr<1>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
21+
// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>, %[[PTR_1:.*]]: !llvm.ptr<1>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>} {
2222
// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
2323
// CHECK: %[[VAL_3:.*]] = llvm.extractelement %[[VAL_0]]{{\[}}%[[VAL_2]] : i32] : vector<16xf32>
2424
// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32
@@ -78,7 +78,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.thr
7878
}
7979

8080
// CHECK: llvm.func spir_kernelcc @reduce_max(
81-
// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>, %[[PTR_1:.*]]: !llvm.ptr<1>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
81+
// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>, %[[PTR_1:.*]]: !llvm.ptr<1>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>} {
8282
// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32
8383
// CHECK: %[[VAL_3:.*]] = llvm.extractelement %[[VAL_0]]{{\[}}%[[VAL_2]] : i32] : vector<16xf32>
8484
// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32

test/Target/LLVMIR/triton-gen.mlir

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,5 @@
11
// RUN: triton-translate -triton-to-llvmir -split-input-file %s | FileCheck %s
22

3-
// CHECK: define spir_kernel void @test_max_work_group_size() !max_work_group_size ![[MAX_WORK_GROUP_SIZE:.*]] {
4-
llvm.func spir_kernelcc @test_max_work_group_size() attributes {triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
5-
llvm.return
6-
}
7-
8-
// CHECK-DAG: ![[MAX_WORK_GROUP_SIZE]] = !{i64 128, i64 1, i64 1}
9-
10-
// -----
11-
123
llvm.func @foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr)
134

145
// CHECK-LABEL: define void @triton_gen.cache_controls(

test/TritonIntelGPU/blockptr_load.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
108108
module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
109109
// CHECK-LABEL: llvm.func spir_kernelcc @dot_op_a_2d_load(
110110
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>,
111-
// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 16, 1, 1>} {
111+
// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
112112
tt.func public @dot_op_a_2d_load(%arg0: !tt.ptr<f16>, %arg2: i64, %arg4: i64, %arg5: i64, %arg7: i64) {
113113
%c0_i32 = arith.constant 0 : i32
114114
%c1_i64 = arith.constant 1 : i64
@@ -171,7 +171,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32,
171171
module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
172172
// CHECK-LABEL: llvm.func spir_kernelcc @dot_op_b_2d_load(
173173
// CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>,
174-
// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 16, 1, 1>} {
174+
// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
175175
tt.func public @dot_op_b_2d_load(%arg1: !tt.ptr<f16>, %arg3: i64, %arg4: i64, %arg7: i64) {
176176
%c0_i32 = arith.constant 0 : i32
177177
%c1_i64 = arith.constant 1 : i64

test/TritonIntelGPU/blockptr_store.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32,
242242
module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
243243
// CHECK-LABEL: llvm.func spir_kernelcc @dpas_layout_2d_store_rep_cluster_4_2(
244244
// CHECK-SAME: %[[base:.*]]: !llvm.ptr<1>,
245-
// CHECK-SAME: %[[width:.*]]: i64, %[[height:.*]]: i64, %[[rowStride:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 16, 1, 1>} {
245+
// CHECK-SAME: %[[width:.*]]: i64, %[[height:.*]]: i64, %[[rowStride:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 16, 1, 1>} {
246246
tt.func public @dpas_layout_2d_store_rep_cluster_4_2(%base: !tt.ptr<f16>, %width: i64, %height: i64, %rowStride: i64) {
247247
%cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dpas>
248248
%c0_i32 = arith.constant 0 : i32

test/TritonIntelGPU/prefetch-to-llvm.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
66
// CHECK-SAME: %[[BASE_HEIGHT:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64,
77
// CHECK-SAME: %[[BASE_WIDTH:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64,
88
// CHECK-SAME: %[[ROW_STRIDE:[0-9]+|[a-zA-Z$._-][a-zA-Z0-9$._-]*]]: i64,
9-
// CHECK-SAME: %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
9+
// CHECK-SAME: %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array<i32: 128, 1, 1>} {
1010
tt.func public @prefetch_block_ptr(%arg0: !tt.ptr<f16>, %arg2: i64, %arg4: i64, %arg5: i64) {
1111
%c0_i32 = arith.constant 0 : i32
1212
%c1_i64 = arith.constant 1 : i64

0 commit comments

Comments
 (0)