From f800e9fee6800139e69e2760b5deb86b933bd086 Mon Sep 17 00:00:00 2001 From: victor-eds Date: Wed, 20 Nov 2024 11:34:44 +0000 Subject: [PATCH 1/6] [XPU][TritonGPUToLLVM] Use `llvm.func` attributes to express kernels ND-ranges Use `llvm.func` `reqd_work_group_size` and `intel_reqd_sub_group_size` to express ND-range dimensions instead of `triton_gen` attributes that are later translated. Signed-off-by: victor-eds --- .../intel/dpas_to_block_layout_convert.mlir | 4 +- .../intel/shared_to_dot_layout_convert.mlir | 6 +- test/Conversion/intel/tritongpu_to_gen.mlir | 2 +- .../intel/tritongpu_to_gen_dot.mlir | 2 +- ...tritongpu_to_llvm_intel_advanced_path.mlir | 12 ++-- .../intel/tritongpu_transposed_reduction.mlir | 4 +- test/TritonIntelGPU/blockptr_load.mlir | 4 +- test/TritonIntelGPU/blockptr_store.mlir | 2 +- ...ritonintelgpu-convert-layout-shortcut.mlir | 6 +- .../Dialect/TritonGEN/IR/TritonGENDialect.td | 18 ----- .../TritonGENToLLVMIRTranslation.cpp | 69 ++++--------------- .../TritonIntelGPUToLLVM/PipelineManager.h | 9 +-- 12 files changed, 38 insertions(+), 100 deletions(-) diff --git a/test/Conversion/intel/dpas_to_block_layout_convert.mlir b/test/Conversion/intel/dpas_to_block_layout_convert.mlir index ae4bd99f7a..396e544f9a 100644 --- a/test/Conversion/intel/dpas_to_block_layout_convert.mlir +++ b/test/Conversion/intel/dpas_to_block_layout_convert.mlir @@ -6,7 +6,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, triton_gpu.shared = 67584 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @convert_dpas( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {noinline = false, triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { +// CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, reqd_work_group_size = array} { tt.func public @convert_dpas(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf16, #mma> @@ -69,7 +69,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, triton_gpu.shared = 67584 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @convert_dpas( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {noinline = false, triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { +// CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, reqd_work_group_size = array} { tt.func public @convert_dpas(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf16, #mma> diff --git a/test/Conversion/intel/shared_to_dot_layout_convert.mlir b/test/Conversion/intel/shared_to_dot_layout_convert.mlir index d3d38e985f..6a03fd5511 100644 --- a/test/Conversion/intel/shared_to_dot_layout_convert.mlir +++ b/test/Conversion/intel/shared_to_dot_layout_convert.mlir @@ -9,7 +9,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @convert_dot( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, - // CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], {{.*}}} { + // CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, {{.*}}} { tt.func @convert_dot(%A: tensor<128x64xf16, #blocked0>) { // CHECK-DAG: %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32 // CHECK-DAG: %[[CST_4:.*]] = llvm.mlir.constant(4 : i32) : i32 @@ -91,7 +91,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @convert_dot( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, - // CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], {{.*}}} { + // CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, {{.*}}} { tt.func @convert_dot(%A: tensor<128x64xf16, #blocked0>) { // CHECK-DAG: %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32 // CHECK-DAG: %[[CST_32:.*]] = llvm.mlir.constant(32 : i32) : i32 @@ -174,7 +174,7 @@ module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-wa module attributes {"triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @convert_dot( // CHECK-SAME: %[[VAL_1:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, - // CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], {{.*}}} { + // CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, {{.*}}} { tt.func @convert_dot(%B: tensor<64x256xf16, #blocked1>) { // CHECK-DAG: %[[CST_128:.*]] = llvm.mlir.constant(128 : i32) : i32 // CHECK-DAG: %[[CST_256:.*]] = llvm.mlir.constant(256 : i32) : i32 diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir index f6ddb63183..3659a8d6a1 100644 --- a/test/Conversion/intel/tritongpu_to_gen.mlir +++ b/test/Conversion/intel/tritongpu_to_gen.mlir @@ -3,7 +3,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK: llvm.func spir_kernelcc @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>) // Here the 128 comes from the 4 in module attribute multiples 32 - // CHECK-SAME: attributes {triton_gen.intel_reqd_sub_group_size = [32 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { + // CHECK-SAME: attributes {intel_reqd_sub_group_size = 32 : i32, reqd_work_group_size = array} { tt.func @test_empty_kernel(%lb : index, %A : !tt.ptr) { // CHECK: llvm.return tt.return diff --git a/test/Conversion/intel/tritongpu_to_gen_dot.mlir b/test/Conversion/intel/tritongpu_to_gen_dot.mlir index 7c489ff5b5..0e1dd40ce3 100644 --- a/test/Conversion/intel/tritongpu_to_gen_dot.mlir +++ b/test/Conversion/intel/tritongpu_to_gen_dot.mlir @@ -106,7 +106,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : // CHECK: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes {convergent, memory_effects = #llvm.memory_effects, no_unwind, will_return} // CHECK-LABEL: llvm.func spir_kernelcc @dot_rep_cluster_4_2( // CHECK-SAME: %[[A:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, %[[B:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, - // CHECK-SAME: %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [16 : i32, 1 : i32, 1 : i32]} { + // CHECK-SAME: %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func @dot_rep_cluster_4_2(%a: tensor<32x32xf16, #dot_operand_a>, %b: tensor<32x32xf16, #dot_operand_b>, %c: tensor<32x32xf32, #dpas>) { // CHECK: %[[VAL_3:.*]] = llvm.mlir.undef : vector<8xf32> // CHECK: %[[CST_15:.*]] = llvm.mlir.constant(15 : i32) : i32 diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir index aedee51005..d2bd45dba9 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir @@ -114,7 +114,7 @@ module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.sup module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_tf32( - // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { + // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func public @matmul_kernel_with_block_pointers_tf32(%arg0: !tt.ptr) { %c0_i64 = arith.constant 0 : i64 %c0_i32 = arith.constant 0 : i32 @@ -134,7 +134,7 @@ module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.sup module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_f16accu( - // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { + // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func public @matmul_kernel_with_block_pointers_f16accu(%arg0: !tt.ptr) { %c0_i64 = arith.constant 0 : i64 %c0_i32 = arith.constant 0 : i32 @@ -157,7 +157,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK-DAG: llvm.func spir_funccc @_Z32sub_group_non_uniform_reduce_addf(f32) -> f32 // CHECK-LABEL: llvm.func spir_kernelcc @reduce_sum( - // CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>) -> f32 attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} + // CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} tt.func public @reduce_sum(%arg0: tensor<8x16xf32>) -> f32 { // CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: [[VAL_2:%.*]] = llvm.extractelement [[VAL_0]][[[VAL_1]] : i32] : vector<8xf32> @@ -172,7 +172,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : } // CHECK-LABEL: llvm.func spir_kernelcc @reduce_max( - // CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>) -> f32 attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} + // CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} tt.func public @reduce_max(%arg0: tensor<8x16xf32>) -> f32 { // CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: [[VAL_2:%.*]] = llvm.extractelement [[VAL_0]][[[VAL_1]] : i32] : vector<8xf32> @@ -229,7 +229,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : } // CHECK-LABEL: llvm.func spir_kernelcc @addptr( - // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) -> !llvm.ptr<1> attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} + // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) -> !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} tt.func public @addptr(%arg0: !tt.ptr) -> !tt.ptr { // CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: [[VAL_2:%.*]] = llvm.call spir_funccc @_Z12get_group_idj([[VAL_1]]) {{.*}} : (i32) -> i64 @@ -368,7 +368,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war #warp = #triton_intel_gpu.warp<{sizePerThread = [16, 64], threadsPerWarp = [1, 1], order = [1, 0]}> // CHECK-LABEL: llvm.func spir_kernelcc @test( -// CHECK-SAME: %[[VAL_0:.*]]: f32) -> vector<16xf32> attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [64 : i32, 1 : i32, 1 : i32]} { +// CHECK-SAME: %[[VAL_0:.*]]: f32) -> vector<16xf32> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { // CHECK: %[[VAL_2:.*]] = llvm.mlir.poison : vector<16xf32> // CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_4:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_0]], %[[VAL_3]]) diff --git a/test/Conversion/intel/tritongpu_transposed_reduction.mlir b/test/Conversion/intel/tritongpu_transposed_reduction.mlir index c59df996c2..b0b63d1bcc 100644 --- a/test/Conversion/intel/tritongpu_transposed_reduction.mlir +++ b/test/Conversion/intel/tritongpu_transposed_reduction.mlir @@ -18,7 +18,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK: } // CHECK: llvm.func spir_kernelcc @reduce_sum( -// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>) -> f32 attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { +// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { // CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_3:.*]] = llvm.extractelement %[[VAL_0]]{{\[}}%[[VAL_2]] : i32] : vector<16xf32> // CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32 @@ -78,7 +78,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : } // CHECK: llvm.func spir_kernelcc @reduce_max( -// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>) -> f32 attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { +// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { // CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_3:.*]] = llvm.extractelement %[[VAL_0]]{{\[}}%[[VAL_2]] : i32] : vector<16xf32> // CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32 diff --git a/test/TritonIntelGPU/blockptr_load.mlir b/test/TritonIntelGPU/blockptr_load.mlir index ff99bbf77f..0085c2e4c3 100644 --- a/test/TritonIntelGPU/blockptr_load.mlir +++ b/test/TritonIntelGPU/blockptr_load.mlir @@ -57,7 +57,7 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @dot_op_a_2d_load( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { +// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func public @dot_op_a_2d_load(%arg0: !tt.ptr, %arg2: i64, %arg4: i64, %arg5: i64, %arg7: i64) { %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 @@ -129,7 +129,7 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @dot_op_b_2d_load( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { +// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func public @dot_op_b_2d_load(%arg1: !tt.ptr, %arg3: i64, %arg4: i64, %arg7: i64) { %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 diff --git a/test/TritonIntelGPU/blockptr_store.mlir b/test/TritonIntelGPU/blockptr_store.mlir index 443743fa76..1ff2213a2d 100644 --- a/test/TritonIntelGPU/blockptr_store.mlir +++ b/test/TritonIntelGPU/blockptr_store.mlir @@ -61,7 +61,7 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @dpas_layout_2d_store_rep_cluster_4_2( // CHECK-SAME: %[[base:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[width:.*]]: i64, %[[height:.*]]: i64, %[[rowStride:.*]]: i64) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { +// CHECK-SAME: %[[width:.*]]: i64, %[[height:.*]]: i64, %[[rowStride:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func public @dpas_layout_2d_store_rep_cluster_4_2(%base: !tt.ptr, %width: i64, %height: i64, %rowStride: i64) { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dpas> %c0_i32 = arith.constant 0 : i32 diff --git a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir index 7bfff4fc36..e4d2800a0b 100644 --- a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir @@ -3,7 +3,7 @@ #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [32, 1], repCluster = [1, 2], A = [8, 16], B = [16, 32], C = [8, 32]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} { // CHECK-LABEL: convert_dpas_to_dot_rep_cluster_1_2 - // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { + // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func public @convert_dpas_to_dot_rep_cluster_1_2(%arg: tensor<1024x32xf16, #dpas>) { // COM: The repetitions order of dot layout and dpas layout are same when the GEMM tiling is clustered as repCluster [1, 2]. // CHECK: %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32 @@ -56,7 +56,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [32, 1], repCluster = [2, 2], A = [8, 16], B = [16, 32], C = [8, 32]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} { // CHECK-LABEL: convert_dpas_to_dot_rep_cluster_2_2 - // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { + // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func public @convert_dpas_to_dot_rep_cluster_2_2(%arg: tensor<1024x32xf16, #dpas>) { // COM: The repetitions order of dpas layout when the GEMM tiling is clustered as repCluster [2, 2]: // COM: - 0, 1, 2, 3, 4, 5, 6, 7. @@ -112,7 +112,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [32, 1], repCluster = [4, 2], A = [8, 16], B = [16, 32], C = [8, 32]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} { // CHECK-LABEL: convert_dpas_to_dot_rep_cluster_4_2 - // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {triton_gen.intel_reqd_sub_group_size = [16 : i32], triton_gen.max_work_group_size = [512 : i32, 1 : i32, 1 : i32]} { + // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { tt.func public @convert_dpas_to_dot_rep_cluster_4_2(%arg: tensor<1024x32xf16, #dpas>) { // COM: The repetitions order of dpas layout when the GEMM tiling is clustered as repCluster [4, 2]: // COM: - 0, 1, 2, 3, 4, 5, 6, 7. diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENDialect.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENDialect.td index c7ade3af3d..d6fe3300b6 100644 --- a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENDialect.td +++ b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENDialect.td @@ -24,24 +24,6 @@ def TritonGEN_Dialect : Dialect { let dependentDialects = ["mlir::LLVM::LLVMDialect"]; let extraClassDeclaration = [{ - /// Get the name of the attribute used to annotate max work group size - /// required for kernels. - static constexpr ::llvm::StringLiteral getMaxWorkGroupSizeAttrName() { - return ::llvm::StringLiteral("triton_gen.max_work_group_size"); - } - - /// Get the name of the attribute used to annotate exact work group size - /// required for kernels. - static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() { - return ::llvm::StringLiteral("triton_gen.reqd_work_group_size"); - } - - /// Get the name for the attribute used to annotate the exact sub group - /// size required for kernels. - static constexpr ::llvm::StringLiteral getReqdSubGroupSizeAttrName() { - return ::llvm::StringLiteral("triton_gen.intel_reqd_sub_group_size"); - } - /// Get the name for the attribute used to specify cache control /// decorations. static constexpr ::llvm::StringRef getCacheControlsAttrName() { diff --git a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp index cce8f6c720..f1c4ab1bd9 100644 --- a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp +++ b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp @@ -36,22 +36,20 @@ class TritonGENDialectLLVMIRTranslationInterface NamedAttribute attribute, LLVM::ModuleTranslation &moduleTranslation) const final { StringRef attrName = attribute.getName().getValue(); - if (attrName == - triton::TritonGEN::TritonGENDialect::getCacheControlsAttrName()) { - auto decorationAttr = - dyn_cast( - attribute.getValue()); - if (!decorationAttr) - return op->emitOpError( - "Expecting triton_gen.decoration_cache_control attribute"); - if (instructions.size() != 1) - return op->emitOpError("Expecting a single instruction"); - return handleDecorationCacheControl(op, instructions.front(), - decorationAttr); - } - if (attrName.starts_with("triton_gen")) - return handleTritonGenAttr(op, attribute, moduleTranslation); - return success(); + assert( + attrName == + triton::TritonGEN::TritonGENDialect::getCacheControlsAttrName() && + "Only supported attribute"); + auto decorationAttr = + dyn_cast( + attribute.getValue()); + if (!decorationAttr) + return op->emitOpError( + "Expecting triton_gen.decoration_cache_control attribute"); + if (instructions.size() != 1) + return op->emitOpError("Expecting a single instruction"); + return handleDecorationCacheControl(op, instructions.front(), + decorationAttr); } private: @@ -102,45 +100,6 @@ class TritonGENDialectLLVMIRTranslationInterface llvm::MDNode::get(ctx, decorations)); return success(); } - - LogicalResult - handleTritonGenAttr(Operation *op, NamedAttribute attribute, - LLVM::ModuleTranslation &moduleTranslation) const { - llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); - llvm::Function *llvmFunc = - moduleTranslation.lookupFunction(cast(op).getName()); - if (isKernel(op)) - amendKernel(llvmContext, llvmFunc, attribute); - return success(); - } - - // Checks if the given operation is a kernel function. - bool isKernel(Operation *op) const { - auto fn = dyn_cast(op); - return fn && fn.getCConv() == LLVM::CConv::SPIR_KERNEL; - } - - // The attribute is converted into metadata and added to the function. - void amendKernel(llvm::LLVMContext &llvmContext, llvm::Function *llvmFunc, - NamedAttribute attribute) const { - StringRef name = attribute.getName().getValue(); - assert((name == triton::TritonGEN::TritonGENDialect:: - getMaxWorkGroupSizeAttrName() || - name == triton::TritonGEN::TritonGENDialect:: - getReqdWorkGroupSizeAttrName() || - name == triton::TritonGEN::TritonGENDialect:: - getReqdSubGroupSizeAttrName()) && - "Unexpected attribute"); - SmallVector metadata; - llvm::Type *i64 = llvm::IntegerType::get(llvmContext, 64); - for (int64_t i : - extractFromIntegerArrayAttr(attribute.getValue())) { - llvm::Constant *constant = llvm::ConstantInt::get(i64, i); - metadata.push_back(llvm::ConstantAsMetadata::get(constant)); - } - llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata); - llvmFunc->setMetadata(name.drop_front(11), node); - } }; } // namespace diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index 0593ca63f1..3610f6e021 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -116,12 +116,9 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern { newFuncOp.setLinkage(LLVM::Linkage::External); } - NamedAttrList attrs; - attrs.append(TritonGEN::TritonGENDialect::getMaxWorkGroupSizeAttrName(), - rewriter.getI32ArrayAttr({threadsPerWarp * numWarps, 1, 1})); - attrs.append(TritonGEN::TritonGENDialect::getReqdSubGroupSizeAttrName(), - rewriter.getI32ArrayAttr({threadsPerWarp})); - newFuncOp->setDialectAttrs(attrs); + newFuncOp.setReqdWorkGroupSize( + ArrayRef{threadsPerWarp * numWarps, 1, 1}); + newFuncOp.setIntelReqdSubGroupSize(threadsPerWarp); if (!LLVM::isKernel(funcOp)) { newFuncOp.setPassthroughAttr( From 824fbc13f17f34ccedb1d2c3ceb910b3ae9db2a0 Mon Sep 17 00:00:00 2001 From: victor-eds Date: Wed, 20 Nov 2024 12:44:57 +0000 Subject: [PATCH 2/6] Drop old tests --- test/Target/LLVMIR/triton-gen.mlir | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/test/Target/LLVMIR/triton-gen.mlir b/test/Target/LLVMIR/triton-gen.mlir index a4774ac7a2..6074ccc05d 100644 --- a/test/Target/LLVMIR/triton-gen.mlir +++ b/test/Target/LLVMIR/triton-gen.mlir @@ -1,24 +1,5 @@ // RUN: triton-translate -triton-to-llvmir -split-input-file %s | FileCheck %s -// CHECK: define spir_kernel void @test_intel_reqd_sub_group_size() !intel_reqd_sub_group_size ![[REQD_SUB_GROUP_SIZE:.*]] { -llvm.func spir_kernelcc @test_intel_reqd_sub_group_size() attributes {triton_gen.intel_reqd_sub_group_size = [32 : i32]} { - llvm.return -} -// CHECK: define spir_kernel void @test_max_work_group_size() !max_work_group_size ![[MAX_WORK_GROUP_SIZE:.*]] { -llvm.func spir_kernelcc @test_max_work_group_size() attributes {triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { - llvm.return -} -// CHECK: define spir_kernel void @test_reqd_work_group_size() !reqd_work_group_size ![[REQD_WORK_GROUP_SIZE:.*]] { -llvm.func spir_kernelcc @test_reqd_work_group_size() attributes {triton_gen.reqd_work_group_size = [128 : i32, 1 : i32, 2 : i32]} { - llvm.return -} - -// CHECK-DAG: ![[REQD_SUB_GROUP_SIZE]] = !{i64 32} -// CHECK-DAG: ![[MAX_WORK_GROUP_SIZE]] = !{i64 128, i64 1, i64 1} -// CHECK-DAG: ![[REQD_WORK_GROUP_SIZE]] = !{i64 128, i64 1, i64 2} - -// ----- - llvm.func @foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) // CHECK-LABEL: define void @triton_gen.cache_controls( From 7c192f62745dfd82a701d558a2818ab2fe17bb37 Mon Sep 17 00:00:00 2001 From: victor-eds Date: Wed, 20 Nov 2024 12:46:47 +0000 Subject: [PATCH 3/6] Do not assert --- .../Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp index f1c4ab1bd9..203bd2ad19 100644 --- a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp +++ b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp @@ -36,10 +36,10 @@ class TritonGENDialectLLVMIRTranslationInterface NamedAttribute attribute, LLVM::ModuleTranslation &moduleTranslation) const final { StringRef attrName = attribute.getName().getValue(); - assert( - attrName == - triton::TritonGEN::TritonGENDialect::getCacheControlsAttrName() && - "Only supported attribute"); + // Unsupported attribute name: skip. + if (attrName != + triton::TritonGEN::TritonGENDialect::getCacheControlsAttrName()) + return success(); auto decorationAttr = dyn_cast( attribute.getValue()); From f0a2fcc5d64ffb44b399d467b4825586ed1d848c Mon Sep 17 00:00:00 2001 From: victor-eds Date: Fri, 22 Nov 2024 09:53:57 +0000 Subject: [PATCH 4/6] Add back attribute --- .../intel/dpas_to_block_layout_convert.mlir | 4 +- test/Conversion/intel/tritongpu_to_gen.mlir | 2 +- .../intel/tritongpu_to_gen_dot.mlir | 2 +- ...tritongpu_to_llvm_intel_advanced_path.mlir | 12 ++-- .../intel/tritongpu_transposed_reduction.mlir | 4 +- test/Target/LLVMIR/triton-gen.mlir | 7 ++ test/TritonIntelGPU/blockptr_load.mlir | 4 +- test/TritonIntelGPU/blockptr_store.mlir | 2 +- ...ritonintelgpu-convert-layout-shortcut.mlir | 6 +- .../Dialect/TritonGEN/IR/TritonGENDialect.td | 6 ++ .../TritonGENToLLVMIRTranslation.cpp | 64 +++++++++++++++---- .../TritonIntelGPUToLLVM/PipelineManager.h | 5 +- 12 files changed, 85 insertions(+), 33 deletions(-) diff --git a/test/Conversion/intel/dpas_to_block_layout_convert.mlir b/test/Conversion/intel/dpas_to_block_layout_convert.mlir index 396e544f9a..60d127f52b 100644 --- a/test/Conversion/intel/dpas_to_block_layout_convert.mlir +++ b/test/Conversion/intel/dpas_to_block_layout_convert.mlir @@ -6,7 +6,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, triton_gpu.shared = 67584 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @convert_dpas( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, reqd_work_group_size = array} { +// CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, triton_gen.max_work_group_size = array} { tt.func public @convert_dpas(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf16, #mma> @@ -69,7 +69,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, triton_gpu.shared = 67584 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @convert_dpas( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, reqd_work_group_size = array} { +// CHECK-SAME: %[[SCRATCH_SLM:.*]]: !llvm.ptr<3>) attributes {intel_reqd_sub_group_size = 16 : i32, noinline = false, triton_gen.max_work_group_size = array} { tt.func public @convert_dpas(%arg0: !tt.ptr {tt.divisibility = 16 : i32}) attributes {noinline = false} { %cst = arith.constant dense<0.000000e+00> : tensor<128x256xf16, #mma> diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir index 3659a8d6a1..d6696c15af 100644 --- a/test/Conversion/intel/tritongpu_to_gen.mlir +++ b/test/Conversion/intel/tritongpu_to_gen.mlir @@ -3,7 +3,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { // CHECK: llvm.func spir_kernelcc @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>) // Here the 128 comes from the 4 in module attribute multiples 32 - // CHECK-SAME: attributes {intel_reqd_sub_group_size = 32 : i32, reqd_work_group_size = array} { + // CHECK-SAME: attributes {intel_reqd_sub_group_size = 32 : i32, triton_gen.max_work_group_size = array} { tt.func @test_empty_kernel(%lb : index, %A : !tt.ptr) { // CHECK: llvm.return tt.return diff --git a/test/Conversion/intel/tritongpu_to_gen_dot.mlir b/test/Conversion/intel/tritongpu_to_gen_dot.mlir index 0e1dd40ce3..341b899e64 100644 --- a/test/Conversion/intel/tritongpu_to_gen_dot.mlir +++ b/test/Conversion/intel/tritongpu_to_gen_dot.mlir @@ -106,7 +106,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : // CHECK: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes {convergent, memory_effects = #llvm.memory_effects, no_unwind, will_return} // CHECK-LABEL: llvm.func spir_kernelcc @dot_rep_cluster_4_2( // CHECK-SAME: %[[A:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, %[[B:.*]]: !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>, - // CHECK-SAME: %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { + // CHECK-SAME: %[[C:.*]]: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func @dot_rep_cluster_4_2(%a: tensor<32x32xf16, #dot_operand_a>, %b: tensor<32x32xf16, #dot_operand_b>, %c: tensor<32x32xf32, #dpas>) { // CHECK: %[[VAL_3:.*]] = llvm.mlir.undef : vector<8xf32> // CHECK: %[[CST_15:.*]] = llvm.mlir.constant(15 : i32) : i32 diff --git a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir index d2bd45dba9..8747deb905 100644 --- a/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir +++ b/test/Conversion/intel/tritongpu_to_llvm_intel_advanced_path.mlir @@ -114,7 +114,7 @@ module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.sup module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_tf32( - // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { + // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func public @matmul_kernel_with_block_pointers_tf32(%arg0: !tt.ptr) { %c0_i64 = arith.constant 0 : i64 %c0_i32 = arith.constant 0 : i32 @@ -134,7 +134,7 @@ module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.sup module attributes {"triton_intel_gpu.support_sg_2d_block", "triton_intel_gpu.support_dpas", "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @matmul_kernel_with_block_pointers_f16accu( - // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { + // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func public @matmul_kernel_with_block_pointers_f16accu(%arg0: !tt.ptr) { %c0_i64 = arith.constant 0 : i64 %c0_i32 = arith.constant 0 : i32 @@ -157,7 +157,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK-DAG: llvm.func spir_funccc @_Z32sub_group_non_uniform_reduce_addf(f32) -> f32 // CHECK-LABEL: llvm.func spir_kernelcc @reduce_sum( - // CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} + // CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} tt.func public @reduce_sum(%arg0: tensor<8x16xf32>) -> f32 { // CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: [[VAL_2:%.*]] = llvm.extractelement [[VAL_0]][[[VAL_1]] : i32] : vector<8xf32> @@ -172,7 +172,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : } // CHECK-LABEL: llvm.func spir_kernelcc @reduce_max( - // CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} + // CHECK-SAME: [[VAL_0:%.*]]: vector<8xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} tt.func public @reduce_max(%arg0: tensor<8x16xf32>) -> f32 { // CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: [[VAL_2:%.*]] = llvm.extractelement [[VAL_0]][[[VAL_1]] : i32] : vector<8xf32> @@ -229,7 +229,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : } // CHECK-LABEL: llvm.func spir_kernelcc @addptr( - // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) -> !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} + // CHECK-SAME: [[VAL_0:%.*]]: !llvm.ptr<1>) -> !llvm.ptr<1> attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} tt.func public @addptr(%arg0: !tt.ptr) -> !tt.ptr { // CHECK: [[VAL_1:%.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: [[VAL_2:%.*]] = llvm.call spir_funccc @_Z12get_group_idj([[VAL_1]]) {{.*}} : (i32) -> i64 @@ -368,7 +368,7 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war #warp = #triton_intel_gpu.warp<{sizePerThread = [16, 64], threadsPerWarp = [1, 1], order = [1, 0]}> // CHECK-LABEL: llvm.func spir_kernelcc @test( -// CHECK-SAME: %[[VAL_0:.*]]: f32) -> vector<16xf32> attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { +// CHECK-SAME: %[[VAL_0:.*]]: f32) -> vector<16xf32> attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { // CHECK: %[[VAL_2:.*]] = llvm.mlir.poison : vector<16xf32> // CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_4:.*]] = llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_0]], %[[VAL_3]]) diff --git a/test/Conversion/intel/tritongpu_transposed_reduction.mlir b/test/Conversion/intel/tritongpu_transposed_reduction.mlir index b0b63d1bcc..b6e0428574 100644 --- a/test/Conversion/intel/tritongpu_transposed_reduction.mlir +++ b/test/Conversion/intel/tritongpu_transposed_reduction.mlir @@ -18,7 +18,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : // CHECK: } // CHECK: llvm.func spir_kernelcc @reduce_sum( -// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { +// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { // CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_3:.*]] = llvm.extractelement %[[VAL_0]]{{\[}}%[[VAL_2]] : i32] : vector<16xf32> // CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32 @@ -78,7 +78,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : } // CHECK: llvm.func spir_kernelcc @reduce_max( -// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { +// CHECK-SAME: %[[VAL_0:.*]]: vector<16xf32>) -> f32 attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { // CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[VAL_3:.*]] = llvm.extractelement %[[VAL_0]]{{\[}}%[[VAL_2]] : i32] : vector<16xf32> // CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(1 : i32) : i32 diff --git a/test/Target/LLVMIR/triton-gen.mlir b/test/Target/LLVMIR/triton-gen.mlir index 6074ccc05d..54176b5e94 100644 --- a/test/Target/LLVMIR/triton-gen.mlir +++ b/test/Target/LLVMIR/triton-gen.mlir @@ -1,5 +1,12 @@ // RUN: triton-translate -triton-to-llvmir -split-input-file %s | FileCheck %s +// CHECK: define spir_kernel void @test_max_work_group_size() !max_work_group_size ![[MAX_WORK_GROUP_SIZE:.*]] { +llvm.func spir_kernelcc @test_max_work_group_size() attributes {triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { + llvm.return +} + +// ----- + llvm.func @foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) // CHECK-LABEL: define void @triton_gen.cache_controls( diff --git a/test/TritonIntelGPU/blockptr_load.mlir b/test/TritonIntelGPU/blockptr_load.mlir index 0085c2e4c3..f5004f5735 100644 --- a/test/TritonIntelGPU/blockptr_load.mlir +++ b/test/TritonIntelGPU/blockptr_load.mlir @@ -57,7 +57,7 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @dot_op_a_2d_load( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { +// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func public @dot_op_a_2d_load(%arg0: !tt.ptr, %arg2: i64, %arg4: i64, %arg5: i64, %arg7: i64) { %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 @@ -129,7 +129,7 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @dot_op_b_2d_load( // CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { +// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func public @dot_op_b_2d_load(%arg1: !tt.ptr, %arg3: i64, %arg4: i64, %arg7: i64) { %c0_i32 = arith.constant 0 : i32 %c1_i64 = arith.constant 1 : i64 diff --git a/test/TritonIntelGPU/blockptr_store.mlir b/test/TritonIntelGPU/blockptr_store.mlir index 1ff2213a2d..f59f37034a 100644 --- a/test/TritonIntelGPU/blockptr_store.mlir +++ b/test/TritonIntelGPU/blockptr_store.mlir @@ -61,7 +61,7 @@ module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-war module attributes {"triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { // CHECK-LABEL: llvm.func spir_kernelcc @dpas_layout_2d_store_rep_cluster_4_2( // CHECK-SAME: %[[base:.*]]: !llvm.ptr<1>, -// CHECK-SAME: %[[width:.*]]: i64, %[[height:.*]]: i64, %[[rowStride:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { +// CHECK-SAME: %[[width:.*]]: i64, %[[height:.*]]: i64, %[[rowStride:.*]]: i64) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func public @dpas_layout_2d_store_rep_cluster_4_2(%base: !tt.ptr, %width: i64, %height: i64, %rowStride: i64) { %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #dpas> %c0_i32 = arith.constant 0 : i32 diff --git a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir index e4d2800a0b..e485db4e55 100644 --- a/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir +++ b/test/TritonIntelGPU/tritonintelgpu-convert-layout-shortcut.mlir @@ -3,7 +3,7 @@ #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [32, 1], repCluster = [1, 2], A = [8, 16], B = [16, 32], C = [8, 32]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} { // CHECK-LABEL: convert_dpas_to_dot_rep_cluster_1_2 - // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { + // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func public @convert_dpas_to_dot_rep_cluster_1_2(%arg: tensor<1024x32xf16, #dpas>) { // COM: The repetitions order of dot layout and dpas layout are same when the GEMM tiling is clustered as repCluster [1, 2]. // CHECK: %[[VAL_81:.*]] = llvm.mlir.constant(7 : i32) : i32 @@ -56,7 +56,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [32, 1], repCluster = [2, 2], A = [8, 16], B = [16, 32], C = [8, 32]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} { // CHECK-LABEL: convert_dpas_to_dot_rep_cluster_2_2 - // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { + // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func public @convert_dpas_to_dot_rep_cluster_2_2(%arg: tensor<1024x32xf16, #dpas>) { // COM: The repetitions order of dpas layout when the GEMM tiling is clustered as repCluster [2, 2]: // COM: - 0, 1, 2, 3, 4, 5, 6, 7. @@ -112,7 +112,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [32, 1], repCluster = [4, 2], A = [8, 16], B = [16, 32], C = [8, 32]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} { // CHECK-LABEL: convert_dpas_to_dot_rep_cluster_4_2 - // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, reqd_work_group_size = array} { + // CHECK-SAME: %[[VAL_0:.*]]: !llvm.struct<({{.*}})>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array} { tt.func public @convert_dpas_to_dot_rep_cluster_4_2(%arg: tensor<1024x32xf16, #dpas>) { // COM: The repetitions order of dpas layout when the GEMM tiling is clustered as repCluster [4, 2]: // COM: - 0, 1, 2, 3, 4, 5, 6, 7. diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENDialect.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENDialect.td index d6fe3300b6..2036b9f2e1 100644 --- a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENDialect.td +++ b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENDialect.td @@ -24,6 +24,12 @@ def TritonGEN_Dialect : Dialect { let dependentDialects = ["mlir::LLVM::LLVMDialect"]; let extraClassDeclaration = [{ + /// Get the name of the attribute used to annotate max work group size + /// required for kernels. + static constexpr ::llvm::StringLiteral getMaxWorkGroupSizeAttrName() { + return ::llvm::StringLiteral("triton_gen.max_work_group_size"); + } + /// Get the name for the attribute used to specify cache control /// decorations. static constexpr ::llvm::StringRef getCacheControlsAttrName() { diff --git a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp index 203bd2ad19..36cb0a4ad3 100644 --- a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp +++ b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp @@ -37,19 +37,22 @@ class TritonGENDialectLLVMIRTranslationInterface LLVM::ModuleTranslation &moduleTranslation) const final { StringRef attrName = attribute.getName().getValue(); // Unsupported attribute name: skip. - if (attrName != - triton::TritonGEN::TritonGENDialect::getCacheControlsAttrName()) - return success(); - auto decorationAttr = - dyn_cast( - attribute.getValue()); - if (!decorationAttr) - return op->emitOpError( - "Expecting triton_gen.decoration_cache_control attribute"); - if (instructions.size() != 1) - return op->emitOpError("Expecting a single instruction"); - return handleDecorationCacheControl(op, instructions.front(), - decorationAttr); + if (attrName == + triton::TritonGEN::TritonGENDialect::getCacheControlsAttrName()) { + auto decorationAttr = + dyn_cast( + attribute.getValue()); + if (!decorationAttr) + return op->emitOpError( + "Expecting triton_gen.decoration_cache_control attribute"); + if (instructions.size() != 1) + return op->emitOpError("Expecting a single instruction"); + return handleDecorationCacheControl(op, instructions.front(), + decorationAttr); + } + if (attrName.starts_with("triton_gen")) + return handleTritonGenAttr(op, attribute, moduleTranslation); + return success(); } private: @@ -100,6 +103,41 @@ class TritonGENDialectLLVMIRTranslationInterface llvm::MDNode::get(ctx, decorations)); return success(); } + + LogicalResult + handleTritonGenAttr(Operation *op, NamedAttribute attribute, + LLVM::ModuleTranslation &moduleTranslation) const { + llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); + llvm::Function *llvmFunc = + moduleTranslation.lookupFunction(cast(op).getName()); + if (isKernel(op)) + amendKernel(llvmContext, llvmFunc, attribute); + return success(); + } + + // Checks if the given operation is a kernel function. + bool isKernel(Operation *op) const { + auto fn = dyn_cast(op); + return fn && fn.getCConv() == LLVM::CConv::SPIR_KERNEL; + } + + // The attribute is converted into metadata and added to the function. + void amendKernel(llvm::LLVMContext &llvmContext, llvm::Function *llvmFunc, + NamedAttribute attribute) const { + StringRef name = attribute.getName().getValue(); + assert(name == triton::TritonGEN::TritonGENDialect:: + getMaxWorkGroupSizeAttrName() && + "Unexpected attribute"); + SmallVector metadata; + llvm::Type *i64 = llvm::IntegerType::get(llvmContext, 64); + for (int64_t i : + extractFromIntegerArrayAttr(attribute.getValue())) { + llvm::Constant *constant = llvm::ConstantInt::get(i64, i); + metadata.push_back(llvm::ConstantAsMetadata::get(constant)); + } + llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata); + llvmFunc->setMetadata(name.drop_front(11), node); + } }; } // namespace diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index 3610f6e021..0d020b9d5c 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -116,8 +116,9 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern { newFuncOp.setLinkage(LLVM::Linkage::External); } - newFuncOp.setReqdWorkGroupSize( - ArrayRef{threadsPerWarp * numWarps, 1, 1}); + newFuncOp->setAttr( + TritonGEN::TritonGENDialect::getMaxWorkGroupSizeAttrName(), + rewriter.getDenseI32ArrayAttr({threadsPerWarp * numWarps, 1, 1})); newFuncOp.setIntelReqdSubGroupSize(threadsPerWarp); if (!LLVM::isKernel(funcOp)) { From 790723498861bb568b14c882bd2631a9fb24a4f8 Mon Sep 17 00:00:00 2001 From: victor-eds Date: Fri, 22 Nov 2024 10:04:39 +0000 Subject: [PATCH 5/6] Drop comment --- .../LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp index 36cb0a4ad3..ffc9e64d33 100644 --- a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp +++ b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp @@ -36,7 +36,6 @@ class TritonGENDialectLLVMIRTranslationInterface NamedAttribute attribute, LLVM::ModuleTranslation &moduleTranslation) const final { StringRef attrName = attribute.getName().getValue(); - // Unsupported attribute name: skip. if (attrName == triton::TritonGEN::TritonGENDialect::getCacheControlsAttrName()) { auto decorationAttr = From bf8fc32149beb67d4594677aeacab2dfe0ef6358 Mon Sep 17 00:00:00 2001 From: victor-eds Date: Fri, 22 Nov 2024 10:15:51 +0000 Subject: [PATCH 6/6] Fix attribute type --- test/Target/LLVMIR/triton-gen.mlir | 4 +++- .../LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/test/Target/LLVMIR/triton-gen.mlir b/test/Target/LLVMIR/triton-gen.mlir index 54176b5e94..cfb9e81c44 100644 --- a/test/Target/LLVMIR/triton-gen.mlir +++ b/test/Target/LLVMIR/triton-gen.mlir @@ -1,10 +1,12 @@ // RUN: triton-translate -triton-to-llvmir -split-input-file %s | FileCheck %s // CHECK: define spir_kernel void @test_max_work_group_size() !max_work_group_size ![[MAX_WORK_GROUP_SIZE:.*]] { -llvm.func spir_kernelcc @test_max_work_group_size() attributes {triton_gen.max_work_group_size = [128 : i32, 1 : i32, 1 : i32]} { +llvm.func spir_kernelcc @test_max_work_group_size() attributes {triton_gen.max_work_group_size = array} { llvm.return } +// CHECK-DAG: ![[MAX_WORK_GROUP_SIZE]] = !{i64 128, i64 1, i64 1} + // ----- llvm.func @foo(%arg0: !llvm.ptr, %arg1: !llvm.ptr) diff --git a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp index ffc9e64d33..2760ae674f 100644 --- a/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp +++ b/third_party/intel/lib/Target/LLVMIR/Dialect/TritonGEN/TritonGENToLLVMIRTranslation.cpp @@ -129,8 +129,8 @@ class TritonGENDialectLLVMIRTranslationInterface "Unexpected attribute"); SmallVector metadata; llvm::Type *i64 = llvm::IntegerType::get(llvmContext, 64); - for (int64_t i : - extractFromIntegerArrayAttr(attribute.getValue())) { + for (int32_t i : + cast(attribute.getValue()).asArrayRef()) { llvm::Constant *constant = llvm::ConstantInt::get(i64, i); metadata.push_back(llvm::ConstantAsMetadata::get(constant)); }