55#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [4 , 2 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
66#dot0 = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>
77#dot1 = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>
8- module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
8+ module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 , " ttig.support_sg_2d_block " } {
99 tt.func public @matmul_no_scf_with_advance_kernel (%arg0: !tt.ptr <f16 >, %arg1: !tt.ptr <f16 >, %arg2: i64 , %arg3: i64 , %arg4: i64 , %arg5: i64 , %arg7: i64 ) {
1010 %C = arith.constant dense <0.000000e+00 > : tensor <64 x64 xf32 , #dpas >
1111 %c0_i32 = arith.constant 0 : i32
@@ -29,7 +29,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
2929#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [4 , 2 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
3030#dot0 = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>
3131#dot1 = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>
32- module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
32+ module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 , " ttig.support_sg_2d_block " } {
3333 tt.func public @matmul_no_scf_with_add_kernel (%arg0: !tt.ptr <f16 >, %arg1: !tt.ptr <f16 >, %arg2: !tt.ptr <f32 >, %arg3: i64 , %arg4: i64 , %arg5: i64 , %arg6: i64 , %arg8: i64 ) {
3434 %C = arith.constant dense <0.000000e+00 > : tensor <64 x64 xf32 , #dpas >
3535 %c0_i32 = arith.constant 0 : i32
@@ -57,7 +57,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
5757#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [4 , 2 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
5858#dot0 = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>
5959#dot1 = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>
60- module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
60+ module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 , " ttig.support_sg_2d_block " } {
6161 tt.func public @matmul_no_scf_with_add_transpose_kernel (%arg0: !tt.ptr <f16 >, %arg1: !tt.ptr <f16 >, %arg2: !tt.ptr <f32 >, %arg3: i64 , %arg4: i64 , %arg5: i64 , %arg6: i64 , %arg8: i64 ) {
6262 %C = arith.constant dense <0.000000e+00 > : tensor <64 x64 xf32 , #dpas >
6363 %c0_i32 = arith.constant 0 : i32
@@ -83,7 +83,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
8383#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 1 , threadsPerWarp = 16 , warpsPerCTA = [8 , 4 ], repCluster = [1 , 1 ], A = [8 , 8 ], B = [8 , 16 ], C = [8 , 16 ]}>
8484#dot0 = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>
8585#dot1 = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =1 }>
86- module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
86+ module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 , " ttig.support_sg_2d_block " } {
8787 tt.func public @matmul_no_scf_with_advance_kernel (%arg0: !tt.ptr <f32 >, %arg1: !tt.ptr <f32 >, %arg2: i64 , %arg3: i64 , %arg4: i64 , %arg5: i64 , %arg7: i64 ) {
8888 %C = arith.constant dense <0.000000e+00 > : tensor <64 x64 xf32 , #dpas >
8989 %c0_i32 = arith.constant 0 : i32
@@ -105,7 +105,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
105105#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [4 , 2 ], A = [32 , 16 ], B = [16 , 32 ], C = [32 , 32 ]}>
106106#dot0 = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>
107107#dot1 = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>
108- module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
108+ module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 , " ttig.support_sg_2d_block " } {
109109// CHECK-LABEL: llvm.func spir_kernelcc @dot_op_a_2d_load(
110110// CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>,
111111// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
@@ -168,7 +168,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
168168#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [4 , 2 ], A = [32 , 16 ], B = [16 , 32 ], C = [32 , 32 ]}>
169169#dot0 = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>
170170#dot1 = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>
171- module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
171+ module attributes {" ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp" = 16 : i32 , " ttig.support_sg_2d_block " } {
172172// CHECK-LABEL: llvm.func spir_kernelcc @dot_op_b_2d_load(
173173// CHECK-SAME: %[[VAL_0:.*]]: !llvm.ptr<1>,
174174// CHECK-SAME: %[[VAL_1:.*]]: i64, %[[VAL_2:.*]]: i64, %[[VAL_3:.*]]: i64, %[[PTR_1:.*]]: !llvm.ptr<1>) attributes {intel_reqd_sub_group_size = 16 : i32, triton_gen.max_work_group_size = array<i32: 128, 1, 1>} {
@@ -230,7 +230,7 @@ module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 16 : i32}
230230
231231#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [1 , 2 ], A = [8 , 16 ], B = [16 , 32 ], C = [8 , 32 ]}>
232232#dot_b = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth = 2 }>
233- module attributes {" ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
233+ module attributes {" ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp" = 16 : i32 , " ttig.support_sg_2d_block " } {
234234 // CHECK-LABEL: llvm.func spir_kernelcc @column_major_dot_b
235235 tt.func public @column_major_dot_b (%arg0: !tt.ptr <f16 >, %col_stride: i64 ) {
236236 %c64_i32 = arith.constant 64 : i32
@@ -263,7 +263,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32}
263263
264264#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [1 , 1 ], A = [8 , 16 ], B = [16 , 16 ], C = [8 , 16 ]}>
265265#dot_b = #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth = 2 }>
266- module attributes {" ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
266+ module attributes {" ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp" = 16 : i32 , " ttig.support_sg_2d_block " } {
267267 // CHECK-LABEL: llvm.func spir_kernelcc @column_major_dot_b
268268 tt.func public @column_major_dot_b (%arg0: !tt.ptr <f16 >, %col_stride: i64 ) {
269269 %c64_i64 = arith.constant 64 : i64
0 commit comments