@@ -832,9 +832,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
832
832
833
833
// -----
834
834
835
- #blocked0 = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [32 , 1 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
835
+ #blocked0 = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [16 , 1 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
836
836
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
837
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
837
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
838
838
// CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
839
839
// CHECK-LABEL: convert_layout_dpas_block
840
840
tt.func @convert_layout_dpas_blocked (%arg0: tensor <32 x16 xf32 , #dpas >) {
@@ -852,9 +852,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
852
852
853
853
// -----
854
854
855
- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
855
+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
856
856
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
857
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
857
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
858
858
// CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
859
859
// CHECK-LABEL: convert_layout_dpas_block
860
860
tt.func @convert_layout_dpas_blocked (%arg0: tensor <32 x64 xf32 , #dpas >) {
@@ -876,9 +876,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
876
876
877
877
// -----
878
878
879
- #blocked = #ttg.blocked <{sizePerThread = [16 , 1 ], threadsPerWarp = [8 , 4 ], warpsPerCTA = [1 , 8 ], order = [0 , 1 ]}>
879
+ #blocked = #ttg.blocked <{sizePerThread = [16 , 1 ], threadsPerWarp = [4 , 4 ], warpsPerCTA = [1 , 8 ], order = [0 , 1 ]}>
880
880
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [8 , 1 ], repCluster = [1 , 1 ]}>
881
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 } {
881
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
882
882
// CHECK-LABEL: convert_layout_dpas_transpose
883
883
tt.func @convert_layout_dpas_transpose (%arg0: tensor <128 x256 xf8 E5 M2 , #dpas >) {
884
884
// CHECK-COUNT-128: llvm.store %{{.*}} : vector<1xi8>, !llvm.ptr<3>
@@ -951,13 +951,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
951
951
952
952
// -----
953
953
954
- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
954
+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
955
955
#shared = #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
956
956
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
957
957
#dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
958
958
#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =2 }>
959
959
#smem = #ttg.shared_memory
960
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
960
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
961
961
tt.func @matmul_kernel_dot_operand_layout (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
962
962
%a: !ttg.memdesc <128 x32 xf16 , #shared , #smem >, %b: !ttg.memdesc <32 x256 xf16 , #shared , #smem >) {
963
963
%cst = arith.constant dense <0.000000e+00 > : tensor <128 x256 xf32 , #dpas >
@@ -977,14 +977,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
977
977
978
978
// -----
979
979
980
- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
980
+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
981
981
#shared0 = #ttg.swizzled_shared <{vec = 4 , perPhase = 1 , maxPhase = 8 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
982
982
#shared1 = #ttg.swizzled_shared <{vec = 8 , perPhase = 1 , maxPhase = 4 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
983
983
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
984
984
#dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
985
985
#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =2 }>
986
986
#smem = #ttg.shared_memory
987
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
987
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
988
988
tt.func @matmul884_kernel_dot_operand_layout (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
989
989
%a: !ttg.memdesc <32 x64 xf16 , #shared0 , #smem >, %b: !ttg.memdesc <64 x64 xf16 , #shared1 , #smem >) {
990
990
%cst = arith.constant dense <0.000000e+00 > : tensor <32 x64 xf32 , #dpas >
@@ -1028,11 +1028,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
1028
1028
1029
1029
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 1 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
1030
1030
#shared = #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1031
- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1031
+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1032
1032
#dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
1033
1033
#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =1 }>
1034
1034
#smem = #ttg.shared_memory
1035
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
1035
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
1036
1036
// CHECK-LABEL: matmul_tf32dot
1037
1037
tt.func @matmul_tf32dot (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
1038
1038
%a: !ttg.memdesc <32 x16 xf32 , #shared , #smem >, %b: !ttg.memdesc <16 x32 xf32 , #shared , #smem >) {
@@ -1487,10 +1487,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
1487
1487
1488
1488
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 1 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
1489
1489
#shared = #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1490
- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1490
+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1491
1491
#dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
1492
1492
#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =1 }>
1493
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
1493
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
1494
1494
// CHECK-LABEL: matmul_tf32_cst_b
1495
1495
tt.func @matmul_tf32_cst_b (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
1496
1496
%a: tensor <32 x16 xf32 , #dot_operand_a >, %c: tensor <32 x32 xf32 , #dpas >) {
@@ -1510,9 +1510,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
1510
1510
1511
1511
// -----
1512
1512
1513
- #blocked = #ttg.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [8 , 4 ], warpsPerCTA = [4 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1513
+ #blocked = #ttg.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [4 , 4 ], warpsPerCTA = [4 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1514
1514
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
1515
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
1515
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
1516
1516
// CHECK-LABEL: matmul_f16_cst_operands
1517
1517
tt.func public @matmul_f16_cst_operands (%arg0: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }) {
1518
1518
%cst = arith.constant dense <0.000000e+00 > : tensor <32 x32 xf32 , #dpas >
@@ -1556,7 +1556,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
1556
1556
// -----
1557
1557
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 4 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [1 , 1 ]}>
1558
1558
#dot = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>
1559
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " ttig.support_bf16_conversion" } {
1559
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp " = 16 : i32 , " ttig.support_bf16_conversion" } {
1560
1560
// CHECK-LABEL: test_s8_to_bf16_vectorized_conversion
1561
1561
tt.func @test_s8_to_bf16_vectorized_conversion (%in: tensor <16 x16 xi8 , #dpas >) {
1562
1562
// CHECK: %[[F32:.+]] = llvm.sitofp %{{.*}} : i8 to f32
@@ -1856,7 +1856,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
1856
1856
1857
1857
// CHECK-LABEL: volta_dot
1858
1858
#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [1 , 1 ]}>
1859
- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
1859
+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
1860
1860
tt.func @volta_dot () {
1861
1861
%cst = arith.constant dense <0.000000e+00 > : tensor <32 x32 xf32 , #dpas >
1862
1862
%a = arith.constant dense <0.000000e+00 > : tensor <32 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>>
0 commit comments