@@ -832,9 +832,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
832832
833833// -----
834834
835- #blocked0 = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [32 , 1 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
835+ #blocked0 = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [16 , 1 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
836836#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
837- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
837+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
838838 // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
839839 // CHECK-LABEL: convert_layout_dpas_block
840840 tt.func @convert_layout_dpas_blocked (%arg0: tensor <32 x16 xf32 , #dpas >) {
@@ -852,9 +852,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
852852
853853// -----
854854
855- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
855+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
856856#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
857- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
857+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
858858 // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
859859 // CHECK-LABEL: convert_layout_dpas_block
860860 tt.func @convert_layout_dpas_blocked (%arg0: tensor <32 x64 xf32 , #dpas >) {
@@ -876,9 +876,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
876876
877877// -----
878878
879- #blocked = #ttg.blocked <{sizePerThread = [16 , 1 ], threadsPerWarp = [8 , 4 ], warpsPerCTA = [1 , 8 ], order = [0 , 1 ]}>
879+ #blocked = #ttg.blocked <{sizePerThread = [16 , 1 ], threadsPerWarp = [4 , 4 ], warpsPerCTA = [1 , 8 ], order = [0 , 1 ]}>
880880#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [8 , 1 ], repCluster = [1 , 1 ]}>
881- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 } {
881+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 8 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
882882 // CHECK-LABEL: convert_layout_dpas_transpose
883883 tt.func @convert_layout_dpas_transpose (%arg0: tensor <128 x256 xf8 E5 M2 , #dpas >) {
884884 // CHECK-COUNT-128: llvm.store %{{.*}} : vector<1xi8>, !llvm.ptr<3>
@@ -951,13 +951,13 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
951951
952952// -----
953953
954- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
954+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
955955#shared = #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
956956#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
957957#dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
958958#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =2 }>
959959#smem = #ttg.shared_memory
960- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
960+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
961961 tt.func @matmul_kernel_dot_operand_layout (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
962962 %a: !ttg.memdesc <128 x32 xf16 , #shared , #smem >, %b: !ttg.memdesc <32 x256 xf16 , #shared , #smem >) {
963963 %cst = arith.constant dense <0.000000e+00 > : tensor <128 x256 xf32 , #dpas >
@@ -977,14 +977,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
977977
978978// -----
979979
980- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
980+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
981981#shared0 = #ttg.swizzled_shared <{vec = 4 , perPhase = 1 , maxPhase = 8 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
982982#shared1 = #ttg.swizzled_shared <{vec = 8 , perPhase = 1 , maxPhase = 4 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
983983#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
984984#dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
985985#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =2 }>
986986#smem = #ttg.shared_memory
987- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
987+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
988988 tt.func @matmul884_kernel_dot_operand_layout (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
989989 %a: !ttg.memdesc <32 x64 xf16 , #shared0 , #smem >, %b: !ttg.memdesc <64 x64 xf16 , #shared1 , #smem >) {
990990 %cst = arith.constant dense <0.000000e+00 > : tensor <32 x64 xf32 , #dpas >
@@ -1028,11 +1028,11 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
10281028
10291029#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 1 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
10301030#shared = #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1031- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1031+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
10321032#dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
10331033#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =1 }>
10341034#smem = #ttg.shared_memory
1035- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
1035+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
10361036 // CHECK-LABEL: matmul_tf32dot
10371037 tt.func @matmul_tf32dot (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
10381038 %a: !ttg.memdesc <32 x16 xf32 , #shared , #smem >, %b: !ttg.memdesc <16 x32 xf32 , #shared , #smem >) {
@@ -1487,10 +1487,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
14871487
14881488#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 1 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
14891489#shared = #ttg.swizzled_shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1490- #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1490+ #blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
14911491#dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
14921492#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =1 }>
1493- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
1493+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
14941494 // CHECK-LABEL: matmul_tf32_cst_b
14951495 tt.func @matmul_tf32_cst_b (%ptr: !tt.ptr <f32 > {tt.divisibility = 16 : i32 },
14961496 %a: tensor <32 x16 xf32 , #dot_operand_a >, %c: tensor <32 x32 xf32 , #dpas >) {
@@ -1510,9 +1510,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
15101510
15111511// -----
15121512
1513- #blocked = #ttg.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [8 , 4 ], warpsPerCTA = [4 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1513+ #blocked = #ttg.blocked <{sizePerThread = [1 , 8 ], threadsPerWarp = [4 , 4 ], warpsPerCTA = [4 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
15141514#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
1515- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 32 : i32 } {
1515+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
15161516 // CHECK-LABEL: matmul_f16_cst_operands
15171517 tt.func public @matmul_f16_cst_operands (%arg0: !tt.ptr <f16 > {tt.divisibility = 16 : i32 }) {
15181518 %cst = arith.constant dense <0.000000e+00 > : tensor <32 x32 xf32 , #dpas >
@@ -1556,7 +1556,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
15561556// -----
15571557#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 4 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [1 , 1 ]}>
15581558#dot = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>
1559- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " ttig.support_bf16_conversion" } {
1559+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp " = 16 : i32 , " ttig.support_bf16_conversion" } {
15601560 // CHECK-LABEL: test_s8_to_bf16_vectorized_conversion
15611561 tt.func @test_s8_to_bf16_vectorized_conversion (%in: tensor <16 x16 xi8 , #dpas >) {
15621562 // CHECK: %[[F32:.+]] = llvm.sitofp %{{.*}} : i8 to f32
@@ -1856,7 +1856,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
18561856
18571857// CHECK-LABEL: volta_dot
18581858#dpas = #ttig.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [1 , 1 ]}>
1859- module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
1859+ module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp " = 16 : i32 } {
18601860 tt.func @volta_dot () {
18611861 %cst = arith.constant dense <0.000000e+00 > : tensor <32 x32 xf32 , #dpas >
18621862 %a = arith.constant dense <0.000000e+00 > : tensor <32 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>>
0 commit comments