@@ -801,7 +801,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
801801#blocked0 = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [8 , 2 ], warpsPerCTA = [1 , 1 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
802802#shared0 = #ttg.shared <{vec = 1 , perPhase =2 , maxPhase =8 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
803803#dpas0 = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [1 , 1 ]}>
804- #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas0 , kWidth =2 }>
804+ #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas0 , kWidth =1 }>
805805#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas0 , kWidth =2 }>
806806#smem = #ttg.shared_memory
807807module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " ttg.threads-per-warp" = 16 : i32 } {
@@ -953,7 +953,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
953953#blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
954954#shared = #ttg.shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
955955#dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
956- #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =2 }>
956+ #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
957957#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =2 }>
958958#smem = #ttg.shared_memory
959959module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
@@ -980,7 +980,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
980980#shared0 = #ttg.shared <{vec = 4 , perPhase = 1 , maxPhase = 8 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
981981#shared1 = #ttg.shared <{vec = 8 , perPhase = 1 , maxPhase = 4 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
982982#dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
983- #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =2 }>
983+ #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
984984#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =2 }>
985985#smem = #ttg.shared_memory
986986module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
@@ -1028,7 +1028,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
10281028#dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
10291029#shared = #ttg.shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
10301030#blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1031- #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =2 }>
1031+ #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
10321032#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =2 }>
10331033#smem = #ttg.shared_memory
10341034module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
@@ -1422,7 +1422,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32} {
14221422#dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 2 , threadsPerWarp = 16 , warpsPerCTA = [2 , 2 ], repCluster = [1 , 1 ]}>
14231423#shared = #ttg.shared <{vec = 1 , perPhase = 1 , maxPhase = 1 , order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
14241424#blocked = #ttg.blocked <{sizePerThread = [1 , 4 ], threadsPerWarp = [2 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ], CTAsPerCGA = [1 , 1 ], CTASplitNum = [1 , 1 ], CTAOrder = [1 , 0 ]}>
1425- #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =2 }>
1425+ #dot_operand_a = #ttg.dot_op <{opIdx =0 , parent =#dpas , kWidth =1 }>
14261426#dot_operand_b = #ttg.dot_op <{opIdx =1 , parent =#dpas , kWidth =2 }>
14271427module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
14281428 // CHECK-LABEL: matmul_tf32_cst_b
@@ -1454,10 +1454,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
14541454 // CHECK: %[[Cf16:.+]] = llvm.bitcast %[[C1f]] : f16 to f16
14551455 // CHECK: %[[U:.+]] = llvm.mlir.undef : !llvm.struct<(f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16, f16)>
14561456 // CHECK: llvm.insertvalue %[[Cf16]], %[[U]][0]
1457- %cst_0 = arith.constant dense <1.000000e+00 > : tensor <32 x32 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>>
1457+ %cst_0 = arith.constant dense <1.000000e+00 > : tensor <32 x32 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>>
14581458 %cst_1 = arith.constant dense <1.000000e+00 > : tensor <32 x32 xf16 , #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>>
14591459 %cst_2 = arith.constant dense <32 > : tensor <32 x1 xi32 , #blocked >
1460- %0 = tt.dot %cst_0 , %cst_1 , %cst : tensor <32 x32 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>> * tensor <32 x32 xf16 , #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>> -> tensor <32 x32 xf32 , #dpas >
1460+ %0 = tt.dot %cst_0 , %cst_1 , %cst : tensor <32 x32 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>> * tensor <32 x32 xf16 , #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>> -> tensor <32 x32 xf32 , #dpas >
14611461 %1 = ttg.convert_layout %0 : tensor <32 x32 xf32 , #dpas > -> tensor <32 x32 xf32 , #blocked >
14621462 %2 = tt.make_range {end = 32 : i32 , start = 0 : i32 } : tensor <32 xi32 , #ttg.slice <{dim = 1 , parent = #blocked }>>
14631463 %3 = tt.expand_dims %2 {axis = 1 : i32 } : tensor <32 xi32 , #ttg.slice <{dim = 1 , parent = #blocked }>> -> tensor <32 x1 xi32 , #blocked >
@@ -1489,7 +1489,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
14891489
14901490// -----
14911491#dpas = #triton_intel_gpu.dpas <{repeatCount = 8 , systolicDepth = 8 , executionSize = 16 , opsPerChan = 4 , threadsPerWarp = 16 , warpsPerCTA = [1 , 1 ], repCluster = [1 , 1 ]}>
1492- #dot = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =4 }>
1492+ #dot = #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>
14931493module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 1 : i32 , " triton_intel_gpu.support_bf16_conversion" } {
14941494 // CHECK-LABEL: test_s8_to_bf16_vectorized_conversion
14951495 tt.func @test_s8_to_bf16_vectorized_conversion (%in: tensor <16 x16 xi8 , #dpas >) {
@@ -1793,10 +1793,10 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
17931793module attributes {" ttg.num-ctas" = 1 : i32 , " ttg.num-warps" = 4 : i32 } {
17941794 tt.func @volta_dot () {
17951795 %cst = arith.constant dense <0.000000e+00 > : tensor <32 x32 xf32 , #dpas >
1796- %a = arith.constant dense <0.000000e+00 > : tensor <32 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>>
1796+ %a = arith.constant dense <0.000000e+00 > : tensor <32 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>>
17971797 %b = arith.constant dense <0.000000e+00 > : tensor <64 x32 xf16 , #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>>
17981798
1799- %87 = tt.dot %a , %b , %cst : tensor <32 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =2 }>> * tensor <64 x32 xf16 , #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>> -> tensor <32 x32 xf32 , #dpas >
1799+ %87 = tt.dot %a , %b , %cst : tensor <32 x64 xf16 , #ttg.dot_op <{opIdx = 0 , parent = #dpas , kWidth =1 }>> * tensor <64 x32 xf16 , #ttg.dot_op <{opIdx = 1 , parent = #dpas , kWidth =2 }>> -> tensor <32 x32 xf32 , #dpas >
18001800 tt.return
18011801 }
18021802}
0 commit comments