@@ -47,16 +47,16 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-war
4747 // CHECK: %[[VAL_40:.*]] = tt.make_tensor_ptr %{{.*}}, {{\[}}%{{.*}}, %{{.*}}], {{\[}}%{{.*}}, %{{.*}}], {{\[}}%{{.*}}, %{{.*}}] {order = array<i32: 1, 0>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
4848 %22 = tt.make_tensor_ptr %arg1 , [%16 , %20 ], [%21 , %c1_i64 ], [%c0_i32 , %19 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x256 xf16 , #blocked1 >>
4949 // CHECK: %[[VAL_41:.*]]:3 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}} = %{{.*}}, %{{.*}} = %[[VAL_36]], %{{.*}} = %[[VAL_40]]) -> (tensor<64x256xf32, #[[DPAS]]>, !tt.ptr<tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>, !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>) : i32 {
50- // CHECK: %[[VAL_46:.*]] = tt.load %{{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
51- // CHECK: %[[VAL_47:.*]] = tt.load %{{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
50+ // CHECK: %[[VAL_46:.*]] = tt.load %{{.*}} {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major" } : !tt.ptr<tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
51+ // CHECK: %[[VAL_47:.*]] = tt.load %{{.*}} {boundaryCheck = array<i32: 0, 1>, triton_intel_gpu.block_io = "row_major" } : !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
5252 // CHECK-NOT: triton_gpu.convert_layout
5353 // CHECK-NEXT: %[[VAL_48:.*]] = tt.dot %[[VAL_46]], %[[VAL_47]], %{{.*}}, inputPrecision = tf32 : tensor<64x32xf16, #{{.*}}<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>> * tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>> -> tensor<64x256xf32, #[[DPAS]]>
5454 // CHECK: %[[VAL_49:.*]] = tt.advance %{{.*}}, {{\[}}%{{.*}}, %{{.*}}] : <tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
5555 // CHECK: %[[VAL_50:.*]] = tt.advance %{{.*}}, {{\[}}%{{.*}}, %{{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
5656 // CHECK: scf.yield %{{.*}}, %{{.*}}, %{{.*}} : tensor<64x256xf32, #[[DPAS]]>, !tt.ptr<tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>, !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
5757 %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <64 x256 xf32 , #dpas >, !tt.ptr <tensor <64 x32 xf16 , #blocked >>, !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>) : i32 {
58- %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <64 x32 xf16 , #blocked >>
59- %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>
58+ %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <64 x32 xf16 , #blocked >>
59+ %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>
6060 %30 = triton_gpu.convert_layout %28 : tensor <64 x32 xf16 , #blocked > -> tensor <64 x32 xf16 , #dot0 >
6161 %31 = triton_gpu.convert_layout %29 : tensor <32 x256 xf16 , #blocked1 > -> tensor <32 x256 xf16 , #dot1 >
6262 %32 = tt.dot %30 , %31 , %arg10 , inputPrecision = tf32 : tensor <64 x32 xf16 , #dot0 > * tensor <32 x256 xf16 , #dot1 > -> tensor <64 x256 xf32 , #dpas >
@@ -130,7 +130,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
130130 scf.yield %32 , %33 , %34 : tensor <64 x256 xf32 , #dpas >, !tt.ptr <tensor <64 x32 xf16 , #blocked >>, !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>
131131 }
132132 %24 = arith.truncf %23#0 : tensor <64 x256 xf32 , #dpas > to tensor <64 x256 xf16 , #dpas >
133- // CHECK-NOT: triton_gpu.convert_layout
134133 %25 = triton_gpu.convert_layout %24 : tensor <64 x256 xf16 , #dpas > -> tensor <64 x256 xf16 , #blocked1 >
135134 %26 = arith.extsi %arg8 : i32 to i64
136135 // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #[[DPAS]]>>
@@ -147,6 +146,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
147146// COM: Checks that DPAS encoding has been forwarded to the store op
148147// COM: The `tt.make_tensor_ptr` has multiple users (the storeOp + another OP)
149148// COM: The initial `tt.make_tensor_ptr` with non-DPAS encoding must be kept.
149+ // CHECK: #[[BLOCKED:.+]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
150150// CHECK: #[[DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 1], A = [8, 16], B = [16, 16], C = [8, 16]}>
151151#blocked = #triton_gpu.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [2 , 2 ], order = [1 , 0 ]}>
152152#blocked1 = #triton_gpu.blocked <{sizePerThread = [1 , 1 ], threadsPerWarp = [1 , 16 ], warpsPerCTA = [1 , 4 ], order = [1 , 0 ]}>
@@ -188,8 +188,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
188188 %21 = arith.extsi %arg7 : i32 to i64
189189 %22 = tt.make_tensor_ptr %arg1 , [%16 , %20 ], [%21 , %c1_i64 ], [%c0_i32 , %19 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x256 xf16 , #blocked1 >>
190190 %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <64 x256 xf32 , #dpas >, !tt.ptr <tensor <64 x32 xf16 , #blocked >>, !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>) : i32 {
191- %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <64 x32 xf16 , #blocked >>
192- %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>
191+ %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <64 x32 xf16 , #blocked >>
192+ %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>
193193 %30 = triton_gpu.convert_layout %28 : tensor <64 x32 xf16 , #blocked > -> tensor <64 x32 xf16 , #dot0 >
194194 %31 = triton_gpu.convert_layout %29 : tensor <32 x256 xf16 , #blocked1 > -> tensor <32 x256 xf16 , #dot1 >
195195 %32 = tt.dot %30 , %31 , %arg10 , inputPrecision = tf32 : tensor <64 x32 xf16 , #dot0 > * tensor <32 x256 xf16 , #dot1 > -> tensor <64 x256 xf32 , #dpas >
@@ -198,11 +198,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
198198 scf.yield %32 , %33 , %34 : tensor <64 x256 xf32 , #dpas >, !tt.ptr <tensor <64 x32 xf16 , #blocked >>, !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>
199199 }
200200 %24 = arith.truncf %23#0 : tensor <64 x256 xf32 , #dpas > to tensor <64 x256 xf16 , #dpas >
201- // CHECK-NOT: triton_gpu.convert_layout
202201 %25 = triton_gpu.convert_layout %24 : tensor <64 x256 xf16 , #dpas > -> tensor <64 x256 xf16 , #blocked1 >
203202 %26 = arith.extsi %arg8 : i32 to i64
204203 // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #[[DPAS]]>>
205- // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}> >>
204+ // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #[[BLOCKED]] >>
206205 %27 = tt.make_tensor_ptr %arg2 , [%15 , %20 ], [%26 , %c1_i64 ], [%14 , %19 ] {order = array<i32 : 1 , 0 >} : <tensor <64 x256 xf16 , #blocked1 >>
207206 // CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #[[DPAS]]>>
208207 tt.store %27 , %25 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <64 x256 xf16 , #blocked1 >>
@@ -243,8 +242,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
243242 %18 = tt.make_tensor_ptr %arg0 , [%c0_i64 , %c0_i64 ], [%c0_i64 , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <64 x32 xf16 , #blocked >>
244243 %22 = tt.make_tensor_ptr %arg1 , [%c0_i64 , %c0_i64 ], [%c0_i64 , %c1_i64 ], [%c0_i32 , %c0_i32 ] {order = array<i32 : 1 , 0 >} : <tensor <32 x256 xf16 , #blocked1 >>
245244 %23:3 = scf.for %arg9 = %c0_i32 to %arg5 step %c32_i32 iter_args (%arg10 = %cst , %arg11 = %18 , %arg12 = %22 ) -> (tensor <64 x256 xf32 , #blocked1 >, !tt.ptr <tensor <64 x32 xf16 , #blocked >>, !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>) : i32 {
246- %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <64 x32 xf16 , #blocked >>
247- %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >} : !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>
245+ %28 = tt.load %arg11 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <64 x32 xf16 , #blocked >>
246+ %29 = tt.load %arg12 {boundaryCheck = array<i32 : 0 , 1 >, triton_intel_gpu.block_io = " row_major " } : !tt.ptr <tensor <32 x256 xf16 , #blocked1 >>
248247 %36 = triton_gpu.convert_layout %arg10 : tensor <64 x256 xf32 , #blocked1 > -> tensor <64 x256 xf32 , #dpas >
249248 %30 = triton_gpu.convert_layout %28 : tensor <64 x32 xf16 , #blocked > -> tensor <64 x32 xf16 , #dot0 >
250249 %31 = triton_gpu.convert_layout %29 : tensor <32 x256 xf16 , #blocked1 > -> tensor <32 x256 xf16 , #dot1 >
0 commit comments