Skip to content

Commit bae3356

Browse files
authored
[RemoveLayout] Remove convert layout op for any layout if the user is tt.store with block pointer (#4751)
It is always lower cost to store the value to the memory referred by block pointer directly without layout conversion. Signed-off-by: Lu,Chengjun <[email protected]>
1 parent ff0309c commit bae3356

File tree

2 files changed

+25
-11
lines changed

2 files changed

+25
-11
lines changed

test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,3 +261,28 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
261261
tt.return
262262
}
263263
}
264+
265+
// -----
266+
267+
// COM: Case 5:
268+
// COM: Checks that block encoding has been forwarded to the store op
269+
// COM: and the ttg.convert_layout operation has been removed
270+
// CHECK: #[[BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
271+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
272+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
273+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
274+
tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>) {
275+
%c8_i32 = arith.constant 8 : i32
276+
%c64_i64 = arith.constant 64 : i64
277+
%c1_i64 = arith.constant 1 : i64
278+
%c256_i64 = arith.constant 256 : i64
279+
%cst = arith.constant dense<0.000000e+00> : tensor<64x256xf16, #blocked>
280+
// CHECK-NOT: ttg.convert_layout
281+
%25 = ttg.convert_layout %cst : tensor<64x256xf16, #blocked> -> tensor<64x256xf16, #blocked1>
282+
// CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #[[BLOCKED]]>>
283+
%27 = tt.make_tensor_ptr %arg0, [%c256_i64, %c256_i64], [%c64_i64, %c1_i64], [%c8_i32, %c8_i32] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #blocked1>>
284+
// CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #[[BLOCKED]]>>
285+
tt.store %27, %25 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #blocked1>>
286+
tt.return
287+
}
288+
}

third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -825,20 +825,9 @@ bool LayoutPropagation::rewriteTensorPtrStoreOp(StoreOp storeOp) {
825825
tensorType.getElementType(), encoding);
826826
newPtrType = PointerType::get(tmpType, ptrType.getAddressSpace());
827827
} else {
828-
Attribute convertOpDstEncoding = convertOp.getType().getEncoding();
829828
RankedTensorType convertOpSrcType = convertOp.getSrc().getType();
830-
if (((!convertOpDstEncoding) ||
831-
isa<ttgi::DpasEncodingAttr>(convertOpDstEncoding)) ||
832-
(!convertOpSrcType ||
833-
!isa<ttgi::DpasEncodingAttr>(convertOpSrcType.getEncoding())))
834-
return false;
835829

836830
auto ptrType = cast<PointerType>(makeTensorPtrOp.getType());
837-
auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
838-
// If the output type of the MakeTensorPtrOp already has a
839-
// DPAS encoding, we do not forward the previous DPAS encoding.
840-
if (isa<ttgi::DpasEncodingAttr>(tensorType.getEncoding()))
841-
return false;
842831

843832
newPtrType = PointerType::get(convertOpSrcType, ptrType.getAddressSpace());
844833

0 commit comments

Comments
 (0)