[RemoveLayout] Remove convert layout op for any layout if the user is tt.store with block pointer (#4751)

chengjunlu · web-flow · commit bae335665101 · 2025-07-28T23:57:19.000-04:00
It is always lower cost to store the value to the memory referred by
block pointer directly without layout conversion.

Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir b/test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir
@@ -261,3 +261,28 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+// COM: Case 5:
+// COM: Checks that block encoding has been forwarded to the store op
+// COM: and the ttg.convert_layout operation has been removed
+// CHECK: #[[BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
+  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>) {
+    %c8_i32 = arith.constant 8 : i32
+    %c64_i64 = arith.constant 64 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c256_i64 = arith.constant 256 : i64
+    %cst = arith.constant dense<0.000000e+00> : tensor<64x256xf16, #blocked>
+    // CHECK-NOT: ttg.convert_layout
+    %25 = ttg.convert_layout %cst : tensor<64x256xf16, #blocked> -> tensor<64x256xf16, #blocked1>
+    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #[[BLOCKED]]>>
+    %27 = tt.make_tensor_ptr %arg0, [%c256_i64, %c256_i64], [%c64_i64, %c1_i64], [%c8_i32, %c8_i32] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #blocked1>>
+    // CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #[[BLOCKED]]>>
+    tt.store %27, %25 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #blocked1>>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -825,20 +825,9 @@ bool LayoutPropagation::rewriteTensorPtrStoreOp(StoreOp storeOp) {
                                          tensorType.getElementType(), encoding);
     newPtrType = PointerType::get(tmpType, ptrType.getAddressSpace());
   } else {
-    Attribute convertOpDstEncoding = convertOp.getType().getEncoding();
     RankedTensorType convertOpSrcType = convertOp.getSrc().getType();
-    if (((!convertOpDstEncoding) ||
-         isa<ttgi::DpasEncodingAttr>(convertOpDstEncoding)) ||
-        (!convertOpSrcType ||
-         !isa<ttgi::DpasEncodingAttr>(convertOpSrcType.getEncoding())))
-      return false;
 
     auto ptrType = cast<PointerType>(makeTensorPtrOp.getType());
-    auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
-    // If the output type of the MakeTensorPtrOp already has a
-    // DPAS encoding, we do not forward the previous DPAS encoding.
-    if (isa<ttgi::DpasEncodingAttr>(tensorType.getEncoding()))
-      return false;
 
     newPtrType = PointerType::get(convertOpSrcType, ptrType.getAddressSpace());