MaterializeBlockPointer fix for GEMM with 1st operand transposed

etiotto · etiotto · commit 7c9a0f93f2d2 · 2024-10-25T19:53:49.000Z
Signed-off-by: Tiotto, Ettore &lt;ettore.tiotto@intel.com&gt;
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
@@ -12,6 +12,7 @@
 
 using namespace mlir;
 namespace tt = mlir::triton;
+namespace ttg = mlir::triton::gpu;
 namespace ttgi = mlir::triton::gpu::intel;
 
 namespace mlir::triton::gpu::intel {
@@ -37,7 +38,7 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       return;
 
     MLIRContext *context = &getContext();
-    mod.walk([context](tt::LoadOp loadOp) {
+    mod.walk([context, this](tt::LoadOp loadOp) {
       LDBG("Considering op: " << loadOp);
 
       Value ptr = loadOp.getPtr();
@@ -51,7 +52,6 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       LDBG("Found make tensor ptr op: " << makeTensorPtrOp);
       auto ptrType = cast<tt::PointerType>(makeTensorPtrOp.getType());
       auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
-      auto dotLayout = ttgi::getDotEncoding(tensorType);
 
       Operation::operand_range shape = makeTensorPtrOp.getShape();
       unsigned rank = shape.size();
@@ -100,11 +100,11 @@ struct TritonIntelGPUMaterializeBlockPointerPass
           return;
 
         const bool isRowMajor = fastChangeDim == rank - 1;
-        if (dotLayout) {
-          // Check if the load is being used in a dot layout, and if so is this
-          // the first op and is it a transposed row major matrix. If so, skip
-          // the block ptr attribute as performance is worse than if we remove
-          // the tensor pointer
+        if (auto dotLayout = getDotLayout(loadOp)) {
+          // Check if the load is being used by a tt.dot operation, and if so is
+          // this the first operand and is it a transposed row major matrix. If
+          // so, skip the block ptr attribute as performance is worse than if we
+          // remove the tensor pointer.
           LDBG("dotLayout: " << *dotLayout);
           const unsigned opIdx = dotLayout->getOpIdx();
           auto dotOrder = dotLayout->getThreadOrder();
@@ -122,6 +122,46 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       }
     });
   }
+
+private:
+  // Return the load layout if it is a dot layout. If it is not, check if the
+  // load result is converted to a dot layout. If so, return the dot layout,
+  // otherwise return nullopt.
+  std::optional<ttg::DotOperandEncodingAttr>
+  getDotLayout(tt::LoadOp loadOp) const {
+    Value ptr = loadOp.getPtr();
+    if (!tt::isTensorPointerType(ptr.getType()))
+      return nullptr;
+
+    RankedTensorType tensorType = ttgi::getRankedTensorType(ptr.getType());
+    auto dotLayout = ttgi::getDotEncoding(tensorType);
+    if (dotLayout)
+      return dotLayout;
+
+    auto allUsersAreConvertOps = [](Operation::user_range users) {
+      return llvm::all_of(users, [](Operation *user) {
+        return isa<ttg::ConvertLayoutOp>(user);
+      });
+    };
+
+    auto allUserHaveIdenticalLayout = [](Operation::user_range users) {
+      Attribute firstUserLayout =
+          cast<ttg::ConvertLayoutOp>(*users.begin()).getType().getEncoding();
+      return llvm::all_of(users, [&firstUserLayout](Operation *user) {
+        return firstUserLayout ==
+               cast<ttg::ConvertLayoutOp>(user).getType().getEncoding();
+      });
+    };
+
+    Operation::user_range users = loadOp->getUsers();
+    if (allUsersAreConvertOps(users) && allUserHaveIdenticalLayout(users)) {
+      Attribute firstUserLayout =
+          cast<ttg::ConvertLayoutOp>(*users.begin()).getType().getEncoding();
+      return dyn_cast<ttg::DotOperandEncodingAttr>(firstUserLayout);
+    }
+
+    return nullptr;
+  }
 };
 
 } // anonymous namespace