Do not set block load attribute for transposed A matrices (#2443)

alexbaden · web-flow · commit 89868e2655c5 · 2024-10-10T11:28:26.000-04:00
We cannot lower a transposed A matrix to a transposed 2D block load. Instead, the load is lowered via the LLVM path introduced in #2181 . There appears to be a performance regression in this path which is slower than materializing the block in SLM and then reading into registers and computing the dot product from there. Using the work in #2420 I am able to drop the block load attribute for this case and go down the non block ptr path. Performance on main: ``` Compute A x B ✅ Triton and Torch match Time for torch: 0.32444801926612854 ms Time for triton: 0.44371041655540466 ms Compute A x B.T ✅ Triton and Torch match Time for torch: 0.32708799839019775 ms Time for triton: 0.634996771812439 ms Compute A.T x B ✅ Triton and Torch match Time for torch: 0.31204161047935486 ms Time for triton: 3.4140689373016357 ms Compute A.T x B.T ✅ Triton and Torch match Time for torch: 0.45701122283935547 ms Time for triton: 3.7463345527648926 ms ``` Performance on this PR: ``` Compute A x B ✅ Triton and Torch match Time for torch: 0.3081200122833252 ms Time for triton: 0.44333598017692566 ms Compute A x B.T ✅ Triton and Torch match Time for torch: 0.33799198269844055 ms Time for triton: 0.6391856074333191 ms Compute A.T x B ✅ Triton and Torch match Time for torch: 0.31700319051742554 ms Time for triton: 1.5733630657196045 ms Compute A.T x B.T ✅ Triton and Torch match Time for torch: 0.45083683729171753 ms Time for triton: 1.8271965980529785 ms ``` Note that the important commit is `31386ef1132c3f6cf9cb5f1063ecfab705f4c2a1`. Once #2420 is merged I will rebase this. Depends on #2420. Links to #1795.
diff --git a/test/TritonIntelGPU/materialize-block-pointer.mlir b/test/TritonIntelGPU/materialize-block-pointer.mlir
@@ -17,7 +17,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, triton_gpu.target = "xpu", t
     %5 = tt.load %3 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
     %6 = tt.load %4 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
 
-    // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32, triton_intel_gpu.block_io = "column_major"}
+    // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
     // CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, triton_intel_gpu.block_io = "column_major"}
     %7 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf16, #dot_a>>
     %8 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf16, #dot_b>>
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
@@ -51,6 +51,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       LDBG("Found make tensor ptr op: " << makeTensorPtrOp);
       auto ptrType = cast<tt::PointerType>(makeTensorPtrOp.getType());
       auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
+      auto dotLayout = ttgi::getDotEncoding(tensorType);
+
       Operation::operand_range shape = makeTensorPtrOp.getShape();
       unsigned rank = shape.size();
       LDBG("Rank: " << rank);
@@ -97,10 +99,26 @@ struct TritonIntelGPUMaterializeBlockPointerPass
                                128 / tensorType.getElementTypeBitWidth()))
           return;
 
+        const bool isRowMajor = fastChangeDim == rank - 1;
+        if (dotLayout) {
+          // Check if the load is being used in a dot layout, and if so is this
+          // the first op and is it a transposed row major matrix. If so, skip
+          // the block ptr attribute as performance is worse than if we remove
+          // the tensor pointer
+          LDBG("dotLayout: " << *dotLayout);
+          const unsigned opIdx = dotLayout->getOpIdx();
+          auto dotOrder = dotLayout->getThreadOrder();
+          const bool valueRowMajor = (dotOrder[0] == 1 && dotOrder[1] == 0);
+          if (opIdx == 0 && valueRowMajor ^ isRowMajor) {
+            LDBG("Skipping block pointer attribute for transposed A matrix in "
+                 "dot operation");
+            return;
+          }
+        }
+
         loadOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),
-                        StringAttr::get(context, fastChangeDim == rank - 1
-                                                     ? "row_major"
-                                                     : "column_major"));
+                        StringAttr::get(context, isRowMajor ? "row_major"
+                                                            : "column_major"));
       }
     });
   }