Use stride instead of order to determine block attr (#2349)

alexbaden · web-flow · commit 979301f75910 · 2024-09-25T21:49:13.000-04:00
Per the Triton slack, `order` is unused on architecture below Hopper. But more importantly, order provides information that stride already has. In fact, order can be completely different from stride (i.e. wrong) and we still generate correct code. I think it is better to use the stride assuming the logic I added here makes sense. Note this depends on #2348, I'd like to land the debug logging separately, so we have it even if we decide to modify this approach. It was very useful in debugging this problem. cc #2347
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp
@@ -51,17 +51,27 @@ struct TritonIntelGPUMaterializeBlockPointerPass
       LDBG("Found make tensor ptr op: " << makeTensorPtrOp);
       auto ptrType = cast<tt::PointerType>(makeTensorPtrOp.getType());
       auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
-      ArrayRef<int32_t> order = makeTensorPtrOp.getOrder();
-      unsigned rank = order.size();
+      Operation::operand_range shape = makeTensorPtrOp.getShape();
+      unsigned rank = shape.size();
       LDBG("Rank: " << rank);
       if (rank == 1)
         return;
 
-      unsigned fastChangeDim = order[0];
+      Operation::operand_range strides = makeTensorPtrOp.getStrides();
+      int fastChangeDim = -1;
+      for (size_t i = 0; i < strides.size(); ++i) {
+        if (mlir::triton::gpu::intel::isConstant(strides[i], 1)) {
+          fastChangeDim = i;
+          break;
+        }
+      }
+
       LDBG("Fast change dim: " << fastChangeDim);
-      if (fastChangeDim >= (rank - 2)) {
-        Operation::operand_range strides = makeTensorPtrOp.getStrides();
+      if (fastChangeDim < 0) {
+        return;
+      }
 
+      if (fastChangeDim >= (rank - 2)) {
         // HW 2D block read instruction only supports contiguous access.
         Value fastChangeStride = strides[fastChangeDim];
         LLVM_DEBUG({
@@ -77,7 +87,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
         Value pitch =
             strides[(fastChangeDim == rank - 1) ? rank - 2 : rank - 1];
         LDBG("Pitch: " << pitch);
-        if (!ttgi::isDivisible(pitch, 64 / tensorType.getElementTypeBitWidth()))
+        if (!ttgi::isDivisible(pitch,
+                               128 / tensorType.getElementTypeBitWidth()))
           return;
 
         loadOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),