diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp index f281764c63..8361675b55 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp @@ -51,17 +51,27 @@ struct TritonIntelGPUMaterializeBlockPointerPass LDBG("Found make tensor ptr op: " << makeTensorPtrOp); auto ptrType = cast(makeTensorPtrOp.getType()); auto tensorType = cast(ptrType.getPointeeType()); - ArrayRef order = makeTensorPtrOp.getOrder(); - unsigned rank = order.size(); + Operation::operand_range shape = makeTensorPtrOp.getShape(); + unsigned rank = shape.size(); LDBG("Rank: " << rank); if (rank == 1) return; - unsigned fastChangeDim = order[0]; + Operation::operand_range strides = makeTensorPtrOp.getStrides(); + int fastChangeDim = -1; + for (size_t i = 0; i < strides.size(); ++i) { + if (mlir::triton::gpu::intel::isConstant(strides[i], 1)) { + fastChangeDim = i; + break; + } + } + LDBG("Fast change dim: " << fastChangeDim); - if (fastChangeDim >= (rank - 2)) { - Operation::operand_range strides = makeTensorPtrOp.getStrides(); + if (fastChangeDim < 0) { + return; + } + if (fastChangeDim >= (rank - 2)) { // HW 2D block read instruction only supports contiguous access. Value fastChangeStride = strides[fastChangeDim]; LLVM_DEBUG({ @@ -77,7 +87,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass Value pitch = strides[(fastChangeDim == rank - 1) ? rank - 2 : rank - 1]; LDBG("Pitch: " << pitch); - if (!ttgi::isDivisible(pitch, 64 / tensorType.getElementTypeBitWidth())) + if (!ttgi::isDivisible(pitch, + 128 / tensorType.getElementTypeBitWidth())) return; loadOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),