Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion test/TritonIntelGPU/materialize-block-pointer.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, triton_gpu.target = "xpu", t
%5 = tt.load %3 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
%6 = tt.load %4 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>

// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32, triton_intel_gpu.block_io = "column_major"}
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, triton_intel_gpu.block_io = "column_major"}
%7 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf16, #dot_a>>
%8 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf16, #dot_b>>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
LDBG("Found make tensor ptr op: " << makeTensorPtrOp);
auto ptrType = cast<tt::PointerType>(makeTensorPtrOp.getType());
auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
auto dotLayout = ttgi::getDotEncoding(tensorType);

Operation::operand_range shape = makeTensorPtrOp.getShape();
unsigned rank = shape.size();
LDBG("Rank: " << rank);
Expand Down Expand Up @@ -97,10 +99,26 @@ struct TritonIntelGPUMaterializeBlockPointerPass
128 / tensorType.getElementTypeBitWidth()))
return;

const bool isRowMajor = fastChangeDim == rank - 1;
if (dotLayout) {
// Check if the load is being used in a dot layout, and if so is this
// the first op and is it a transposed row major matrix. If so, skip
// the block ptr attribute as performance is worse than if we remove
// the tensor pointer
LDBG("dotLayout: " << *dotLayout);
const unsigned opIdx = dotLayout->getOpIdx();
auto dotOrder = dotLayout->getThreadOrder();
const bool valueRowMajor = (dotOrder[0] == 1 && dotOrder[1] == 0);
if (opIdx == 0 && valueRowMajor ^ isRowMajor) {
LDBG("Skipping block pointer attribute for transposed A matrix in "
"dot operation");
return;
}
}

loadOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),
StringAttr::get(context, fastChangeDim == rank - 1
? "row_major"
: "column_major"));
StringAttr::get(context, isRowMajor ? "row_major"
: "column_major"));
}
});
}
Expand Down