Skip to content

Commit 61ef8a7

Browse files
committed
Do not set block load attribute for tranposed A matrices
1 parent 734e33c commit 61ef8a7

File tree

2 files changed

+22
-4
lines changed

2 files changed

+22
-4
lines changed

test/TritonIntelGPU/materialize-block-pointer.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, triton_gpu.target = "xpu", t
1717
%5 = tt.load %3 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
1818
%6 = tt.load %4 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
1919

20-
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32, triton_intel_gpu.block_io = "column_major"}
20+
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
2121
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, triton_intel_gpu.block_io = "column_major"}
2222
%7 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf16, #dot_a>>
2323
%8 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf16, #dot_b>>

third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
5151
LDBG("Found make tensor ptr op: " << makeTensorPtrOp);
5252
auto ptrType = cast<tt::PointerType>(makeTensorPtrOp.getType());
5353
auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
54+
auto dotLayout = ttgi::getDotEncoding(tensorType);
55+
5456
Operation::operand_range shape = makeTensorPtrOp.getShape();
5557
unsigned rank = shape.size();
5658
LDBG("Rank: " << rank);
@@ -97,10 +99,26 @@ struct TritonIntelGPUMaterializeBlockPointerPass
9799
128 / tensorType.getElementTypeBitWidth()))
98100
return;
99101

102+
const bool isRowMajor = fastChangeDim == rank - 1;
103+
if (dotLayout) {
104+
// Check if the load is being used in a dot layout, and if so is this
105+
// the first op and is it a transposed row major matrix. If so, skip
106+
// the block ptr attribute as performance is worse than if we remove
107+
// the tensor pointer
108+
LDBG("dotLayout: " << *dotLayout);
109+
const unsigned opIdx = dotLayout->getOpIdx();
110+
auto dotOrder = dotLayout->getThreadOrder();
111+
const bool valueRowMajor = (dotOrder[0] == 1 && dotOrder[1] == 0);
112+
if (opIdx == 0 && valueRowMajor ^ isRowMajor) {
113+
LDBG("Skipping block pointer attribute for transposed A matrix in "
114+
"dot operation");
115+
return;
116+
}
117+
}
118+
100119
loadOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),
101-
StringAttr::get(context, fastChangeDim == rank - 1
102-
? "row_major"
103-
: "column_major"));
120+
StringAttr::get(context, isRowMajor ? "row_major"
121+
: "column_major"));
104122
}
105123
});
106124
}

0 commit comments

Comments
 (0)