Skip to content

Commit 89868e2

Browse files
authored
Do not set block load attribute for transposed A matrices (#2443)
We cannot lower a transposed A matrix to a transposed 2D block load. Instead, the load is lowered via the LLVM path introduced in #2181 . There appears to be a performance regression in this path which is slower than materializing the block in SLM and then reading into registers and computing the dot product from there. Using the work in #2420 I am able to drop the block load attribute for this case and go down the non block ptr path. Performance on main: ``` Compute A x B ✅ Triton and Torch match Time for torch: 0.32444801926612854 ms Time for triton: 0.44371041655540466 ms Compute A x B.T ✅ Triton and Torch match Time for torch: 0.32708799839019775 ms Time for triton: 0.634996771812439 ms Compute A.T x B ✅ Triton and Torch match Time for torch: 0.31204161047935486 ms Time for triton: 3.4140689373016357 ms Compute A.T x B.T ✅ Triton and Torch match Time for torch: 0.45701122283935547 ms Time for triton: 3.7463345527648926 ms ``` Performance on this PR: ``` Compute A x B ✅ Triton and Torch match Time for torch: 0.3081200122833252 ms Time for triton: 0.44333598017692566 ms Compute A x B.T ✅ Triton and Torch match Time for torch: 0.33799198269844055 ms Time for triton: 0.6391856074333191 ms Compute A.T x B ✅ Triton and Torch match Time for torch: 0.31700319051742554 ms Time for triton: 1.5733630657196045 ms Compute A.T x B.T ✅ Triton and Torch match Time for torch: 0.45083683729171753 ms Time for triton: 1.8271965980529785 ms ``` Note that the important commit is `31386ef1132c3f6cf9cb5f1063ecfab705f4c2a1`. Once #2420 is merged I will rebase this. Depends on #2420. Links to #1795.
1 parent d66d424 commit 89868e2

File tree

2 files changed

+22
-4
lines changed

2 files changed

+22
-4
lines changed

test/TritonIntelGPU/materialize-block-pointer.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, triton_gpu.target = "xpu", t
1717
%5 = tt.load %3 {boundaryCheck = array<i32: 1>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<64x32xf16, #dot_a>>
1818
%6 = tt.load %4 {boundaryCheck = array<i32: 0>, cache = 1 : i32, evict = 1 : i32, isVolatile = false, padding = 1 : i32} : !tt.ptr<tensor<32x64xf16, #dot_b>>
1919

20-
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32, triton_intel_gpu.block_io = "column_major"}
20+
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 1>, padding = 1 : i32}
2121
// CHECK: tt.load {{.*}} {boundaryCheck = array<i32: 0>, padding = 1 : i32, triton_intel_gpu.block_io = "column_major"}
2222
%7 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf16, #dot_a>>
2323
%8 = tt.make_tensor_ptr %arg0, [%c0_i64, %c0_i64], [%c1_i64, %pitch], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<32x64xf16, #dot_b>>

third_party/intel/lib/TritonIntelGPUTransforms/MaterializeBlockPointer.cpp

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ struct TritonIntelGPUMaterializeBlockPointerPass
5151
LDBG("Found make tensor ptr op: " << makeTensorPtrOp);
5252
auto ptrType = cast<tt::PointerType>(makeTensorPtrOp.getType());
5353
auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
54+
auto dotLayout = ttgi::getDotEncoding(tensorType);
55+
5456
Operation::operand_range shape = makeTensorPtrOp.getShape();
5557
unsigned rank = shape.size();
5658
LDBG("Rank: " << rank);
@@ -97,10 +99,26 @@ struct TritonIntelGPUMaterializeBlockPointerPass
9799
128 / tensorType.getElementTypeBitWidth()))
98100
return;
99101

102+
const bool isRowMajor = fastChangeDim == rank - 1;
103+
if (dotLayout) {
104+
// Check if the load is being used in a dot layout, and if so is this
105+
// the first op and is it a transposed row major matrix. If so, skip
106+
// the block ptr attribute as performance is worse than if we remove
107+
// the tensor pointer
108+
LDBG("dotLayout: " << *dotLayout);
109+
const unsigned opIdx = dotLayout->getOpIdx();
110+
auto dotOrder = dotLayout->getThreadOrder();
111+
const bool valueRowMajor = (dotOrder[0] == 1 && dotOrder[1] == 0);
112+
if (opIdx == 0 && valueRowMajor ^ isRowMajor) {
113+
LDBG("Skipping block pointer attribute for transposed A matrix in "
114+
"dot operation");
115+
return;
116+
}
117+
}
118+
100119
loadOp->setAttr(ttgi::TritonIntelGPUDialect::getBlockIOAttrName(),
101-
StringAttr::get(context, fastChangeDim == rank - 1
102-
? "row_major"
103-
: "column_major"));
120+
StringAttr::get(context, isRowMajor ? "row_major"
121+
: "column_major"));
104122
}
105123
});
106124
}

0 commit comments

Comments
 (0)