Skip to content

Commit 03f3094

Browse files
authored
[Codegen][GPU] Sort intrinsic according to k alignment - Step 2 of 2 - Creating intrinsic sort routine (iree-org#21128)
This is a follow-up of iree-org#21103. This intrinsic sorting routine will sort the MMA intrinsics by following precedence rules: 1) K-alignment. We prefer intrinsics that can evenly divide the K dimension of the problem. 2) M/N-alignment. We prefer intrinsics that can evenly divide the M and N dimensions of the problem. 3) Intrinsic with larger gemm size. 4) Intrinsic with larger K size. Scope of the impact is igemm and matmul problems. Other pipelines can choose to adopt this if the sorting turns out to be useful. ### Motivation: convolution configuration with > BOO_CACHE_ON=0 python boo_driver.py convbfp16 -n 16 -c 40 -H 192 -W 128 -k 40 -y 3 -x 3 -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 1 -F 1 -t 1 --in_layout NHWC --out_layout NHWC --fil_layout NHWC --iter 50 Since the gemmK size is 360, the best intrinsic is 32x32x8xbf16, not 16x16x16xbf16, as the former will end in aligned K. Before this PR: `362.06 us`. After this PR: `44.74 us`. Note that due to limitation of the tuner, this combination won't be covered by typical tuning run with 2k trials. So it is essential we bridge the gap through heuristic first before tuner is more capable of searching through different heuristics. --------- Signed-off-by: jerryyin <[email protected]>
1 parent 994a716 commit 03f3094

File tree

2 files changed

+86
-0
lines changed

2 files changed

+86
-0
lines changed

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,54 @@ LogicalResult setDataTiledMultiMmaLoweringConfig(
123123
workgroupSize, targetSubgroupSize, pipelineConfig);
124124
}
125125

126+
/// Sort the MMA intrinsics by following precedence rules:
127+
/// 1) k-alignment. We prefer intrinsics that can evenly divide the K
128+
/// dimension of the problem.
129+
/// 2) M/N-alignment. We prefer intrinsics that can evenly divide the M and N
130+
/// dimensions of the problem.
131+
/// 3) Intrinsic with larger gemm size.
132+
/// 4) Intrinsic with larger K size.
133+
static void sortMMAIntrinsics(GPUMatmulShapeType problem,
134+
SmallVector<GPUIntrinsicType> &intrinsics) {
135+
llvm::sort(intrinsics, [&](const GPUMatmulShapeType &lhs,
136+
const GPUMatmulShapeType &rhs) {
137+
// Prefer K-aligned intrinsics.
138+
int lhsKAligned = problem.kSizes.back() % lhs.kSizes.back() == 0 ? 1 : 0;
139+
int rhsKAligned = problem.kSizes.back() % rhs.kSizes.back() == 0 ? 1 : 0;
140+
if (lhsKAligned != rhsKAligned) {
141+
return lhsKAligned > rhsKAligned;
142+
}
143+
144+
// If K alignment is the same, prefer the intrinsic that aligns M and N.
145+
int lhsMNAligned = (problem.mSizes.back() % lhs.mSizes.back() == 0 &&
146+
problem.nSizes.back() % lhs.nSizes.back() == 0)
147+
? 1
148+
: 0;
149+
int rhsMNAligned = (problem.mSizes.back() % rhs.mSizes.back() == 0 &&
150+
problem.nSizes.back() % rhs.nSizes.back() == 0)
151+
? 1
152+
: 0;
153+
if (lhsMNAligned != rhsMNAligned) {
154+
return lhsMNAligned > rhsMNAligned;
155+
}
156+
157+
auto intrinsicArea = [&](const GPUMatmulShapeType &intrinsic) {
158+
return (ShapedType::getNumElements(intrinsic.mSizes) +
159+
ShapedType::getNumElements(intrinsic.nSizes)) *
160+
ShapedType::getNumElements(intrinsic.kSizes);
161+
};
162+
int64_t lhsArea = intrinsicArea(lhs);
163+
int64_t rhsArea = intrinsicArea(rhs);
164+
if (lhsArea != rhsArea) {
165+
return lhsArea > rhsArea;
166+
}
167+
168+
// Finally if everything else is the same, prefer large K size.
169+
return ShapedType::getNumElements(lhs.kSizes) >
170+
ShapedType::getNumElements(rhs.kSizes);
171+
});
172+
}
173+
126174
/// Given a target and a matmul problem, try to find an MMA schedule for the
127175
/// problem based on the available mma intrinsics.
128176
static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
@@ -144,6 +192,7 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
144192
}
145193
if (intrinsics.empty())
146194
return std::nullopt;
195+
sortMMAIntrinsics(problem, intrinsics);
147196

148197
GPUMMAHeuristicSeeds seeds;
149198
assert(problem.aType == problem.bType &&

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,43 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
157157

158158
// -----
159159

160+
// This tests the mfma lowering intrinsic K alignment. Since gemmK = 360, and will
161+
// be aligned to 8 but not 16, we expect the 32x32x8xf16 intrinsic to be used.
162+
func.func @mfma_matmul_k_aligned_intrinsic(%lhs: tensor<1024x360xf16>, %rhs: tensor<360x1024xf16>) -> tensor<1024x1024xf32> {
163+
%cst = arith.constant 0.000000e+00 : f32
164+
%5 = tensor.empty() : tensor<1024x1024xf32>
165+
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
166+
%7 = linalg.matmul ins(%lhs, %rhs : tensor<1024x360xf16>, tensor<360x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
167+
return %7 : tensor<1024x1024xf32>
168+
}
169+
170+
// CHECK-LABEL: func.func @mfma_matmul_k_aligned_intrinsic
171+
// CHECK: pipeline = LLVMGPUTileAndFuse
172+
// CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
173+
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>
174+
175+
// LATE: LLVMGPUVectorDistribute
176+
177+
// -----
178+
179+
// This tests the mfma lowering intrinsic M alignment. Since gemmM = 176, and will
180+
// be aligned to 16 but not 32, we expect the 16x16x32xi8 intrinsic to be used.
181+
func.func @mfma_matmul_m_aligned_intrinsic(%lhs: tensor<176x1024xi8>, %rhs: tensor<1024x1024xi8>) -> tensor<176x1024xi32> {
182+
%cst = arith.constant 0 : i32
183+
%5 = tensor.empty() : tensor<176x1024xi32>
184+
%6 = linalg.fill ins(%cst : i32) outs(%5 : tensor<176x1024xi32>) -> tensor<176x1024xi32>
185+
%7 = linalg.matmul ins(%lhs, %rhs : tensor<176x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<176x1024xi32>) -> tensor<176x1024xi32>
186+
return %7 : tensor<176x1024xi32>
187+
}
188+
189+
// CHECK-LABEL: func.func @mfma_matmul_m_aligned_intrinsic
190+
// CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
191+
// CHECK-SAME: mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>
192+
193+
// LATE: LLVMGPUVectorDistribute
194+
195+
// -----
196+
160197
module {
161198
func.func @conv_nhwc(%3: tensor<2x258x514x768xf16>, %4: tensor<3x3x768x256xf16>) -> tensor<2x256x512x256xf32> {
162199
%c0 = arith.constant 0 : index

0 commit comments

Comments
 (0)