[Codegen][GPU] Sort intrinsic according to k alignment - Step 2 of 2 - Creating intrinsic sort routine (iree-org#21128)

jerryyin · web-flow · commit 03f30949fe5b · 2025-06-20T19:38:51.000Z
This is a follow-up of iree-org#21103. This intrinsic sorting routine will sort the MMA intrinsics by following precedence rules: 1) K-alignment. We prefer intrinsics that can evenly divide the K dimension of the problem. 2) M/N-alignment. We prefer intrinsics that can evenly divide the M and N dimensions of the problem. 3) Intrinsic with larger gemm size. 4) Intrinsic with larger K size. Scope of the impact is igemm and matmul problems. Other pipelines can choose to adopt this if the sorting turns out to be useful. ### Motivation: convolution configuration with > BOO_CACHE_ON=0 python boo_driver.py convbfp16 -n 16 -c 40 -H 192 -W 128 -k 40 -y 3 -x 3 -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 1 -F 1 -t 1 --in_layout NHWC --out_layout NHWC --fil_layout NHWC --iter 50 Since the gemmK size is 360, the best intrinsic is 32x32x8xbf16, not 16x16x16xbf16, as the former will end in aligned K. Before this PR: `362.06 us`. After this PR: `44.74 us`. Note that due to limitation of the tuner, this combination won't be covered by typical tuning run with 2k trials. So it is essential we bridge the gap through heuristic first before tuner is more capable of searching through different heuristics. --------- Signed-off-by: jerryyin <zhuoryin@amd.com>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -123,6 +123,54 @@ LogicalResult setDataTiledMultiMmaLoweringConfig(
       workgroupSize, targetSubgroupSize, pipelineConfig);
 }
 
+/// Sort the MMA intrinsics by following precedence rules:
+///   1) k-alignment. We prefer intrinsics that can evenly divide the K
+///   dimension of the problem.
+///   2) M/N-alignment. We prefer intrinsics that can evenly divide the M and N
+///   dimensions of the problem.
+///   3) Intrinsic with larger gemm size.
+///   4) Intrinsic with larger K size.
+static void sortMMAIntrinsics(GPUMatmulShapeType problem,
+                              SmallVector<GPUIntrinsicType> &intrinsics) {
+  llvm::sort(intrinsics, [&](const GPUMatmulShapeType &lhs,
+                             const GPUMatmulShapeType &rhs) {
+    // Prefer K-aligned intrinsics.
+    int lhsKAligned = problem.kSizes.back() % lhs.kSizes.back() == 0 ? 1 : 0;
+    int rhsKAligned = problem.kSizes.back() % rhs.kSizes.back() == 0 ? 1 : 0;
+    if (lhsKAligned != rhsKAligned) {
+      return lhsKAligned > rhsKAligned;
+    }
+
+    // If K alignment is the same, prefer the intrinsic that aligns M and N.
+    int lhsMNAligned = (problem.mSizes.back() % lhs.mSizes.back() == 0 &&
+                        problem.nSizes.back() % lhs.nSizes.back() == 0)
+                           ? 1
+                           : 0;
+    int rhsMNAligned = (problem.mSizes.back() % rhs.mSizes.back() == 0 &&
+                        problem.nSizes.back() % rhs.nSizes.back() == 0)
+                           ? 1
+                           : 0;
+    if (lhsMNAligned != rhsMNAligned) {
+      return lhsMNAligned > rhsMNAligned;
+    }
+
+    auto intrinsicArea = [&](const GPUMatmulShapeType &intrinsic) {
+      return (ShapedType::getNumElements(intrinsic.mSizes) +
+              ShapedType::getNumElements(intrinsic.nSizes)) *
+             ShapedType::getNumElements(intrinsic.kSizes);
+    };
+    int64_t lhsArea = intrinsicArea(lhs);
+    int64_t rhsArea = intrinsicArea(rhs);
+    if (lhsArea != rhsArea) {
+      return lhsArea > rhsArea;
+    }
+
+    // Finally if everything else is the same, prefer large K size.
+    return ShapedType::getNumElements(lhs.kSizes) >
+           ShapedType::getNumElements(rhs.kSizes);
+  });
+}
+
 /// Given a target and a matmul problem, try to find an MMA schedule for the
 /// problem based on the available mma intrinsics.
 static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
@@ -144,6 +192,7 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
   }
   if (intrinsics.empty())
     return std::nullopt;
+  sortMMAIntrinsics(problem, intrinsics);
 
   GPUMMAHeuristicSeeds seeds;
   assert(problem.aType == problem.bType &&
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -157,6 +157,43 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
 
 // -----
 
+// This tests the mfma lowering intrinsic K alignment. Since gemmK = 360, and will
+// be aligned to 8 but not 16, we expect the 32x32x8xf16 intrinsic to be used.
+func.func @mfma_matmul_k_aligned_intrinsic(%lhs: tensor<1024x360xf16>, %rhs: tensor<360x1024xf16>) -> tensor<1024x1024xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %5 = tensor.empty() : tensor<1024x1024xf32>
+  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+  %7 = linalg.matmul ins(%lhs, %rhs : tensor<1024x360xf16>, tensor<360x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+  return %7 : tensor<1024x1024xf32>
+}
+
+// CHECK-LABEL: func.func @mfma_matmul_k_aligned_intrinsic
+// CHECK:         pipeline = LLVMGPUTileAndFuse
+// CHECK:         linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME:    mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>
+
+// LATE: LLVMGPUVectorDistribute
+
+// -----
+
+// This tests the mfma lowering intrinsic M alignment. Since gemmM = 176, and will
+// be aligned to 16 but not 32, we expect the 16x16x32xi8 intrinsic to be used.
+func.func @mfma_matmul_m_aligned_intrinsic(%lhs: tensor<176x1024xi8>, %rhs: tensor<1024x1024xi8>) -> tensor<176x1024xi32> {
+  %cst = arith.constant 0 : i32
+  %5 = tensor.empty() : tensor<176x1024xi32>
+  %6 = linalg.fill ins(%cst : i32) outs(%5 : tensor<176x1024xi32>) -> tensor<176x1024xi32>
+  %7 = linalg.matmul ins(%lhs, %rhs : tensor<176x1024xi8>, tensor<1024x1024xi8>) outs(%6 : tensor<176x1024xi32>) -> tensor<176x1024xi32>
+  return %7 : tensor<176x1024xi32>
+}
+
+// CHECK-LABEL: func.func @mfma_matmul_m_aligned_intrinsic
+// CHECK:         linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
+// CHECK-SAME:    mma_kind = #iree_gpu.mma_layout<MFMA_I32_16x16x32_I8>
+
+// LATE: LLVMGPUVectorDistribute
+
+// -----
+
 module {
   func.func @conv_nhwc(%3: tensor<2x258x514x768xf16>, %4: tensor<3x3x768x256xf16>) -> tensor<2x256x512x256xf32> {
     %c0 = arith.constant 0 : index