[Codegen][GPU] Use arithmetic intensity to guide gemm size categorization - step 3 (#21826)

jerryyin · web-flow · commit 9303360c291f · 2025-09-03T11:29:39.000-04:00
This PR updates the heuristic for tile and fuse pipeline such that GEMMs
and Convolutions have different seeds. This PR is backed by experimental
data on mi300x:
- 478 convolution performance stays flat before and after: 103.7807
(before) -&gt; 103.1491 (after)
- 259 gemms performance improved by 6.3403% using geomean over
individual perf uplift (from 192.2177(before) -&gt; 186.197(after))

llama3 perf regression results will following in comments while working
on code reviews.

Signed-off-by: jerryyin &lt;zhuoryin@amd.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -240,8 +240,8 @@ static GemmCutoff computeGemmCutoffsForAI(IREE::GPU::TargetAttr target,
 /// problem based on the available mma intrinsics.
 static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
     IREE::GPU::TargetAttr target, GPUMatmulShapeType problem,
-    bool transposedLhs, bool transposedRhs, bool mustBeAligned = true,
-    bool doCPromotion = false, bool scaled = false) {
+    bool transposedLhs, bool transposedRhs, bool isGemm,
+    bool mustBeAligned = true, bool doCPromotion = false, bool scaled = false) {
   const int64_t targetSubgroupSize = target.getPreferredSubgroupSize();
   SmallVector<GPUIntrinsicType> intrinsics;
   if (scaled) {
@@ -307,19 +307,40 @@ static std::optional<GPUMMASchedule> getMmaScheduleFromProblemAndTarget(
     // bestMNTileCountPerSubgroup and small bestKTileCountPerSubgroup to
     // amortize launch/memory costs and maximize throughput.
     problem.gemmSize = GemmSize::LargeGemm;
-    seeds = {/*bestSubgroupCountPerWorkgroup=*/8,
-             /*bestMNTileCountPerSubgroup=*/8,
-             /*bestKTileCountPerSubgroup=*/2,
-             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / 2 /
-                 inBitWidth};
+    if (isGemm) {
+      seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
+               /*bestMNTileCountPerSubgroup=*/16,
+               /*bestKTileCountPerSubgroup=*/2,
+               /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / 2 /
+                   inBitWidth};
+    } else {
+      // Favor more subgroups for convolution to help latency hiding from global
+      // loads.
+      seeds = {/*bestSubgroupCountPerWorkgroup=*/8,
+               /*bestMNTileCountPerSubgroup=*/8,
+               /*bestKTileCountPerSubgroup=*/2,
+               /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / 2 /
+                   inBitWidth};
+    }
   } else {
     // Choose balanced tile shapes. Empirically, medium-AI workloads can favor
     // either small or large tiles depending on kernel details.
     problem.gemmSize = GemmSize::MediumGemm;
-    seeds = {/*bestSubgroupCountPerWorkgroup=*/8,
-             /*bestMNTileCountPerSubgroup=*/4,
-             /*bestKTileCountPerSubgroup=*/4,
-             /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits / inBitWidth};
+    if (isGemm) {
+      seeds = {/*bestSubgroupCountPerWorkgroup=*/4,
+               /*bestMNTileCountPerSubgroup=*/8,
+               /*bestKTileCountPerSubgroup=*/4,
+               /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits /
+                   inBitWidth};
+    } else {
+      // Favor more subgroups for convolution to help latency hiding from global
+      // loads.
+      seeds = {/*bestSubgroupCountPerWorkgroup=*/8,
+               /*bestMNTileCountPerSubgroup=*/4,
+               /*bestKTileCountPerSubgroup=*/4,
+               /*bestKElementCountPerSubgroup*/ kCacheLineSizeBits /
+                   inBitWidth};
+    }
   }
   int64_t maxSharedMemoryBytes = target.getWgp().getMaxWorkgroupMemoryBytes();
 
@@ -407,7 +428,7 @@ static FailureOr<std::pair<LoweringConfigAttr, int64_t>>
 getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
     SmallVector<int64_t> bounds, ArrayRef<AffineMap> maps,
     ArrayRef<Value> operands, IREE::GPU::TargetAttr target, bool useDirectLoad,
-    bool scaled,
+    bool isGemm, bool scaled,
     std::optional<DenseMap<int64_t, AffineExpr>> convToIgemmDimMap =
         std::nullopt,
     std::optional<linalg::ConvolutionDimensions> convDims = std::nullopt) {
@@ -537,7 +558,8 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
   bool mustBeAligned = true;
   bool doCPromotion = false;
   std::optional<GPUMMASchedule> schedule = getMmaScheduleFromProblemAndTarget(
-      target, problem, transposedLhs, transposedRhs, /*mustBeAligned*/ true,
+      target, problem, transposedLhs, transposedRhs, isGemm,
+      /*mustBeAligned*/ true,
       /*doCPromotion*/ false, scaled);
 
   // TODO (nirvedhmeshram, qedawkins): The performance with this will be bad if
@@ -549,7 +571,7 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
     mustBeAligned = false;
     doCPromotion = true;
     schedule = getMmaScheduleFromProblemAndTarget(
-        target, problem, transposedLhs, transposedRhs, mustBeAligned,
+        target, problem, transposedLhs, transposedRhs, isGemm, mustBeAligned,
         doCPromotion, scaled);
   }
 
@@ -710,7 +732,7 @@ LogicalResult setIGEMMConvolutionLoweringConfig(
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
       getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
           bounds, igemmContractionMaps, igemmOperands, target, useDirectLoad,
-          /*scaled*/ false, convToIgemmDimMap, convDims);
+          /*isGemm=*/false, /*scaled*/ false, convToIgemmDimMap, convDims);
   if (failed(configAndWgSize)) {
     return failure();
   }
@@ -756,7 +778,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
 
   FailureOr<std::pair<LoweringConfigAttr, int64_t>> configAndWgSize =
       getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
-          bounds, maps, operands, target, useDirectLoad, /*scaled*/ false);
+          bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
+          /*scaled*/ false);
 
   // TODO (muzasyed) : add generalization for scaled and nonscaled versions of
   // matmul lowering.
@@ -765,7 +788,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
     // conflicts when dealing with scaled matmuls. For now it is disabled.
     useDirectLoad = true;
     configAndWgSize = getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
-        bounds, maps, operands, target, useDirectLoad, /*scaled*/ true);
+        bounds, maps, operands, target, useDirectLoad, /*isGemm=*/true,
+        /*scaled*/ true);
   }
 
   if (failed(configAndWgSize)) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -37,7 +37,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
 }
 
 // CHECK-LABEL: func.func @expanded_matmul_transpose_b
-//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
+//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
 //  CHECK-SAME:   #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>
 
 // Verify that the fill does not have the lowering config propagated to it.
@@ -47,7 +47,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 4]
-//  CHECK-SAME:     subgroup = [1, 2, 2, 1, 0]
+//  CHECK-SAME:     subgroup = [1, 2, 2, 2, 0]
 //  CHECK-SAME:     workgroup = [1, 2, 64, 64, 0]
 
 //        LATE:  LLVMGPUVectorDistribute
@@ -77,14 +77,14 @@ func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4
 }
 
 // CHECK-LABEL: func.func @multi_dim_mma_schedule
-//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
+//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
 //  CHECK-SAME:   #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>
 
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 4, 1]
-//  CHECK-SAME:     subgroup = [2, 1, 2, 1, 0, 0]
+//  CHECK-SAME:     subgroup = [2, 4, 1, 1, 0, 0]
 //  CHECK-SAME:     workgroup = [2, 4, 32, 32, 0, 0]
 
 //        LATE:  LLVMGPUVectorDistribute
@@ -140,7 +140,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
 }
 
 // CHECK-LABEL: func.func @mfma_matmul_1024x1024x1024
-//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
+//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
 //  CHECK-SAME:   #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>
 
 // Verify that the fill does not have the lowering config propagated to it.
@@ -150,8 +150,8 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 4]
-//  CHECK-SAME:     subgroup = [2, 2, 0]
-//  CHECK-SAME:     workgroup = [128, 64, 0]
+//  CHECK-SAME:     subgroup = [2, 4, 0]
+//  CHECK-SAME:     workgroup = [64, 128, 0]
 
 //        LATE:  LLVMGPUVectorDistribute
 
@@ -380,12 +380,12 @@ func.func @aligned_dynamic_matmul_with_two_reduce_dim(%arg0: tensor<192x?x16xf32
 }
 
 // CHECK-LABEL: func.func @aligned_dynamic_matmul_with_two_reduce_dim
-// CHECK-SAME:  {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
+// CHECK-SAME:  {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [128, 1, 1] subgroup_size = 64
 // CHECK:       linalg.generic
 // CHECK-SAME:  {lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>
 // CHECK-SAME:  promote_operands = [0, 1]
 // CHECK-SAME:  reduction = [0, 1, 0, 4],
-// CHECK-SAME:  subgroup = [1, 0, 1, 0],
+// CHECK-SAME:  subgroup = [2, 0, 1, 0],
 // CHECK-SAME:  workgroup = [64, 0, 16, 0]}
 
 // -----
@@ -433,14 +433,14 @@ func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x5
 // schedule with nTileSize of 16 while in reality it should be 8.
 
 // LATE-LABEL: func.func @unaligned_to_intrinsic_batched_matmul_tiling_check
-// LATE-SAME:    #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
+// LATE-SAME:    #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [128, 1, 1] subgroup_size = 64
 // LATE-SAME:    {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}
 //      LATE:    linalg.batch_matmul {{.*}}lowering_config = #iree_gpu.lowering_config
-//  LATE-SAME:     padding = [1, 16, 128, 4]
+//  LATE-SAME:     padding = [1, 16, 64, 4]
 //  LATE-SAME:     promote_operands = [0, 1, 2]
 //  LATE-SAME:     reduction = [0, 0, 0, 1]
 //  LATE-SAME:     subgroup = [0, 1, 2, 0]
-//  LATE-SAME:     workgroup = [1, 16, 128, 0]
+//  LATE-SAME:     workgroup = [1, 16, 64, 0]
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir
@@ -25,14 +25,14 @@ func.func @scaled_matmul(
 }
 
 // CHECK-LABEL: func.func @scaled_matmul
-//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
+//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
 //  CHECK-SAME:   #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.scaled_mma_layout<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 8, 1]
-//  CHECK-SAME:     subgroup = [2, 2, 0, 0]
-//  CHECK-SAME:     workgroup = [128, 64, 0, 0]
+//  CHECK-SAME:     subgroup = [2, 4, 0, 0]
+//  CHECK-SAME:     workgroup = [64, 128, 0, 0]
 
 // -----
 
@@ -58,14 +58,14 @@ func.func @scaled_matmul_with_batch(
 }
 
 // CHECK-LABEL: func.func @scaled_matmul_with_batch
-//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
+//  CHECK-SAME:   #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [256, 1, 1] subgroup_size = 64
 //  CHECK-SAME:   #iree_gpu.pipeline_options<prefetch_shared_memory = true, no_reduce_shared_memory_bank_conflicts = true, use_igemm_convolution = false>
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.scaled_mma_layout<intrinsic = MFMA_SCALE_F32_16x16x128_B32, lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, acc_elem_type = f32>
 //  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 8, 1]
-//  CHECK-SAME:     subgroup = [0, 2, 2, 0, 0]
-//  CHECK-SAME:     workgroup = [1, 128, 64, 0, 0]
+//  CHECK-SAME:     subgroup = [0, 2, 4, 0, 0]
+//  CHECK-SAME:     workgroup = [1, 64, 128, 0, 0]
 
 // -----