[Codegen] Remove batch size in target intrinsic checks (iree-org#22289)

jtuyls · pstarkcdpr · commit f9737f9942ba · 2025-11-28T13:55:08.000-08:00
Conceptually, we shouldn't depend on batch size information when
checking whether we should target a mfma intrinsics as that doesn't
change whether a matmul-like operation is compute or memory bound.

Signed-off-by: Jorn Tuyls &lt;jorn.tuyls@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -260,11 +260,6 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
   // remove this todo.
   const int64_t mSize = llvm::product_of(problem.mSizes);
   const int64_t nSize = llvm::product_of(problem.nSizes);
-  // TODO(jornt): Remove this check as batch size doesn't make a computation
-  // more compute bound, so it shouldn't be considered.
-  if (!problem.batchSizes.empty()) {
-    return success();
-  }
   if ((mSize <= kVerySkinnyDimThreshold && (nSize > preferredSubgroupSize)) ||
       (nSize <= kVerySkinnyDimThreshold && (mSize > preferredSubgroupSize))) {
     return failure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -293,12 +293,12 @@ module {
 // -----
 
 module {
-func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x2x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x2x577xf32> {
+func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x8x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x8x577xf32> {
     %c0 = arith.constant 0.0 : f32
-    %empty = tensor.empty() : tensor<12x2x577xf32>
-    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
-    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x2x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x2x577xf32>) -> tensor<12x2x577xf32>
-    return %mm :  tensor<12x2x577xf32>
+    %empty = tensor.empty() : tensor<12x8x577xf32>
+    %fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x8x577xf32>) -> tensor<12x8x577xf32>
+    %mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x8x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x8x577xf32>) -> tensor<12x8x577xf32>
+    return %mm :  tensor<12x8x577xf32>
 }
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/config_matvec.mlir
@@ -25,7 +25,7 @@ func.func @static_batch_matvec() {
 
 
 // CHECK:     LLVMGPUVectorDistribute
-// CDNA3:     LLVMGPUTileAndFuse
+// CDNA3:     LLVMGPUVectorDistribute
 
 // -----