[Codegen] Use llvm accumulate wrappers. NFC. (iree-org#22331)

kuhar · weidel-p · commit 0231745d84cb · 2025-10-21T02:43:28.000-07:00
`std::accumulate` is bug-prone because the return type depends on the
initial value passed in and not on the binary function or the element
types. `llvm::product_of` and `llvm::sum_of` use the correct accumulator
type by default.

Also remove the `prod` helper that tried to do the same thing by going
through a ShapedType helper, and fix some minor formatting issues.

Signed-off-by: Philipp &lt;philipp.weidel@intel.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUHeuristics.cpp
@@ -11,6 +11,7 @@
 
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUEnums.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Support/DebugLog.h"
 #include "llvm/Support/InterleavedRange.h"
@@ -48,20 +49,14 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os,
   return os;
 }
 
-// Shortened helper to compute the product of `values`.
-static int64_t prod(ArrayRef<int64_t> values) {
-  return ShapedType::getNumElements(values);
-}
-
 static int64_t calculateOperandsSharedMemoryUsedInBytes(
     const GPUMMASchedule &schedule, int64_t lhsBitwidth, int64_t rhsBitwidth,
     int64_t numRhs = 1) {
-
-  int64_t tileM = schedule.mSize * prod(schedule.mTileSizes) *
-                  prod(schedule.mSubgroupCounts);
-  int64_t tileN = schedule.nSize * prod(schedule.nTileSizes) *
-                  prod(schedule.nSubgroupCounts);
-  int64_t tileK = schedule.kSize * prod(schedule.kTileSizes);
+  int64_t tileM = schedule.mSize * llvm::product_of(schedule.mTileSizes) *
+                  llvm::product_of(schedule.mSubgroupCounts);
+  int64_t tileN = schedule.nSize * llvm::product_of(schedule.nTileSizes) *
+                  llvm::product_of(schedule.nSubgroupCounts);
+  int64_t tileK = schedule.kSize * llvm::product_of(schedule.kTileSizes);
   return (tileM * tileK * lhsBitwidth + numRhs * tileN * tileK * rhsBitwidth) /
          8;
 }
@@ -70,11 +65,10 @@ static int64_t
 calculateResultSharedMemoryUsedInBytes(const GPUMMASchedule &schedule,
                                        int64_t resultBitwidth,
                                        int64_t numRes = 1) {
-
-  int64_t tileM = schedule.mSize * prod(schedule.mTileSizes) *
-                  prod(schedule.mSubgroupCounts);
-  int64_t tileN = schedule.nSize * prod(schedule.nTileSizes) *
-                  prod(schedule.nSubgroupCounts);
+  int64_t tileM = schedule.mSize * llvm::product_of(schedule.mTileSizes) *
+                  llvm::product_of(schedule.mSubgroupCounts);
+  int64_t tileN = schedule.nSize * llvm::product_of(schedule.nTileSizes) *
+                  llvm::product_of(schedule.nSubgroupCounts);
   return (numRes * tileM * tileN * resultBitwidth) / 8;
 }
 
@@ -150,13 +144,14 @@ static bool isValidMMASchedule(const GPUMatmulShapeType &problem,
   const int64_t kMaxVectorLoadBitWidth = 128;
   int64_t elemsPerThread =
       kMaxVectorLoadBitWidth / problem.bType.getIntOrFloatBitWidth();
-  int64_t wgThreads = subgroupSize * prod(schedule.mSubgroupCounts) *
-                      prod(schedule.nSubgroupCounts);
-  int64_t mWgSize = schedule.mSize * prod(schedule.mTileSizes) *
-                    prod(schedule.mSubgroupCounts);
-  int64_t nWgSize = schedule.nSize * prod(schedule.nTileSizes) *
-                    prod(schedule.nSubgroupCounts);
-  int64_t kWgSize = schedule.kSize * prod(schedule.kTileSizes);
+  int64_t wgThreads = subgroupSize *
+                      llvm::product_of(schedule.mSubgroupCounts) *
+                      llvm::product_of(schedule.nSubgroupCounts);
+  int64_t mWgSize = schedule.mSize * llvm::product_of(schedule.mTileSizes) *
+                    llvm::product_of(schedule.mSubgroupCounts);
+  int64_t nWgSize = schedule.nSize * llvm::product_of(schedule.nTileSizes) *
+                    llvm::product_of(schedule.nSubgroupCounts);
+  int64_t kWgSize = schedule.kSize * llvm::product_of(schedule.kTileSizes);
   int64_t innerLhsDimSize = transposedLhs ? mWgSize : kWgSize;
   int64_t innerRhsDimSize = transposedRhs ? kWgSize : nWgSize;
 
@@ -263,12 +258,8 @@ static LogicalResult canTargetIntrinsic(const GPUMatmulShapeType &problem,
   // established after we sweep the different tile sizes for a problem config.
   // Once a precise threshold is established, replace 4 with the threshold and
   // remove this todo.
-  const int64_t mSize =
-      std::accumulate(problem.mSizes.begin(), problem.mSizes.end(), 1,
-                      std::multiplies<int64_t>());
-  const int64_t nSize =
-      std::accumulate(problem.nSizes.begin(), problem.nSizes.end(), 1,
-                      std::multiplies<int64_t>());
+  const int64_t mSize = llvm::product_of(problem.mSizes);
+  const int64_t nSize = llvm::product_of(problem.nSizes);
   // TODO(jornt): Remove this check as batch size doesn't make a computation
   // more compute bound, so it shouldn't be considered.
   if (!problem.batchSizes.empty()) {
@@ -383,7 +374,6 @@ static void distributeGCDForDim(bool isMDim, int64_t &mTotalTileToDistribute,
                                 int64_t &nTileSizeDistributed,
                                 int64_t &remainingSubgroups,
                                 int64_t &remainingTiles) {
-
   int64_t &totalTilesToDistribute =
       isMDim ? mTotalTileToDistribute : nTotalTileToDistribute;
   int64_t &subgroupDistributed =
@@ -418,8 +408,8 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
       llvm::divideCeil(problem.mSizes.back(), intrinsic.mSizes[0]);
   nTotalTileCounts.back() =
       llvm::divideCeil(problem.nSizes.back(), intrinsic.nSizes[0]);
-  int64_t mTotalTileToDistribute = prod(mTotalTileCounts);
-  int64_t nTotalTileToDistribute = prod(nTotalTileCounts);
+  int64_t mTotalTileToDistribute = llvm::product_of(mTotalTileCounts);
+  int64_t nTotalTileToDistribute = llvm::product_of(nTotalTileCounts);
 
   int64_t remainingSubgroups = seeds.bestSubgroupCountPerWorkgroup;
   int64_t remainingTiles = seeds.bestMNTileCountPerSubgroup;
@@ -529,9 +519,9 @@ static GPUMMASchedule getOptimalMMASchedule(const GPUMatmulShapeType &problem,
       getBestKTileSizes(problem, intrinsic, seeds);
 
   return GPUMMASchedule{intrinsic.mmaKind,
-                        prod(intrinsic.mSizes),
-                        prod(intrinsic.nSizes),
-                        prod(intrinsic.kSizes),
+                        llvm::product_of(intrinsic.mSizes),
+                        llvm::product_of(intrinsic.nSizes),
+                        llvm::product_of(intrinsic.kSizes),
                         mSubgroupCounts,
                         nSubgroupCounts,
                         mTileSizes,
@@ -741,8 +731,8 @@ getOptimalAttentionPVSchedule(const GPUMatmulShapeType &problem,
   // subgroups on N leaves room to distribute subgroups on K1 and how that
   // effects the softmax computation hasn't been experimented with yet.
   //
-  // Distribute tile sizes on N as much as we can as it's completly unrolled and
-  // then distribute remaining tiles and subgroups on M.
+  // Distribute tile sizes on N as much as we can as it's completely unrolled
+  // and then distribute remaining tiles and subgroups on M.
   for (int nDim = problem.nSizes.size() - 1; nDim >= 0; --nDim) {
     // Do not distribute N on subgroups.
     nSubgroupCounts[nDim] = 1;
@@ -793,7 +783,6 @@ FailureOr<std::pair<GPUMMASchedule, GPUMMASchedule>> deduceAttentionSchedule(
     const GPUMMAHeuristicSeeds &pvMatmulSeeds, int64_t sharedMemLimitInBytes,
     int64_t subgroupSize, bool transposedQ, bool transposedK, bool transposedV,
     bool canUpcastAcc, bool mustBeAligned) {
-
   SmallVector<uint64_t> qkViableIntrinsicIndices;
   SmallVector<uint64_t> pvViableIntrinsicIndices;
   for (const auto &[index, intrinsic] : llvm::enumerate(intrinsics)) {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.cpp
@@ -544,8 +544,8 @@ TileSwizzle getSwizzle(IREE::GPU::DataTiledMMAAttr mma,
 /// indices accordingly.
 static void remove(TileSwizzle &swizzle, size_t idx) {
   assert(idx < swizzle.expandShape.size() && "idx out of bounds");
-  const size_t startIdx = std::accumulate(
-      std::begin(swizzle.expandShape), std::begin(swizzle.expandShape) + idx, 0,
+  const size_t startIdx = llvm::accumulate(
+      ArrayRef(swizzle.expandShape).take_front(idx), size_t(0),
       [](size_t idx, const TileSwizzle::ExpandShapeDimVectorType &dims)
           -> size_t { return idx + dims.size(); });
   const size_t endIdx = startIdx + swizzle.expandShape[idx].size();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -1775,9 +1775,7 @@ static LogicalResult setRootDefaultConfig(IREE::GPU::TargetAttr target,
       }
       if (vectorSize == 1) // assume there is fastpath + slowpath
         vectorSize = 4;
-      int64_t problemSize = std::accumulate(
-          shape.begin(), shape.end(), 1,
-          [](const int64_t &a, const int64_t &b) { return a * b; });
+      int64_t problemSize = llvm::product_of(shape);
       if ((problemSize / (preferredSubgroupSize * vectorSize)) < 64) {
         vectorSize = 1;
         break;