iree-org
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/BUILD.bazel‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/DecomposeHorizontallyFusedGemms.cpp‎
Lines changed: 216 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/DecomposeHorizontallyFusedGemms.cpp‎
Lines changed: 216 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td‎
Lines changed: 12 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/Passes.td‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/BUILD.bazel‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎compiler/src/iree/compiler/Codegen/Common/GPU/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -50,6 +50,7 @@ iree_compiler_cc_library(
     name = "CommonGPUPasses",
     srcs = [
         "AMDGPUDistributeContract.cpp",
+        "DecomposeHorizontallyFusedGemms.cpp",
         "ExpandGPUOps.cpp",
         "GPUApplyTilingLevel.cpp",
         "GPUCheckResourceUsage.cpp",
@@ -107,6 +108,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Codegen/Utils:VectorOpUtils",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
+        "//compiler/src/iree/compiler/Dialect/LinalgExt/Utils",
         "//compiler/src/iree/compiler/Utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUDialect",
 
@@ -48,6 +48,7 @@ iree_cc_library(
     "Passes.h"
   SRCS
     "AMDGPUDistributeContract.cpp"
+    "DecomposeHorizontallyFusedGemms.cpp"
     "ExpandGPUOps.cpp"
     "GPUApplyTilingLevel.cpp"
     "GPUCheckResourceUsage.cpp"
@@ -141,6 +142,7 @@ iree_cc_library(
     iree::compiler::Codegen::Utils
     iree::compiler::Codegen::Utils::VectorOpUtils
     iree::compiler::Dialect::HAL::IR
+    iree::compiler::Dialect::LinalgExt::Utils
     iree::compiler::Utils
   PUBLIC
 )
 
@@ -0,0 +1,216 @@
+// Copyright 2025 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/Common/GPU/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
+#include "iree/compiler/Dialect/LinalgExt/Utils/Utils.h"
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir::iree_compiler {
+
+#define GEN_PASS_DEF_DECOMPOSEHORIZONTALLYFUSEDGEMMSPASS
+#include "iree/compiler/Codegen/Common/GPU/Passes.h.inc"
+
+namespace {
+
+struct DecomposeHorizontallyFusedGemmsPass final
+    : impl::DecomposeHorizontallyFusedGemmsPassBase<
+          DecomposeHorizontallyFusedGemmsPass> {
+  void runOnOperation() override;
+};
+} // namespace
+
+//===---------------------------------------------------------------------===//
+// Decompose horizontally fused gemm operations
+// TODO: Eventually drop this if we end up creating an operation for the
+// horizontally fused contractions.
+//===---------------------------------------------------------------------===//
+
+static LogicalResult captureUsedOperationsAndBlockArguements(
+    linalg::LinalgOp linalgOp, SetVector<int64_t> &usedInputs,
+    SetVector<Operation *> &usedOperations, int64_t resultNumber) {
+  BackwardSliceOptions options;
+  options.inclusive = true;
+  options.filter = [&](Operation *op) -> bool {
+    return op->getBlock() == linalgOp.getBlock();
+  };
+
+  auto yieldOp = cast<linalg::YieldOp>(linalgOp.getBlock()->getTerminator());
+  Value result = yieldOp.getOperand(resultNumber);
+
+  getBackwardSlice(result, &usedOperations, options);
+
+  // Get all block arguments used by the operations. If any of the arguments
+  // used is a dpsInit argument other than resultNumber, return failure.
+  for (Operation *op : usedOperations) {
+    for (Value operand : op->getOperands()) {
+      if (auto blockArg = dyn_cast<BlockArgument>(operand)) {
+        if (blockArg.getOwner() != linalgOp.getBlock()) {
+          continue;
+        }
+
+        int64_t argNumber = blockArg.getArgNumber();
+        if (argNumber >= linalgOp.getNumDpsInputs() &&
+            argNumber - linalgOp.getNumDpsInputs() != resultNumber) {
+          return failure();
+        }
+
+        if (argNumber < linalgOp.getNumDpsInputs()) {
+          usedInputs.insert(argNumber);
+        }
+      }
+    }
+  }
+
+  return success();
+}
+
+// Since the `promotedOperands` changes that needs to be modified
+// and transfered over to the decomposed ops.
+static IREE::GPU::LoweringConfigAttr
+getModifiedLoweringConfigForDecomposedGemmOp(
+    RewriterBase &rewriter, IREE::GPU::LoweringConfigAttr origAttr,
+    ArrayRef<unsigned> keptOperands) {
+  std::optional<SmallVector<int64_t>> promotedOperandsList =
+      IREE::GPU::getPromotedOperandList(origAttr);
+  if (!promotedOperandsList) {
+    return origAttr;
+  }
+
+  llvm::SmallDenseSet<int64_t> promotedOperandsSet(
+      promotedOperandsList->begin(), promotedOperandsList->end());
+  SmallVector<int64_t> newPromotedOperands;
+  for (auto [index, origOperandNum] : llvm::enumerate(keptOperands)) {
+    if (promotedOperandsSet.contains(origOperandNum)) {
+      newPromotedOperands.push_back(index);
+    }
+  }
+  return setPromotedOperandsList(rewriter.getContext(), origAttr,
+                                 newPromotedOperands);
+}
+
+static LogicalResult
+decomposeHorizontallyFusedGemmOperations(RewriterBase &rewriter,
+                                         linalg::LinalgOp linalgOp) {
+  assert(IREE::LinalgExt::isaHorizontallyFusedContraction(linalgOp) &&
+         "expected op that is a horizontally fused contraction");
+
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(linalgOp);
+  // Create num_results linalg.generics, each producing a single result (and
+  // relying on canonicalizations to simplify).
+  for (int64_t resultNumber : llvm::seq<int64_t>(linalgOp->getNumResults())) {
+    rewriter.setInsertionPoint(linalgOp);
+
+    auto yieldOp = cast<linalg::YieldOp>(linalgOp.getBlock()->getTerminator());
+    Value result = yieldOp.getOperand(resultNumber);
+
+    // Get all operations required to produce this result.
+    SetVector<Operation *> usedOperations;
+    SetVector<int64_t> usedInputs;
+    if (failed(captureUsedOperationsAndBlockArguements(
+            linalgOp, usedInputs, usedOperations, resultNumber))) {
+      return failure();
+    }
+
+    // Create a new linalg.generic operation for this result.
+    SmallVector<OpOperand *> inputs = llvm::map_to_vector(
+        usedInputs, [&](int64_t x) { return linalgOp.getDpsInputOperand(x); });
+    SmallVector<OpOperand *> inits = {linalgOp.getDpsInitOperand(resultNumber)};
+
+    SmallVector<AffineMap> indexingMaps =
+        llvm::map_to_vector(usedInputs, [&](int64_t x) {
+          return linalgOp.getIndexingMapsArray()[x];
+        });
+    indexingMaps.push_back(linalgOp.getIndexingMapMatchingResult(
+        linalgOp->getOpResult(resultNumber)));
+    llvm::SmallBitVector unusedDims = getUnusedDimsBitVector(indexingMaps);
+    indexingMaps = compressUnusedDims(indexingMaps);
+
+    SmallVector<utils::IteratorType> iteratorTypes;
+    for (int64_t i : llvm::seq<int64_t>(linalgOp.getNumLoops())) {
+      if (!unusedDims.test(i)) {
+        iteratorTypes.push_back(linalgOp.getIteratorTypesArray()[i]);
+      }
+    }
+
+    SmallVector<Value> inputVals = llvm::map_to_vector(
+        inputs, [](OpOperand *operand) { return operand->get(); });
+    SmallVector<Value> initVals = llvm::map_to_vector(
+        inits, [](OpOperand *operand) { return operand->get(); });
+    auto newOp = rewriter.create<linalg::GenericOp>(
+        linalgOp.getLoc(), TypeRange{inits[0]->get().getType()}, inputVals,
+        initVals, indexingMaps, iteratorTypes,
+        [&](OpBuilder &b, Location loc, ValueRange blockArgs) {
+          Block *oldBody = linalgOp.getBlock();
+          usedInputs.insert(resultNumber + linalgOp.getNumDpsInputs());
+
+          IRMapping regionMapping;
+
+          for (auto [oldBlockArgNum, newBlockArg] :
+               llvm::zip_equal(usedInputs, blockArgs)) {
+            regionMapping.map(oldBody->getArgument(oldBlockArgNum),
+                              newBlockArg);
+          }
+
+          for (Operation *usedOperation : usedOperations) {
+            b.clone(*usedOperation, regionMapping);
+          }
+
+          b.create<linalg::YieldOp>(loc, regionMapping.lookup(result));
+        });
+
+    // If on decomposition any dims are unused propagating lowering config isnt
+    // well defined. So propagate lowering config only when no dim is unused.
+    if (unusedDims.none()) {
+      IREE::GPU::LoweringConfigAttr loweringConfigAttr =
+          getLoweringConfig<IREE::GPU::LoweringConfigAttr>(linalgOp);
+      if (loweringConfigAttr && getPromotedOperandList(loweringConfigAttr)) {
+        SmallVector<unsigned> operandNums =
+            llvm::map_to_vector(inputs, [](OpOperand *operand) {
+              return operand->getOperandNumber();
+            });
+        auto range = llvm::map_range(inits, [](OpOperand *operand) {
+          return operand->getOperandNumber();
+        });
+        operandNums.append(range.begin(), range.end());
+        IREE::GPU::LoweringConfigAttr newGPUAttr =
+            getModifiedLoweringConfigForDecomposedGemmOp(
+                rewriter, loweringConfigAttr, operandNums);
+        setLoweringConfig(newOp, newGPUAttr);
+      }
+    }
+
+    rewriter.replaceAllUsesWith(linalgOp->getResult(resultNumber),
+                                newOp.getResult(0));
+  }
+
+  rewriter.eraseOp(linalgOp);
+  return success();
+}
+
+void DecomposeHorizontallyFusedGemmsPass::runOnOperation() {
+  auto funcOp = getOperation();
+  IRRewriter rewriter(&getContext());
+  SmallVector<linalg::LinalgOp> horizontallyFusedOps;
+  funcOp.walk([&](linalg::LinalgOp linalgOp) {
+    if (IREE::LinalgExt::isaHorizontallyFusedContraction(linalgOp)) {
+      horizontallyFusedOps.push_back(linalgOp);
+    }
+  });
+
+  for (auto linalgOp : llvm::make_early_inc_range(horizontallyFusedOps)) {
+    if (failed(decomposeHorizontallyFusedGemmOperations(rewriter, linalgOp))) {
+      return signalPassFailure();
+    }
+  }
+}
+
+} // namespace mlir::iree_compiler
@@ -13,6 +13,18 @@ include "mlir/Pass/PassBase.td"
 // Common Passes used for GPU-like backends (keep alphabetical)
 //===---------------------------------------------------------------------===//
 
+def DecomposeHorizontallyFusedGemmsPass :
+    InterfacePass<"iree-codegen-gpu-decompose-horizontally-fused-gemms",
+                  "mlir::FunctionOpInterface"> {
+  let summary =
+      "Decomposes a horizontally fused GEMM back into its constituent GEMMs";
+  let dependentDialects = [
+    "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect",
+    "::mlir::linalg::LinalgDialect",
+  ];
+}
+
+
 def GPUCheckResourceUsagePass :
     InterfacePass<"iree-codegen-gpu-check-resource-usage", "mlir::FunctionOpInterface"> {
   let summary = "Checks GPU specific resource usage constraints like shared memory limits";
 
@@ -18,6 +18,7 @@ iree_lit_test_suite(
     name = "lit",
     srcs = enforce_glob(
         [
+            "decompose_horizontally_fused_gemms.mlir",
             "gpu_apply_derived_thread_config.mlir",
             "gpu_apply_tiling_level.mlir",
             "gpu_check_resource_usage.mlir",
 
@@ -14,6 +14,7 @@ iree_lit_test_suite(
   NAME
     lit
   SRCS
+    "decompose_horizontally_fused_gemms.mlir"
     "gpu_apply_derived_thread_config.mlir"
     "gpu_apply_tiling_level.mlir"
     "gpu_check_resource_usage.mlir"
Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ iree_lit_test_suite(`
`18`	`18`	`name = "lit",`
`19`	`19`	`srcs = enforce_glob(`
`20`	`20`	`[`
	`21`	`+ "decompose_horizontally_fused_gemms.mlir",`
`21`	`22`	`"gpu_apply_derived_thread_config.mlir",`
`22`	`23`	`"gpu_apply_tiling_level.mlir",`
`23`	`24`	`"gpu_check_resource_usage.mlir",`