MaheshRavishankar
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/InternalAPI.h‎ b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/InternalAPI.h‎
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp‎
Lines changed: 1 addition & 224 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUConfigureTensorLayouts.cpp‎
Lines changed: 1 addition & 224 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp‎
Lines changed: 23 additions & 27 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPUVectorDistribute.cpp‎
Lines changed: 23 additions & 27 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp‎
Lines changed: 0 additions & 10 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h‎
Lines changed: 11 additions & 0 deletions b/‎compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h‎
Lines changed: 11 additions & 0 deletions
@@ -4,6 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/GPULoweringConfigUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
@@ -155,230 +156,6 @@ static NestedLayoutAttr createNestedLayout(
   return layoutAttr;
 }
 
-static FailureOr<std::tuple<IREE::VectorExt::VectorLayoutInterface,
-                            IREE::VectorExt::VectorLayoutInterface,
-                            IREE::VectorExt::VectorLayoutInterface>>
-getContractionLayout(IREE::GPU::MMAScheduleAttr schedule,
-                     VectorContractOpInfo &opInfo,
-                     linalg::LinalgOp contractOp) {
-  LLVM_DEBUG({
-    llvm::dbgs() << "Getting mma layouts for:\n" << contractOp << "\n";
-    llvm::dbgs() << "For schedule: " << schedule << "\n";
-  });
-
-  int64_t rank = contractOp.getIteratorTypesArray().size();
-  auto mmaAttr =
-      llvm::cast<IREE::GPU::MmaInterfaceAttr>(schedule.getIntrinsic());
-  MLIRContext *context = schedule.getContext();
-
-  SmallVector<int64_t> bounds = contractOp.getStaticLoopRanges();
-  if (llvm::any_of(bounds,
-                   [](int64_t x) { return x == ShapedType::kDynamic; })) {
-    return failure();
-  }
-
-  if (!llvm::all_of(opInfo.getBatchDims(),
-                    [&bounds](int64_t dim) { return bounds[dim] == 1; })) {
-    LLVM_DEBUG({ llvm::dbgs() << "non-unit batch dimension\n"; });
-    return failure();
-  }
-
-  // Get the concrete nested layout for each matrix. Note that the struct
-  // MMASingleSubgroupLayout contains the partial layout for the
-  // canonical (M, K) x (K, N) -> (M, N) matmul form; while the specific
-  // contract op we are looking at right now may not be exactly in that form.
-  // So here we need to permute/transpose the canonical layout to match with
-  // the concrete contract op.
-
-  // Note that no matter how we permute/transpose the input contraction
-  // problem, the way we view the hardware warps remain the same--that is,
-  // from the hardware's perspective, a single warp has the same warp ID no
-  // matter what part of the contraction it works on. Similarly here, we are
-  // delinearizing the linearized GPU hardware lane ID into a n-D concatenated
-  // logical warp+thread using the subgroup/thread basis, so the subgroup
-  // basis should remain the same for all A/B/C matrix.
-
-  auto [intrinsicM, intrinsicN, intrinsicK] = mmaAttr.getMNKShape();
-
-  SmallVector<int64_t, 2> subgroupMBasis;
-  SmallVector<int64_t, 2> batchMSizes;
-  int64_t currMCount = schedule.getSubgroupMCount();
-
-  auto divideGreedily = [](int64_t availableSubgroups, int64_t dimSize,
-                           int64_t minDimSize) -> std::pair<int64_t, int64_t> {
-    int64_t dividableDim = dimSize / minDimSize;
-    int64_t subgroupsUsed = std::gcd(availableSubgroups, dividableDim);
-    dividableDim /= subgroupsUsed;
-    int64_t batchesUsed = dividableDim;
-    return {subgroupsUsed, batchesUsed};
-  };
-
-  // Greedily break up the M subgroup and batch counts along the "M" iteration
-  // bounds. We distribute as many residual subgroups as possible per M dim,
-  // and then divide the remaining along batch dims. The inner most M dim is
-  // always the one used for the intrinsic, meaning for a valid schedule, the
-  // computed batch counts and subgroup basis will satisfy totalMSize /
-  // intrinsicM = product(batchMSizes) * product(subgroupMBasis)
-  for (auto dim : opInfo.getMDims()) {
-    // Get the number of subgroups and batches used for this dimension based
-    // on the intrinsic size and the bound size.
-    int64_t subgroupsUsed, batchesUsed;
-    if (dim == opInfo.getMDims().back()) {
-      std::tie(subgroupsUsed, batchesUsed) =
-          divideGreedily(currMCount, bounds[dim], intrinsicM);
-    } else {
-      std::tie(subgroupsUsed, batchesUsed) =
-          divideGreedily(currMCount, bounds[dim], 1);
-    }
-    subgroupMBasis.push_back(subgroupsUsed);
-    batchMSizes.push_back(batchesUsed);
-    // Update available subgroup count.
-    currMCount /= subgroupsUsed;
-  }
-
-  SmallVector<int64_t, 2> subgroupNBasis;
-  SmallVector<int64_t, 2> batchNSizes;
-  int64_t currNCount = schedule.getSubgroupNCount();
-
-  // Do the same for N dims.
-  for (auto dim : opInfo.getNDims()) {
-    // Get the number of subgroups and batches used for this dimension based
-    // on the intrinsic size and the bound size.
-    int64_t subgroupsUsed, batchesUsed;
-    if (dim == opInfo.getNDims().back()) {
-      std::tie(subgroupsUsed, batchesUsed) =
-          divideGreedily(currNCount, bounds[dim], intrinsicN);
-    } else {
-      std::tie(subgroupsUsed, batchesUsed) =
-          divideGreedily(currNCount, bounds[dim], 1);
-    }
-    subgroupNBasis.push_back(subgroupsUsed);
-    batchNSizes.push_back(batchesUsed);
-    // Update available subgroup count.
-    currNCount /= subgroupsUsed;
-  }
-
-  SmallVector<int64_t> subgroupMStrides(subgroupMBasis.size());
-  SmallVector<int64_t> subgroupNStrides(subgroupNBasis.size());
-
-  auto mDimVec = opInfo.getMDims();
-  llvm::SmallDenseSet<int64_t> mDims(mDimVec.begin(), mDimVec.end());
-  auto nDimVec = opInfo.getNDims();
-  llvm::SmallDenseSet<int64_t> nDims(nDimVec.begin(), nDimVec.end());
-  // Because we currently require all batch dimensions to be unit, the
-  // subgroup basis can be constructed from the M and N bases. To keep things
-  // simple, the current heuristic is to distribute the loop dimensions from
-  // outer to inner.
-  int64_t currStride = 1;
-  int64_t currM = subgroupMStrides.size() - 1;
-  int64_t currN = subgroupNStrides.size() - 1;
-  for (int64_t dim : llvm::reverse(llvm::seq<int64_t>(rank))) {
-    if (mDims.contains(dim)) {
-      subgroupMStrides[currM] = currStride;
-      currStride *= subgroupMBasis[currM];
-      currM--;
-      continue;
-    }
-
-    if (nDims.contains(dim)) {
-      subgroupNStrides[currN] = currStride;
-      currStride *= subgroupNBasis[currN];
-      currN--;
-      continue;
-    }
-  }
-
-  // C matrix layout
-  auto [m, n] = opInfo.getResultMNIndex();
-  int64_t cRank = opInfo.getCRank();
-
-  // Get the M and N dims w.r.t. the dimensions of the C matrix. cMDims and
-  // cNDims are the M and N dimensions of the C matrix in the order they are
-  // iterated over in the contraction.
-  SmallVector<int64_t> cMDims = opInfo.outMDims;
-  SmallVector<int64_t> cNDims = opInfo.outNDims;
-  SmallVector<int64_t> cBatchSizes(cRank, 1);
-  SmallVector<int64_t> cSubgroupSizes(cRank, 1);
-  SmallVector<int64_t> cSubgroupStrides(cRank, 0);
-  for (auto [i, dim] : llvm::enumerate(cMDims)) {
-    cBatchSizes[dim] = batchMSizes[i];
-    cSubgroupSizes[dim] = subgroupMBasis[i];
-    cSubgroupStrides[dim] = subgroupMStrides[i];
-  }
-  for (auto [i, dim] : llvm::enumerate(cNDims)) {
-    cBatchSizes[dim] = batchNSizes[i];
-    cSubgroupSizes[dim] = subgroupNBasis[i];
-    cSubgroupStrides[dim] = subgroupNStrides[i];
-  }
-
-  IREE::VectorExt::NestedLayoutAttr cLayout = createNestedLayout(
-      context, cRank, m, n,
-      /*subgroupCount=*/cSubgroupSizes,
-      /*subgroupStrides=*/cSubgroupStrides,
-      /*batchCount=*/cBatchSizes,
-      getSingleSubgroupLayout(mmaAttr, IREE::GPU::MMAFragment::Acc));
-  LLVM_DEBUG({ llvm::dbgs() << "C layout: " << cLayout << "\n"; });
-
-  // A matrix layout
-  auto [afm, bfn] = opInfo.getOperandMNIndex();
-  auto [afk, bfk] = opInfo.getOperandKIndex();
-
-  int64_t aRank = opInfo.getARank();
-
-  SmallVector<int64_t> aMDims = opInfo.lhsMDims;
-  SmallVector<int64_t> aBatchSizes(aRank, 1);
-  SmallVector<int64_t> aSubgroupSizes(aRank, 1);
-  SmallVector<int64_t> aSubgroupStrides(aRank, 0);
-  for (auto [i, dim] : llvm::enumerate(aMDims)) {
-    aBatchSizes[dim] = batchMSizes[i];
-    aSubgroupSizes[dim] = subgroupMBasis[i];
-    aSubgroupStrides[dim] = subgroupMStrides[i];
-  }
-  for (auto [kDim, lhsKDim] :
-       llvm::zip_equal(opInfo.getKDims(), opInfo.lhsKDim)) {
-    aBatchSizes[lhsKDim] = bounds[kDim];
-  }
-  aBatchSizes[afk] = bounds[opInfo.getKDims().back()] / intrinsicK;
-
-  IREE::VectorExt::NestedLayoutAttr aLayout = createNestedLayout(
-      context, aRank, afm, afk,
-      /*subgroupCount=*/aSubgroupSizes,
-      /*subgroupStrides=*/aSubgroupStrides,
-      /*batchCount=*/aBatchSizes,
-      getSingleSubgroupLayout(mmaAttr, IREE::GPU::MMAFragment::Lhs));
-  LLVM_DEBUG({ llvm::dbgs() << "A layout: " << aLayout << "\n"; });
-
-  int64_t bRank = opInfo.getBRank();
-
-  SmallVector<int64_t> bNDims = opInfo.rhsNDims;
-  SmallVector<int64_t> bBatchSizes(bRank, 1);
-  SmallVector<int64_t> bSubgroupSizes(bRank, 1);
-  SmallVector<int64_t> bSubgroupStrides(bRank, 0);
-  for (auto [i, dim] : llvm::enumerate(bNDims)) {
-    bBatchSizes[dim] = batchNSizes[i];
-    bSubgroupSizes[dim] = subgroupNBasis[i];
-    bSubgroupStrides[dim] = subgroupNStrides[i];
-  }
-  for (auto [kDim, rhsKDim] :
-       llvm::zip_equal(opInfo.getKDims(), opInfo.rhsKDim)) {
-    bBatchSizes[rhsKDim] = bounds[kDim];
-  }
-  bBatchSizes[bfk] = bounds[opInfo.getKDims().back()] / intrinsicK;
-
-  IREE::VectorExt::NestedLayoutAttr bLayout = createNestedLayout(
-      context, bRank, bfk, bfn,
-      /*subgroupCount=*/bSubgroupSizes,
-      /*subgroupStrides=*/bSubgroupStrides,
-      /*batchCount=*/bBatchSizes,
-      getSingleSubgroupLayout(mmaAttr, IREE::GPU::MMAFragment::Rhs));
-  LLVM_DEBUG({ llvm::dbgs() << "B layout: " << bLayout << "\n"; });
-
-  std::tuple<VectorLayoutInterface, VectorLayoutInterface,
-             VectorLayoutInterface>
-      result = {aLayout, bLayout, cLayout};
-  return result;
-}
-
 static LogicalResult setContractionAnchor(IREE::GPU::MMAScheduleAttr schedule,
                                           SmallVector<bool> promotedOperands,
                                           RewriterBase &rewriter,
 
@@ -9,6 +9,7 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h"
+#include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUUtils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -26,35 +27,30 @@ namespace mlir::iree_compiler {
 #define GEN_PASS_DEF_LLVMGPUVECTORDISTRIBUTEPASS
 #include "iree/compiler/Codegen/LLVMGPU/Passes.h.inc"
 
-namespace {
-
-class ContractionVectorLayoutOptions : public VectorLayoutOptions {
-public:
-  ContractionVectorLayoutOptions(Operation *root, Value laneId,
-                                 int64_t subgroupSize)
-      : VectorLayoutOptions(root), patterns(root->getContext()) {
-    populateGPUDistributionPatterns(patterns);
-    populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId,
-                                                  subgroupSize);
-    populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns);
+ContractionVectorLayoutOptions::ContractionVectorLayoutOptions(
+    Operation *root, Value laneId, int64_t subgroupSize)
+    : VectorLayoutOptions(root), patterns(root->getContext()) {
+  populateGPUDistributionPatterns(patterns);
+  populateGPUDistributeNestedLayoutAttrPatterns(patterns, laneId, subgroupSize);
+  populateGPUDistributeNestedLayoutContractAMDGPUPatterns(patterns);
+}
+
+RewritePatternSet &ContractionVectorLayoutOptions::getPatterns() {
+  return patterns;
+}
+
+VectorLayoutInterface
+ContractionVectorLayoutOptions::getDefaultLayout(VectorType type) const {
+  // We only allow a default layout for 0-d vectors for now.
+  if (type.getRank() > 0) {
+    return VectorLayoutInterface();
   }
+  ArrayRef<int64_t> empty = {};
+  return IREE::VectorExt::NestedLayoutAttr::get(
+      type.getContext(), empty, empty, empty, empty, empty, empty, empty);
+}
 
-  RewritePatternSet &getPatterns() { return patterns; }
-
-  VectorLayoutInterface getDefaultLayout(VectorType type) const override {
-    // We only allow a default layout for 0-d vectors for now.
-    if (type.getRank() > 0) {
-      return VectorLayoutInterface();
-    }
-    ArrayRef<int64_t> empty = {};
-    return IREE::VectorExt::NestedLayoutAttr::get(
-        type.getContext(), empty, empty, empty, empty, empty, empty, empty);
-  }
-
-private:
-  RewritePatternSet patterns;
-};
-
+namespace {
 struct LLVMGPUVectorDistributePass final
     : impl::LLVMGPUVectorDistributePassBase<LLVMGPUVectorDistributePass> {
   void getDependentDialects(DialectRegistry &registry) const override {
 
@@ -1458,16 +1458,6 @@ transform_dialect::PrefetchSharedMemoryCopiesOp::applyToOne(
   return DiagnosedSilenceableFailure::success();
 }
 
-class TransformVectorLayoutOptions : public VectorLayoutOptions {
-public:
-  TransformVectorLayoutOptions(Operation *root, bool fullConversion)
-      : VectorLayoutOptions(root, fullConversion) {}
-
-  VectorLayoutInterface getDefaultLayout(VectorType type) const override {
-    return VectorLayoutInterface();
-  }
-};
-
 DiagnosedSilenceableFailure
 transform_dialect::AMDGPUDistributeVectorsOp::applyToOne(
     transform::TransformRewriter &rewriter, mlir::FunctionOpInterface target,
 
@@ -7,6 +7,7 @@
 #ifndef IREE_COMPILER_CODEGEN_LLVMGPU_TRANSFORMEXTENSIONS_LLVMGPUEXTENSIONS_H_
 #define IREE_COMPILER_CODEGEN_LLVMGPU_TRANSFORMEXTENSIONS_LLVMGPUEXTENSIONS_H_
 
+#include "iree/compiler/Codegen/Common/GPU/GPUVectorDistribution.h"
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
@@ -31,6 +32,16 @@ class WarpExecuteOnLane0Op;
 
 namespace mlir::iree_compiler {
 
+class TransformVectorLayoutOptions : public VectorLayoutOptions {
+public:
+  TransformVectorLayoutOptions(Operation *root, bool fullConversion)
+      : VectorLayoutOptions(root, fullConversion) {}
+
+  VectorLayoutInterface getDefaultLayout(VectorType type) const override {
+    return VectorLayoutInterface();
+  }
+};
+
 /// Registers Flow transformations that require IREE-specific information into
 /// the transform dialect.
 void registerTransformDialectLLVMGPUExtension(DialectRegistry &registry);