Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_

#include "mlir/Dialect/GPU/Transforms/Utils.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/Types.h"
Expand Down
2 changes: 1 addition & 1 deletion mlir/include/mlir/Dialect/GPU/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_

#include "Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/Pass/Pass.h"
#include <optional>
Expand Down
59 changes: 59 additions & 0 deletions mlir/include/mlir/Dialect/GPU/Utils/DistributionUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
//===- DistributionUtils.h - Distribution Utilities -------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_
#define MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBITIONUTILS_H_

#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/Value.h"

namespace mlir::gpu {
struct WarpDistributionPattern : OpRewritePattern<WarpExecuteOnLane0Op> {
using OpRewritePattern::OpRewritePattern;
using Base = WarpDistributionPattern;

virtual LogicalResult
matchAndRewrite(WarpExecuteOnLane0Op op,
PatternRewriter &rewriter) const override = 0;

protected:
/// Return a value yielded by `warpOp` which statifies the filter lamdba
/// condition and is not dead.
OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
llvm::function_ref<bool(Operation *)> fn) const;

/// Helper to create a new WarpExecuteOnLane0Op with different signature.
WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
ValueRange newYieldedValues, TypeRange newReturnTypes) const;

/// Helper to create a new WarpExecuteOnLane0Op region with extra outputs.
/// `indices` return the index of each new output.
WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns(
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
ValueRange newYieldedValues, TypeRange newReturnTypes,
SmallVector<size_t> &indices) const;

/// Delinearize the given `laneId` into multiple dimensions, where each
/// dimension's size is determined by `originalShape` and `distributedShape`
/// together. This function expects the total numbers of threads needed for
/// distribution is equal to `warpSize`. Returns true and updates
/// `delinearizedIds` if so.
bool delinearizeLaneId(OpBuilder &builder, Location loc,
ArrayRef<int64_t> originalShape,
ArrayRef<int64_t> distributedShape, int64_t warpSize,
Value laneId,
SmallVectorImpl<Value> &delinearizedIds) const;
};

} // namespace mlir::gpu

#endif // MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_
3 changes: 2 additions & 1 deletion mlir/lib/Dialect/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ add_mlir_dialect_library(MLIRGPUTransforms
Transforms/ShuffleRewriter.cpp
Transforms/SPIRVAttachTarget.cpp
Transforms/SubgroupReduceLowering.cpp
Transforms/Utils.cpp

OBJECT

Expand All @@ -59,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
MLIRDataLayoutInterfaces
MLIRExecutionEngineUtils
MLIRGPUDialect
MLIRGPUUtils
MLIRIR
MLIRIndexDialect
MLIRLLVMDialect
Expand All @@ -76,3 +76,4 @@ add_mlir_dialect_library(MLIRGPUTransforms

add_subdirectory(TransformOps)
add_subdirectory(Pipelines)
add_subdirectory(Utils)
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include "mlir/Dialect/Async/IR/Async.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Utils.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/IR/PatternMatch.h"
Expand Down
2 changes: 1 addition & 1 deletion mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Utils.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"
#include "mlir/Dialect/GPU/Transforms/Utils.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Location.h"
Expand Down
14 changes: 14 additions & 0 deletions mlir/lib/Dialect/GPU/Utils/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
add_mlir_dialect_library(MLIRGPUUtils
Utils.cpp
DistributionUtils.cpp

ADDITIONAL_HEADER_DIRS
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils

LINK_LIBS PUBLIC
MLIRArithDialect
MLIRAffineDialect
MLIRGPUDialect
MLIRSupport
MLIRIR
)
144 changes: 144 additions & 0 deletions mlir/lib/Dialect/GPU/Utils/DistributionUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
//===- DistributionUtils.cpp - Distribution tools for GPUOps --------------===//
//
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements distribution utility methods.
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/IR/Value.h"

#include <numeric>

using namespace mlir;
using namespace mlir::gpu;

WarpExecuteOnLane0Op
WarpDistributionPattern::moveRegionToNewWarpOpAndReplaceReturns(
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
ValueRange newYieldedValues, TypeRange newReturnTypes) const {
// Create a new op before the existing one, with the extra operands.
OpBuilder::InsertionGuard g(rewriter);
rewriter.setInsertionPoint(warpOp);
auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());

Region &opBody = warpOp.getBodyRegion();
Region &newOpBody = newWarpOp.getBodyRegion();
Block &newOpFirstBlock = newOpBody.front();
rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
rewriter.eraseBlock(&newOpFirstBlock);
assert(newWarpOp.getWarpRegion().hasOneBlock() &&
"expected WarpOp with single block");

auto yield =
cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());

rewriter.modifyOpInPlace(
yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
return newWarpOp;
}

WarpExecuteOnLane0Op
WarpDistributionPattern::moveRegionToNewWarpOpAndAppendReturns(
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
ValueRange newYieldedValues, TypeRange newReturnTypes,
SmallVector<size_t> &indices) const {
SmallVector<Type> types(warpOp.getResultTypes().begin(),
warpOp.getResultTypes().end());
auto yield = cast<gpu::YieldOp>(
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
yield.getOperands().end());
for (auto [value, type] : llvm::zip_equal(newYieldedValues, newReturnTypes)) {
if (yieldValues.insert(value)) {
types.push_back(type);
indices.push_back(yieldValues.size() - 1);
} else {
// If the value already exit the region don't create a new output.
for (auto [idx, yieldOperand] :
llvm::enumerate(yieldValues.getArrayRef())) {
if (yieldOperand == value) {
indices.push_back(idx);
break;
}
}
}
}
yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
rewriter, warpOp, yieldValues.getArrayRef(), types);
rewriter.replaceOp(warpOp,
newWarpOp.getResults().take_front(warpOp.getNumResults()));
return newWarpOp;
}

OpOperand *WarpDistributionPattern::getWarpResult(
WarpExecuteOnLane0Op warpOp,
llvm::function_ref<bool(Operation *)> fn) const {
auto yield = cast<gpu::YieldOp>(
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
for (OpOperand &yieldOperand : yield->getOpOperands()) {
Value yieldValues = yieldOperand.get();
Operation *definedOp = yieldValues.getDefiningOp();
if (definedOp && fn(definedOp)) {
if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
return &yieldOperand;
}
}
return nullptr;
}

bool WarpDistributionPattern::delinearizeLaneId(
OpBuilder &builder, Location loc, ArrayRef<int64_t> originalShape,
ArrayRef<int64_t> distributedShape, int64_t warpSize, Value laneId,
SmallVectorImpl<Value> &delinearizedIds) const {
// If the original shape and the distributed shape is the same, we don't
// distribute at all--every thread is handling the whole. For such case, we
// should not rely on lane IDs later. So just return an empty lane ID vector.
if (originalShape == distributedShape) {
delinearizedIds.clear();
return true;
}

SmallVector<int64_t> sizes;
for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
if (large % small != 0)
return false;
sizes.push_back(large / small);
}
if (std::accumulate(sizes.begin(), sizes.end(), 1,
std::multiplies<int64_t>()) != warpSize)
return false;

AffineExpr s0, s1;
bindSymbols(builder.getContext(), s0, s1);

int64_t usedThreads = 1;

Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
delinearizedIds.assign(sizes.size(), zero);

for (int i = sizes.size() - 1; i >= 0; --i) {
usedThreads *= sizes[i];
if (usedThreads == warpSize) {
// We've used up all available threads. Don't need to perform modulo
// anymore. And we can stop the calculation for further dimensions.
delinearizedIds[i] = laneId;
break;
}
delinearizedIds[i] =
affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
laneId = affine::makeComposedAffineApply(
builder, loc, s0.floorDiv(usedThreads), {laneId});
}
return true;
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
//
//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/Transforms/Utils.h"
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
#include "llvm/Support/ErrorHandling.h"

namespace mlir::gpu {
Expand Down
1 change: 1 addition & 0 deletions mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
MLIRArithDialect
MLIRDialectUtils
MLIRGPUDialect
MLIRGPUUtils
MLIRIR
MLIRLinalgDialect
MLIRMemRefDialect
Expand Down
Loading
Loading