Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 58 additions & 6 deletions mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"

#include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
Expand All @@ -27,6 +28,7 @@
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/DebugLog.h"
#include <optional>

Expand Down Expand Up @@ -625,18 +627,49 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
bool seenSideeffects = false;
// Whether we have left a nesting scope (and hence are no longer innermost).
bool leftNestingScope = false;
LocalAliasAnalysis aliasAnalysis;
llvm::DenseSet<Value> writtenBuffer;
while (!worklist.empty()) {
Operation *op = worklist.pop_back_val();
// Now walk over the body and clone it.
// TODO: This is only correct if there either is no further scf.parallel
// nested or this code is side-effect free. Otherwise we might need
// predication. We are overly conservative for now and only allow
// side-effects in the innermost scope.
// nested or this code has side-effect but the memory buffer is not
// alias to inner loop access buffer. Otherwise we might need
// predication.
if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
// Before entering a nested scope, make sure there have been no
// sideeffects until now.
if (seenSideeffects)
return failure();
// sideeffects until now or the nested operations do not access the
// buffer written by outer scope.
if (seenSideeffects) {
WalkResult walkRes = nestedParallel.walk([&](Operation *nestedOp) {
if (isMemoryEffectFree(nestedOp))
return WalkResult::advance();

auto memEffectInterface = dyn_cast<MemoryEffectOpInterface>(nestedOp);
if (!memEffectInterface)
return WalkResult::advance();

SmallVector<MemoryEffects::EffectInstance> effects;
memEffectInterface.getEffects(effects);
for (const MemoryEffects::EffectInstance &effect : effects) {
if (isa<MemoryEffects::Read>(effect.getEffect()) ||
isa<MemoryEffects::Write>(effect.getEffect())) {
Value baseBuffer = effect.getValue();
if (!baseBuffer)
return WalkResult::interrupt();
for (Value val : writtenBuffer) {
if (aliasAnalysis.alias(baseBuffer, val) !=
AliasResult::NoAlias) {
return WalkResult::interrupt();
}
}
}
}
return WalkResult::advance();
});
if (walkRes.wasInterrupted())
return failure();
}
// A nested scf.parallel needs insertion of code to compute indices.
// Insert that now. This will also update the worklist with the loops
// body.
Expand All @@ -650,6 +683,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
rewriter.setInsertionPointAfter(parent);
leftNestingScope = true;
seenSideeffects = false;
writtenBuffer.clear();
} else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
// Convert scf.reduction op
auto parentLoop = op->getParentOfType<ParallelOp>();
Expand Down Expand Up @@ -682,6 +716,24 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
Operation *clone = rewriter.clone(*op, cloningMap);
cloningMap.map(op->getResults(), clone->getResults());
// Check for side effects.
if (!isMemoryEffectFree(clone)) {
// Record the buffer accessed by the operations with write effects.
if (auto memEffectInterface =
dyn_cast<MemoryEffectOpInterface>(clone)) {
SmallVector<MemoryEffects::EffectInstance> effects;
memEffectInterface.getEffects(effects);
for (const MemoryEffects::EffectInstance &effect : effects) {
if (isa<MemoryEffects::Write>(effect.getEffect())) {
Value writtenBase = effect.getValue();
// Conservatively return failure if we cannot find the written
// address.
if (!writtenBase)
return failure();
writtenBuffer.insert(writtenBase);
}
}
}
}
// TODO: Handle region side effects properly.
seenSideeffects |=
!isMemoryEffectFree(clone) || clone->getNumRegions() != 0;
Expand Down
32 changes: 32 additions & 0 deletions mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -641,3 +641,35 @@ func.func @parallel_reduction_1d_outside() {
// CHECK: scf.parallel
// CHECK-NEXT: scf.parallel
// CHECK: scf.reduce

// -----

// CHECK-LABEL: @nested_parallel_with_side_effect
func.func @nested_parallel_with_side_effect() {
%c65536 = arith.constant 65536 : index
%c2 = arith.constant 2 : index
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%alloc_0 = memref.alloc() : memref<2x256x256xf32>
%alloc_1 = memref.alloc() : memref<2x4x256x256xf32>
%alloc_2 = memref.alloc() : memref<4x4xf32>
%alloc_3 = memref.alloc() : memref<4x4xf32>
scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) {
%1 = arith.remsi %arg4, %c256 : index
%2 = arith.divsi %arg4, %c256 : index
%4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32>
memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32>
scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) {
%5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32>
memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32>
scf.reduce
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
scf.reduce
} {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
return
}

// CHECK: gpu.launch
// CHECK-NOT: scf.parallel