Skip to content

Commit 2e1c004

Browse files
committed
[mlir][gpu] Loose the condition to convert scf.parallel to gpu.launch
Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch.
1 parent 89b18f0 commit 2e1c004

File tree

2 files changed

+82
-2
lines changed

2 files changed

+82
-2
lines changed

mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
1616

17+
#include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h"
1718
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
1819
#include "mlir/Dialect/Affine/IR/AffineOps.h"
1920
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -27,6 +28,7 @@
2728
#include "mlir/Interfaces/SideEffectInterfaces.h"
2829
#include "mlir/Transforms/DialectConversion.h"
2930
#include "mlir/Transforms/RegionUtils.h"
31+
#include "llvm/ADT/DenseSet.h"
3032
#include "llvm/Support/DebugLog.h"
3133
#include <optional>
3234

@@ -625,6 +627,8 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
625627
bool seenSideeffects = false;
626628
// Whether we have left a nesting scope (and hence are no longer innermost).
627629
bool leftNestingScope = false;
630+
LocalAliasAnalysis aliasAnalysis;
631+
llvm::DenseSet<Value> writtenBuffer;
628632
while (!worklist.empty()) {
629633
Operation *op = worklist.pop_back_val();
630634
// Now walk over the body and clone it.
@@ -635,8 +639,39 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
635639
if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
636640
// Before entering a nested scope, make sure there have been no
637641
// sideeffects until now.
638-
if (seenSideeffects)
639-
return failure();
642+
if (seenSideeffects) {
643+
// Go through all operations in the nested parallel and check if any
644+
// of the side-effecting operations access buffers that have been
645+
// written to in the outer scope.
646+
bool accessesWrittenBuffer = false;
647+
nestedParallel.walk([&](Operation *nestedOp) {
648+
if (accessesWrittenBuffer)
649+
return;
650+
if (isMemoryEffectFree(nestedOp))
651+
return;
652+
653+
if (auto memEffectInterface =
654+
dyn_cast<MemoryEffectOpInterface>(nestedOp)) {
655+
SmallVector<MemoryEffects::EffectInstance> effects;
656+
memEffectInterface.getEffects(effects);
657+
for (const auto &effect : effects) {
658+
if (isa<MemoryEffects::Read>(effect.getEffect()) ||
659+
isa<MemoryEffects::Write>(effect.getEffect())) {
660+
Value baseBuffer = effect.getValue();
661+
for (auto val : writtenBuffer) {
662+
if (aliasAnalysis.alias(baseBuffer, val) !=
663+
AliasResult::NoAlias) {
664+
accessesWrittenBuffer = true;
665+
return;
666+
}
667+
}
668+
}
669+
}
670+
}
671+
});
672+
if (accessesWrittenBuffer)
673+
return failure();
674+
}
640675
// A nested scf.parallel needs insertion of code to compute indices.
641676
// Insert that now. This will also update the worklist with the loops
642677
// body.
@@ -650,6 +685,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
650685
rewriter.setInsertionPointAfter(parent);
651686
leftNestingScope = true;
652687
seenSideeffects = false;
688+
writtenBuffer.clear();
653689
} else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
654690
// Convert scf.reduction op
655691
auto parentLoop = op->getParentOfType<ParallelOp>();
@@ -682,6 +718,18 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
682718
Operation *clone = rewriter.clone(*op, cloningMap);
683719
cloningMap.map(op->getResults(), clone->getResults());
684720
// Check for side effects.
721+
if (!isMemoryEffectFree(clone)) {
722+
// Record the buffer accessed by the operations with write effects.
723+
if (auto memEffectInterface =
724+
dyn_cast<MemoryEffectOpInterface>(clone)) {
725+
SmallVector<MemoryEffects::EffectInstance> effects;
726+
memEffectInterface.getEffects(effects);
727+
for (const auto &effect : effects) {
728+
if (isa<MemoryEffects::Write>(effect.getEffect()))
729+
writtenBuffer.insert(effect.getValue());
730+
}
731+
}
732+
}
685733
// TODO: Handle region side effects properly.
686734
seenSideeffects |=
687735
!isMemoryEffectFree(clone) || clone->getNumRegions() != 0;

mlir/test/Conversion/SCFToGPU/parallel_loop.mlir

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,3 +641,35 @@ func.func @parallel_reduction_1d_outside() {
641641
// CHECK: scf.parallel
642642
// CHECK-NEXT: scf.parallel
643643
// CHECK: scf.reduce
644+
645+
// -----
646+
647+
// CHECK-LABEL: @nested_parallel_with_side_effect
648+
func.func @nested_parallel_with_side_effect() {
649+
%c65536 = arith.constant 65536 : index
650+
%c2 = arith.constant 2 : index
651+
%c256 = arith.constant 256 : index
652+
%c0 = arith.constant 0 : index
653+
%c4 = arith.constant 4 : index
654+
%c1 = arith.constant 1 : index
655+
%alloc_0 = memref.alloc() : memref<2x256x256xf32>
656+
%alloc_1 = memref.alloc() : memref<2x4x256x256xf32>
657+
%alloc_2 = memref.alloc() : memref<4x4xf32>
658+
%alloc_3 = memref.alloc() : memref<4x4xf32>
659+
scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) {
660+
%1 = arith.remsi %arg4, %c256 : index
661+
%2 = arith.divsi %arg4, %c256 : index
662+
%4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32>
663+
memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32>
664+
scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) {
665+
%5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32>
666+
memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32>
667+
scf.reduce
668+
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
669+
scf.reduce
670+
} {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
671+
return
672+
}
673+
674+
// CHECK: gpu.launch
675+
// CHECK-NOT: scf.parallel

0 commit comments

Comments
 (0)