Skip to content

Commit db530ac

Browse files
HsiangkaiDebadri Basak
authored andcommitted
[mlir][gpu] Loose the condition to convert scf.parallel to gpu.launch (llvm#164978)
Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch.
1 parent 069ff69 commit db530ac

File tree

2 files changed

+90
-6
lines changed

2 files changed

+90
-6
lines changed

mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
1616

17+
#include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h"
1718
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
1819
#include "mlir/Dialect/Affine/IR/AffineOps.h"
1920
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -27,6 +28,7 @@
2728
#include "mlir/Interfaces/SideEffectInterfaces.h"
2829
#include "mlir/Transforms/DialectConversion.h"
2930
#include "mlir/Transforms/RegionUtils.h"
31+
#include "llvm/ADT/DenseSet.h"
3032
#include "llvm/Support/DebugLog.h"
3133
#include <optional>
3234

@@ -625,18 +627,49 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
625627
bool seenSideeffects = false;
626628
// Whether we have left a nesting scope (and hence are no longer innermost).
627629
bool leftNestingScope = false;
630+
LocalAliasAnalysis aliasAnalysis;
631+
llvm::DenseSet<Value> writtenBuffer;
628632
while (!worklist.empty()) {
629633
Operation *op = worklist.pop_back_val();
630634
// Now walk over the body and clone it.
631635
// TODO: This is only correct if there either is no further scf.parallel
632-
// nested or this code is side-effect free. Otherwise we might need
633-
// predication. We are overly conservative for now and only allow
634-
// side-effects in the innermost scope.
636+
// nested or this code has side-effect but the memory buffer is not
637+
// alias to inner loop access buffer. Otherwise we might need
638+
// predication.
635639
if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
636640
// Before entering a nested scope, make sure there have been no
637-
// sideeffects until now.
638-
if (seenSideeffects)
639-
return failure();
641+
// sideeffects until now or the nested operations do not access the
642+
// buffer written by outer scope.
643+
if (seenSideeffects) {
644+
WalkResult walkRes = nestedParallel.walk([&](Operation *nestedOp) {
645+
if (isMemoryEffectFree(nestedOp))
646+
return WalkResult::advance();
647+
648+
auto memEffectInterface = dyn_cast<MemoryEffectOpInterface>(nestedOp);
649+
if (!memEffectInterface)
650+
return WalkResult::advance();
651+
652+
SmallVector<MemoryEffects::EffectInstance> effects;
653+
memEffectInterface.getEffects(effects);
654+
for (const MemoryEffects::EffectInstance &effect : effects) {
655+
if (isa<MemoryEffects::Read>(effect.getEffect()) ||
656+
isa<MemoryEffects::Write>(effect.getEffect())) {
657+
Value baseBuffer = effect.getValue();
658+
if (!baseBuffer)
659+
return WalkResult::interrupt();
660+
for (Value val : writtenBuffer) {
661+
if (aliasAnalysis.alias(baseBuffer, val) !=
662+
AliasResult::NoAlias) {
663+
return WalkResult::interrupt();
664+
}
665+
}
666+
}
667+
}
668+
return WalkResult::advance();
669+
});
670+
if (walkRes.wasInterrupted())
671+
return failure();
672+
}
640673
// A nested scf.parallel needs insertion of code to compute indices.
641674
// Insert that now. This will also update the worklist with the loops
642675
// body.
@@ -650,6 +683,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
650683
rewriter.setInsertionPointAfter(parent);
651684
leftNestingScope = true;
652685
seenSideeffects = false;
686+
writtenBuffer.clear();
653687
} else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
654688
// Convert scf.reduction op
655689
auto parentLoop = op->getParentOfType<ParallelOp>();
@@ -682,6 +716,24 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
682716
Operation *clone = rewriter.clone(*op, cloningMap);
683717
cloningMap.map(op->getResults(), clone->getResults());
684718
// Check for side effects.
719+
if (!isMemoryEffectFree(clone)) {
720+
// Record the buffer accessed by the operations with write effects.
721+
if (auto memEffectInterface =
722+
dyn_cast<MemoryEffectOpInterface>(clone)) {
723+
SmallVector<MemoryEffects::EffectInstance> effects;
724+
memEffectInterface.getEffects(effects);
725+
for (const MemoryEffects::EffectInstance &effect : effects) {
726+
if (isa<MemoryEffects::Write>(effect.getEffect())) {
727+
Value writtenBase = effect.getValue();
728+
// Conservatively return failure if we cannot find the written
729+
// address.
730+
if (!writtenBase)
731+
return failure();
732+
writtenBuffer.insert(writtenBase);
733+
}
734+
}
735+
}
736+
}
685737
// TODO: Handle region side effects properly.
686738
seenSideeffects |=
687739
!isMemoryEffectFree(clone) || clone->getNumRegions() != 0;

mlir/test/Conversion/SCFToGPU/parallel_loop.mlir

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,3 +641,35 @@ func.func @parallel_reduction_1d_outside() {
641641
// CHECK: scf.parallel
642642
// CHECK-NEXT: scf.parallel
643643
// CHECK: scf.reduce
644+
645+
// -----
646+
647+
// CHECK-LABEL: @nested_parallel_with_side_effect
648+
func.func @nested_parallel_with_side_effect() {
649+
%c65536 = arith.constant 65536 : index
650+
%c2 = arith.constant 2 : index
651+
%c256 = arith.constant 256 : index
652+
%c0 = arith.constant 0 : index
653+
%c4 = arith.constant 4 : index
654+
%c1 = arith.constant 1 : index
655+
%alloc_0 = memref.alloc() : memref<2x256x256xf32>
656+
%alloc_1 = memref.alloc() : memref<2x4x256x256xf32>
657+
%alloc_2 = memref.alloc() : memref<4x4xf32>
658+
%alloc_3 = memref.alloc() : memref<4x4xf32>
659+
scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) {
660+
%1 = arith.remsi %arg4, %c256 : index
661+
%2 = arith.divsi %arg4, %c256 : index
662+
%4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32>
663+
memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32>
664+
scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) {
665+
%5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32>
666+
memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32>
667+
scf.reduce
668+
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
669+
scf.reduce
670+
} {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
671+
return
672+
}
673+
674+
// CHECK: gpu.launch
675+
// CHECK-NOT: scf.parallel

0 commit comments

Comments
 (0)