-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[mlir][gpu] Loose the condition to convert scf.parallel to gpu.launch #164978
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch.
|
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Hsiangkai Wang (Hsiangkai) ChangesUse LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch. Full diff: https://github.com/llvm/llvm-project/pull/164978.diff 2 Files Affected:
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
index 7d0a236b6f69a..9f977802f4404 100644
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -14,6 +14,7 @@
#include "mlir/Conversion/SCFToGPU/SCFToGPU.h"
+#include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
@@ -27,6 +28,7 @@
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/DenseSet.h"
#include "llvm/Support/DebugLog.h"
#include <optional>
@@ -625,6 +627,8 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
bool seenSideeffects = false;
// Whether we have left a nesting scope (and hence are no longer innermost).
bool leftNestingScope = false;
+ LocalAliasAnalysis aliasAnalysis;
+ llvm::DenseSet<Value> writtenBuffer;
while (!worklist.empty()) {
Operation *op = worklist.pop_back_val();
// Now walk over the body and clone it.
@@ -635,8 +639,39 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
// Before entering a nested scope, make sure there have been no
// sideeffects until now.
- if (seenSideeffects)
- return failure();
+ if (seenSideeffects) {
+ // Go through all operations in the nested parallel and check if any
+ // of the side-effecting operations access buffers that have been
+ // written to in the outer scope.
+ bool accessesWrittenBuffer = false;
+ nestedParallel.walk([&](Operation *nestedOp) {
+ if (accessesWrittenBuffer)
+ return;
+ if (isMemoryEffectFree(nestedOp))
+ return;
+
+ if (auto memEffectInterface =
+ dyn_cast<MemoryEffectOpInterface>(nestedOp)) {
+ SmallVector<MemoryEffects::EffectInstance> effects;
+ memEffectInterface.getEffects(effects);
+ for (const auto &effect : effects) {
+ if (isa<MemoryEffects::Read>(effect.getEffect()) ||
+ isa<MemoryEffects::Write>(effect.getEffect())) {
+ Value baseBuffer = effect.getValue();
+ for (auto val : writtenBuffer) {
+ if (aliasAnalysis.alias(baseBuffer, val) !=
+ AliasResult::NoAlias) {
+ accessesWrittenBuffer = true;
+ return;
+ }
+ }
+ }
+ }
+ }
+ });
+ if (accessesWrittenBuffer)
+ return failure();
+ }
// A nested scf.parallel needs insertion of code to compute indices.
// Insert that now. This will also update the worklist with the loops
// body.
@@ -650,6 +685,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
rewriter.setInsertionPointAfter(parent);
leftNestingScope = true;
seenSideeffects = false;
+ writtenBuffer.clear();
} else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
// Convert scf.reduction op
auto parentLoop = op->getParentOfType<ParallelOp>();
@@ -682,6 +718,18 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
Operation *clone = rewriter.clone(*op, cloningMap);
cloningMap.map(op->getResults(), clone->getResults());
// Check for side effects.
+ if (!isMemoryEffectFree(clone)) {
+ // Record the buffer accessed by the operations with write effects.
+ if (auto memEffectInterface =
+ dyn_cast<MemoryEffectOpInterface>(clone)) {
+ SmallVector<MemoryEffects::EffectInstance> effects;
+ memEffectInterface.getEffects(effects);
+ for (const auto &effect : effects) {
+ if (isa<MemoryEffects::Write>(effect.getEffect()))
+ writtenBuffer.insert(effect.getValue());
+ }
+ }
+ }
// TODO: Handle region side effects properly.
seenSideeffects |=
!isMemoryEffectFree(clone) || clone->getNumRegions() != 0;
diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
index 1dbce05be85b4..26f5a3e1f0ac0 100644
--- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
@@ -641,3 +641,35 @@ func.func @parallel_reduction_1d_outside() {
// CHECK: scf.parallel
// CHECK-NEXT: scf.parallel
// CHECK: scf.reduce
+
+// -----
+
+// CHECK-LABEL: @nested_parallel_with_side_effect
+func.func @nested_parallel_with_side_effect() {
+ %c65536 = arith.constant 65536 : index
+ %c2 = arith.constant 2 : index
+ %c256 = arith.constant 256 : index
+ %c0 = arith.constant 0 : index
+ %c4 = arith.constant 4 : index
+ %c1 = arith.constant 1 : index
+ %alloc_0 = memref.alloc() : memref<2x256x256xf32>
+ %alloc_1 = memref.alloc() : memref<2x4x256x256xf32>
+ %alloc_2 = memref.alloc() : memref<4x4xf32>
+ %alloc_3 = memref.alloc() : memref<4x4xf32>
+ scf.parallel (%arg2, %arg3, %arg4) = (%c0, %c0, %c0) to (%c2, %c4, %c65536) step (%c1, %c1, %c1) {
+ %1 = arith.remsi %arg4, %c256 : index
+ %2 = arith.divsi %arg4, %c256 : index
+ %4 = memref.load %alloc_0[%arg2, %2, %1] : memref<2x256x256xf32>
+ memref.store %4, %alloc_1[%arg2, %arg3, %2, %1] : memref<2x4x256x256xf32>
+ scf.parallel (%arg5) = (%c0) to (%c4) step (%c1) {
+ %5 = memref.load %alloc_2[%arg5, %c0] : memref<4x4xf32>
+ memref.store %5, %alloc_3[%arg5, %c0] : memref<4x4xf32>
+ scf.reduce
+ } {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+ scf.reduce
+ } {mapping = [#gpu.loop_dim_map<processor = block_z, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_y, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
+ return
+}
+
+// CHECK: gpu.launch
+// CHECK-NOT: scf.parallel
|
| memEffectInterface.getEffects(effects); | ||
| for (const auto &effect : effects) { | ||
| if (isa<MemoryEffects::Write>(effect.getEffect())) | ||
| writtenBuffer.insert(effect.getValue()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can effect.getValue() return null?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Checked the value before use.
| SmallVector<MemoryEffects::EffectInstance> effects; | ||
| memEffectInterface.getEffects(effects); | ||
| for (const auto &effect : effects) { | ||
| if (isa<MemoryEffects::Read>(effect.getEffect()) || |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: can we invert this logic and early return if not a Read/Write?
Removes the following nesting level.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
| for (const auto &effect : effects) { | ||
| if (isa<MemoryEffects::Read>(effect.getEffect()) || | ||
| isa<MemoryEffects::Write>(effect.getEffect())) { | ||
| Value baseBuffer = effect.getValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can getValue be nullptr?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
| if (aliasAnalysis.alias(baseBuffer, val) != | ||
| AliasResult::NoAlias) { | ||
| accessesWrittenBuffer = true; | ||
| return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Suppose here you want to interrupt the walk? Hence why you have in the beginning.
if (accessesWrittenBuffer) return;
But return doesn't stop walk does it? Maybe you can explicitly interrupt here and remove the flag check in the beginning?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
…t scf.parallel to gpu.launch (#165721)
…llvm#164978) Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch.
…nvert scf.parallel to gpu.launch (llvm#165721)
…llvm#164978) Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch.
…nvert scf.parallel to gpu.launch (llvm#165721)
…llvm#164978) Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch.
…nvert scf.parallel to gpu.launch (llvm#165721)
Use LocalAliasAnalysis to improve handling of side effects in nested scf.parallel. If the written memory outside nested scf.parallel is not alias to the memory accessed inside the nested loop, we can convert it to gpu.launch.