-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[flang][OpenMP] Support multi-block reduction combiner regions on the GPU #156837
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-mlir-llvm @llvm/pr-subscribers-flang-openmp Author: Kareem Ergawy (ergawy) ChangesFixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators. Full diff: https://github.com/llvm/llvm-project/pull/156837.diff 2 Files Affected:
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3d5e487c8990f..fe00a2a5696dc 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3506,6 +3506,8 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction(
return AfterIP.takeError();
if (!Builder.GetInsertBlock())
return ReductionFunc;
+
+ Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint());
Builder.CreateStore(Reduced, LHSPtr);
}
}
@@ -3750,6 +3752,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
if (!AfterIP)
return AfterIP.takeError();
+ Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint());
Builder.CreateStore(Reduced, LHS, false);
}
}
diff --git a/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
new file mode 100644
index 0000000000000..aaf06d2d0e0c2
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-multi-block-reduction.mlir
@@ -0,0 +1,85 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Verifies that the IR builder can handle reductions with multi-block combiner
+// regions on the GPU.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<"dlti.alloca_memory_space" = 5 : ui64, "dlti.global_memory_space" = 1 : ui64>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+ llvm.func @bar() {}
+ llvm.func @baz() {}
+
+ omp.declare_reduction @add_reduction_byref_box_5xf32 : !llvm.ptr alloc {
+ %0 = llvm.mlir.constant(1 : i64) : i64
+ %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> : (i64) -> !llvm.ptr<5>
+ %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr
+ omp.yield(%2 : !llvm.ptr)
+ } init {
+ ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ omp.yield(%arg1 : !llvm.ptr)
+ } combiner {
+ ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+ llvm.call @bar() : () -> ()
+ llvm.br ^bb3
+
+ ^bb3: // pred: ^bb1
+ llvm.call @baz() : () -> ()
+ omp.yield(%arg0 : !llvm.ptr)
+ }
+ llvm.func @foo_() {
+ %c1 = llvm.mlir.constant(1 : i64) : i64
+ %10 = llvm.alloca %c1 x !llvm.array<5 x f32> {bindc_name = "x"} : (i64) -> !llvm.ptr<5>
+ %11 = llvm.addrspacecast %10 : !llvm.ptr<5> to !llvm.ptr
+ %74 = omp.map.info var_ptr(%11 : !llvm.ptr, !llvm.array<5 x f32>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"}
+ omp.target map_entries(%74 -> %arg0 : !llvm.ptr) {
+ %c1_2 = llvm.mlir.constant(1 : i32) : i32
+ %c10 = llvm.mlir.constant(10 : i32) : i32
+ omp.teams reduction(byref @add_reduction_byref_box_5xf32 %arg0 -> %arg2 : !llvm.ptr) {
+ omp.parallel {
+ omp.distribute {
+ omp.wsloop {
+ omp.loop_nest (%arg5) : i32 = (%c1_2) to (%c10) inclusive step (%c1_2) {
+ omp.yield
+ }
+ } {omp.composite}
+ } {omp.composite}
+ omp.terminator
+ } {omp.composite}
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+ }
+}
+
+// CHECK: call void @__kmpc_parallel_51({{.*}}, i32 1, i32 -1, i32 -1,
+// CHECK-SAME: ptr @[[PAR_OUTLINED:.*]], ptr null, ptr %2, i64 1)
+
+// CHECK: define internal void @[[PAR_OUTLINED]]{{.*}} {
+// CHECK: .omp.reduction.then:
+// CHECK: br label %omp.reduction.nonatomic.body
+
+// CHECK: omp.reduction.nonatomic.body:
+// CHECK: call void @bar()
+// CHECK: br label %[[BODY_2ND_BB:.*]]
+
+// CHECK: [[BODY_2ND_BB]]:
+// CHECK: call void @baz()
+// CHECK: br label %[[CONT_BB:.*]]
+
+// CHECK: [[CONT_BB]]:
+// CHECK: br label %.omp.reduction.done
+// CHECK: }
+
+// CHECK: define internal void @"{{.*}}$reduction$reduction_func"(ptr noundef %0, ptr noundef %1) #0 {
+// CHECK: br label %omp.reduction.nonatomic.body
+
+// CHECK: [[BODY_2ND_BB:.*]]:
+// CHECK: call void @baz()
+// CHECK: br label %omp.region.cont
+
+
+// CHECK: omp.reduction.nonatomic.body:
+// CHECK: call void @bar()
+// CHECK: br label %[[BODY_2ND_BB]]
+
+// CHECK: }
|
6987182 to
9a7ae05
Compare
49715c2 to
3c3f326
Compare
…ide values (#155754) Following up on #154483, this PR introduces further refactoring to extract some shared utils between OpenMP lowering and `do concurrent` conversion pass. In particular, this PR extracts 2 utils that handle mapping or cloning values used inside target regions but defined outside. Later `do concurrent` PR(s) will also use these utils. PR stack: - #155754◀️ - #155987 - #155992 - #155993 - #156589 - #156610 - #156837
9a7ae05 to
6d02115
Compare
3c3f326 to
8467dbc
Compare
… clone outside values (#155754) Following up on #154483, this PR introduces further refactoring to extract some shared utils between OpenMP lowering and `do concurrent` conversion pass. In particular, this PR extracts 2 utils that handle mapping or cloning values used inside target regions but defined outside. Later `do concurrent` PR(s) will also use these utils. PR stack: - llvm/llvm-project#155754◀️ - llvm/llvm-project#155987 - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#156589 - llvm/llvm-project#156610 - llvm/llvm-project#156837
6d02115 to
7fb93a3
Compare
8467dbc to
57bd38b
Compare
7fb93a3 to
afd1552
Compare
57bd38b to
f0e9a11
Compare
afd1552 to
31bf2c1
Compare
f0e9a11 to
adf9d42
Compare
| @@ -3506,6 +3506,8 @@ Expected<Function *> OpenMPIRBuilder::createReductionFunction( | |||
| return AfterIP.takeError(); | |||
| if (!Builder.GetInsertBlock()) | |||
| return ReductionFunc; | |||
|
|
|||
| Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint()); | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint()); | |
| Builder.restoreIP(*AfterIP); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
| @@ -3750,6 +3752,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU( | |||
| RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced); | |||
| if (!AfterIP) | |||
| return AfterIP.takeError(); | |||
| Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint()); | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| Builder.SetInsertPoint(AfterIP->getBlock(), AfterIP->getPoint()); | |
| Builder.restoreIP(*AfterIP); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
31bf2c1 to
3b73016
Compare
adf9d42 to
c7d6552
Compare
…#155987) Upstreams further parts of `do concurrent` to OpenMP conversion pass from AMD's fork. This PR extends the pass by adding support for mapping to the device. PR stack: - llvm/llvm-project#155754 - llvm/llvm-project#155987◀️ - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#157638 - llvm/llvm-project#156610 - llvm/llvm-project#156837
Seems like it. Sorry for the noise. |
Extends support for mapping `do concurrent` on the device by adding support for `local` specifiers. The changes in this PR map the local variable to the `omp.target` op and uses the mapped value as the `private` clause operand in the nested `omp.parallel` op. - #155754 - #155987 - #155992 - #155993 - #157638◀️ - #156610 - #156837
76c4b9a to
f698b21
Compare
f77070f to
0c61084
Compare
… (#157638) Extends support for mapping `do concurrent` on the device by adding support for `local` specifiers. The changes in this PR map the local variable to the `omp.target` op and uses the mapped value as the `private` clause operand in the nested `omp.parallel` op. - llvm/llvm-project#155754 - llvm/llvm-project#155987 - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#157638◀️ - llvm/llvm-project#156610 - llvm/llvm-project#156837
Extends `do concurrent` to OpenMP device mapping by adding support for mapping `reduce` specifiers to omp `reduction` clauses. The changes attach 2 `reduction` clauses to the mapped OpenMP construct: one on the `teams` part of the construct and one on the `wloop` part. - #155754 - #155987 - #155992 - #155993 - #157638 - #156610◀️ - #156837
… GPU Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators.
0c61084 to
d3b28d0
Compare
…e (#156610) Extends `do concurrent` to OpenMP device mapping by adding support for mapping `reduce` specifiers to omp `reduction` clauses. The changes attach 2 `reduction` clauses to the mapped OpenMP construct: one on the `teams` part of the construct and one on the `wloop` part. - llvm/llvm-project#155754 - llvm/llvm-project#155987 - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#157638 - llvm/llvm-project#156610◀️ - llvm/llvm-project#156837
…ions on the GPU (#156837) Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators. PR stack: - llvm/llvm-project#155754 - llvm/llvm-project#155987 - llvm/llvm-project#155992 - llvm/llvm-project#155993 - llvm/llvm-project#157638 - llvm/llvm-project#156610 - llvm/llvm-project#156837◀️
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/66/builds/19591 Here is the relevant piece of the build log for the reference |
|
Failure seems unlrelated to the changes in the PR. Let me know if I missed anything. |
Fixes a bug related to insertion points when inlining multi-block combiner reduction regions. The IP at the end of the inlined region was not used resulting in emitting BBs with multiple terminators.
PR stack:
do concurrentmapping to device #155987do concurrentto device mapping lit tests #155992do concurrent: supportlocalon device #157638do concurrent: supportreduceon device #156610