Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2494,7 +2494,12 @@ LogicalResult PrivateClauseOp::verify() {
<< "Did not expect any values to be yielded.";
}

if (yieldedTypes.size() == 1 && yieldedTypes.front() == symType)
if (yieldedTypes.size() != 1)
return mlir::emitError(terminator->getLoc())
<< "Expected exactly 1 yielded value, but got "
<< yieldedTypes.size();

if (yieldedTypes.front() == symType)
return success();

auto error = mlir::emitError(yieldOp.getLoc())
Expand Down
131 changes: 130 additions & 1 deletion mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "mlir/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.h"
#include "mlir/Analysis/TopologicalSortUtils.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/OpenMP/OpenMPClauseOperands.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
#include "mlir/IR/IRMapping.h"
Expand Down Expand Up @@ -3241,12 +3242,140 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
SmallVector<Value> mapVars = targetOp.getMapVars();
llvm::Function *llvmOutlinedFn = nullptr;

// TODO: It can also be false if a compile-time constant `false` IF clause is
// specified.
bool isOffloadEntry =
isTargetDevice || !ompBuilder->Config.TargetTriples.empty();

if (!targetOp.getPrivateVars().empty()) {
OperandRange privateVars = targetOp.getPrivateVars();
std::optional<mlir::ArrayAttr> privateSyms = targetOp.getPrivateSyms();
auto &targetRegion = opInst.getRegion(0);
auto &firstTargetBlock = targetRegion.front();
auto *regionArgsStart = firstTargetBlock.getArguments().begin();
auto *privArgsStart = regionArgsStart + targetOp.getMapVars().size();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to remove any target-specific logic as much as possible (we probably can outline this into a util that other ops use as well, for example distribute which I am working on now).

Suggested change
auto *privArgsStart = regionArgsStart + targetOp.getMapVars().size();
auto *privArgsStart = regionArgsStart + targetOp.getPrivateVars().getBeginOperandIndex();

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just as a heads-up, I'm planning to create a PR stack this week to revamp the handling of clause-related block arguments such as these. At that point we won't have to worry so much about the order they appear in, but instead we'll be able to access them through an MLIR interface.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Excellent. That would be great, because the present usage is extremely brittle.

auto *privArgsEnd = privArgsStart + targetOp.getPrivateVars().size();
BitVector blockArgsBV(firstTargetBlock.getNumArguments(), false);
omp::PrivateClauseOps newPrivateClauses;
MutableArrayRef argSubRangePrivates(privArgsStart, privArgsEnd);
for (auto [privVar, privatizerNameAttr, blockArg] :
llvm::zip_equal(privateVars, *privateSyms, argSubRangePrivates)) {

SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(privatizerNameAttr);

// 1. Look up the privatizer in the symbol table
MLIRContext &context = moduleTranslation.getContext();
mlir::IRRewriter rewriter(&context);
omp::PrivateClauseOp privatizer =
SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(&opInst,
privSym);
// Handle only private for now. Also, not handling allocatables yet.
if (privatizer.getDataSharingType() !=
omp::DataSharingClauseType::Private ||
!privatizer.getDeallocRegion().empty()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Otherwise could you return to indicate failure?

newPrivateClauses.privateSyms.push_back(privSym);
newPrivateClauses.privateVars.push_back(privVar);
continue;
}

auto &allocRegion = privatizer.getAllocRegion();
// `alloc` region should have only 1 argument
assert(allocRegion.getNumArguments() == 1 &&
"alloc region of omp::PrivateClauseOp should have one argument");
auto allocRegionArg = allocRegion.getArgument(0);
// Assuming we have the following targetOp and Privatizer
//
// omp.private {type = private} @x.privatizer : !llvm.ptr alloc {
// %1 = alloca...
// omp.yield(%1)
// }
// omp.target map(..) map(..) private(privVar) {
// ^bb0(map_arg0, ..., map_argn, priv_arg0):
// ...
// ...
// store %v, %priv_arg0
// omp.terminator
// }
// Roughly, we do the following -
// Split the first block (bb0) of the target region into two blocks.
// Then, clone the alloc region of the privatizer between the two
// new blocks. When cloning replace the alloc argument with privVar.
// We'll then have
//
// omp.target map(..) map(..) private(privVar) {
// ^bb0(map_arg0, ..., map_argn, priv_arg0, ..., priv_argn):
// ^bb1: (cloned region) // no predecessor
// %1 = alloca...
// omp.yield(%1)
// ^bb2: // no predecessor
// ...
// ...
// store %v, %priv_arg0
// omp.terminator
// }
// Next, add an unconditional branch from ^bb0 to ^bb1 and change the
// yield in the last block of the cloned alloc region to an unconditional
// branch to the other split block before replacing all uses of
// 'priv_arg0' with the yielded value to finally get the following code
// structure
//
// omp.target map(..) map(..) private(privVar) {
// ^bb0(map_arg0, ..., map_argn, priv_arg0, ..., priv_argn):
// llvm.br ^bb1
// ^bb1: (cloned region) // pred: ^bb0
// %1 = alloca...
// llvm.br ^bb2
// ^bb2: // pred: ^bb1
// ...
// ...
// store %v, %1 // %priv_arg0 replaced with the yield value
// omp.terminator
// }
Block *newBlock =
rewriter.splitBlock(&firstTargetBlock, firstTargetBlock.begin());
rewriter.setInsertionPointToStart(&firstTargetBlock);
Location loc = targetOp.getLoc();
rewriter.create<LLVM::BrOp>(loc, ValueRange(), newBlock);

IRMapping cloneMap;
cloneMap.map(allocRegionArg, privVar);
auto newBlockIter = std::next(targetRegion.begin(), 1);
allocRegion.cloneInto(&targetRegion, newBlockIter, cloneMap);

unsigned allocRegNumBlocks = allocRegion.getBlocks().size();
newBlockIter = std::next(targetRegion.begin(), 1);
auto clonedAllocRegionBackIter =
std::next(newBlockIter, (allocRegNumBlocks - 1));
Block &clonedAllocRegionLastBlock = *clonedAllocRegionBackIter;

LLVM::BrOp brOp = dyn_cast<LLVM::BrOp>(firstTargetBlock.getTerminator());
omp::YieldOp yieldOp =
dyn_cast<omp::YieldOp>(clonedAllocRegionLastBlock.getTerminator());
auto newValue = yieldOp.getResults().front();
rewriter.setInsertionPointAfter(yieldOp);
auto *oldSucc = brOp.getSuccessor();
brOp.setSuccessor(&*newBlockIter);
// TODO:Consider cloning brOp and adding it to clonedAllocRegEngBlock
// TODO: Can we not simply merge clonedAllocRegEngBlock and oldSucc?
rewriter.create<LLVM::BrOp>(loc, ValueRange(), oldSucc);
blockArgsBV.set(blockArg.getArgNumber());
rewriter.replaceAllUsesWith(blockArg, newValue);
rewriter.eraseOp(yieldOp);
}
// Do some fix ups now.
// First, remove the blockArguments that we just privatized
firstTargetBlock.eraseArguments(blockArgsBV);
// Then remove the private vars and privatizers that we have
// processed i.e privatized just now.
if (newPrivateClauses.privateSyms.empty()) {
targetOp.getPrivateVarsMutable().clear();
targetOp.removePrivateSymsAttr();
} else {
targetOp.setPrivateSymsAttr(mlir::ArrayAttr::get(
targetOp.getContext(), newPrivateClauses.privateSyms));
targetOp.getPrivateVarsMutable().assign(newPrivateClauses.privateVars);
Comment on lines +3374 to +3376
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do we do with the un-privatized variables now?

I am thinking, we should emit an error in this case. We have openmp-enable-delayed-privatization-staging to hide incomplete implementations of delayed privatization so we can just emit an error to clearly say which variables were not successfully privatized.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would convert what is essentially silently failing right now (ie not getting lowered to LLVMIR silently) into hard failures. This change would mean omp.private for allocatables will start failing with errors. For firstprivate we already fail early during parsing. Let me know if you still want me to make this change. I am leaning towards doing what you are suggesting because a hard failure is better than silently failing but let me know if you change your mind about what you'd prefer in light of this comment of mine.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a hard failure would good. At least it will allow someone working on the code to find the proper extension point to support allocatables (or whatever is not supported) on this layer.

}
}
LogicalResult bodyGenStatus = success();
using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
auto bodyCB = [&](InsertPointTy allocaIP,
Expand Down
2 changes: 1 addition & 1 deletion mlir/test/Dialect/OpenMP/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -2219,7 +2219,7 @@ omp.private {type = private} @x.privatizer : i32 alloc {

omp.private {type = private} @x.privatizer : i32 alloc {
^bb0(%arg0: i32):
// expected-error @below {{Invalid yielded value. Expected type: 'i32', got: None}}
// expected-error @below {{Expected exactly 1 yielded value, but got 0}}
omp.yield
}

Expand Down
72 changes: 72 additions & 0 deletions mlir/test/Target/LLVMIR/openmp-target-private.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s

omp.private {type = private} @simple_var.privatizer : !llvm.ptr alloc {
^bb0(%arg0: !llvm.ptr):
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x i32 {bindc_name = "simple_var", pinned} : (i64) -> !llvm.ptr
omp.yield(%1 : !llvm.ptr)
}
llvm.func @target_map_single_private() attributes {fir.internal_name = "_QPtarget_map_single_private"} {
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x i32 {bindc_name = "simple_var"} : (i64) -> !llvm.ptr
%2 = llvm.mlir.constant(1 : i64) : i64
%3 = llvm.alloca %2 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
%4 = llvm.mlir.constant(2 : i32) : i32
llvm.store %4, %3 : i32, !llvm.ptr
%5 = omp.map.info var_ptr(%3 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = "a"}
omp.target map_entries(%5 -> %arg0 : !llvm.ptr) private(@simple_var.privatizer %1 -> %arg1 : !llvm.ptr) {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%6 = llvm.mlir.constant(10 : i32) : i32
%7 = llvm.load %arg0 : !llvm.ptr -> i32
%8 = llvm.add %7, %6 : i32
llvm.store %8, %arg1 : i32, !llvm.ptr
omp.terminator
}
llvm.return
}
// CHECK: define internal void @__omp_offloading_fd00
// CHECK-NOT: define {{.*}}
// CHECK-DAG: %[[PRIV_ALLOC:.*]] = alloca i32, i64 1, align 4
// CHECK-DAG: %[[ADD:.*]] = add i32 {{.*}}, 10
// CHECK-DAG: store i32 %[[ADD]], ptr %[[PRIV_ALLOC]], align 4
Comment on lines +29 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Avoid using CHECK-DAG when possible. They are more expensive than normal CHECKs and imply that there is really a DAG ordering happening.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I used CHECk-DAG because the def-use chain between the interesting instructions_ spans different basic blocks. As such then the test itself becomes subject to the order in which blocks are laid out. Isn't CHECK-DAG meant to address such situations?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is usually save to assume that the block order doesn't change. While the order is not relevant (apart from the entry block), it is deterministic.
In my experience, CHECK-DAG is mainly used for constructs that are very susceptible to order changes, like locations or complicated attributes.

Note that this is not necessarily a big issue, and if you assume that this order might indeed change at some point in the future, feel free to keep it as is.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just out of curiosity, what would be some examples of OpenMP constructs which might be susceptible to order changes and therefore require the more rigorous CHECK-DAG?


omp.private {type = private} @n.privatizer : !llvm.ptr alloc {
^bb0(%arg0: !llvm.ptr):
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x f32 {bindc_name = "n", pinned} : (i64) -> !llvm.ptr
omp.yield(%1 : !llvm.ptr)
}
llvm.func @target_map_2_privates() attributes {fir.internal_name = "_QPtarget_map_2_privates"} {
%0 = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x i32 {bindc_name = "simple_var"} : (i64) -> !llvm.ptr
%3 = llvm.alloca %0 x f32 {bindc_name = "n"} : (i64) -> !llvm.ptr
%5 = llvm.alloca %0 x i32 {bindc_name = "a"} : (i64) -> !llvm.ptr
%6 = llvm.mlir.constant(2 : i32) : i32
llvm.store %6, %5 : i32, !llvm.ptr
%7 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = "a"}
omp.target map_entries(%7 -> %arg0 : !llvm.ptr) private(@simple_var.privatizer %1 -> %arg1 : !llvm.ptr, @n.privatizer %3 -> %arg2 : !llvm.ptr) {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
%8 = llvm.mlir.constant(1.100000e+01 : f32) : f32
%9 = llvm.mlir.constant(10 : i32) : i32
%10 = llvm.load %arg0 : !llvm.ptr -> i32
%11 = llvm.add %10, %9 : i32
llvm.store %11, %arg1 : i32, !llvm.ptr
%12 = llvm.load %arg1 : !llvm.ptr -> i32
%13 = llvm.sitofp %12 : i32 to f32
%14 = llvm.fadd %13, %8 {fastmathFlags = #llvm.fastmath<contract>} : f32
llvm.store %14, %arg2 : f32, !llvm.ptr
omp.terminator
}
llvm.return
}

// CHECK: define internal void @__omp_offloading_fd00
// CHECK-DAG: %[[PRIV_I32_ALLOC:.*]] = alloca i32, i64 1, align 4
// CHECK-DAG: %[[PRIV_FLOAT_ALLOC:.*]] = alloca float, i64 1, align 4
// CHECK-DAG: %[[ADD_I32:.*]] = add i32 {{.*}}, 10
// CHECK-DAG: store i32 %[[ADD_I32]], ptr %[[PRIV_I32_ALLOC]], align 4
// CHECK-DAG: %[[LOAD_I32_AGAIN:.*]] = load i32, ptr %6, align 4
// CHECK-DAG: %[[CAST_TO_FLOAT:.*]] = sitofp i32 %[[LOAD_I32_AGAIN]] to float
// CHECK-DAG: %[[ADD_FLOAT:.*]] = fadd contract float %[[CAST_TO_FLOAT]], 1.100000e+01
// CHECK-DAG: store float %[[ADD_FLOAT]], ptr %[[PRIV_FLOAT_ALLOC]], align 4

41 changes: 41 additions & 0 deletions offload/test/offloading/fortran/target-private-map.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
! Test target private
! REQUIRES: flang, amdgcn-amd-amdhsa
! UNSUPPORTED: nvptx64-nvidia-cuda
! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
! UNSUPPORTED: aarch64-unknown-linux-gnu
! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
! UNSUPPORTED: x86_64-pc-linux-gnu
! UNSUPPORTED: x86_64-pc-linux-gnu-LTO

! RUN: %libomptarget-compile-fortran-run-and-check-generic
program main
integer :: a = 0
call target_private(a)
print*, "======= FORTRAN Test passed! ======="
print*, "foo(a) should not return 20, got " , a
if (a /= 20) then
stop 0
else
stop 1
end if

! stop 0
end program main
subroutine target_private(r)
implicit none
integer, dimension(2) :: simple_vars
integer :: a, r
! set a to 5
a = 5
simple_vars(1) = a
simple_vars(2) = a
!$omp target map(tofrom: simple_vars) private(a)
! Without private(a), a would be firstprivate, meaning it's value would be 5
! with private(a), it's value would be uninitialized, which means it'd have
! a very small chance of being 5.
Comment on lines +34 to +35
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ultra nit: This case is UB, so anything can happen. Not sure if there would be another case that would not invoke UB in case of missing private support?

! So, simple_vars(1|2) should be 5 + a_private
simple_vars(1) = simple_vars(1) + a
simple_vars(2) = simple_vars(2) + a
!$omp end target
r = simple_vars(1) + simple_vars(2)
end subroutine target_private