Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ namespace {
class ScalarExprEmitter;
} // namespace

namespace mlir {
namespace acc {
class LoopOp;
} // namespace acc
} // namespace mlir

namespace clang::CIRGen {

class CIRGenFunction : public CIRGenTypeCache {
Expand Down Expand Up @@ -1082,6 +1088,12 @@ class CIRGenFunction : public CIRGenTypeCache {
OpenACCDirectiveKind dirKind, SourceLocation dirLoc,
ArrayRef<const OpenACCClause *> clauses);

// The OpenACC LoopOp requires that we have auto, seq, or independent on all
// LoopOp operations for the 'none' device type case. This function checks if
// the LoopOp has one, else it updates it to have one.
void updateLoopOpParallelism(mlir::acc::LoopOp &op, bool isOrphan,
OpenACCDirectiveKind dk);

public:
mlir::LogicalResult
emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s);
Expand Down
2 changes: 2 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(

emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses);

updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind);

builder.create<TermOp>(end);
}

Expand Down
60 changes: 60 additions & 0 deletions clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,63 @@ using namespace clang::CIRGen;
using namespace cir;
using namespace mlir::acc;

void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op,
bool isOrphan,
OpenACCDirectiveKind dk) {
// Check that at least one of auto, independent, or seq is present
// for the device-independent default clauses.
auto hasDeviceNone = [](mlir::acc::DeviceTypeAttr attr) -> bool {
return attr.getValue() == mlir::acc::DeviceType::None;
};
bool hasDefaultSeq =
op.getSeqAttr()
? llvm::any_of(
op.getSeqAttr().getAsRange<mlir::acc::DeviceTypeAttr>(),
hasDeviceNone)
: false;
bool hasDefaultIndependent =
op.getIndependentAttr()
? llvm::any_of(
op.getIndependentAttr().getAsRange<mlir::acc::DeviceTypeAttr>(),
hasDeviceNone)
: false;
bool hasDefaultAuto =
op.getAuto_Attr()
? llvm::any_of(
op.getAuto_Attr().getAsRange<mlir::acc::DeviceTypeAttr>(),
hasDeviceNone)
: false;

if (hasDefaultSeq || hasDefaultIndependent || hasDefaultAuto)
return;

// Orphan or parallel results in 'independent'.
if (isOrphan || dk == OpenACCDirectiveKind::Parallel ||
dk == OpenACCDirectiveKind::ParallelLoop) {
op.addIndependent(builder.getContext(), {});
return;
}

// Kernels always results in 'auto'.
if (dk == OpenACCDirectiveKind::Kernels ||
dk == OpenACCDirectiveKind::KernelsLoop) {
op.addAuto(builder.getContext(), {});
return;
}

// Serial should use 'seq' unless there is a gang, worker, or vector clause,
// in which case, it should use 'auto'.
assert(dk == OpenACCDirectiveKind::Serial ||
dk == OpenACCDirectiveKind::SerialLoop);

if (op.getWorkerAttr() || op.getVectorAttr() || op.getGangAttr()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an acc dialect problem - but basically we encode in a different fields whether a loop has gang or gang(value) or gang(dim:). The more complete check is:

    bool hasDefaultGangWorkerOrVector =
        loopOp.hasVector() || loopOp.getVectorValue() || loopOp.hasWorker() ||
        loopOp.getWorkerValue() || loopOp.hasGang() ||
        loopOp.getGangValue(mlir::acc::GangArgType::Num) ||
        loopOp.getGangValue(mlir::acc::GangArgType::Dim) ||
        loopOp.getGangValue(mlir::acc::GangArgType::Static);

And probably it should be in a utility in acc dialect itself.

op.addAuto(builder.getContext(), {});
return;
}

op.addSeq(builder.getContext(), {});
}

mlir::LogicalResult
CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
mlir::Location start = getLoc(s.getSourceRange().getBegin());
Expand Down Expand Up @@ -90,6 +147,9 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
s.clauses());

updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(),
s.getParentComputeConstructKind());

mlir::LogicalResult stmtRes = mlir::success();
// Emit body.
{
Expand Down
69 changes: 62 additions & 7 deletions clang/test/CIR/CodeGenOpenACC/combined.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.serial combined(loop) {
// CHECK: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
// CHECK: acc.yield
// CHECK-NEXT: } loc
#pragma acc kernels loop seq device_type(nvidia, radeon)
Expand All @@ -99,7 +99,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.serial combined(loop) {
// CHECK: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
// CHECK: acc.yield
// CHECK-NEXT: } loc
#pragma acc kernels loop auto device_type(nvidia, radeon)
Expand All @@ -124,7 +124,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.serial combined(loop) {
// CHECK: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]} loc
// CHECK: acc.yield
// CHECK-NEXT: } loc
#pragma acc kernels loop independent device_type(nvidia, radeon)
Expand All @@ -143,7 +143,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.parallel combined(loop) {
// CHECK: acc.loop combined(parallel) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}
// CHECK: acc.yield
// CHECK-NEXT: } loc

Expand All @@ -154,7 +154,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.serial combined(loop) {
// CHECK: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], seq = [#acc.device_type<none>]}
// CHECK: acc.yield
// CHECK-NEXT: } loc

Expand All @@ -165,7 +165,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.kernels combined(loop) {
// CHECK: acc.loop combined(kernels) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>], collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
// CHECK: acc.terminator
// CHECK-NEXT: } loc
#pragma acc parallel loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
Expand All @@ -175,7 +175,7 @@ extern "C" void acc_combined(int N, int cond) {
// CHECK: acc.parallel combined(loop) {
// CHECK: acc.loop combined(parallel) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}
// CHECK: acc.yield
// CHECK-NEXT: } loc

Expand Down Expand Up @@ -1184,4 +1184,59 @@ extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) {
// CHECK-NEXT: } loc
// CHECK-NEXT: acc.detach accPtr(%[[ATTACH2]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg2"}
// CHECK-NEXT: acc.detach accPtr(%[[ATTACH1]] : !cir.ptr<!cir.ptr<!s32i>>) async([#acc.device_type<host>]) {dataClause = #acc<data_clause acc_attach>, name = "arg1"}

// Checking the automatic-addition of parallelism clauses.
#pragma acc parallel loop
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.parallel combined(loop) {
// CHECK-NEXT: acc.loop combined(parallel) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc kernels loop
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.kernels combined(loop) {
// CHECK-NEXT: acc.loop combined(kernels) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc

#pragma acc serial loop
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.serial combined(loop) {
// CHECK-NEXT: acc.loop combined(serial) {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial loop worker
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.serial combined(loop) {
// CHECK-NEXT: acc.loop combined(serial) worker {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial loop vector
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.serial combined(loop) {
// CHECK-NEXT: acc.loop combined(serial) vector {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial loop gang
for(unsigned I = 0; I < 5; ++I);
// CHECK-NEXT: acc.serial combined(loop) {
// CHECK-NEXT: acc.loop combined(serial) gang {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
}
101 changes: 91 additions & 10 deletions clang/test/CIR/CodeGenOpenACC/loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
#pragma acc loop device_type(radeon) seq
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>], seq = [#acc.device_type<radeon>]} loc
#pragma acc loop seq device_type(nvidia, radeon)
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
Expand All @@ -67,12 +67,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<nvidia>, #acc.device_type<radeon>, #acc.device_type<none>]} loc
#pragma acc loop device_type(radeon) independent
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {independent = [#acc.device_type<radeon>, #acc.device_type<none>]} loc
#pragma acc loop independent device_type(nvidia, radeon)
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
Expand All @@ -93,12 +93,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<nvidia>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
#pragma acc loop device_type(radeon) auto
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>]} loc
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<radeon>], independent = [#acc.device_type<none>]} loc
#pragma acc loop auto device_type(nvidia, radeon)
for(unsigned I = 0; I < N; ++I);
// CHECK: acc.loop {
Expand All @@ -116,30 +116,30 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
for(unsigned K = 0; K < N; ++K);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>]}
// CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type<none>], independent = [#acc.device_type<none>]}

#pragma acc loop collapse(1) device_type(radeon) collapse (2)
for(unsigned I = 0; I < N; ++I)
for(unsigned J = 0; J < N; ++J)
for(unsigned K = 0; K < N; ++K);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>]}
// CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>], independent = [#acc.device_type<none>]}

#pragma acc loop collapse(1) device_type(radeon, nvidia) collapse (2)
for(unsigned I = 0; I < N; ++I)
for(unsigned J = 0; J < N; ++J)
for(unsigned K = 0; K < N; ++K);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>]}
// CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>], independent = [#acc.device_type<none>]}
#pragma acc loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3)
for(unsigned I = 0; I < N; ++I)
for(unsigned J = 0; J < N; ++J)
for(unsigned K = 0; K < N; ++K);
// CHECK: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>]}
// CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type<none>, #acc.device_type<radeon>, #acc.device_type<nvidia>, #acc.device_type<host>], independent = [#acc.device_type<none>]}

#pragma acc loop tile(1, 2, 3)
for(unsigned I = 0; I < N; ++I)
Expand Down Expand Up @@ -392,4 +392,85 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
// CHECK: acc.yield
// CHECK-NEXT: } loc
}
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc

// Checking the automatic-addition of parallelism clauses.
#pragma acc loop
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc

#pragma acc parallel
{
// CHECK-NEXT: acc.parallel {
#pragma acc loop
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc kernels
{
// CHECK-NEXT: acc.kernels {
#pragma acc loop
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc

#pragma acc serial
{
// CHECK-NEXT: acc.serial {
#pragma acc loop
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial
{
// CHECK-NEXT: acc.serial {
#pragma acc loop worker
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop worker {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial
{
// CHECK-NEXT: acc.serial {
#pragma acc loop vector
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop vector {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc

#pragma acc serial
{
// CHECK-NEXT: acc.serial {
#pragma acc loop gang
for(unsigned I = 0; I < N; ++I);
// CHECK-NEXT: acc.loop gang {
// CHECK: acc.yield
// CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]} loc
}
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
}
Loading