Skip to content

Commit 716062d

Browse files
committed
[OpenACC][CIR] Lowering for vector_length on combined constructs
Another simple one, added tests and implemented, just like num_gangs and num_workers.
1 parent 97a58b0 commit 716062d

File tree

2 files changed

+71
-5
lines changed

2 files changed

+71
-5
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -242,12 +242,10 @@ class OpenACCClauseCIREmitter final
242242
operation.addVectorLengthOperand(builder.getContext(),
243243
createIntExpr(clause.getIntExpr()),
244244
lastDeviceTypeValues);
245-
} else if constexpr (isOneOfTypes<OpTy, mlir::acc::SerialOp>) {
246-
llvm_unreachable("vector_length not valid on serial");
245+
} else if constexpr (isCombinedType<OpTy>) {
246+
applyToComputeOp(clause);
247247
} else {
248-
// TODO: When we've implemented this for everything, switch this to an
249-
// unreachable. Combined constructs remain.
250-
return clauseNotImplemented(clause);
248+
llvm_unreachable("Unknown construct kind in VisitVectorLengthClause");
251249
}
252250
}
253251

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,4 +879,72 @@ extern "C" void acc_combined(int N, int cond) {
879879
// CHECK-NEXT: } loc
880880
// CHECK-NEXT: acc.yield
881881
// CHECK-NEXT: } loc
882+
//
883+
#pragma acc parallel loop vector_length(cond)
884+
for(unsigned I = 0; I < N; ++I);
885+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
886+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
887+
// CHECK-NEXT: acc.parallel combined(loop) vector_length(%[[CONV_CAST]] : si32) {
888+
// CHECK-NEXT: acc.loop combined(parallel) {
889+
// CHECK: acc.yield
890+
// CHECK-NEXT: } loc
891+
// CHECK-NEXT: acc.yield
892+
// CHECK-NEXT: } loc
893+
894+
#pragma acc kernels loop vector_length(cond) device_type(nvidia) vector_length(2u)
895+
for(unsigned I = 0; I < N; ++I);
896+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
897+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
898+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
899+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
900+
// CHECK-NEXT: acc.kernels combined(loop) vector_length(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
901+
// CHECK-NEXT: acc.loop combined(kernels) {
902+
// CHECK: acc.yield
903+
// CHECK-NEXT: } loc
904+
// CHECK-NEXT: acc.terminator
905+
// CHECK-NEXT: } loc
906+
907+
#pragma acc parallel loop vector_length(cond) device_type(nvidia, host) vector_length(2) device_type(radeon) vector_length(3)
908+
for(unsigned I = 0; I < N; ++I);
909+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
910+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
911+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
912+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
913+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
914+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
915+
// CHECK-NEXT: acc.parallel combined(loop) vector_length(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
916+
// CHECK-NEXT: acc.loop combined(parallel) {
917+
// CHECK: acc.yield
918+
// CHECK-NEXT: } loc
919+
// CHECK-NEXT: acc.yield
920+
// CHECK-NEXT: } loc
921+
922+
#pragma acc kernels loop vector_length(cond) device_type(nvidia) vector_length(2) device_type(radeon, multicore) vector_length(4)
923+
for(unsigned I = 0; I < N; ++I);
924+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
925+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
926+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
927+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
928+
// CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
929+
// CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
930+
// CHECK-NEXT: acc.kernels combined(loop) vector_length(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[FOUR_CAST]] : si32 [#acc.device_type<radeon>], %[[FOUR_CAST]] : si32 [#acc.device_type<multicore>]) {
931+
// CHECK-NEXT: acc.loop combined(kernels) {
932+
// CHECK: acc.yield
933+
// CHECK-NEXT: } loc
934+
// CHECK-NEXT: acc.terminator
935+
// CHECK-NEXT: } loc
936+
937+
#pragma acc parallel loop device_type(nvidia) vector_length(2) device_type(radeon) vector_length(3)
938+
for(unsigned I = 0; I < N; ++I);
939+
// CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
940+
// CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
941+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
942+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
943+
// CHECK-NEXT: acc.parallel combined(loop) vector_length(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
944+
// CHECK-NEXT: acc.loop combined(parallel) {
945+
// CHECK: acc.yield
946+
// CHECK-NEXT: } loc
947+
// CHECK-NEXT: acc.yield
948+
// CHECK-NEXT: } loc
949+
882950
}

0 commit comments

Comments
 (0)