Skip to content

Commit 31fd77a

Browse files
committed
[OpenACC][CIR] worker/vector lowering for combined constructs
Another set of 2 line changes, but makes sure to add sufficient testing.
1 parent df78e28 commit 31fd77a

File tree

2 files changed

+176
-0
lines changed

2 files changed

+176
-0
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,8 @@ class OpenACCClauseCIREmitter final
465465
else
466466
operation.addEmptyWorker(builder.getContext(), lastDeviceTypeValues);
467467

468+
} else if constexpr (isCombinedType<OpTy>) {
469+
applyToLoopOp(clause);
468470
} else {
469471
// TODO: When we've implemented this for everything, switch this to an
470472
// unreachable. Combined constructs remain.
@@ -481,6 +483,8 @@ class OpenACCClauseCIREmitter final
481483
else
482484
operation.addEmptyVector(builder.getContext(), lastDeviceTypeValues);
483485

486+
} else if constexpr (isCombinedType<OpTy>) {
487+
applyToLoopOp(clause);
484488
} else {
485489
// TODO: When we've implemented this for everything, switch this to an
486490
// unreachable. Combined constructs remain.

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,4 +401,176 @@ extern "C" void acc_combined(int N) {
401401
// CHECK-NEXT: } loc
402402
// CHECK-NEXT: acc.terminator
403403
// CHECK-NEXT: } loc
404+
405+
#pragma acc kernels loop worker
406+
for(unsigned I = 0; I < N; ++I);
407+
// CHECK-NEXT: acc.kernels combined(loop) {
408+
// CHECK-NEXT: acc.loop combined(kernels) worker {
409+
// CHECK: acc.yield
410+
// CHECK-NEXT: } loc
411+
// CHECK: acc.terminator
412+
// CHECK-NEXT: } loc
413+
414+
#pragma acc kernels loop worker(N)
415+
for(unsigned I = 0; I < N; ++I);
416+
// CHECK-NEXT: acc.kernels combined(loop) {
417+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
418+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
419+
// CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32) {
420+
// CHECK: acc.yield
421+
// CHECK-NEXT: } loc
422+
// CHECK: acc.terminator
423+
// CHECK-NEXT: } loc
424+
425+
#pragma acc kernels loop worker device_type(nvidia, radeon) worker
426+
for(unsigned I = 0; I < N; ++I);
427+
// CHECK-NEXT: acc.kernels combined(loop) {
428+
// CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
429+
// CHECK: acc.yield
430+
// CHECK-NEXT: } loc
431+
// CHECK: acc.terminator
432+
// CHECK-NEXT: } loc
433+
434+
#pragma acc kernels loop worker(N) device_type(nvidia, radeon) worker
435+
for(unsigned I = 0; I < N; ++I);
436+
// CHECK-NEXT: acc.kernels combined(loop) {
437+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
438+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
439+
// CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
440+
// CHECK: acc.yield
441+
// CHECK-NEXT: } loc
442+
// CHECK: acc.terminator
443+
// CHECK-NEXT: } loc
444+
445+
#pragma acc kernels loop worker device_type(nvidia, radeon) worker(N)
446+
for(unsigned I = 0; I < N; ++I);
447+
// CHECK-NEXT: acc.kernels combined(loop) {
448+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
449+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
450+
// CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) {
451+
// CHECK: acc.yield
452+
// CHECK-NEXT: } loc
453+
// CHECK: acc.terminator
454+
// CHECK-NEXT: } loc
455+
456+
#pragma acc kernels loop worker(N) device_type(nvidia, radeon) worker(N + 1)
457+
for(unsigned I = 0; I < N; ++I);
458+
// CHECK-NEXT: acc.kernels combined(loop) {
459+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
460+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
461+
// CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
462+
// CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
463+
// CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
464+
// CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
465+
// CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
466+
// CHECK: acc.yield
467+
// CHECK-NEXT: } loc
468+
// CHECK: acc.terminator
469+
// CHECK-NEXT: } loc
470+
471+
#pragma acc kernels loop device_type(nvidia, radeon) worker(num:N + 1)
472+
for(unsigned I = 0; I < N; ++I);
473+
// CHECK-NEXT: acc.kernels combined(loop) {
474+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
475+
// CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
476+
// CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
477+
// CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
478+
// CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
479+
// CHECK: acc.terminator
480+
// CHECK-NEXT: } loc
481+
482+
483+
#pragma acc kernels loop worker vector device_type(nvidia) worker vector
484+
for(unsigned I = 0; I < N; ++I);
485+
// CHECK-NEXT: acc.kernels combined(loop) {
486+
// CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>])
487+
// CHECK: acc.yield
488+
// CHECK-NEXT: } loc
489+
// CHECK: acc.terminator
490+
// CHECK-NEXT: } loc
491+
492+
#pragma acc kernels loop vector
493+
for(unsigned I = 0; I < N; ++I);
494+
// CHECK-NEXT: acc.kernels combined(loop) {
495+
// CHECK: acc.loop combined(kernels) vector {
496+
// CHECK: acc.yield
497+
// CHECK-NEXT: } loc
498+
// CHECK: acc.terminator
499+
// CHECK-NEXT: } loc
500+
501+
#pragma acc kernels loop vector(N)
502+
for(unsigned I = 0; I < N; ++I);
503+
// CHECK-NEXT: acc.kernels combined(loop) {
504+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
505+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
506+
// CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_CONV]] : si32) {
507+
// CHECK: acc.yield
508+
// CHECK-NEXT: } loc
509+
// CHECK: acc.terminator
510+
// CHECK-NEXT: } loc
511+
512+
#pragma acc kernels loop vector device_type(nvidia, radeon) vector
513+
for(unsigned I = 0; I < N; ++I);
514+
// CHECK-NEXT: acc.kernels combined(loop) {
515+
// CHECK-NEXT: acc.loop combined(kernels) vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
516+
// CHECK: acc.yield
517+
// CHECK-NEXT: } loc
518+
// CHECK: acc.terminator
519+
// CHECK-NEXT: } loc
520+
521+
#pragma acc kernels loop vector(N) device_type(nvidia, radeon) vector
522+
for(unsigned I = 0; I < N; ++I);
523+
// CHECK-NEXT: acc.kernels combined(loop) {
524+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
525+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
526+
// CHECK-NEXT: acc.loop combined(kernels) vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
527+
// CHECK: acc.yield
528+
// CHECK-NEXT: } loc
529+
// CHECK: acc.terminator
530+
// CHECK-NEXT: } loc
531+
532+
#pragma acc kernels loop vector(N) device_type(nvidia, radeon) vector(N + 1)
533+
for(unsigned I = 0; I < N; ++I);
534+
// CHECK-NEXT: acc.kernels combined(loop) {
535+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
536+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
537+
// CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
538+
// CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
539+
// CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
540+
// CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
541+
// CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
542+
// CHECK: acc.yield
543+
// CHECK-NEXT: } loc
544+
// CHECK: acc.terminator
545+
// CHECK-NEXT: } loc
546+
547+
#pragma acc kernels loop device_type(nvidia, radeon) vector(length:N + 1)
548+
for(unsigned I = 0; I < N; ++I);
549+
// CHECK-NEXT: acc.kernels combined(loop) {
550+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
551+
// CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
552+
// CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
553+
// CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
554+
// CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
555+
// CHECK: acc.yield
556+
// CHECK-NEXT: } loc
557+
// CHECK: acc.terminator
558+
// CHECK-NEXT: } loc
559+
560+
#pragma acc kernels loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N)
561+
for(unsigned I = 0; I < N; ++I);
562+
// CHECK-NEXT: acc.kernels combined(loop) {
563+
// CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
564+
// CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
565+
// CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
566+
// CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
567+
// CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
568+
// CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32
569+
// CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
570+
// CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32
571+
// CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) {
572+
// CHECK: acc.yield
573+
// CHECK-NEXT: } loc
574+
// CHECK: acc.terminator
575+
// CHECK-NEXT: } loc
404576
}

0 commit comments

Comments
 (0)