@@ -401,4 +401,176 @@ extern "C" void acc_combined(int N) {
401401 // CHECK-NEXT: } loc
402402 // CHECK-NEXT: acc.terminator
403403 // CHECK-NEXT: } loc
404+
405+ #pragma acc kernels loop worker
406+ for (unsigned I = 0 ; I < N; ++I);
407+ // CHECK-NEXT: acc.kernels combined(loop) {
408+ // CHECK-NEXT: acc.loop combined(kernels) worker {
409+ // CHECK: acc.yield
410+ // CHECK-NEXT: } loc
411+ // CHECK: acc.terminator
412+ // CHECK-NEXT: } loc
413+
414+ #pragma acc kernels loop worker(N)
415+ for (unsigned I = 0 ; I < N; ++I);
416+ // CHECK-NEXT: acc.kernels combined(loop) {
417+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
418+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
419+ // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32) {
420+ // CHECK: acc.yield
421+ // CHECK-NEXT: } loc
422+ // CHECK: acc.terminator
423+ // CHECK-NEXT: } loc
424+
425+ #pragma acc kernels loop worker device_type(nvidia, radeon) worker
426+ for (unsigned I = 0 ; I < N; ++I);
427+ // CHECK-NEXT: acc.kernels combined(loop) {
428+ // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
429+ // CHECK: acc.yield
430+ // CHECK-NEXT: } loc
431+ // CHECK: acc.terminator
432+ // CHECK-NEXT: } loc
433+
434+ #pragma acc kernels loop worker(N) device_type(nvidia, radeon) worker
435+ for (unsigned I = 0 ; I < N; ++I);
436+ // CHECK-NEXT: acc.kernels combined(loop) {
437+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
438+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
439+ // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
440+ // CHECK: acc.yield
441+ // CHECK-NEXT: } loc
442+ // CHECK: acc.terminator
443+ // CHECK-NEXT: } loc
444+
445+ #pragma acc kernels loop worker device_type(nvidia, radeon) worker(N)
446+ for (unsigned I = 0 ; I < N; ++I);
447+ // CHECK-NEXT: acc.kernels combined(loop) {
448+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
449+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
450+ // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) {
451+ // CHECK: acc.yield
452+ // CHECK-NEXT: } loc
453+ // CHECK: acc.terminator
454+ // CHECK-NEXT: } loc
455+
456+ #pragma acc kernels loop worker(N) device_type(nvidia, radeon) worker(N + 1)
457+ for (unsigned I = 0 ; I < N; ++I);
458+ // CHECK-NEXT: acc.kernels combined(loop) {
459+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
460+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
461+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
462+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
463+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
464+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
465+ // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
466+ // CHECK: acc.yield
467+ // CHECK-NEXT: } loc
468+ // CHECK: acc.terminator
469+ // CHECK-NEXT: } loc
470+
471+ #pragma acc kernels loop device_type(nvidia, radeon) worker(num:N + 1)
472+ for (unsigned I = 0 ; I < N; ++I);
473+ // CHECK-NEXT: acc.kernels combined(loop) {
474+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
475+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
476+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
477+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
478+ // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
479+ // CHECK: acc.terminator
480+ // CHECK-NEXT: } loc
481+
482+
483+ #pragma acc kernels loop worker vector device_type(nvidia) worker vector
484+ for (unsigned I = 0 ; I < N; ++I);
485+ // CHECK-NEXT: acc.kernels combined(loop) {
486+ // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>])
487+ // CHECK: acc.yield
488+ // CHECK-NEXT: } loc
489+ // CHECK: acc.terminator
490+ // CHECK-NEXT: } loc
491+
492+ #pragma acc kernels loop vector
493+ for (unsigned I = 0 ; I < N; ++I);
494+ // CHECK-NEXT: acc.kernels combined(loop) {
495+ // CHECK: acc.loop combined(kernels) vector {
496+ // CHECK: acc.yield
497+ // CHECK-NEXT: } loc
498+ // CHECK: acc.terminator
499+ // CHECK-NEXT: } loc
500+
501+ #pragma acc kernels loop vector(N)
502+ for (unsigned I = 0 ; I < N; ++I);
503+ // CHECK-NEXT: acc.kernels combined(loop) {
504+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
505+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
506+ // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_CONV]] : si32) {
507+ // CHECK: acc.yield
508+ // CHECK-NEXT: } loc
509+ // CHECK: acc.terminator
510+ // CHECK-NEXT: } loc
511+
512+ #pragma acc kernels loop vector device_type(nvidia, radeon) vector
513+ for (unsigned I = 0 ; I < N; ++I);
514+ // CHECK-NEXT: acc.kernels combined(loop) {
515+ // CHECK-NEXT: acc.loop combined(kernels) vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
516+ // CHECK: acc.yield
517+ // CHECK-NEXT: } loc
518+ // CHECK: acc.terminator
519+ // CHECK-NEXT: } loc
520+
521+ #pragma acc kernels loop vector(N) device_type(nvidia, radeon) vector
522+ for (unsigned I = 0 ; I < N; ++I);
523+ // CHECK-NEXT: acc.kernels combined(loop) {
524+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
525+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
526+ // CHECK-NEXT: acc.loop combined(kernels) vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
527+ // CHECK: acc.yield
528+ // CHECK-NEXT: } loc
529+ // CHECK: acc.terminator
530+ // CHECK-NEXT: } loc
531+
532+ #pragma acc kernels loop vector(N) device_type(nvidia, radeon) vector(N + 1)
533+ for (unsigned I = 0 ; I < N; ++I);
534+ // CHECK-NEXT: acc.kernels combined(loop) {
535+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
536+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
537+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
538+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
539+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
540+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
541+ // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
542+ // CHECK: acc.yield
543+ // CHECK-NEXT: } loc
544+ // CHECK: acc.terminator
545+ // CHECK-NEXT: } loc
546+
547+ #pragma acc kernels loop device_type(nvidia, radeon) vector(length:N + 1)
548+ for (unsigned I = 0 ; I < N; ++I);
549+ // CHECK-NEXT: acc.kernels combined(loop) {
550+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
551+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
552+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
553+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
554+ // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
555+ // CHECK: acc.yield
556+ // CHECK-NEXT: } loc
557+ // CHECK: acc.terminator
558+ // CHECK-NEXT: } loc
559+
560+ #pragma acc kernels loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N)
561+ for (unsigned I = 0 ; I < N; ++I);
562+ // CHECK-NEXT: acc.kernels combined(loop) {
563+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
564+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
565+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
566+ // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
567+ // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
568+ // CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32
569+ // CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
570+ // CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32
571+ // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) {
572+ // CHECK: acc.yield
573+ // CHECK-NEXT: } loc
574+ // CHECK: acc.terminator
575+ // CHECK-NEXT: } loc
404576}
0 commit comments