@@ -319,4 +319,86 @@ extern "C" void acc_combined(int N) {
319319 // CHECK-NEXT: acc.yield
320320 // CHECK-NEXT: } loc
321321
322+ #pragma acc parallel loop gang
323+ for (unsigned I = 0 ; I < N; ++I);
324+ // CHECK-NEXT: acc.parallel combined(loop) {
325+ // CHECK-NEXT: acc.loop combined(parallel) gang {
326+ // CHECK: acc.yield
327+ // CHECK-NEXT: } loc
328+ // CHECK-NEXT: acc.yield
329+ // CHECK-NEXT: } loc
330+ #pragma acc parallel loop gang device_type(nvidia) gang
331+ for (unsigned I = 0 ; I < N; ++I);
332+ // CHECK-NEXT: acc.parallel combined(loop) {
333+ // CHECK-NEXT: acc.loop combined(parallel) gang([#acc.device_type<none>, #acc.device_type<nvidia>]) {
334+ // CHECK: acc.yield
335+ // CHECK-NEXT: } loc
336+ // CHECK-NEXT: acc.yield
337+ // CHECK-NEXT: } loc
338+ #pragma acc parallel loop gang(dim:1) device_type(nvidia) gang(dim:2)
339+ for (unsigned I = 0 ; I < N; ++I);
340+ // CHECK-NEXT: acc.parallel combined(loop) {
341+ // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
342+ // CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
343+ // CHECK-NEXT: acc.loop combined(parallel) gang({dim=%[[ONE_CONST]] : i64}, {dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>]) {
344+ // CHECK: acc.yield
345+ // CHECK-NEXT: } loc
346+ // CHECK-NEXT: acc.yield
347+ // CHECK-NEXT: } loc
348+ #pragma acc parallel loop gang(static:N, dim: 1) device_type(nvidia, radeon) gang(static:*, dim : 2)
349+ for (unsigned I = 0 ; I < N; ++I);
350+ // CHECK-NEXT: acc.parallel combined(loop) {
351+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
352+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
353+ // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
354+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
355+ // CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
356+ // CHECK-NEXT: acc.loop combined(parallel) gang({static=%[[N_CONV]] : si32, dim=%[[ONE_CONST]] : i64}, {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>], {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<radeon>]) {
357+ // CHECK: acc.yield
358+ // CHECK-NEXT: } loc
359+ // CHECK-NEXT: acc.yield
360+ // CHECK-NEXT: } loc
361+
362+ #pragma acc kernels loop gang(num:N) device_type(nvidia, radeon) gang(num:N)
363+ for (unsigned I = 0 ; I < N; ++I);
364+ // CHECK-NEXT: acc.kernels combined(loop) {
365+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
366+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
367+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
368+ // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
369+ // CHECK-NEXT: acc.loop combined(kernels) gang({num=%[[N_CONV]] : si32}, {num=%[[N_CONV2]] : si32} [#acc.device_type<nvidia>], {num=%[[N_CONV2]] : si32} [#acc.device_type<radeon>]) {
370+ // CHECK: acc.yield
371+ // CHECK-NEXT: } loc
372+ // CHECK-NEXT: acc.terminator
373+ // CHECK-NEXT: } loc
374+ #pragma acc kernels loop gang(static:N) device_type(nvidia) gang(static:*)
375+ for (unsigned I = 0 ; I < N; ++I);
376+ // CHECK-NEXT: acc.kernels combined(loop) {
377+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
378+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
379+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
380+ // CHECK-NEXT: acc.loop combined(kernels) gang({static=%[[N_CONV]] : si32}, {static=%[[STAR_CONST]] : i64} [#acc.device_type<nvidia>]) {
381+ // CHECK: acc.yield
382+ // CHECK-NEXT: } loc
383+ // CHECK-NEXT: acc.terminator
384+ // CHECK-NEXT: } loc
385+ #pragma acc kernels loop gang(static:N, num: N + 1) device_type(nvidia) gang(static:*, num : N + 2)
386+ for (unsigned I = 0 ; I < N; ++I);
387+ // CHECK-NEXT: acc.kernels combined(loop) {
388+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
389+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
390+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
391+ // CHECK-NEXT: %[[CIR_ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
392+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[CIR_ONE_CONST]]) nsw : !s32i
393+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
394+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
395+ // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
396+ // CHECK-NEXT: %[[CIR_TWO_CONST:.*]] = cir.const #cir.int<2> : !s32i
397+ // CHECK-NEXT: %[[N_PLUS_TWO:.*]] = cir.binop(add, %[[N_LOAD3]], %[[CIR_TWO_CONST]]) nsw : !s32i
398+ // CHECK-NEXT: %[[N_PLUS_TWO_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_TWO]] : !s32i to si32
399+ // CHECK-NEXT: acc.loop combined(kernels) gang({static=%[[N_CONV]] : si32, num=%[[N_PLUS_ONE_CONV]] : si32}, {static=%[[STAR_CONST]] : i64, num=%[[N_PLUS_TWO_CONV]] : si32} [#acc.device_type<nvidia>]) {
400+ // CHECK: acc.yield
401+ // CHECK-NEXT: } loc
402+ // CHECK-NEXT: acc.terminator
403+ // CHECK-NEXT: } loc
322404}
0 commit comments