@@ -323,4 +323,73 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
323323 // CHECK: acc.yield
324324 // CHECK-NEXT: } loc
325325 }
326+
327+ #pragma acc parallel
328+ // CHECK: acc.parallel {
329+ {
330+ #pragma acc loop gang
331+ for (unsigned I = 0 ; I < N; ++I);
332+ // CHECK-NEXT: acc.loop gang {
333+ // CHECK: acc.yield
334+ // CHECK-NEXT: } loc
335+ #pragma acc loop gang device_type(nvidia) gang
336+ for (unsigned I = 0 ; I < N; ++I);
337+ // CHECK-NEXT: acc.loop gang([#acc.device_type<none>, #acc.device_type<nvidia>]) {
338+ // CHECK: acc.yield
339+ // CHECK-NEXT: } loc
340+ #pragma acc loop gang(dim:1) device_type(nvidia) gang(dim:2)
341+ for (unsigned I = 0 ; I < N; ++I);
342+ // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
343+ // CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
344+ // CHECK-NEXT: acc.loop gang({dim=%[[ONE_CONST]] : i64}, {dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>]) {
345+ // CHECK: acc.yield
346+ // CHECK-NEXT: } loc
347+ #pragma acc loop gang(static:N, dim: 1) device_type(nvidia, radeon) gang(static:*, dim : 2)
348+ for (unsigned I = 0 ; I < N; ++I);
349+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
350+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
351+ // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 : i64
352+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
353+ // CHECK-NEXT: %[[TWO_CONST:.*]] = arith.constant 2 : i64
354+ // CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32, dim=%[[ONE_CONST]] : i64}, {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<nvidia>], {static=%[[STAR_CONST]] : i64, dim=%[[TWO_CONST]] : i64} [#acc.device_type<radeon>]) {
355+ // CHECK: acc.yield
356+ // CHECK-NEXT: } loc
357+ }
358+ #pragma acc kernels
359+ // CHECK: acc.kernels {
360+ {
361+ #pragma acc loop gang(num:N) device_type(nvidia, radeon) gang(num:N)
362+ for (unsigned I = 0 ; I < N; ++I);
363+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
364+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
365+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
366+ // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
367+ // CHECK-NEXT: acc.loop gang({num=%[[N_CONV]] : si32}, {num=%[[N_CONV2]] : si32} [#acc.device_type<nvidia>], {num=%[[N_CONV2]] : si32} [#acc.device_type<radeon>]) {
368+ // CHECK: acc.yield
369+ // CHECK-NEXT: } loc
370+ #pragma acc loop gang(static:N) device_type(nvidia) gang(static:*)
371+ for (unsigned I = 0 ; I < N; ++I);
372+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
373+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
374+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
375+ // CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32}, {static=%[[STAR_CONST]] : i64} [#acc.device_type<nvidia>]) {
376+ // CHECK: acc.yield
377+ // CHECK-NEXT: } loc
378+ #pragma acc loop gang(static:N, num: N + 1) device_type(nvidia) gang(static:*, num : N + 2)
379+ for (unsigned I = 0 ; I < N; ++I);
380+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
381+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
382+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
383+ // CHECK-NEXT: %[[CIR_ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
384+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[CIR_ONE_CONST]]) nsw : !s32i
385+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
386+ // CHECK-NEXT: %[[STAR_CONST:.*]] = arith.constant -1 : i64
387+ // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
388+ // CHECK-NEXT: %[[CIR_TWO_CONST:.*]] = cir.const #cir.int<2> : !s32i
389+ // CHECK-NEXT: %[[N_PLUS_TWO:.*]] = cir.binop(add, %[[N_LOAD3]], %[[CIR_TWO_CONST]]) nsw : !s32i
390+ // CHECK-NEXT: %[[N_PLUS_TWO_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_TWO]] : !s32i to si32
391+ // CHECK-NEXT: acc.loop gang({static=%[[N_CONV]] : si32, num=%[[N_PLUS_ONE_CONV]] : si32}, {static=%[[STAR_CONST]] : i64, num=%[[N_PLUS_TWO_CONV]] : si32} [#acc.device_type<nvidia>]) {
392+ // CHECK: acc.yield
393+ // CHECK-NEXT: } loc
394+ }
326395}
0 commit comments