[OpenACC][CIR] Implement 'worker'/'vector' lowering (#138765)

erichkeane · web-flow · commit f4e7ba02cc7f · 2025-05-07T13:48:17.000-07:00
This patch implements worker and vector lowering for the loop construct,
which are fairly simple clauses, except that they also have a 'no
argument' form which requires a touch more work. Else, these are just
like a handful of other clauses where we just keep the device_type array
and operands in sync.
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
@@ -392,6 +392,38 @@ class OpenACCClauseCIREmitter final
       return clauseNotImplemented(clause);
     }
   }
+
+  void VisitWorkerClause(const OpenACCWorkerClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+      if (clause.hasIntExpr())
+        operation.addWorkerNumOperand(builder.getContext(),
+                                      createIntExpr(clause.getIntExpr()),
+                                      lastDeviceTypeValues);
+      else
+        operation.addEmptyWorker(builder.getContext(), lastDeviceTypeValues);
+
+    } else {
+      // TODO: When we've implemented this for everything, switch this to an
+      // unreachable. Combined constructs remain.
+      return clauseNotImplemented(clause);
+    }
+  }
+
+  void VisitVectorClause(const OpenACCVectorClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+      if (clause.hasIntExpr())
+        operation.addVectorOperand(builder.getContext(),
+                                   createIntExpr(clause.getIntExpr()),
+                                   lastDeviceTypeValues);
+      else
+        operation.addEmptyVector(builder.getContext(), lastDeviceTypeValues);
+
+    } else {
+      // TODO: When we've implemented this for everything, switch this to an
+      // unreachable. Combined constructs remain.
+      return clauseNotImplemented(clause);
+    }
+  }
 };
 
 template <typename OpTy>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp
@@ -193,4 +193,134 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
   // CHECK: acc.yield
   // CHECK-NEXT: } loc
 
+
+#pragma acc kernels
+  {
+
+#pragma acc loop worker
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK: acc.loop worker {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker(N)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker(N + 1)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) worker(num:N + 1)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+
+#pragma acc loop vector
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK: acc.loop vector {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop vector device_type(nvidia, radeon) vector
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: acc.loop vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.loop vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector(N + 1)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+  // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) vector(length:N + 1)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+  // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+  // CHECK-NEXT: acc.loop vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker vector device_type(nvidia) worker vector
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>])
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N)
+  for(unsigned I = 0; I < N; ++I);
+  // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32
+  // CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32
+  // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) {
+  // CHECK: acc.yield
+  // CHECK-NEXT: } loc
+  }
 }
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2216,6 +2216,21 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     // values should be integral constants, with the '*' represented as a '-1'.
     void setTileForDeviceTypes(MLIRContext *, llvm::ArrayRef<DeviceType>,
                                mlir::ValueRange);
+
+    // Add a value to the 'vector' list with a current list of device_types.
+    void addVectorOperand(MLIRContext *, mlir::Value,
+                          llvm::ArrayRef<DeviceType>);
+    // Add an empty value to the 'vector' list with a current list of
+    // device_types. This is for the case where there is no expression specified
+    // in a 'vector'.
+    void addEmptyVector(MLIRContext *, llvm::ArrayRef<DeviceType>);
+    // Add a value to the 'worker' list with a current list of device_types.
+    void addWorkerNumOperand(MLIRContext *, mlir::Value,
+                             llvm::ArrayRef<DeviceType>);
+    // Add an empty value to the 'worker' list with a current list of
+    // device_types. This is for the case where there is no expression specified
+    // in a 'worker'.
+    void addEmptyWorker(MLIRContext *, llvm::ArrayRef<DeviceType>);
   }];
 
   let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2720,6 +2720,34 @@ void acc::LoopOp::setTileForDeviceTypes(
   setTileOperandsSegments(segments);
 }
 
+void acc::LoopOp::addVectorOperand(
+    MLIRContext *context, mlir::Value newValue,
+    llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+  setVectorOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+      context, getVectorOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+      newValue, getVectorOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyVector(
+    MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+  setVectorAttr(addDeviceTypeAffectedOperandHelper(context, getVectorAttr(),
+                                                   effectiveDeviceTypes));
+}
+
+void acc::LoopOp::addWorkerNumOperand(
+    MLIRContext *context, mlir::Value newValue,
+    llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+  setWorkerNumOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+      context, getWorkerNumOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+      newValue, getWorkerNumOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyWorker(
+    MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+  setWorkerAttr(addDeviceTypeAffectedOperandHelper(context, getWorkerAttr(),
+                                                   effectiveDeviceTypes));
+}
+
 //===----------------------------------------------------------------------===//
 // DataOp
 //===----------------------------------------------------------------------===//