[OpenMP] Allow GPUs to be targeted directly via -fopenmp.

jhuber6 · jhuber6 · commit 3329b7ae7dc6 · 2025-01-08T12:55:11.000-06:00
Summary:
Currently we prevent the following from working. However, it is
completely reasonable to be able to target the files individually.
```
$ clang --target=amdgcn-amd-amdhsa -fopenmp
```

This patch lifts this restriction, allowing individual files to be
compiled as standalone OpenMP without the extra offloading overhead. The
main motivation behind this is to update the build of the OpenMP
DeviceRTL. Currently, we do `--offload-device-only -S -emit-llvm` which
is just a hackier version of `-fopenmp -flto -c`.

This patch allows the following to work.
```
$ clang omp.c -fopenmp --target=amdgcn-amd-amdhsa -flto -c
$ clang offload.c -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xoffload-linker omp.o
$ ./a.out
```
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -870,9 +870,6 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
       hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
   OMPBuilder.setConfig(Config);
 
-  if (!CGM.getLangOpts().OpenMPIsTargetDevice)
-    llvm_unreachable("OpenMP can only handle device code.");
-
   if (CGM.getLangOpts().OpenMPCUDAMode)
     CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
 
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -6801,7 +6801,8 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
   CodeGenModule &CGM = CGF.CGM;
 
   // On device emit this construct as inlined code.
-  if (CGM.getLangOpts().OpenMPIsTargetDevice) {
+  if (CGM.getLangOpts().OpenMPIsTargetDevice ||
+      CGM.getOpenMPRuntime().isGPU()) {
     OMPLexicalScope Scope(CGF, S, OMPD_target);
     CGM.getOpenMPRuntime().emitInlinedDirective(
         CGF, OMPD_target, [&S](CodeGenFunction &CGF, PrePostActionTy &) {
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -483,8 +483,6 @@ void CodeGenModule::createOpenMPRuntime() {
   case llvm::Triple::nvptx:
   case llvm::Triple::nvptx64:
   case llvm::Triple::amdgcn:
-    assert(getLangOpts().OpenMPIsTargetDevice &&
-           "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
     OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
     break;
   default:
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4210,19 +4210,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
             Args, OPT_fopenmp_version_EQ,
             (IsSimdSpecified || IsTargetSpecified) ? 51 : Opts.OpenMP, Diags))
       Opts.OpenMP = Version;
-    // Provide diagnostic when a given target is not expected to be an OpenMP
-    // device or host.
-    if (!Opts.OpenMPIsTargetDevice) {
-      switch (T.getArch()) {
-      default:
-        break;
-      // Add unsupported host targets here:
-      case llvm::Triple::nvptx:
-      case llvm::Triple::nvptx64:
-        Diags.Report(diag::err_drv_omp_host_target_not_supported) << T.str();
-        break;
-      }
-    }
   }
 
   // Set the flag to prevent the implementation from emitting device exception
diff --git a/clang/test/OpenMP/gpu_target.cpp b/clang/test/OpenMP/gpu_target.cpp
@@ -0,0 +1,220 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --version 5
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck %s --check-prefix=AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-nvidia-cuda -emit-llvm %s -o - | FileCheck %s --check-prefix=NVPTX
+
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0LU)
+} omp_allocator_handle_t;
+
+int d = 0;
+#pragma omp allocate(d) allocator(omp_default_mem_alloc)
+
+int g = 0;
+#pragma omp allocate(g) allocator(omp_cgroup_mem_alloc)
+
+extern const int c = 0;
+#pragma omp allocate(c) allocator(omp_const_mem_alloc)
+
+
+int foo() {
+  int t = 0;
+#pragma omp allocate(t) allocator(omp_thread_mem_alloc)
+  return t;
+}
+
+void bar() {
+#pragma omp target
+  ;
+#pragma omp parallel
+  ;
+}
+
+void baz(int *p) {
+#pragma omp atomic
+  *p += 1;
+}
+
+int qux() {
+#if defined(__NVPTX__)
+  return 1;
+#elif defined(__AMDGPU__)
+  return 2;
+#endif
+}
+//.
+// AMDGCN: @c = addrspace(4) constant i32 0, align 4
+// AMDGCN: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// AMDGCN: @[[GLOB1:[0-9]+]] = private unnamed_addr addrspace(1) constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// AMDGCN: @d = global i32 0, align 4
+// AMDGCN: @g = global i32 0, align 4
+// AMDGCN: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
+//.
+// NVPTX: @d = global i32 0, align 4
+// NVPTX: @g = global i32 0, align 4
+// NVPTX: @c = addrspace(4) constant i32 0, align 4
+// NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+//.
+// AMDGCN-LABEL: define dso_local noundef i32 @_Z3foov(
+// AMDGCN-SAME: ) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[T:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT:    [[T_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[T]] to ptr
+// AMDGCN-NEXT:    store i32 0, ptr [[T_ASCAST]], align 4
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T_ASCAST]], align 4
+// AMDGCN-NEXT:    ret i32 [[TMP0]]
+//
+//
+// AMDGCN-LABEL: define dso_local void @_Z3barv(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
+// AMDGCN-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
+// AMDGCN-NEXT:    call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0)
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined(
+// AMDGCN-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
+// AMDGCN-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
+// AMDGCN-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN-LABEL: define internal void @_Z3barv_omp_outlined_wrapper(
+// AMDGCN-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2, addrspace(5)
+// AMDGCN-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
+// AMDGCN-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
+// AMDGCN-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
+// AMDGCN-NEXT:    [[GLOBAL_ARGS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GLOBAL_ARGS]] to ptr
+// AMDGCN-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR_ASCAST]], align 2
+// AMDGCN-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
+// AMDGCN-NEXT:    store i32 0, ptr [[DOTZERO_ADDR_ASCAST]], align 4
+// AMDGCN-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS_ASCAST]])
+// AMDGCN-NEXT:    call void @_Z3barv_omp_outlined(ptr [[DOTADDR1_ASCAST]], ptr [[DOTZERO_ADDR_ASCAST]]) #[[ATTR3:[0-9]+]]
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN-LABEL: define dso_local void @_Z3bazPi(
+// AMDGCN-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
+// AMDGCN-NEXT:    store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4
+// AMDGCN-NEXT:    ret void
+//
+//
+// AMDGCN-LABEL: define dso_local noundef i32 @_Z3quxv(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
+// AMDGCN-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// AMDGCN-NEXT:    ret i32 2
+//
+//
+// NVPTX-LABEL: define dso_local noundef i32 @_Z3foov(
+// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[T:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    store i32 0, ptr [[T]], align 4
+// NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[T]], align 4
+// NVPTX-NEXT:    ret i32 [[TMP0]]
+//
+//
+// NVPTX-LABEL: define dso_local void @_Z3barv(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
+// NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @_Z3barv_omp_outlined, ptr @_Z3barv_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
+// NVPTX-NEXT:    ret void
+//
+//
+// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined(
+// NVPTX-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// NVPTX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// NVPTX-NEXT:    ret void
+//
+//
+// NVPTX-LABEL: define internal void @_Z3barv_omp_outlined_wrapper(
+// NVPTX-SAME: i16 noundef zeroext [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
+// NVPTX-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
+// NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
+// NVPTX-NEXT:    call void @_Z3barv_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3:[0-9]+]]
+// NVPTX-NEXT:    ret void
+//
+//
+// NVPTX-LABEL: define dso_local void @_Z3bazPi(
+// NVPTX-SAME: ptr noundef [[P:%.*]]) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+// NVPTX-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+// NVPTX-NEXT:    [[TMP1:%.*]] = atomicrmw add ptr [[TMP0]], i32 1 monotonic, align 4
+// NVPTX-NEXT:    ret void
+//
+//
+// NVPTX-LABEL: define dso_local noundef i32 @_Z3quxv(
+// NVPTX-SAME: ) #[[ATTR0]] {
+// NVPTX-NEXT:  [[ENTRY:.*:]]
+// NVPTX-NEXT:    ret i32 1
+//
+//.
+// AMDGCN: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// AMDGCN: attributes #[[ATTR3]] = { nounwind }
+// AMDGCN: attributes #[[ATTR4:[0-9]+]] = { alwaysinline }
+//.
+// NVPTX: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR1]] = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR2]] = { convergent noinline norecurse nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32" }
+// NVPTX: attributes #[[ATTR3]] = { nounwind }
+// NVPTX: attributes #[[ATTR4:[0-9]+]] = { alwaysinline }
+//.
+// AMDGCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
+// AMDGCN: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// AMDGCN: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 45}
+// AMDGCN: [[META3:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// NVPTX: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// NVPTX: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 45}
+// NVPTX: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/OpenMP/target_messages.cpp b/clang/test/OpenMP/target_messages.cpp
@@ -6,9 +6,6 @@
 // RUN: %clang_cc1 -verify=expected,omp4 -fopenmp-simd -fopenmp-version=45 -std=c++11 -o - %s
 // RUN: %clang_cc1 -verify=expected,omp5 -fopenmp-simd -std=c++11 -o - %s
 // CHECK: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd'
-// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx64-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
-// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
-// CHECK-UNSUPPORTED-HOST-TARGET: error: target '{{nvptx64-nvidia-cuda|nvptx-nvidia-cuda}}' is not a supported OpenMP host target
 // RUN: not %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=hexagon-linux-gnu -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-DEVICE-TARGET %s
 // CHECK-UNSUPPORTED-DEVICE-TARGET: OpenMP target is invalid: 'hexagon-linux-gnu'