-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[offload][OpenMP] Remove device code for num_threads strict #157893
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Due to potential performance issues, this commit temporarily removes support for the num_threads 'strict' modifier and its corresponding message and severity clauses on the device.
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-codegen Author: Robert Imschweiler (ro-i) ChangesDue to potential performance issues, this commit temporarily removes support for the num_threads 'strict' modifier and its corresponding message and severity clauses on the device. Patch is 542.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157893.diff 11 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index a80d9fd68ef2f..8a402fc3859cf 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1210,9 +1210,9 @@ void CGOpenMPRuntimeGPU::emitParallelCall(
if (!CGF.HaveInsertPoint())
return;
- auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, NumThreads,
- NumThreadsModifier, Severity, Message](
- CodeGenFunction &CGF, PrePostActionTy &Action) {
+ auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
+ NumThreads](CodeGenFunction &CGF,
+ PrePostActionTy &Action) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *NumThreadsVal = NumThreads;
llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
@@ -1260,22 +1260,21 @@ void CGOpenMPRuntimeGPU::emitParallelCall(
NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty);
assert(IfCondVal && "Expected a value");
- RuntimeFunction FnID = OMPRTL___kmpc_parallel_51;
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- llvm::SmallVector<llvm::Value *, 10> Args(
- {RTLoc, getThreadID(CGF, Loc), IfCondVal, NumThreadsVal,
- llvm::ConstantInt::get(CGF.Int32Ty, -1), FnPtr, ID,
- Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
- CGF.VoidPtrPtrTy),
- llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())});
- if (NumThreadsModifier == OMPC_NUMTHREADS_strict) {
- FnID = OMPRTL___kmpc_parallel_60;
- Args.append({llvm::ConstantInt::get(CGM.Int32Ty, true),
- emitSeverityClause(Severity),
- emitMessageClause(CGF, Message)});
- }
- CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), FnID), Args);
+ llvm::Value *Args[] = {
+ RTLoc,
+ getThreadID(CGF, Loc),
+ IfCondVal,
+ NumThreadsVal,
+ llvm::ConstantInt::get(CGF.Int32Ty, -1),
+ FnPtr,
+ ID,
+ Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
+ CGF.VoidPtrPtrTy),
+ llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_parallel_51),
+ Args);
};
RegionCodeGenTy RCG(ParallelGen);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 3e367088a47f8..665221b7d7890 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -165,11 +165,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
/// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
/// clause.
- /// If the modifier 'strict' is given:
- /// Emits call to void __kmpc_push_num_threads_strict(ident_t *loc, kmp_int32
- /// global_tid, kmp_int32 num_threads, int severity, const char *message) to
- /// generate code for 'num_threads' clause with 'strict' modifier.
- /// \param NumThreads An integer value of threads.
void emitNumThreadsClause(
CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc,
OpenMPNumThreadsClauseModifier Modifier = OMPC_NUMTHREADS_unknown,
@@ -238,11 +233,11 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// \param NumThreads The value corresponding to the num_threads clause, if
/// any, or nullptr.
/// \param NumThreadsModifier The modifier of the num_threads clause, if
- /// any, ignored otherwise.
+ /// any, ignored otherwise. Currently unused on the device.
/// \param Severity The severity corresponding to the num_threads clause, if
- /// any, ignored otherwise.
+ /// any, ignored otherwise. Currently unused on the device.
/// \param Message The message string corresponding to the num_threads clause,
- /// if any, or nullptr.
+ /// if any, or nullptr. Currently unused on the device.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
diff --git a/clang/test/OpenMP/amdgcn_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/amdgcn_target_parallel_num_threads_codegen.cpp
deleted file mode 100644
index 806a79eba80e9..0000000000000
--- a/clang/test/OpenMP/amdgcn_target_parallel_num_threads_codegen.cpp
+++ /dev/null
@@ -1,1095 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=OMP45_1
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=OMP45_2
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=OMP45_2
-
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefixes=OMP60_1
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefixes=OMP60_2
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -fexceptions -fcxx-exceptions -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefixes=OMP60_2
-
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
-
-// expected-no-diagnostics
-#ifndef HEADER
-#define HEADER
-
-template<typename tx>
-tx ftemplate(int n) {
- tx a = 0;
- short aa = 0;
- tx b[10];
-
- #pragma omp target parallel map(tofrom: aa) num_threads(1024)
- {
- aa += 1;
- }
- #ifdef OMP60
- char str[] = "msg";
- #pragma omp target parallel map(tofrom: aa) num_threads(strict: 1024) severity(warning) message(str)
- {
- aa += 1;
- }
- #endif
-
- #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) num_threads(n)
- {
- a += 1;
- aa += 1;
- b[2] += 1;
- }
- #ifdef OMP60
- const char *str1 = "msg1";
- #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) num_threads(strict: n) severity(warning) message(str1)
- {
- a += 1;
- aa += 1;
- b[2] += 1;
- }
- #endif
-
- return a;
-}
-
-int bar(int n){
- int a = 0;
-
- a += ftemplate<int>(n);
-
- return a;
-}
-
-#endif
-// OMP45_1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// OMP45_1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
-// OMP45_1-NEXT: entry:
-// OMP45_1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
-// OMP45_1-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
-// OMP45_1-NEXT: [[AA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AA_ADDR]] to ptr
-// OMP45_1-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// OMP45_1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[AA]], ptr [[AA_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR_ASCAST]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
-// OMP45_1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment to ptr), ptr [[DYN_PTR]])
-// OMP45_1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
-// OMP45_1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// OMP45_1: user_code.entry:
-// OMP45_1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
-// OMP45_1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
-// OMP45_1-NEXT: store ptr [[TMP0]], ptr [[TMP3]], align 8
-// OMP45_1-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP2]], i32 1, i32 1024, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 1)
-// OMP45_1-NEXT: call void @__kmpc_target_deinit()
-// OMP45_1-NEXT: ret void
-// OMP45_1: worker.exit:
-// OMP45_1-NEXT: ret void
-//
-//
-// OMP45_1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_omp_outlined
-// OMP45_1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
-// OMP45_1-NEXT: entry:
-// OMP45_1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
-// OMP45_1-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
-// OMP45_1-NEXT: [[AA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AA_ADDR]] to ptr
-// OMP45_1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[AA]], ptr [[AA_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META8]]
-// OMP45_1-NEXT: [[TMP1:%.*]] = load i16, ptr [[TMP0]], align 2
-// OMP45_1-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
-// OMP45_1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1
-// OMP45_1-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
-// OMP45_1-NEXT: store i16 [[CONV1]], ptr [[TMP0]], align 2
-// OMP45_1-NEXT: ret void
-//
-//
-// OMP45_1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
-// OMP45_1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
-// OMP45_1-NEXT: entry:
-// OMP45_1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
-// OMP45_1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8, addrspace(5)
-// OMP45_1-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
-// OMP45_1-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// OMP45_1-NEXT: [[AA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AA_ADDR]] to ptr
-// OMP45_1-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// OMP45_1-NEXT: [[DOTCAPTURE_EXPR__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCAPTURE_EXPR__ADDR]] to ptr
-// OMP45_1-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// OMP45_1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[AA]], ptr [[AA_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META9:![0-9]+]]
-// OMP45_1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[AA_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META8]]
-// OMP45_1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META9]]
-// OMP45_1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_kernel_environment to ptr), ptr [[DYN_PTR]])
-// OMP45_1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
-// OMP45_1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// OMP45_1: user_code.entry:
-// OMP45_1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
-// OMP45_1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR_ASCAST]], align 4
-// OMP45_1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
-// OMP45_1-NEXT: store ptr [[TMP0]], ptr [[TMP6]], align 8
-// OMP45_1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
-// OMP45_1-NEXT: store ptr [[TMP1]], ptr [[TMP7]], align 8
-// OMP45_1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
-// OMP45_1-NEXT: store ptr [[TMP2]], ptr [[TMP8]], align 8
-// OMP45_1-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 3)
-// OMP45_1-NEXT: call void @__kmpc_target_deinit()
-// OMP45_1-NEXT: ret void
-// OMP45_1: worker.exit:
-// OMP45_1-NEXT: ret void
-//
-//
-// OMP45_1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_omp_outlined
-// OMP45_1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
-// OMP45_1-NEXT: entry:
-// OMP45_1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
-// OMP45_1-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
-// OMP45_1-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// OMP45_1-NEXT: [[AA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AA_ADDR]] to ptr
-// OMP45_1-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// OMP45_1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[AA]], ptr [[AA_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META9]]
-// OMP45_1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[AA_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META8]]
-// OMP45_1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META9]]
-// OMP45_1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
-// OMP45_1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
-// OMP45_1-NEXT: store i32 [[ADD]], ptr [[TMP0]], align 4
-// OMP45_1-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP1]], align 2
-// OMP45_1-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32
-// OMP45_1-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
-// OMP45_1-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
-// OMP45_1-NEXT: store i16 [[CONV2]], ptr [[TMP1]], align 2
-// OMP45_1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP2]], i64 0, i64 2
-// OMP45_1-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-// OMP45_1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1
-// OMP45_1-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
-// OMP45_1-NEXT: ret void
-//
-//
-// OMP45_2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// OMP45_2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
-// OMP45_2-NEXT: entry:
-// OMP45_2-NEXT: [[DYN_PTR_ADDR:%.*]] = allo...
[truncated]
|
|
@llvm/pr-subscribers-flang-openmp Author: Robert Imschweiler (ro-i) ChangesDue to potential performance issues, this commit temporarily removes support for the num_threads 'strict' modifier and its corresponding message and severity clauses on the device. Patch is 542.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157893.diff 11 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index a80d9fd68ef2f..8a402fc3859cf 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1210,9 +1210,9 @@ void CGOpenMPRuntimeGPU::emitParallelCall(
if (!CGF.HaveInsertPoint())
return;
- auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond, NumThreads,
- NumThreadsModifier, Severity, Message](
- CodeGenFunction &CGF, PrePostActionTy &Action) {
+ auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
+ NumThreads](CodeGenFunction &CGF,
+ PrePostActionTy &Action) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *NumThreadsVal = NumThreads;
llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
@@ -1260,22 +1260,21 @@ void CGOpenMPRuntimeGPU::emitParallelCall(
NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty);
assert(IfCondVal && "Expected a value");
- RuntimeFunction FnID = OMPRTL___kmpc_parallel_51;
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
- llvm::SmallVector<llvm::Value *, 10> Args(
- {RTLoc, getThreadID(CGF, Loc), IfCondVal, NumThreadsVal,
- llvm::ConstantInt::get(CGF.Int32Ty, -1), FnPtr, ID,
- Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
- CGF.VoidPtrPtrTy),
- llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())});
- if (NumThreadsModifier == OMPC_NUMTHREADS_strict) {
- FnID = OMPRTL___kmpc_parallel_60;
- Args.append({llvm::ConstantInt::get(CGM.Int32Ty, true),
- emitSeverityClause(Severity),
- emitMessageClause(CGF, Message)});
- }
- CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), FnID), Args);
+ llvm::Value *Args[] = {
+ RTLoc,
+ getThreadID(CGF, Loc),
+ IfCondVal,
+ NumThreadsVal,
+ llvm::ConstantInt::get(CGF.Int32Ty, -1),
+ FnPtr,
+ ID,
+ Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
+ CGF.VoidPtrPtrTy),
+ llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_parallel_51),
+ Args);
};
RegionCodeGenTy RCG(ParallelGen);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 3e367088a47f8..665221b7d7890 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -165,11 +165,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
/// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
/// clause.
- /// If the modifier 'strict' is given:
- /// Emits call to void __kmpc_push_num_threads_strict(ident_t *loc, kmp_int32
- /// global_tid, kmp_int32 num_threads, int severity, const char *message) to
- /// generate code for 'num_threads' clause with 'strict' modifier.
- /// \param NumThreads An integer value of threads.
void emitNumThreadsClause(
CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc,
OpenMPNumThreadsClauseModifier Modifier = OMPC_NUMTHREADS_unknown,
@@ -238,11 +233,11 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
/// \param NumThreads The value corresponding to the num_threads clause, if
/// any, or nullptr.
/// \param NumThreadsModifier The modifier of the num_threads clause, if
- /// any, ignored otherwise.
+ /// any, ignored otherwise. Currently unused on the device.
/// \param Severity The severity corresponding to the num_threads clause, if
- /// any, ignored otherwise.
+ /// any, ignored otherwise. Currently unused on the device.
/// \param Message The message string corresponding to the num_threads clause,
- /// if any, or nullptr.
+ /// if any, or nullptr. Currently unused on the device.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
llvm::Function *OutlinedFn,
ArrayRef<llvm::Value *> CapturedVars,
diff --git a/clang/test/OpenMP/amdgcn_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/amdgcn_target_parallel_num_threads_codegen.cpp
deleted file mode 100644
index 806a79eba80e9..0000000000000
--- a/clang/test/OpenMP/amdgcn_target_parallel_num_threads_codegen.cpp
+++ /dev/null
@@ -1,1095 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
-// Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=OMP45_1
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=OMP45_2
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=OMP45_2
-
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefixes=OMP60_1
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefixes=OMP60_2
-// RUN: %clang_cc1 -DOMP60 -verify -fopenmp -fopenmp-version=60 -fexceptions -fcxx-exceptions -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefixes=OMP60_2
-
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK2
-
-// expected-no-diagnostics
-#ifndef HEADER
-#define HEADER
-
-template<typename tx>
-tx ftemplate(int n) {
- tx a = 0;
- short aa = 0;
- tx b[10];
-
- #pragma omp target parallel map(tofrom: aa) num_threads(1024)
- {
- aa += 1;
- }
- #ifdef OMP60
- char str[] = "msg";
- #pragma omp target parallel map(tofrom: aa) num_threads(strict: 1024) severity(warning) message(str)
- {
- aa += 1;
- }
- #endif
-
- #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) num_threads(n)
- {
- a += 1;
- aa += 1;
- b[2] += 1;
- }
- #ifdef OMP60
- const char *str1 = "msg1";
- #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40) num_threads(strict: n) severity(warning) message(str1)
- {
- a += 1;
- aa += 1;
- b[2] += 1;
- }
- #endif
-
- return a;
-}
-
-int bar(int n){
- int a = 0;
-
- a += ftemplate<int>(n);
-
- return a;
-}
-
-#endif
-// OMP45_1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// OMP45_1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
-// OMP45_1-NEXT: entry:
-// OMP45_1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
-// OMP45_1-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
-// OMP45_1-NEXT: [[AA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AA_ADDR]] to ptr
-// OMP45_1-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// OMP45_1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[AA]], ptr [[AA_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR_ASCAST]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]]
-// OMP45_1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment to ptr), ptr [[DYN_PTR]])
-// OMP45_1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
-// OMP45_1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// OMP45_1: user_code.entry:
-// OMP45_1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1:[0-9]+]] to ptr))
-// OMP45_1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
-// OMP45_1-NEXT: store ptr [[TMP0]], ptr [[TMP3]], align 8
-// OMP45_1-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP2]], i32 1, i32 1024, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 1)
-// OMP45_1-NEXT: call void @__kmpc_target_deinit()
-// OMP45_1-NEXT: ret void
-// OMP45_1: worker.exit:
-// OMP45_1-NEXT: ret void
-//
-//
-// OMP45_1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_omp_outlined
-// OMP45_1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
-// OMP45_1-NEXT: entry:
-// OMP45_1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
-// OMP45_1-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
-// OMP45_1-NEXT: [[AA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AA_ADDR]] to ptr
-// OMP45_1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[AA]], ptr [[AA_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META8]]
-// OMP45_1-NEXT: [[TMP1:%.*]] = load i16, ptr [[TMP0]], align 2
-// OMP45_1-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i32
-// OMP45_1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV]], 1
-// OMP45_1-NEXT: [[CONV1:%.*]] = trunc i32 [[ADD]] to i16
-// OMP45_1-NEXT: store i16 [[CONV1]], ptr [[TMP0]], align 2
-// OMP45_1-NEXT: ret void
-//
-//
-// OMP45_1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
-// OMP45_1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
-// OMP45_1-NEXT: entry:
-// OMP45_1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8, addrspace(5)
-// OMP45_1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8, addrspace(5)
-// OMP45_1-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
-// OMP45_1-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// OMP45_1-NEXT: [[AA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AA_ADDR]] to ptr
-// OMP45_1-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// OMP45_1-NEXT: [[DOTCAPTURE_EXPR__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCAPTURE_EXPR__ADDR]] to ptr
-// OMP45_1-NEXT: [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// OMP45_1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[AA]], ptr [[AA_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META9:![0-9]+]]
-// OMP45_1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[AA_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META8]]
-// OMP45_1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META9]]
-// OMP45_1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_kernel_environment to ptr), ptr [[DYN_PTR]])
-// OMP45_1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
-// OMP45_1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
-// OMP45_1: user_code.entry:
-// OMP45_1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
-// OMP45_1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR_ASCAST]], align 4
-// OMP45_1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 0
-// OMP45_1-NEXT: store ptr [[TMP0]], ptr [[TMP6]], align 8
-// OMP45_1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 1
-// OMP45_1-NEXT: store ptr [[TMP1]], ptr [[TMP7]], align 8
-// OMP45_1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 0, i64 2
-// OMP45_1-NEXT: store ptr [[TMP2]], ptr [[TMP8]], align 8
-// OMP45_1-NEXT: call void @__kmpc_parallel_51(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP4]], i32 1, i32 [[TMP5]], i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS_ASCAST]], i64 3)
-// OMP45_1-NEXT: call void @__kmpc_target_deinit()
-// OMP45_1-NEXT: ret void
-// OMP45_1: worker.exit:
-// OMP45_1-NEXT: ret void
-//
-//
-// OMP45_1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_omp_outlined
-// OMP45_1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
-// OMP45_1-NEXT: entry:
-// OMP45_1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[AA_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-// OMP45_1-NEXT: [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
-// OMP45_1-NEXT: [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
-// OMP45_1-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
-// OMP45_1-NEXT: [[AA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[AA_ADDR]] to ptr
-// OMP45_1-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
-// OMP45_1-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[AA]], ptr [[AA_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8
-// OMP45_1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META9]]
-// OMP45_1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[AA_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META8]]
-// OMP45_1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8, !nonnull [[META7]], !align [[META9]]
-// OMP45_1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
-// OMP45_1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP3]], 1
-// OMP45_1-NEXT: store i32 [[ADD]], ptr [[TMP0]], align 4
-// OMP45_1-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP1]], align 2
-// OMP45_1-NEXT: [[CONV:%.*]] = sext i16 [[TMP4]] to i32
-// OMP45_1-NEXT: [[ADD1:%.*]] = add nsw i32 [[CONV]], 1
-// OMP45_1-NEXT: [[CONV2:%.*]] = trunc i32 [[ADD1]] to i16
-// OMP45_1-NEXT: store i16 [[CONV2]], ptr [[TMP1]], align 2
-// OMP45_1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP2]], i64 0, i64 2
-// OMP45_1-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-// OMP45_1-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP5]], 1
-// OMP45_1-NEXT: store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4
-// OMP45_1-NEXT: ret void
-//
-//
-// OMP45_2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// OMP45_2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
-// OMP45_2-NEXT: entry:
-// OMP45_2-NEXT: [[DYN_PTR_ADDR:%.*]] = allo...
[truncated]
|
jhuber6
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we keep the OMPKinds definitions? I figure we'll need those for the CPU side.
|
you mean the parallel_60 thingies? No, host is using __kmpc_push_num_threads_strict, which had already been added a while ago in d30b082 |
|
That was copied from a different failing test case, IIRC the important part was the |
|
No, there will be no |
|
A quick check seems to suggest that it should work, yes. |
4b9b46a to
113bb4c
Compare
…lvm#157893)" This reverts commit 23302a2.
Due to potential performance issues, this commit temporarily removes support for the num_threads 'strict' modifier and its corresponding message and severity clauses on the device.