diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index 099634e211e7a..8f2a1fd01fabc 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -95,11 +95,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS} list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I") # Set flags for LLVM Bitcode compilation. -set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden - ${clang_opt_flags} --offload-device-only - -nocudalib -nogpulib -nogpuinc -nostdlibinc - -fopenmp -fopenmp-cuda-mode - -Wno-unknown-cuda-version -Wno-openmp-target +set(bc_flags -c -flto -std=c++17 -fvisibility=hidden + ${clang_opt_flags} -nogpulib -nostdlibinc + -fno-rtti -fno-exceptions -fconvergent-functions + -Wno-unknown-cuda-version -DOMPTARGET_DEVICE_RUNTIME -I${include_directory} -I${devicertl_base_directory}/../include @@ -123,8 +122,7 @@ function(compileDeviceRTLLibrary target_name target_triple) add_custom_command(OUTPUT ${outfile} COMMAND ${CLANG_TOOL} ${bc_flags} - -fopenmp-targets=${target_triple} - -Xopenmp-target=${target_triple} -march= + --target=${target_triple} ${target_bc_flags} -MD -MF ${depfile} ${infile} -o ${outfile} @@ -242,10 +240,8 @@ function(compileDeviceRTLLibrary target_name target_triple) set(ide_target_name omptarget-ide-${target_name}) add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files}) target_compile_options(${ide_target_name} PRIVATE - -fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march= - -fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable - -foffload-lto -fvisibility=hidden --offload-device-only - -nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version + -fvisibility=hidden --target=${target_triple} + -nogpulib -nostdlibinc -Wno-unknown-cuda-version ) target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512) target_include_directories(${ide_target_name} PRIVATE diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h index 475f6a21bb47e..79c69a2a96b4e 100644 --- a/offload/DeviceRTL/include/Allocator.h +++ b/offload/DeviceRTL/include/Allocator.h @@ -17,8 +17,6 @@ // Forward declaration. struct KernelEnvironmentTy; -#pragma omp begin declare target device_type(nohost) - namespace ompx { namespace allocator { @@ -44,6 +42,4 @@ extern "C" { [[gnu::weak]] void free(void *Ptr); } -#pragma omp end declare target - #endif diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h index 1cd044f432e56..308109b0749f0 100644 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -99,14 +99,7 @@ struct TaskDescriptorTy { TaskFnTy TaskFn; }; -#pragma omp begin declare variant match(device = {arch(amdgcn)}) using LaneMaskTy = uint64_t; -#pragma omp end declare variant - -#pragma omp begin declare variant match( \ - device = {arch(amdgcn)}, implementation = {extension(match_none)}) -using LaneMaskTy = uint64_t; -#pragma omp end declare variant namespace lanes { enum : LaneMaskTy { All = ~(LaneMaskTy)0 }; @@ -163,8 +156,7 @@ typedef enum omp_allocator_handle_t { #define OMP_PRAGMA(STR) __PRAGMA(omp STR) #define SHARED(NAME) \ - NAME [[clang::loader_uninitialized]]; \ - OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc)) + [[clang::address_space(3)]] NAME [[clang::loader_uninitialized]]; // TODO: clang should use address space 5 for omp_thread_mem_alloc, but right // now that's not the case. diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h index 2243673aef61c..b92514ee9838a 100644 --- a/offload/DeviceRTL/include/DeviceUtils.h +++ b/offload/DeviceRTL/include/DeviceUtils.h @@ -15,8 +15,6 @@ #include "DeviceTypes.h" #include "Shared/Utils.h" -#pragma omp begin declare target device_type(nohost) - namespace utils { template struct type_identity { @@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr); } // namespace utils -#pragma omp end declare target - #endif diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h index 2217eb7616b38..f892a025159d4 100644 --- a/offload/DeviceRTL/include/Mapping.h +++ b/offload/DeviceRTL/include/Mapping.h @@ -24,12 +24,8 @@ enum { DIM_Z = 2, }; -#pragma omp begin declare target device_type(nohost) - inline constexpr uint32_t MaxThreadsPerTeam = 1024; -#pragma omp end declare target - /// Initialize the mapping machinery. void init(bool IsSPMD); diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index f0500c1083d7f..58b619ff1072a 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -22,8 +22,6 @@ // Forward declaration. struct KernelEnvironmentTy; -#pragma omp begin declare target device_type(nohost) - namespace ompx { namespace memory { @@ -88,8 +86,7 @@ struct TeamStateTy { ParallelRegionFnTy ParallelRegionFnVar; }; -extern TeamStateTy TeamState; -#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc) +extern TeamStateTy [[clang::address_space(3)]] TeamState; struct ThreadStateTy { @@ -115,8 +112,7 @@ struct ThreadStateTy { } }; -extern ThreadStateTy **ThreadStates; -#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) +extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates; /// Initialize the state machinery. Must be called by all threads. void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, @@ -378,6 +374,4 @@ inline state::Value RunSched; } // namespace ompx -#pragma omp end declare target - #endif diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h index 5045d3c2c99a3..f9eb8d0d23198 100644 --- a/offload/DeviceRTL/include/Synchronization.h +++ b/offload/DeviceRTL/include/Synchronization.h @@ -15,8 +15,6 @@ #include "DeviceTypes.h" #include "DeviceUtils.h" -#pragma omp begin declare target device_type(nohost) - namespace ompx { namespace atomic { @@ -220,6 +218,4 @@ void system(atomic::OrderingTy Ordering); } // namespace ompx -#pragma omp end declare target - #endif diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h index fa9b3b2430b8c..554c3271c334c 100644 --- a/offload/DeviceRTL/include/Workshare.h +++ b/offload/DeviceRTL/include/Workshare.h @@ -12,8 +12,6 @@ #ifndef OMPTARGET_WORKSHARE_H #define OMPTARGET_WORKSHARE_H -#pragma omp begin declare target device_type(nohost) - namespace ompx { namespace workshare { @@ -25,6 +23,4 @@ void init(bool IsSPMD); } // namespace ompx -#pragma omp end declare target - #endif diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp index ac662c48d4f5f..aac2a6005158e 100644 --- a/offload/DeviceRTL/src/Allocator.cpp +++ b/offload/DeviceRTL/src/Allocator.cpp @@ -19,8 +19,6 @@ using namespace ompx; -#pragma omp begin declare target device_type(nohost) - [[gnu::used, gnu::retain, gnu::weak, gnu::visibility( "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool; @@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } ///} - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp index 0b488b8034178..a2dfa4a02a094 100644 --- a/offload/DeviceRTL/src/Configuration.cpp +++ b/offload/DeviceRTL/src/Configuration.cpp @@ -17,8 +17,6 @@ using namespace ompx; -#pragma omp begin declare target device_type(nohost) - // Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled. [[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0; [[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0; @@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() { return false; return state::getKernelEnvironment().Configuration.MayUseNestedParallelism; } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp index 1d9c962885422..5b5482d766b1d 100644 --- a/offload/DeviceRTL/src/Debug.cpp +++ b/offload/DeviceRTL/src/Debug.cpp @@ -21,8 +21,6 @@ using namespace ompx; -#pragma omp begin declare target device_type(nohost) - extern "C" { void __assert_assume(bool condition) { __builtin_assume(condition); } @@ -44,5 +42,3 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file, __builtin_trap(); } } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp index c204a7be73b1f..d8109537832e9 100644 --- a/offload/DeviceRTL/src/DeviceUtils.cpp +++ b/offload/DeviceRTL/src/DeviceUtils.cpp @@ -15,14 +15,10 @@ #include "Interface.h" #include "Mapping.h" -#pragma omp begin declare target device_type(nohost) - using namespace ompx; namespace impl { -bool isSharedMemPtr(const void *Ptr) { return false; } - void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { static_assert(sizeof(unsigned long) == 8, ""); *LowBits = static_cast(Val & 0x00000000FFFFFFFFUL); @@ -42,7 +38,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred); /// AMDGCN Implementation /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) +#ifdef __AMDGPU__ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { int Self = mapping::getThreadIdInWarp(); @@ -66,15 +62,13 @@ bool isSharedMemPtr(const void *Ptr) { return __builtin_amdgcn_is_shared( (const __attribute__((address_space(0))) void *)Ptr); } -#pragma omp end declare variant +#endif ///} /// NVPTX Implementation /// ///{ -#pragma omp begin declare variant match( \ - device = {arch(nvptx, nvptx64)}, \ - implementation = {extension(match_any)}) +#ifdef __NVPTX__ int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1); @@ -91,7 +85,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) { bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); } -#pragma omp end declare variant +#endif ///} } // namespace impl @@ -137,5 +131,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { return utils::shuffleDown(lanes::All, Val, Delta, Width); } } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index 8bb275eae776c..9bb89573dc0cb 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -25,8 +25,6 @@ using namespace ompx; -#pragma omp begin declare target device_type(nohost) - static void inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { @@ -155,5 +153,3 @@ void __kmpc_target_deinit() { int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp index e55008f46269f..83f9233d94803 100644 --- a/offload/DeviceRTL/src/LibC.cpp +++ b/offload/DeviceRTL/src/LibC.cpp @@ -8,8 +8,6 @@ #include "LibC.h" -#pragma omp begin declare target device_type(nohost) - #if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; } #else @@ -48,5 +46,3 @@ namespace ompx { return ::vprintf(Format, vlist); } } // namespace ompx - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index 8583a539824c8..a0c0f6721a84c 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -15,8 +15,6 @@ #include "Interface.h" #include "State.h" -#pragma omp begin declare target device_type(nohost) - #include "llvm/Frontend/OpenMP/OMPGridValues.h" using namespace ompx; @@ -24,24 +22,10 @@ using namespace ompx; namespace ompx { namespace impl { -// Forward declarations defined to be defined for AMDGCN and NVPTX. -LaneMaskTy activemask(); -LaneMaskTy lanemaskLT(); -LaneMaskTy lanemaskGT(); -uint32_t getThreadIdInWarp(); -uint32_t getThreadIdInBlock(int32_t Dim); -uint32_t getNumberOfThreadsInBlock(int32_t Dim); -uint32_t getNumberOfThreadsInKernel(); -uint32_t getBlockIdInKernel(int32_t Dim); -uint32_t getNumberOfBlocksInKernel(int32_t Dim); -uint32_t getWarpIdInBlock(); -uint32_t getNumberOfWarpsInBlock(); -uint32_t getWarpSize(); - /// AMDGCN Implementation /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) +#ifdef __AMDGPU__ uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); } @@ -128,15 +112,13 @@ uint32_t getNumberOfWarpsInBlock() { return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize(); } -#pragma omp end declare variant +#endif ///} /// NVPTX Implementation /// ///{ -#pragma omp begin declare variant match( \ - device = {arch(nvptx, nvptx64)}, \ - implementation = {extension(match_any)}) +#ifdef __NVPTX__ uint32_t getNumberOfThreadsInBlock(int32_t Dim) { switch (Dim) { @@ -214,7 +196,7 @@ uint32_t getNumberOfWarpsInBlock() { mapping::getWarpSize(); } -#pragma omp end declare variant +#endif ///} } // namespace impl @@ -376,7 +358,7 @@ float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta, } long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) { - return utils::shuffleDown(mask, var, delta, width); + return utils::shuffleDown(mask, utils::bitCast(var), delta, width); } double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta, @@ -385,5 +367,3 @@ double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta, utils::shuffleDown(mask, utils::bitCast(var), delta, width)); } } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index 010474b1c4a74..734e937f03920 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -17,19 +17,13 @@ #include "Debug.h" -#pragma omp begin declare target device_type(nohost) - namespace ompx { namespace impl { -double getWTick(); - -double getWTime(); - /// AMDGCN Implementation /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) +#ifdef __AMDGPU__ double getWTick() { // The number of ticks per second for the AMDGPU clock varies by card and can @@ -42,14 +36,12 @@ double getWTime() { return static_cast(__builtin_readsteadycounter()) * getWTick(); } -#pragma omp end declare variant +#endif /// NVPTX Implementation /// ///{ -#pragma omp begin declare variant match( \ - device = {arch(nvptx, nvptx64)}, \ - implementation = {extension(match_any)}) +#ifdef __NVPTX__ double getWTick() { // Timer precision is 1ns @@ -61,7 +53,7 @@ double getWTime() { return static_cast(nsecs) * getWTick(); } -#pragma omp end declare variant +#endif /// Lookup a device-side function using a host pointer /p HstPtr using the table /// provided by the device plugin. The table is an ordered pair of host and @@ -171,4 +163,3 @@ unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) { } ///} -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index a87e363349b1e..08ce616aee1c4 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -43,8 +43,6 @@ using namespace ompx; -#pragma omp begin declare target device_type(nohost) - namespace { uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { @@ -311,5 +309,3 @@ void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp index bb3caaadcc03d..df141af5ebeea 100644 --- a/offload/DeviceRTL/src/Profiling.cpp +++ b/offload/DeviceRTL/src/Profiling.cpp @@ -8,8 +8,6 @@ #include "Profiling.h" -#pragma omp begin declare target device_type(nohost) - extern "C" { // Provides empty implementations for certain functions in compiler-rt @@ -18,5 +16,3 @@ void __llvm_profile_register_function(void *Ptr) {} void __llvm_profile_register_names_function(void *Ptr, long int I) {} void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index 382f6cf392e91..25f34005532f7 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -22,8 +22,6 @@ using namespace ompx; namespace { -#pragma omp begin declare target device_type(nohost) - void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { shflFct(reduce_data, /*LaneId - not used= */ 0, @@ -316,5 +314,3 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2( void *__kmpc_reduction_get_fixed_buffer() { return state::getKernelLaunchEnvironment().ReductionBuffer; } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 100bc8ab47983..89edb4802198c 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -23,16 +23,13 @@ using namespace ompx; -#pragma omp begin declare target device_type(nohost) - /// Memory implementation /// ///{ /// External symbol to access dynamic shared memory. -[[gnu::aligned( - allocator::ALIGNMENT)]] extern unsigned char DynamicSharedBuffer[]; -#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc) +[[gnu::aligned(allocator::ALIGNMENT)]] extern unsigned char + [[clang::address_space(3)]] DynamicSharedBuffer[]; /// The kernel environment passed to the init method by the compiler. static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr); @@ -452,13 +449,10 @@ void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } /// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication. constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; -[[clang::loader_uninitialized]] static void - *SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; -#pragma omp allocate(SharedMemVariableSharingSpace) \ - allocator(omp_pteam_mem_alloc) -[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr; -#pragma omp allocate(SharedMemVariableSharingSpacePtr) \ - allocator(omp_pteam_mem_alloc) +[[clang::loader_uninitialized]] static void *[[clang::address_space( + 3)]] SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM]; +[[clang::loader_uninitialized]] static void **[[clang::address_space( + 3)]] SharedMemVariableSharingSpacePtr; void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { @@ -481,4 +475,3 @@ void __kmpc_get_shared_variables(void ***GlobalArgs) { *GlobalArgs = SharedMemVariableSharingSpacePtr; } } -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index b09d4801faa01..a5090b96560c8 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -19,8 +19,6 @@ #include "Mapping.h" #include "State.h" -#pragma omp begin declare target device_type(nohost) - using namespace ompx; namespace impl { @@ -28,34 +26,12 @@ namespace impl { /// Atomics /// ///{ -/// NOTE: This function needs to be implemented by every target. -uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope); ///} -// Forward declarations defined to be defined for AMDGCN and NVPTX. -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope); -void namedBarrierInit(); -void namedBarrier(); -void fenceTeam(atomic::OrderingTy Ordering); -void fenceKernel(atomic::OrderingTy Ordering); -void fenceSystem(atomic::OrderingTy Ordering); -void syncWarp(__kmpc_impl_lanemask_t); -void syncThreads(atomic::OrderingTy Ordering); -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } -void unsetLock(omp_lock_t *); -int testLock(omp_lock_t *); -void initLock(omp_lock_t *); -void destroyLock(omp_lock_t *); -void setLock(omp_lock_t *); -void unsetCriticalLock(omp_lock_t *); -void setCriticalLock(omp_lock_t *); - /// AMDGCN Implementation /// ///{ -#pragma omp begin declare variant match(device = {arch(amdgcn)}) +#ifdef __AMDGPU__ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, atomic::MemScopeTy MemScope) { @@ -202,15 +178,13 @@ void setCriticalLock(omp_lock_t *Lock) { } } -#pragma omp end declare variant +#endif ///} /// NVPTX Implementation /// ///{ -#pragma omp begin declare variant match( \ - device = {arch(nvptx, nvptx64)}, \ - implementation = {extension(match_any)}) +#ifdef __NVPTX__ uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, atomic::MemScopeTy MemScope) { @@ -283,7 +257,7 @@ void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } -#pragma omp end declare variant +#endif ///} } // namespace impl @@ -401,5 +375,3 @@ void ompx_sync_block_divergent(int Ordering) { impl::syncThreads(atomic::OrderingTy(Ordering)); } } // extern "C" - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp index 23a967c1a337e..d0be0ace50dff 100644 --- a/offload/DeviceRTL/src/Tasking.cpp +++ b/offload/DeviceRTL/src/Tasking.cpp @@ -20,8 +20,6 @@ using namespace ompx; -#pragma omp begin declare target device_type(nohost) - extern "C" { TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, @@ -29,7 +27,7 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, size_t SharedValuesSize, TaskFnTy TaskFn) { auto TaskSizeInclPrivateValuesPadded = - utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *))); + utils::roundUp(TaskSizeInclPrivateValues, sizeof(void *)); auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize; TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal( TaskSizeTotal, "explicit task descriptor"); @@ -103,5 +101,3 @@ int omp_in_final(void) { int omp_get_max_task_priority(void) { return 0; } } - -#pragma omp end declare target diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index cb83f1b670c9e..b1f037a11bddf 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -44,8 +44,6 @@ struct DynamicScheduleTracker { #define NOT_FINISHED 1 #define LAST_CHUNK 2 -#pragma omp begin declare target device_type(nohost) - // TODO: This variable is a hack inherited from the old runtime. static uint64_t SHARED(Cnt); @@ -935,5 +933,3 @@ OMP_LOOP_ENTRY(_4u, uint32_t) OMP_LOOP_ENTRY(_8, int64_t) OMP_LOOP_ENTRY(_8u, uint64_t) } - -#pragma omp end declare target