From ca1c59bf74d78b009f7ba42eb2960b41d01b7e01 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 9 Jan 2025 18:45:09 -0600 Subject: [PATCH] [OpenMP] Add explicit attributes to every function declaration Summary: Instead of having the scoped attributes, add this to every function. --- offload/DeviceRTL/CMakeLists.txt | 2 +- offload/DeviceRTL/include/Allocator.h | 11 +- offload/DeviceRTL/include/Configuration.h | 26 +-- offload/DeviceRTL/include/Debug.h | 11 +- offload/DeviceRTL/include/DeviceTypes.h | 4 +- offload/DeviceRTL/include/DeviceUtils.h | 21 +- offload/DeviceRTL/include/Interface.h | 230 ++++++++++---------- offload/DeviceRTL/include/LibC.h | 6 +- offload/DeviceRTL/include/Mapping.h | 44 ++-- offload/DeviceRTL/include/Profiling.h | 8 +- offload/DeviceRTL/include/State.h | 95 ++++---- offload/DeviceRTL/include/Synchronization.h | 56 ++--- offload/DeviceRTL/include/Workshare.h | 4 +- offload/DeviceRTL/src/Allocator.cpp | 13 +- offload/DeviceRTL/src/Configuration.cpp | 28 +-- offload/DeviceRTL/src/Debug.cpp | 12 +- offload/DeviceRTL/src/DeviceUtils.cpp | 67 +++--- offload/DeviceRTL/src/Kernel.cpp | 13 +- offload/DeviceRTL/src/LibC.cpp | 17 +- offload/DeviceRTL/src/Mapping.cpp | 141 ++++++------ offload/DeviceRTL/src/Misc.cpp | 36 +-- offload/DeviceRTL/src/Parallelism.cpp | 38 ++-- offload/DeviceRTL/src/Profiling.cpp | 6 +- offload/DeviceRTL/src/Reduction.cpp | 54 ++--- offload/DeviceRTL/src/State.cpp | 171 ++++++++------- offload/DeviceRTL/src/Synchronization.cpp | 204 +++++++++-------- offload/DeviceRTL/src/Tasking.cpp | 52 ++--- offload/DeviceRTL/src/Workshare.cpp | 216 +++++++++--------- 28 files changed, 848 insertions(+), 738 deletions(-) diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index 099634e211e7a..e6859ab3d9e9e 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -98,7 +98,7 @@ list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I") set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden ${clang_opt_flags} --offload-device-only -nocudalib -nogpulib -nogpuinc -nostdlibinc - -fopenmp -fopenmp-cuda-mode + -fopenmp -fopenmp-cuda-mode -Wno-unknown-assumption -Wno-unknown-cuda-version -Wno-openmp-target -DOMPTARGET_DEVICE_RUNTIME -I${include_directory} diff --git a/offload/DeviceRTL/include/Allocator.h b/offload/DeviceRTL/include/Allocator.h index 475f6a21bb47e..d3ff7185bb29b 100644 --- a/offload/DeviceRTL/include/Allocator.h +++ b/offload/DeviceRTL/include/Allocator.h @@ -26,22 +26,23 @@ namespace allocator { static uint64_t constexpr ALIGNMENT = 16; /// Initialize the allocator according to \p KernelEnvironment -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); +OMP_ATTRS void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); /// Allocate \p Size bytes. -[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void * +[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), + gnu::malloc]] OMP_ATTRS void * alloc(uint64_t Size); /// Free the allocation pointed to by \p Ptr. -void free(void *Ptr); +OMP_ATTRS void free(void *Ptr); } // namespace allocator } // namespace ompx extern "C" { -[[gnu::weak]] void *malloc(size_t Size); -[[gnu::weak]] void free(void *Ptr); +[[gnu::weak]] OMP_ATTRS void *malloc(size_t Size); +[[gnu::weak]] OMP_ATTRS void free(void *Ptr); } #pragma omp end declare target diff --git a/offload/DeviceRTL/include/Configuration.h b/offload/DeviceRTL/include/Configuration.h index f8b7a6c3c6c9d..cf638838a7d38 100644 --- a/offload/DeviceRTL/include/Configuration.h +++ b/offload/DeviceRTL/include/Configuration.h @@ -22,45 +22,45 @@ namespace config { /// Return the number of devices in the system, same number as returned on the /// host by omp_get_num_devices. -uint32_t getNumDevices(); +OMP_ATTRS uint32_t getNumDevices(); /// Return the device number in the system for omp_get_device_num. -uint32_t getDeviceNum(); +OMP_ATTRS uint32_t getDeviceNum(); /// Return the user choosen debug level. -uint32_t getDebugKind(); +OMP_ATTRS uint32_t getDebugKind(); /// Return if teams oversubscription is assumed -uint32_t getAssumeTeamsOversubscription(); +OMP_ATTRS uint32_t getAssumeTeamsOversubscription(); /// Return if threads oversubscription is assumed -uint32_t getAssumeThreadsOversubscription(); +OMP_ATTRS uint32_t getAssumeThreadsOversubscription(); /// Return the amount of dynamic shared memory that was allocated at launch. -uint64_t getDynamicMemorySize(); +OMP_ATTRS uint64_t getDynamicMemorySize(); /// Returns the cycles per second of the device's fixed frequency clock. -uint64_t getClockFrequency(); +OMP_ATTRS uint64_t getClockFrequency(); /// Returns the pointer to the beginning of the indirect call table. -void *getIndirectCallTablePtr(); +OMP_ATTRS void *getIndirectCallTablePtr(); /// Returns the size of the indirect call table. -uint64_t getIndirectCallTableSize(); +OMP_ATTRS uint64_t getIndirectCallTableSize(); /// Returns the size of the indirect call table. -uint64_t getHardwareParallelism(); +OMP_ATTRS uint64_t getHardwareParallelism(); /// Return if debugging is enabled for the given debug kind. -bool isDebugMode(DeviceDebugKind Level); +OMP_ATTRS bool isDebugMode(DeviceDebugKind Level); /// Indicates if this kernel may require thread-specific states, or if it was /// explicitly disabled by the user. -bool mayUseThreadStates(); +OMP_ATTRS bool mayUseThreadStates(); /// Indicates if this kernel may require data environments for nested /// parallelism, or if it was explicitly disabled by the user. -bool mayUseNestedParallelism(); +OMP_ATTRS bool mayUseNestedParallelism(); } // namespace config } // namespace ompx diff --git a/offload/DeviceRTL/include/Debug.h b/offload/DeviceRTL/include/Debug.h index 22998f44a5bea..31b465fe425b3 100644 --- a/offload/DeviceRTL/include/Debug.h +++ b/offload/DeviceRTL/include/Debug.h @@ -19,11 +19,12 @@ /// /// { extern "C" { -void __assert_assume(bool condition); -void __assert_fail(const char *expr, const char *file, unsigned line, - const char *function); -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function); +OMP_ATTRS void __assert_assume(bool condition); +OMP_ATTRS void __assert_fail(const char *expr, const char *file, unsigned line, + const char *function); +OMP_ATTRS void __assert_fail_internal(const char *expr, const char *msg, + const char *file, unsigned line, + const char *function); } #define ASSERT(expr, msg) \ diff --git a/offload/DeviceRTL/include/DeviceTypes.h b/offload/DeviceRTL/include/DeviceTypes.h index 259bc008f91d1..b20ccfb6b037a 100644 --- a/offload/DeviceRTL/include/DeviceTypes.h +++ b/offload/DeviceRTL/include/DeviceTypes.h @@ -20,9 +20,7 @@ // another function but only inline assembly that performs some operation or // side-effect and then continues execution with something on the existing call // stack. -// -// TODO: Find a good place for this -#pragma omp assumes ext_no_call_asm +#define OMP_ATTRS [[omp::assume("ext_no_call_asm"), gnu::visibility("hidden")]] enum omp_proc_bind_t { omp_proc_bind_false = 0, diff --git a/offload/DeviceRTL/include/DeviceUtils.h b/offload/DeviceRTL/include/DeviceUtils.h index fa66b973a4f5e..fddd0c8722f3f 100644 --- a/offload/DeviceRTL/include/DeviceUtils.h +++ b/offload/DeviceRTL/include/DeviceUtils.h @@ -60,32 +60,35 @@ struct remove_addrspace : type_identity {}; template using remove_addrspace_t = typename remove_addrspace::type; -template inline To bitCast(From V) { +template OMP_ATTRS inline To bitCast(From V) { static_assert(sizeof(To) == sizeof(From), "Bad conversion"); return __builtin_bit_cast(To, V); } /// Return the value \p Var from thread Id \p SrcLane in the warp if the thread /// is identified by \p Mask. -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); +OMP_ATTRS int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, + int32_t Width); -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width); +OMP_ATTRS int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, + int32_t Width); -int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, int32_t Width); +OMP_ATTRS int64_t shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, + int32_t Width); -uint64_t ballotSync(uint64_t Mask, int32_t Pred); +OMP_ATTRS uint64_t ballotSync(uint64_t Mask, int32_t Pred); /// Return \p LowBits and \p HighBits packed into a single 64 bit value. -uint64_t pack(uint32_t LowBits, uint32_t HighBits); +OMP_ATTRS uint64_t pack(uint32_t LowBits, uint32_t HighBits); /// Unpack \p Val into \p LowBits and \p HighBits. -void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); +OMP_ATTRS void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits); /// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)). -bool isSharedMemPtr(void *Ptr); +OMP_ATTRS bool isSharedMemPtr(void *Ptr); /// Return true iff \p Ptr is pointing into (thread) local memory (AS(5)). -bool isThreadLocalMemPtr(void *Ptr); +OMP_ATTRS bool isThreadLocalMemPtr(void *Ptr); /// A pointer variable that has by design an `undef` value. Use with care. [[clang::loader_uninitialized]] static void *const UndefPtr; diff --git a/offload/DeviceRTL/include/Interface.h b/offload/DeviceRTL/include/Interface.h index c4bfaaa2404b4..cb0bfed8edc9d 100644 --- a/offload/DeviceRTL/include/Interface.h +++ b/offload/DeviceRTL/include/Interface.h @@ -28,8 +28,8 @@ extern "C" { /// getter: returns 0. /// ///{ -void omp_set_dynamic(int); -int omp_get_dynamic(void); +OMP_ATTRS void omp_set_dynamic(int); +OMP_ATTRS int omp_get_dynamic(void); ///} /// ICV: nthreads-var, integer @@ -43,8 +43,8 @@ int omp_get_dynamic(void); /// /// ///{ -void omp_set_num_threads(int); -int omp_get_max_threads(void); +OMP_ATTRS void omp_set_num_threads(int); +OMP_ATTRS int omp_get_max_threads(void); ///} /// ICV: thread-limit-var, computed @@ -52,7 +52,7 @@ int omp_get_max_threads(void); /// getter: returns thread limited defined during launch. /// ///{ -int omp_get_thread_limit(void); +OMP_ATTRS int omp_get_thread_limit(void); ///} /// ICV: max-active-level-var, constant 1 @@ -61,8 +61,8 @@ int omp_get_thread_limit(void); /// getter: returns 1. /// ///{ -void omp_set_max_active_levels(int); -int omp_get_max_active_levels(void); +OMP_ATTRS void omp_set_max_active_levels(int); +OMP_ATTRS int omp_get_max_active_levels(void); ///} /// ICV: places-partition-var @@ -76,7 +76,7 @@ int omp_get_max_active_levels(void); /// getter: returns 0 or 1. /// ///{ -int omp_get_active_level(void); +OMP_ATTRS int omp_get_active_level(void); ///} /// ICV: level-var @@ -84,88 +84,88 @@ int omp_get_active_level(void); /// getter: returns parallel region nesting /// ///{ -int omp_get_level(void); +OMP_ATTRS int omp_get_level(void); ///} /// ICV: run-sched-var /// /// ///{ -void omp_set_schedule(omp_sched_t, int); -void omp_get_schedule(omp_sched_t *, int *); +OMP_ATTRS void omp_set_schedule(omp_sched_t, int); +OMP_ATTRS void omp_get_schedule(omp_sched_t *, int *); ///} /// TODO this is incomplete. -int omp_get_num_threads(void); -int omp_get_thread_num(void); -void omp_set_nested(int); +OMP_ATTRS int omp_get_num_threads(void); +OMP_ATTRS int omp_get_thread_num(void); +OMP_ATTRS void omp_set_nested(int); -int omp_get_nested(void); +OMP_ATTRS int omp_get_nested(void); -void omp_set_max_active_levels(int Level); +OMP_ATTRS void omp_set_max_active_levels(int Level); -int omp_get_max_active_levels(void); +OMP_ATTRS int omp_get_max_active_levels(void); -omp_proc_bind_t omp_get_proc_bind(void); +OMP_ATTRS omp_proc_bind_t omp_get_proc_bind(void); -int omp_get_num_places(void); +OMP_ATTRS int omp_get_num_places(void); -int omp_get_place_num_procs(int place_num); +OMP_ATTRS int omp_get_place_num_procs(int place_num); -void omp_get_place_proc_ids(int place_num, int *ids); +OMP_ATTRS void omp_get_place_proc_ids(int place_num, int *ids); -int omp_get_place_num(void); +OMP_ATTRS int omp_get_place_num(void); -int omp_get_partition_num_places(void); +OMP_ATTRS int omp_get_partition_num_places(void); -void omp_get_partition_place_nums(int *place_nums); +OMP_ATTRS void omp_get_partition_place_nums(int *place_nums); -int omp_get_cancellation(void); +OMP_ATTRS int omp_get_cancellation(void); -void omp_set_default_device(int deviceId); +OMP_ATTRS void omp_set_default_device(int deviceId); -int omp_get_default_device(void); +OMP_ATTRS int omp_get_default_device(void); -int omp_get_num_devices(void); +OMP_ATTRS int omp_get_num_devices(void); -int omp_get_device_num(void); +OMP_ATTRS int omp_get_device_num(void); -int omp_get_num_teams(void); +OMP_ATTRS int omp_get_num_teams(void); -int omp_get_team_num(); +OMP_ATTRS int omp_get_team_num(); -int omp_get_initial_device(void); +OMP_ATTRS int omp_get_initial_device(void); -void *llvm_omp_target_dynamic_shared_alloc(); +OMP_ATTRS void *llvm_omp_target_dynamic_shared_alloc(); /// Synchronization /// ///{ -void omp_init_lock(omp_lock_t *Lock); +OMP_ATTRS void omp_init_lock(omp_lock_t *Lock); -void omp_destroy_lock(omp_lock_t *Lock); +OMP_ATTRS void omp_destroy_lock(omp_lock_t *Lock); -void omp_set_lock(omp_lock_t *Lock); +OMP_ATTRS void omp_set_lock(omp_lock_t *Lock); -void omp_unset_lock(omp_lock_t *Lock); +OMP_ATTRS void omp_unset_lock(omp_lock_t *Lock); -int omp_test_lock(omp_lock_t *Lock); +OMP_ATTRS int omp_test_lock(omp_lock_t *Lock); ///} /// Tasking /// ///{ -int omp_in_final(void); +OMP_ATTRS int omp_in_final(void); -int omp_get_max_task_priority(void); +OMP_ATTRS int omp_get_max_task_priority(void); ///} /// Misc /// ///{ -double omp_get_wtick(void); +OMP_ATTRS double omp_get_wtick(void); -double omp_get_wtime(void); +OMP_ATTRS double omp_get_wtime(void); ///} } @@ -173,16 +173,16 @@ extern "C" { /// Allocate \p Bytes in "shareable" memory and return the address. Needs to be /// called balanced with __kmpc_free_shared like a stack (push/pop). Can be /// called by any thread, allocation happens *per thread*. -void *__kmpc_alloc_shared(uint64_t Bytes); +OMP_ATTRS void *__kmpc_alloc_shared(uint64_t Bytes); /// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like /// a stack (push/pop). Can be called by any thread. \p Ptr has to be the /// allocated by __kmpc_alloc_shared by the same thread. -void __kmpc_free_shared(void *Ptr, uint64_t Bytes); +OMP_ATTRS void __kmpc_free_shared(void *Ptr, uint64_t Bytes); /// Get a pointer to the memory buffer containing dynamically allocated shared /// memory configured at launch. -void *__kmpc_get_dynamic_shared(); +OMP_ATTRS void *__kmpc_get_dynamic_shared(); /// Allocate sufficient space for \p NumArgs sequential `void*` and store the /// allocation address in \p GlobalArgs. @@ -191,27 +191,28 @@ void *__kmpc_get_dynamic_shared(); /// /// We also remember it in GlobalArgsPtr to ensure the worker threads and /// deallocation function know the allocation address too. -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs); +OMP_ATTRS void __kmpc_begin_sharing_variables(void ***GlobalArgs, + uint64_t NumArgs); /// Deallocate the memory allocated by __kmpc_begin_sharing_variables. /// /// Called by the main thread after a parallel region. -void __kmpc_end_sharing_variables(); +OMP_ATTRS void __kmpc_end_sharing_variables(); /// Store the allocation address obtained via __kmpc_begin_sharing_variables in /// \p GlobalArgs. /// /// Called by the worker threads in the parallel region (function). -void __kmpc_get_shared_variables(void ***GlobalArgs); +OMP_ATTRS void __kmpc_get_shared_variables(void ***GlobalArgs); /// External interface to get the thread ID. -uint32_t __kmpc_get_hardware_thread_id_in_block(); +OMP_ATTRS uint32_t __kmpc_get_hardware_thread_id_in_block(); /// External interface to get the number of threads. -uint32_t __kmpc_get_hardware_num_threads_in_block(); +OMP_ATTRS uint32_t __kmpc_get_hardware_num_threads_in_block(); /// External interface to get the warp size. -uint32_t __kmpc_get_warp_size(); +OMP_ATTRS uint32_t __kmpc_get_warp_size(); /// Kernel /// @@ -219,27 +220,26 @@ uint32_t __kmpc_get_warp_size(); // Forward declaration struct KernelEnvironmentTy; -int8_t __kmpc_is_spmd_exec_mode(); +OMP_ATTRS int8_t __kmpc_is_spmd_exec_mode(); -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); +OMP_ATTRS int32_t +__kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy &KernelLaunchEnvironment); -void __kmpc_target_deinit(); +OMP_ATTRS void __kmpc_target_deinit(); ///} /// Reduction /// ///{ -void *__kmpc_reduction_get_fixed_buffer(); +OMP_ATTRS void *__kmpc_reduction_get_fixed_buffer(); -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct); +OMP_ATTRS int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + IdentTy *Loc, uint64_t reduce_data_size, void *reduce_data, + ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct); -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( +OMP_ATTRS int32_t __kmpc_nvptx_teams_reduce_nowait_v2( IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, @@ -249,116 +249,120 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2( /// Synchronization /// ///{ -void __kmpc_ordered(IdentTy *Loc, int32_t TId); +OMP_ATTRS void __kmpc_ordered(IdentTy *Loc, int32_t TId); -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId); +OMP_ATTRS void __kmpc_end_ordered(IdentTy *Loc, int32_t TId); -int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId); +OMP_ATTRS int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId); -void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId); +OMP_ATTRS void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId); -void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId); +OMP_ATTRS void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId); -void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId); +OMP_ATTRS void __kmpc_barrier_simple_generic(IdentTy *Loc_ref, int32_t TId); -int32_t __kmpc_master(IdentTy *Loc, int32_t TId); +OMP_ATTRS int32_t __kmpc_master(IdentTy *Loc, int32_t TId); -void __kmpc_end_master(IdentTy *Loc, int32_t TId); +OMP_ATTRS void __kmpc_end_master(IdentTy *Loc, int32_t TId); -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter); +OMP_ATTRS int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter); -void __kmpc_end_masked(IdentTy *Loc, int32_t TId); +OMP_ATTRS void __kmpc_end_masked(IdentTy *Loc, int32_t TId); -int32_t __kmpc_single(IdentTy *Loc, int32_t TId); +OMP_ATTRS int32_t __kmpc_single(IdentTy *Loc, int32_t TId); -void __kmpc_end_single(IdentTy *Loc, int32_t TId); +OMP_ATTRS void __kmpc_end_single(IdentTy *Loc, int32_t TId); -void __kmpc_flush(IdentTy *Loc); +OMP_ATTRS void __kmpc_flush(IdentTy *Loc); -uint64_t __kmpc_warp_active_thread_mask(void); +OMP_ATTRS uint64_t __kmpc_warp_active_thread_mask(void); -void __kmpc_syncwarp(uint64_t Mask); +OMP_ATTRS void __kmpc_syncwarp(uint64_t Mask); -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); +OMP_ATTRS void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); +OMP_ATTRS void __kmpc_end_critical(IdentTy *Loc, int32_t TId, + CriticalNameTy *Name); ///} /// Parallelism /// ///{ /// TODO -void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn); +OMP_ATTRS void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn); /// TODO -bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn); +OMP_ATTRS bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn); /// TODO -void __kmpc_kernel_end_parallel(); +OMP_ATTRS void __kmpc_kernel_end_parallel(); /// TODO -void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); +OMP_ATTRS void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind); /// TODO -void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams, - int32_t ThreadLimit); +OMP_ATTRS void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, + int32_t NumTeams, int32_t ThreadLimit); /// TODO -uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t); +OMP_ATTRS uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t); ///} /// Tasking /// ///{ -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn); +OMP_ATTRS TaskDescriptorTy * +__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, + size_t TaskSizeInclPrivateValues, size_t SharedValuesSize, + TaskFnTy TaskFn); -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); +OMP_ATTRS int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *); +OMP_ATTRS int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, + int32_t, void *, int32_t, void *); -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); +OMP_ATTRS void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor); +OMP_ATTRS void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor); -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *); +OMP_ATTRS void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, + int32_t, void *); -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId); +OMP_ATTRS void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId); -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId); +OMP_ATTRS void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId); -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int); +OMP_ATTRS int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int); -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId); +OMP_ATTRS int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId); -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *); +OMP_ATTRS void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int, + uint64_t *LowerBound, uint64_t *UpperBound, + int64_t, int, int32_t, uint64_t, void *); ///} /// Misc /// ///{ -int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal); +OMP_ATTRS int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, + int32_t CancelVal); -int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal); +OMP_ATTRS int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal); ///} /// Shuffle /// ///{ -int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); -int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); +OMP_ATTRS int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, + int16_t size); +OMP_ATTRS int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, + int16_t size); ///} } diff --git a/offload/DeviceRTL/include/LibC.h b/offload/DeviceRTL/include/LibC.h index 03febdb508342..9c722d1ac28f8 100644 --- a/offload/DeviceRTL/include/LibC.h +++ b/offload/DeviceRTL/include/LibC.h @@ -16,9 +16,9 @@ extern "C" { -int memcmp(const void *lhs, const void *rhs, size_t count); -void memset(void *dst, int C, size_t count); -int printf(const char *format, ...); +OMP_ATTRS int memcmp(const void *lhs, const void *rhs, size_t count); +OMP_ATTRS void memset(void *dst, int C, size_t count); +OMP_ATTRS int printf(const char *format, ...); } #endif diff --git a/offload/DeviceRTL/include/Mapping.h b/offload/DeviceRTL/include/Mapping.h index 2fb87abe5418c..92afb41f10e90 100644 --- a/offload/DeviceRTL/include/Mapping.h +++ b/offload/DeviceRTL/include/Mapping.h @@ -31,67 +31,67 @@ inline constexpr uint32_t MaxThreadsPerTeam = 1024; #pragma omp end declare target /// Initialize the mapping machinery. -void init(bool IsSPMD); +OMP_ATTRS void init(bool IsSPMD); /// Return true if the kernel is executed in SPMD mode. -bool isSPMDMode(); +OMP_ATTRS bool isSPMDMode(); /// Return true if the kernel is executed in generic mode. -bool isGenericMode(); +OMP_ATTRS bool isGenericMode(); /// Return true if the executing thread is the main thread in generic mode. /// These functions will lookup state and it is required that that is OK for the /// thread and location. See also `isInitialThreadInLevel0` for a stateless /// alternative for certain situations, e.g. during initialization. -bool isMainThreadInGenericMode(); -bool isMainThreadInGenericMode(bool IsSPMD); +OMP_ATTRS bool isMainThreadInGenericMode(); +OMP_ATTRS bool isMainThreadInGenericMode(bool IsSPMD); /// Return true if this thread is the initial thread in parallel level 0. /// /// The thread for which this returns true should be used for single threaded /// initialization tasks. We pick a special thread to ensure there are no /// races between the initialization and the first read of initialized state. -bool isInitialThreadInLevel0(bool IsSPMD); +OMP_ATTRS bool isInitialThreadInLevel0(bool IsSPMD); /// Return true if the executing thread has the lowest Id of the active threads /// in the warp. -bool isLeaderInWarp(); +OMP_ATTRS bool isLeaderInWarp(); /// Return a mask describing all active threads in the warp. -LaneMaskTy activemask(); +OMP_ATTRS LaneMaskTy activemask(); /// Return a mask describing all threads with a smaller Id in the warp. -LaneMaskTy lanemaskLT(); +OMP_ATTRS LaneMaskTy lanemaskLT(); /// Return a mask describing all threads with a larget Id in the warp. -LaneMaskTy lanemaskGT(); +OMP_ATTRS LaneMaskTy lanemaskGT(); /// Return the thread Id in the warp, in [0, getWarpSize()). -uint32_t getThreadIdInWarp(); +OMP_ATTRS uint32_t getThreadIdInWarp(); /// Return the warp size, thus number of threads in the warp. -uint32_t getWarpSize(); +OMP_ATTRS uint32_t getWarpSize(); /// Return the warp id in the block, in [0, getNumberOfWarpsInBlock()] -uint32_t getWarpIdInBlock(); +OMP_ATTRS uint32_t getWarpIdInBlock(); /// Return the number of warps in the block. -uint32_t getNumberOfWarpsInBlock(); +OMP_ATTRS uint32_t getNumberOfWarpsInBlock(); /// Return the thread Id in the block, in [0, getNumberOfThreadsInBlock(Dim)). -uint32_t getThreadIdInBlock(int32_t Dim = DIM_X); +OMP_ATTRS uint32_t getThreadIdInBlock(int32_t Dim = DIM_X); /// Return the block size, thus number of threads in the block. -uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X); +OMP_ATTRS uint32_t getNumberOfThreadsInBlock(int32_t Dim = DIM_X); /// Return the block Id in the kernel, in [0, getNumberOfBlocksInKernel(Dim)). -uint32_t getBlockIdInKernel(int32_t Dim = DIM_X); +OMP_ATTRS uint32_t getBlockIdInKernel(int32_t Dim = DIM_X); /// Return the number of blocks in the kernel. -uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X); +OMP_ATTRS uint32_t getNumberOfBlocksInKernel(int32_t Dim = DIM_X); /// Return the kernel size, thus number of threads in the kernel. -uint32_t getNumberOfThreadsInKernel(); +OMP_ATTRS uint32_t getNumberOfThreadsInKernel(); /// Return the maximal number of threads in the block usable for a team (= /// parallel region). @@ -99,11 +99,11 @@ uint32_t getNumberOfThreadsInKernel(); /// Note: The version taking \p IsSPMD mode explicitly can be used during the /// initialization of the target region, that is before `mapping::isSPMDMode()` /// can be called by any thread other than the main one. -uint32_t getMaxTeamThreads(); -uint32_t getMaxTeamThreads(bool IsSPMD); +OMP_ATTRS uint32_t getMaxTeamThreads(); +OMP_ATTRS uint32_t getMaxTeamThreads(bool IsSPMD); /// Return the number of processing elements on the device. -uint32_t getNumberOfProcessorElements(); +OMP_ATTRS uint32_t getNumberOfProcessorElements(); } // namespace mapping diff --git a/offload/DeviceRTL/include/Profiling.h b/offload/DeviceRTL/include/Profiling.h index d994752254121..560581939c540 100644 --- a/offload/DeviceRTL/include/Profiling.h +++ b/offload/DeviceRTL/include/Profiling.h @@ -12,10 +12,12 @@ #ifndef OMPTARGET_DEVICERTL_PROFILING_H #define OMPTARGET_DEVICERTL_PROFILING_H +#include "DeviceTypes.h" + extern "C" { -void __llvm_profile_register_function(void *Ptr); -void __llvm_profile_register_names_function(void *Ptr, long int I); -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2); +OMP_ATTRS void __llvm_profile_register_function(void *Ptr); +OMP_ATTRS void __llvm_profile_register_names_function(void *Ptr, long int I); +OMP_ATTRS void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2); } #endif diff --git a/offload/DeviceRTL/include/State.h b/offload/DeviceRTL/include/State.h index 565235cd48a91..f491d88adbe39 100644 --- a/offload/DeviceRTL/include/State.h +++ b/offload/DeviceRTL/include/State.h @@ -31,21 +31,21 @@ namespace memory { /// Alloca \p Size bytes in shared memory, if possible, for \p Reason. /// /// Note: See the restrictions on __kmpc_alloc_shared for proper usage. -void *allocShared(uint64_t Size, const char *Reason); +OMP_ATTRS void *allocShared(uint64_t Size, const char *Reason); /// Free \p Ptr, alloated via allocShared, for \p Reason. /// /// Note: See the restrictions on __kmpc_free_shared for proper usage. -void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); +OMP_ATTRS void freeShared(void *Ptr, uint64_t Bytes, const char *Reason); /// Alloca \p Size bytes in global memory, if possible, for \p Reason. -void *allocGlobal(uint64_t Size, const char *Reason); +OMP_ATTRS void *allocGlobal(uint64_t Size, const char *Reason); /// Return a pointer to the dynamic shared memory buffer. -void *getDynamicBuffer(); +OMP_ATTRS void *getDynamicBuffer(); /// Free \p Ptr, alloated via allocGlobal, for \p Reason. -void freeGlobal(void *Ptr, const char *Reason); +OMP_ATTRS void freeGlobal(void *Ptr, const char *Reason); } // namespace memory @@ -62,17 +62,17 @@ struct ICVStateTy { uint32_t RunSchedVar; uint32_t RunSchedChunkVar; - bool operator==(const ICVStateTy &Other) const; + OMP_ATTRS bool operator==(const ICVStateTy &Other) const; - void assertEqual(const ICVStateTy &Other) const; + OMP_ATTRS void assertEqual(const ICVStateTy &Other) const; }; struct TeamStateTy { - void init(bool IsSPMD); + OMP_ATTRS void init(bool IsSPMD); - bool operator==(const TeamStateTy &) const; + OMP_ATTRS bool operator==(const TeamStateTy &) const; - void assertEqual(TeamStateTy &Other) const; + OMP_ATTRS void assertEqual(TeamStateTy &Other) const; /// ICVs /// @@ -104,12 +104,12 @@ struct ThreadStateTy { ThreadStateTy *PreviousThreadState; - void init() { + OMP_ATTRS void init() { ICVState = TeamState.ICVState; PreviousThreadState = nullptr; } - void init(ThreadStateTy *PreviousTS) { + OMP_ATTRS void init(ThreadStateTy *PreviousTS) { ICVState = PreviousTS ? PreviousTS->ICVState : TeamState.ICVState; PreviousThreadState = PreviousTS; } @@ -119,15 +119,15 @@ extern ThreadStateTy **ThreadStates; #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc) /// Initialize the state machinery. Must be called by all threads. -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment); +OMP_ATTRS void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy &KernelLaunchEnvironment); /// Return the kernel and kernel launch environment associated with the current /// kernel. The former is static and contains compile time information that /// holds for all instances of the kernel. The latter is dynamic and provides /// per-launch information. -KernelEnvironmentTy &getKernelEnvironment(); -KernelLaunchEnvironmentTy &getKernelLaunchEnvironment(); +OMP_ATTRS KernelEnvironmentTy &getKernelEnvironment(); +OMP_ATTRS KernelLaunchEnvironmentTy &getKernelLaunchEnvironment(); /// TODO enum ValueKind { @@ -144,22 +144,23 @@ enum ValueKind { }; /// TODO -void enterDataEnvironment(IdentTy *Ident); +OMP_ATTRS void enterDataEnvironment(IdentTy *Ident); /// TODO -void exitDataEnvironment(); +OMP_ATTRS void exitDataEnvironment(); /// TODO struct DateEnvironmentRAII { - DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); } - ~DateEnvironmentRAII() { exitDataEnvironment(); } + OMP_ATTRS DateEnvironmentRAII(IdentTy *Ident) { enterDataEnvironment(Ident); } + OMP_ATTRS ~DateEnvironmentRAII() { exitDataEnvironment(); } }; /// TODO -void resetStateForThread(uint32_t TId); +OMP_ATTRS void resetStateForThread(uint32_t TId); -inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var, - IdentTy *Ident, bool ForceTeamState) { +OMP_ATTRS inline uint32_t & +lookupForModify32Impl(uint32_t state::ICVStateTy::*Var, IdentTy *Ident, + bool ForceTeamState) { if (OMP_LIKELY(ForceTeamState || !config::mayUseThreadStates() || !TeamState.HasThreadState)) return TeamState.ICVState.*Var; @@ -174,8 +175,8 @@ inline uint32_t &lookupForModify32Impl(uint32_t state::ICVStateTy::*Var, return ThreadStates[TId]->ICVState.*Var; } -inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var, - bool ForceTeamState) { +OMP_ATTRS inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var, + bool ForceTeamState) { auto TId = mapping::getThreadIdInBlock(); if (OMP_UNLIKELY(!ForceTeamState && config::mayUseThreadStates() && TeamState.HasThreadState && ThreadStates[TId])) @@ -183,7 +184,7 @@ inline uint32_t &lookupImpl(uint32_t state::ICVStateTy::*Var, return TeamState.ICVState.*Var; } -[[gnu::always_inline, gnu::flatten]] inline uint32_t & +[[gnu::always_inline, gnu::flatten]] OMP_ATTRS inline uint32_t & lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { switch (Kind) { case state::VK_NThreads: @@ -225,7 +226,7 @@ lookup32(ValueKind Kind, bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { __builtin_unreachable(); } -[[gnu::always_inline, gnu::flatten]] inline void *& +[[gnu::always_inline, gnu::flatten]] OMP_ATTRS inline void *& lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) { switch (Kind) { case state::VK_ParallelRegionFn: @@ -239,45 +240,48 @@ lookupPtr(ValueKind Kind, bool IsReadonly, bool ForceTeamState) { /// A class without actual state used to provide a nice interface to lookup and /// update ICV values we can declare in global scope. template struct Value { - [[gnu::flatten, gnu::always_inline]] operator Ty() { + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS operator Ty() { return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, /*ForceTeamState=*/false); } - [[gnu::flatten, gnu::always_inline]] Value &operator=(const Ty &Other) { + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Value & + operator=(const Ty &Other) { set(Other, /*IdentTy=*/nullptr); return *this; } - [[gnu::flatten, gnu::always_inline]] Value &operator++() { + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Value &operator++() { inc(1, /*IdentTy=*/nullptr); return *this; } - [[gnu::flatten, gnu::always_inline]] Value &operator--() { + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Value &operator--() { inc(-1, /*IdentTy=*/nullptr); return *this; } - [[gnu::flatten, gnu::always_inline]] void + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS void assert_eq(const Ty &V, IdentTy *Ident = nullptr, bool ForceTeamState = false) { ASSERT(lookup(/*IsReadonly=*/true, Ident, ForceTeamState) == V, nullptr); } private: - [[gnu::flatten, gnu::always_inline]] Ty & + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Ty & lookup(bool IsReadonly, IdentTy *Ident, bool ForceTeamState) { Ty &t = lookup32(Kind, IsReadonly, Ident, ForceTeamState); return t; } - [[gnu::flatten, gnu::always_inline]] Ty &inc(int UpdateVal, IdentTy *Ident) { + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Ty &inc(int UpdateVal, + IdentTy *Ident) { return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) += UpdateVal); } - [[gnu::flatten, gnu::always_inline]] Ty &set(Ty UpdateVal, IdentTy *Ident) { + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS Ty &set(Ty UpdateVal, + IdentTy *Ident) { return (lookup(/*IsReadonly=*/false, Ident, /*ForceTeamState=*/false) = UpdateVal); } @@ -289,22 +293,23 @@ template struct Value { /// a nice interface to lookup and update ICV values /// we can declare in global scope. template struct PtrValue { - [[gnu::flatten, gnu::always_inline]] operator Ty() { + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS operator Ty() { return lookup(/*IsReadonly=*/true, /*IdentTy=*/nullptr, /*ForceTeamState=*/false); } - [[gnu::flatten, gnu::always_inline]] PtrValue &operator=(const Ty Other) { + [[gnu::flatten, gnu::always_inline]] OMP_ATTRS PtrValue & + operator=(const Ty Other) { set(Other); return *this; } private: - Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) { + OMP_ATTRS Ty &lookup(bool IsReadonly, IdentTy *, bool ForceTeamState) { return lookupPtr(Kind, IsReadonly, ForceTeamState); } - Ty &set(Ty UpdateVal) { + OMP_ATTRS Ty &set(Ty UpdateVal) { return (lookup(/*IsReadonly=*/false, /*IdentTy=*/nullptr, /*ForceTeamState=*/false) = UpdateVal); } @@ -313,8 +318,8 @@ template struct PtrValue { }; template struct ValueRAII { - ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, IdentTy *Ident, - bool ForceTeamState = false) + OMP_ATTRS ValueRAII(VTy &V, Ty NewValue, Ty OldValue, bool Active, + IdentTy *Ident, bool ForceTeamState = false) : Ptr(Active ? &V.lookup(/*IsReadonly=*/false, Ident, ForceTeamState) : (Ty *)utils::UndefPtr), Val(OldValue), Active(Active) { @@ -323,7 +328,7 @@ template struct ValueRAII { ASSERT(*Ptr == OldValue, "ValueRAII initialization with wrong old value!"); *Ptr = NewValue; } - ~ValueRAII() { + OMP_ATTRS ~ValueRAII() { if (Active) *Ptr = Val; } @@ -347,12 +352,12 @@ inline state::Value HasThreadState; inline state::PtrValue ParallelRegionFn; -void runAndCheckState(void(Func(void))); +OMP_ATTRS void runAndCheckState(void(Func(void))); -void assumeInitialState(bool IsSPMD); +OMP_ATTRS void assumeInitialState(bool IsSPMD); /// Return the value of the ParallelTeamSize ICV. -int getEffectivePTeamSize(); +OMP_ATTRS int getEffectivePTeamSize(); } // namespace state diff --git a/offload/DeviceRTL/include/Synchronization.h b/offload/DeviceRTL/include/Synchronization.h index e1968675550d4..b4eb6ce30d390 100644 --- a/offload/DeviceRTL/include/Synchronization.h +++ b/offload/DeviceRTL/include/Synchronization.h @@ -43,39 +43,40 @@ enum MemScopeTy { }; /// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics. -uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, - MemScopeTy MemScope = MemScopeTy::all); +OMP_ATTRS uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering, + MemScopeTy MemScope = MemScopeTy::all); /// Atomically perform on \p V and \p *Addr with \p Ordering semantics. The /// result is stored in \p *Addr; /// { template > -bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc, - atomic::OrderingTy OrderingFail) { +OMP_ATTRS bool cas(Ty *Address, V ExpectedV, V DesiredV, + atomic::OrderingTy OrderingSucc, + atomic::OrderingTy OrderingFail) { return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false, OrderingSucc, OrderingFail, __MEMORY_SCOPE_DEVICE); } template > -V add(Ty *Address, V Val, atomic::OrderingTy Ordering) { +OMP_ATTRS V add(Ty *Address, V Val, atomic::OrderingTy Ordering) { return __scoped_atomic_fetch_add(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); } template > -V load(Ty *Address, atomic::OrderingTy Ordering) { +OMP_ATTRS V load(Ty *Address, atomic::OrderingTy Ordering) { return add(Address, Ty(0), Ordering); } template > -void store(Ty *Address, V Val, atomic::OrderingTy Ordering) { +OMP_ATTRS void store(Ty *Address, V Val, atomic::OrderingTy Ordering) { __scoped_atomic_store_n(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); } template > -V mul(Ty *Address, V Val, atomic::OrderingTy Ordering) { +OMP_ATTRS V mul(Ty *Address, V Val, atomic::OrderingTy Ordering) { Ty TypedCurrentVal, TypedResultVal, TypedNewVal; bool Success; do { @@ -88,14 +89,14 @@ V mul(Ty *Address, V Val, atomic::OrderingTy Ordering) { } template > -utils::enable_if_t, V> +OMP_ATTRS utils::enable_if_t, V> max(Ty *Address, V Val, atomic::OrderingTy Ordering) { return __scoped_atomic_fetch_max(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); } template > -utils::enable_if_t, V> +OMP_ATTRS utils::enable_if_t, V> max(Ty *Address, V Val, atomic::OrderingTy Ordering) { if (Val >= 0) return utils::bitCast( @@ -105,7 +106,7 @@ max(Ty *Address, V Val, atomic::OrderingTy Ordering) { } template > -utils::enable_if_t, V> +OMP_ATTRS utils::enable_if_t, V> max(Ty *Address, V Val, atomic::OrderingTy Ordering) { if (Val >= 0) return utils::bitCast( @@ -115,7 +116,7 @@ max(Ty *Address, V Val, atomic::OrderingTy Ordering) { } template > -utils::enable_if_t, V> +OMP_ATTRS utils::enable_if_t, V> min(Ty *Address, V Val, atomic::OrderingTy Ordering) { return __scoped_atomic_fetch_min(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); @@ -123,7 +124,7 @@ min(Ty *Address, V Val, atomic::OrderingTy Ordering) { // TODO: Implement this with __atomic_fetch_max and remove the duplication. template > -utils::enable_if_t, V> +OMP_ATTRS utils::enable_if_t, V> min(Ty *Address, V Val, atomic::OrderingTy Ordering) { if (Val >= 0) return utils::bitCast( @@ -134,7 +135,7 @@ min(Ty *Address, V Val, atomic::OrderingTy Ordering) { // TODO: Implement this with __atomic_fetch_max and remove the duplication. template > -utils::enable_if_t, V> +OMP_ATTRS utils::enable_if_t, V> min(Ty *Address, utils::remove_addrspace_t Val, atomic::OrderingTy Ordering) { if (Val >= 0) @@ -145,25 +146,25 @@ min(Ty *Address, utils::remove_addrspace_t Val, } template > -V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering) { +OMP_ATTRS V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering) { return __scoped_atomic_fetch_or(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); } template > -V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering) { +OMP_ATTRS V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering) { return __scoped_atomic_fetch_and(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); } template > -V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering) { +OMP_ATTRS V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering) { return __scoped_atomic_fetch_xor(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE); } -static inline uint32_t atomicExchange(uint32_t *Address, uint32_t Val, - atomic::OrderingTy Ordering) { +OMP_ATTRS static inline uint32_t atomicExchange(uint32_t *Address, uint32_t Val, + atomic::OrderingTy Ordering) { uint32_t R; __scoped_atomic_exchange(Address, &Val, &R, Ordering, __MEMORY_SCOPE_DEVICE); return R; @@ -176,15 +177,15 @@ static inline uint32_t atomicExchange(uint32_t *Address, uint32_t Val, namespace synchronize { /// Initialize the synchronization machinery. Must be called by all threads. -void init(bool IsSPMD); +OMP_ATTRS void init(bool IsSPMD); /// Synchronize all threads in a warp identified by \p Mask. -void warp(LaneMaskTy Mask); +OMP_ATTRS void warp(LaneMaskTy Mask); /// Synchronize all threads in a block and perform a fence before and after the /// barrier according to \p Ordering. Note that the fence might be part of the /// barrier. -void threads(atomic::OrderingTy Ordering); +OMP_ATTRS void threads(atomic::OrderingTy Ordering); /// Synchronizing threads is allowed even if they all hit different instances of /// `synchronize::threads()`. However, `synchronize::threadsAligned()` is more @@ -192,15 +193,14 @@ void threads(atomic::OrderingTy Ordering); /// noinline is removed by the openmp-opt pass and helps to preserve the /// information till then. ///{ -#pragma omp begin assumes ext_aligned_barrier /// Synchronize all threads in a block, they are reaching the same instruction /// (hence all threads in the block are "aligned"). Also perform a fence before /// and after the barrier according to \p Ordering. Note that the /// fence might be part of the barrier if the target offers this. -[[gnu::noinline]] void threadsAligned(atomic::OrderingTy Ordering); +[[gnu::noinline, omp::assume("ext_aligned_barrier")]] OMP_ATTRS void +threadsAligned(atomic::OrderingTy Ordering); -#pragma omp end assumes ///} } // namespace synchronize @@ -208,13 +208,13 @@ void threads(atomic::OrderingTy Ordering); namespace fence { /// Memory fence with \p Ordering semantics for the team. -void team(atomic::OrderingTy Ordering); +OMP_ATTRS void team(atomic::OrderingTy Ordering); /// Memory fence with \p Ordering semantics for the contention group. -void kernel(atomic::OrderingTy Ordering); +OMP_ATTRS void kernel(atomic::OrderingTy Ordering); /// Memory fence with \p Ordering semantics for the system. -void system(atomic::OrderingTy Ordering); +OMP_ATTRS void system(atomic::OrderingTy Ordering); } // namespace fence diff --git a/offload/DeviceRTL/include/Workshare.h b/offload/DeviceRTL/include/Workshare.h index fa9b3b2430b8c..14127b1841e6c 100644 --- a/offload/DeviceRTL/include/Workshare.h +++ b/offload/DeviceRTL/include/Workshare.h @@ -12,6 +12,8 @@ #ifndef OMPTARGET_WORKSHARE_H #define OMPTARGET_WORKSHARE_H +#include "DeviceTypes.h" + #pragma omp begin declare target device_type(nohost) namespace ompx { @@ -19,7 +21,7 @@ namespace ompx { namespace workshare { /// Initialize the worksharing machinery. -void init(bool IsSPMD); +OMP_ATTRS void init(bool IsSPMD); } // namespace workshare diff --git a/offload/DeviceRTL/src/Allocator.cpp b/offload/DeviceRTL/src/Allocator.cpp index ac662c48d4f5f..c970742b8b596 100644 --- a/offload/DeviceRTL/src/Allocator.cpp +++ b/offload/DeviceRTL/src/Allocator.cpp @@ -32,7 +32,7 @@ using namespace ompx; /// directly. struct BumpAllocatorTy final { - void *alloc(uint64_t Size) { + OMP_ATTRS void *alloc(uint64_t Size) { Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT)); if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) { @@ -58,7 +58,7 @@ struct BumpAllocatorTy final { return reinterpret_cast(OldData); } - void free(void *) {} + OMP_ATTRS void free(void *) {} }; BumpAllocatorTy BumpAllocator; @@ -67,14 +67,17 @@ BumpAllocatorTy BumpAllocator; /// ///{ -void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { +OMP_ATTRS void allocator::init(bool IsSPMD, + KernelEnvironmentTy &KernelEnvironment) { // TODO: Check KernelEnvironment for an allocator choice as soon as we have // more than one. } -void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } +OMP_ATTRS void *allocator::alloc(uint64_t Size) { + return BumpAllocator.alloc(Size); +} -void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } +OMP_ATTRS void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } ///} diff --git a/offload/DeviceRTL/src/Configuration.cpp b/offload/DeviceRTL/src/Configuration.cpp index 9e14c203d4a04..8ef990c5b1a5f 100644 --- a/offload/DeviceRTL/src/Configuration.cpp +++ b/offload/DeviceRTL/src/Configuration.cpp @@ -33,54 +33,56 @@ using namespace ompx; gnu::visibility("protected")]] DeviceEnvironmentTy CONSTANT(__omp_rtl_device_environment); -uint32_t config::getAssumeTeamsOversubscription() { +OMP_ATTRS uint32_t config::getAssumeTeamsOversubscription() { return __omp_rtl_assume_teams_oversubscription; } -uint32_t config::getAssumeThreadsOversubscription() { +OMP_ATTRS uint32_t config::getAssumeThreadsOversubscription() { return __omp_rtl_assume_threads_oversubscription; } -uint32_t config::getDebugKind() { +OMP_ATTRS uint32_t config::getDebugKind() { return __omp_rtl_debug_kind & __omp_rtl_device_environment.DeviceDebugKind; } -uint32_t config::getNumDevices() { +OMP_ATTRS uint32_t config::getNumDevices() { return __omp_rtl_device_environment.NumDevices; } -uint32_t config::getDeviceNum() { +OMP_ATTRS uint32_t config::getDeviceNum() { return __omp_rtl_device_environment.DeviceNum; } -uint64_t config::getDynamicMemorySize() { +OMP_ATTRS uint64_t config::getDynamicMemorySize() { return __omp_rtl_device_environment.DynamicMemSize; } -uint64_t config::getClockFrequency() { +OMP_ATTRS uint64_t config::getClockFrequency() { return __omp_rtl_device_environment.ClockFrequency; } -void *config::getIndirectCallTablePtr() { +OMP_ATTRS void *config::getIndirectCallTablePtr() { return reinterpret_cast( __omp_rtl_device_environment.IndirectCallTable); } -uint64_t config::getHardwareParallelism() { +OMP_ATTRS uint64_t config::getHardwareParallelism() { return __omp_rtl_device_environment.HardwareParallelism; } -uint64_t config::getIndirectCallTableSize() { +OMP_ATTRS uint64_t config::getIndirectCallTableSize() { return __omp_rtl_device_environment.IndirectCallTableSize; } -bool config::isDebugMode(DeviceDebugKind Kind) { +OMP_ATTRS bool config::isDebugMode(DeviceDebugKind Kind) { return config::getDebugKind() & uint32_t(Kind); } -bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } +OMP_ATTRS bool config::mayUseThreadStates() { + return !__omp_rtl_assume_no_thread_state; +} -bool config::mayUseNestedParallelism() { +OMP_ATTRS bool config::mayUseNestedParallelism() { if (__omp_rtl_assume_no_nested_parallelism) return false; return state::getKernelEnvironment().Configuration.MayUseNestedParallelism; diff --git a/offload/DeviceRTL/src/Debug.cpp b/offload/DeviceRTL/src/Debug.cpp index b451f17c6bbd8..07743b58898d9 100644 --- a/offload/DeviceRTL/src/Debug.cpp +++ b/offload/DeviceRTL/src/Debug.cpp @@ -24,17 +24,19 @@ using namespace ompx; #pragma omp begin declare target device_type(nohost) extern "C" { -void __assert_assume(bool condition) { __builtin_assume(condition); } +OMP_ATTRS void __assert_assume(bool condition) { __builtin_assume(condition); } #ifndef OMPTARGET_HAS_LIBC -[[gnu::weak]] void __assert_fail(const char *expr, const char *file, - unsigned line, const char *function) { +[[gnu::weak]] OMP_ATTRS void __assert_fail(const char *expr, const char *file, + unsigned line, + const char *function) { __assert_fail_internal(expr, nullptr, file, line, function); } #endif -void __assert_fail_internal(const char *expr, const char *msg, const char *file, - unsigned line, const char *function) { +OMP_ATTRS void __assert_fail_internal(const char *expr, const char *msg, + const char *file, unsigned line, + const char *function) { if (msg) { PRINTF("%s:%u: %s: Assertion %s (`%s`) failed.\n", file, line, function, msg, expr); diff --git a/offload/DeviceRTL/src/DeviceUtils.cpp b/offload/DeviceRTL/src/DeviceUtils.cpp index c204a7be73b1f..41a8aae619d9c 100644 --- a/offload/DeviceRTL/src/DeviceUtils.cpp +++ b/offload/DeviceRTL/src/DeviceUtils.cpp @@ -21,48 +21,50 @@ using namespace ompx; namespace impl { -bool isSharedMemPtr(const void *Ptr) { return false; } +OMP_ATTRS bool isSharedMemPtr(const void *Ptr) { return false; } -void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { +OMP_ATTRS void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) { static_assert(sizeof(unsigned long) == 8, ""); *LowBits = static_cast(Val & 0x00000000FFFFFFFFUL); *HighBits = static_cast((Val & 0xFFFFFFFF00000000UL) >> 32); } -uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { +OMP_ATTRS uint64_t Pack(uint32_t LowBits, uint32_t HighBits) { return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits; } -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width); -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, - int32_t Width); +OMP_ATTRS int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, + int32_t Width); +OMP_ATTRS int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, + int32_t Width); -uint64_t ballotSync(uint64_t Mask, int32_t Pred); +OMP_ATTRS uint64_t ballotSync(uint64_t Mask, int32_t Pred); /// AMDGCN Implementation /// ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { +OMP_ATTRS int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, + int32_t Width) { int Self = mapping::getThreadIdInWarp(); int Index = SrcLane + (Self & ~(Width - 1)); return __builtin_amdgcn_ds_bpermute(Index << 2, Var); } -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, - int32_t Width) { +OMP_ATTRS int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta, + int32_t Width) { int Self = mapping::getThreadIdInWarp(); int Index = Self + LaneDelta; Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index; return __builtin_amdgcn_ds_bpermute(Index << 2, Var); } -uint64_t ballotSync(uint64_t Mask, int32_t Pred) { +OMP_ATTRS uint64_t ballotSync(uint64_t Mask, int32_t Pred) { return Mask & __builtin_amdgcn_ballot_w64(Pred); } -bool isSharedMemPtr(const void *Ptr) { +OMP_ATTRS bool isSharedMemPtr(const void *Ptr) { return __builtin_amdgcn_is_shared( (const __attribute__((address_space(0))) void *)Ptr); } @@ -76,45 +78,50 @@ bool isSharedMemPtr(const void *Ptr) { device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) { +OMP_ATTRS int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, + int32_t Width) { return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1); } -int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) { +OMP_ATTRS int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, + int32_t Width) { int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f; return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); } -uint64_t ballotSync(uint64_t Mask, int32_t Pred) { +OMP_ATTRS uint64_t ballotSync(uint64_t Mask, int32_t Pred) { return __nvvm_vote_ballot_sync(static_cast(Mask), Pred); } -bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); } +OMP_ATTRS bool isSharedMemPtr(const void *Ptr) { + return __nvvm_isspacep_shared(Ptr); +} #pragma omp end declare variant ///} } // namespace impl -uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { +OMP_ATTRS uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) { return impl::Pack(LowBits, HighBits); } -void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) { +OMP_ATTRS void utils::unpack(uint64_t Val, uint32_t &LowBits, + uint32_t &HighBits) { impl::Unpack(Val, &LowBits, &HighBits); } -int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, - int32_t Width) { +OMP_ATTRS int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, + int32_t Width) { return impl::shuffle(Mask, Var, SrcLane, Width); } -int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, - int32_t Width) { +OMP_ATTRS int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, + int32_t Width) { return impl::shuffleDown(Mask, Var, Delta, Width); } -int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, - int32_t Width) { +OMP_ATTRS int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, + int32_t Width) { uint32_t Lo, Hi; utils::unpack(Var, Lo, Hi); Hi = impl::shuffleDown(Mask, Hi, Delta, Width); @@ -122,18 +129,22 @@ int64_t utils::shuffleDown(uint64_t Mask, int64_t Var, uint32_t Delta, return utils::pack(Lo, Hi); } -uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { +OMP_ATTRS uint64_t utils::ballotSync(uint64_t Mask, int32_t Pred) { return impl::ballotSync(Mask, Pred); } -bool utils::isSharedMemPtr(void *Ptr) { return impl::isSharedMemPtr(Ptr); } +OMP_ATTRS bool utils::isSharedMemPtr(void *Ptr) { + return impl::isSharedMemPtr(Ptr); +} extern "C" { -int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) { +OMP_ATTRS int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, + int16_t SrcLane) { return impl::shuffleDown(lanes::All, Val, Delta, SrcLane); } -int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) { +OMP_ATTRS int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, + int16_t Width) { return utils::shuffleDown(lanes::All, Val, Delta, Width); } } diff --git a/offload/DeviceRTL/src/Kernel.cpp b/offload/DeviceRTL/src/Kernel.cpp index 8bb275eae776c..dd6fb74e03d10 100644 --- a/offload/DeviceRTL/src/Kernel.cpp +++ b/offload/DeviceRTL/src/Kernel.cpp @@ -27,7 +27,7 @@ using namespace ompx; #pragma omp begin declare target device_type(nohost) -static void +OMP_ATTRS static void inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { // Order is important here. @@ -39,7 +39,7 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, } /// Simple generic state machine for worker threads. -static void genericStateMachine(IdentTy *Ident) { +OMP_ATTRS static void genericStateMachine(IdentTy *Ident) { uint32_t TId = mapping::getThreadIdInBlock(); do { @@ -73,8 +73,9 @@ extern "C" { /// /// \param Ident Source location identification, can be NULL. /// -int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { +OMP_ATTRS int32_t +__kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration; bool IsSPMD = Configuration.ExecMode & llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD; @@ -130,7 +131,7 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment, /// /// \param Ident Source location identification, can be NULL. /// -void __kmpc_target_deinit() { +OMP_ATTRS void __kmpc_target_deinit() { bool IsSPMD = mapping::isSPMDMode(); if (IsSPMD) return; @@ -153,7 +154,7 @@ void __kmpc_target_deinit() { } } -int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } +OMP_ATTRS int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); } } #pragma omp end declare target diff --git a/offload/DeviceRTL/src/LibC.cpp b/offload/DeviceRTL/src/LibC.cpp index 291ceb023a69c..b11f4368a07b4 100644 --- a/offload/DeviceRTL/src/LibC.cpp +++ b/offload/DeviceRTL/src/LibC.cpp @@ -11,7 +11,7 @@ #pragma omp begin declare target device_type(nohost) namespace impl { -int32_t omp_vprintf(const char *Format, __builtin_va_list vlist); +OMP_ATTRS int32_t omp_vprintf(const char *Format, __builtin_va_list vlist); } #ifndef OMPTARGET_HAS_LIBC @@ -19,26 +19,27 @@ namespace impl { #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -extern "C" int vprintf(const char *format, ...); -int omp_vprintf(const char *Format, __builtin_va_list vlist) { +extern "C" OMP_ATTRS int vprintf(const char *format, ...); +OMP_ATTRS int omp_vprintf(const char *Format, __builtin_va_list vlist) { return vprintf(Format, vlist); } #pragma omp end declare variant #pragma omp begin declare variant match(device = {arch(amdgcn)}) -int omp_vprintf(const char *Format, __builtin_va_list) { return -1; } +OMP_ATTRS int omp_vprintf(const char *Format, __builtin_va_list) { return -1; } #pragma omp end declare variant } // namespace impl -extern "C" int printf(const char *Format, ...) { +extern "C" OMP_ATTRS int printf(const char *Format, ...) { __builtin_va_list vlist; - __builtin_va_start(vlist, Format); + OMP_ATTRS __builtin_va_start(vlist, Format); return impl::omp_vprintf(Format, vlist); } #endif // OMPTARGET_HAS_LIBC extern "C" { -[[gnu::weak]] int memcmp(const void *lhs, const void *rhs, size_t count) { +[[gnu::weak]] OMP_ATTRS int memcmp(const void *lhs, const void *rhs, + size_t count) { auto *L = reinterpret_cast(lhs); auto *R = reinterpret_cast(rhs); @@ -49,7 +50,7 @@ extern "C" { return 0; } -[[gnu::weak]] void memset(void *dst, int C, size_t count) { +[[gnu::weak]] OMP_ATTRS void memset(void *dst, int C, size_t count) { auto *dstc = reinterpret_cast(dst); for (size_t I = 0; I < count; ++I) dstc[I] = C; diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp index 8583a539824c8..885a19c14ba19 100644 --- a/offload/DeviceRTL/src/Mapping.cpp +++ b/offload/DeviceRTL/src/Mapping.cpp @@ -25,27 +25,27 @@ namespace ompx { namespace impl { // Forward declarations defined to be defined for AMDGCN and NVPTX. -LaneMaskTy activemask(); -LaneMaskTy lanemaskLT(); -LaneMaskTy lanemaskGT(); -uint32_t getThreadIdInWarp(); -uint32_t getThreadIdInBlock(int32_t Dim); -uint32_t getNumberOfThreadsInBlock(int32_t Dim); -uint32_t getNumberOfThreadsInKernel(); -uint32_t getBlockIdInKernel(int32_t Dim); -uint32_t getNumberOfBlocksInKernel(int32_t Dim); -uint32_t getWarpIdInBlock(); -uint32_t getNumberOfWarpsInBlock(); -uint32_t getWarpSize(); +OMP_ATTRS LaneMaskTy activemask(); +OMP_ATTRS LaneMaskTy lanemaskLT(); +OMP_ATTRS LaneMaskTy lanemaskGT(); +OMP_ATTRS uint32_t getThreadIdInWarp(); +OMP_ATTRS uint32_t getThreadIdInBlock(int32_t Dim); +OMP_ATTRS uint32_t getNumberOfThreadsInBlock(int32_t Dim); +OMP_ATTRS uint32_t getNumberOfThreadsInKernel(); +OMP_ATTRS uint32_t getBlockIdInKernel(int32_t Dim); +OMP_ATTRS uint32_t getNumberOfBlocksInKernel(int32_t Dim); +OMP_ATTRS uint32_t getWarpIdInBlock(); +OMP_ATTRS uint32_t getNumberOfWarpsInBlock(); +OMP_ATTRS uint32_t getWarpSize(); /// AMDGCN Implementation /// ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); } +OMP_ATTRS uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); } -uint32_t getNumberOfThreadsInBlock(int32_t Dim) { +OMP_ATTRS uint32_t getNumberOfThreadsInBlock(int32_t Dim) { switch (Dim) { case 0: return __builtin_amdgcn_workgroup_size_x(); @@ -57,16 +57,16 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) { UNREACHABLE("Dim outside range!"); } -LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } +OMP_ATTRS LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } -LaneMaskTy lanemaskLT() { +OMP_ATTRS LaneMaskTy lanemaskLT() { uint32_t Lane = mapping::getThreadIdInWarp(); int64_t Ballot = mapping::activemask(); uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; return Mask & Ballot; } -LaneMaskTy lanemaskGT() { +OMP_ATTRS LaneMaskTy lanemaskGT() { uint32_t Lane = mapping::getThreadIdInWarp(); if (Lane == (mapping::getWarpSize() - 1)) return 0; @@ -75,11 +75,11 @@ LaneMaskTy lanemaskGT() { return Mask & Ballot; } -uint32_t getThreadIdInWarp() { +OMP_ATTRS uint32_t getThreadIdInWarp() { return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); } -uint32_t getThreadIdInBlock(int32_t Dim) { +OMP_ATTRS uint32_t getThreadIdInBlock(int32_t Dim) { switch (Dim) { case 0: return __builtin_amdgcn_workitem_id_x(); @@ -91,12 +91,12 @@ uint32_t getThreadIdInBlock(int32_t Dim) { UNREACHABLE("Dim outside range!"); } -uint32_t getNumberOfThreadsInKernel() { +OMP_ATTRS uint32_t getNumberOfThreadsInKernel() { return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() * __builtin_amdgcn_grid_size_z(); } -uint32_t getBlockIdInKernel(int32_t Dim) { +OMP_ATTRS uint32_t getBlockIdInKernel(int32_t Dim) { switch (Dim) { case 0: return __builtin_amdgcn_workgroup_id_x(); @@ -108,7 +108,7 @@ uint32_t getBlockIdInKernel(int32_t Dim) { UNREACHABLE("Dim outside range!"); } -uint32_t getNumberOfBlocksInKernel(int32_t Dim) { +OMP_ATTRS uint32_t getNumberOfBlocksInKernel(int32_t Dim) { switch (Dim) { case 0: return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); @@ -120,11 +120,11 @@ uint32_t getNumberOfBlocksInKernel(int32_t Dim) { UNREACHABLE("Dim outside range!"); } -uint32_t getWarpIdInBlock() { +OMP_ATTRS uint32_t getWarpIdInBlock() { return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); } -uint32_t getNumberOfWarpsInBlock() { +OMP_ATTRS uint32_t getNumberOfWarpsInBlock() { return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize(); } @@ -138,7 +138,7 @@ uint32_t getNumberOfWarpsInBlock() { device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -uint32_t getNumberOfThreadsInBlock(int32_t Dim) { +OMP_ATTRS uint32_t getNumberOfThreadsInBlock(int32_t Dim) { switch (Dim) { case 0: return __nvvm_read_ptx_sreg_ntid_x(); @@ -150,15 +150,15 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) { UNREACHABLE("Dim outside range!"); } -uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); } +OMP_ATTRS uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); } -LaneMaskTy activemask() { return __nvvm_activemask(); } +OMP_ATTRS LaneMaskTy activemask() { return __nvvm_activemask(); } -LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); } +OMP_ATTRS LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); } -LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); } +OMP_ATTRS LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); } -uint32_t getThreadIdInBlock(int32_t Dim) { +OMP_ATTRS uint32_t getThreadIdInBlock(int32_t Dim) { switch (Dim) { case 0: return __nvvm_read_ptx_sreg_tid_x(); @@ -170,9 +170,9 @@ uint32_t getThreadIdInBlock(int32_t Dim) { UNREACHABLE("Dim outside range!"); } -uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); } +OMP_ATTRS uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); } -uint32_t getBlockIdInKernel(int32_t Dim) { +OMP_ATTRS uint32_t getBlockIdInKernel(int32_t Dim) { switch (Dim) { case 0: return __nvvm_read_ptx_sreg_ctaid_x(); @@ -184,7 +184,7 @@ uint32_t getBlockIdInKernel(int32_t Dim) { UNREACHABLE("Dim outside range!"); } -uint32_t getNumberOfBlocksInKernel(int32_t Dim) { +OMP_ATTRS uint32_t getNumberOfBlocksInKernel(int32_t Dim) { switch (Dim) { case 0: return __nvvm_read_ptx_sreg_nctaid_x(); @@ -196,7 +196,7 @@ uint32_t getNumberOfBlocksInKernel(int32_t Dim) { UNREACHABLE("Dim outside range!"); } -uint32_t getNumberOfThreadsInKernel() { +OMP_ATTRS uint32_t getNumberOfThreadsInKernel() { return impl::getNumberOfThreadsInBlock(0) * impl::getNumberOfBlocksInKernel(0) * impl::getNumberOfThreadsInBlock(1) * @@ -205,11 +205,11 @@ uint32_t getNumberOfThreadsInKernel() { impl::getNumberOfBlocksInKernel(2); } -uint32_t getWarpIdInBlock() { +OMP_ATTRS uint32_t getWarpIdInBlock() { return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); } -uint32_t getNumberOfWarpsInBlock() { +OMP_ATTRS uint32_t getNumberOfWarpsInBlock() { return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / mapping::getWarpSize(); } @@ -224,13 +224,13 @@ uint32_t getNumberOfWarpsInBlock() { /// below to avoid repeating assumptions or including irrelevant ones. ///{ -static bool isInLastWarp() { +OMP_ATTRS static bool isInLastWarp() { uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) & ~(mapping::getWarpSize() - 1); return mapping::getThreadIdInBlock() == MainTId; } -bool mapping::isMainThreadInGenericMode(bool IsSPMD) { +OMP_ATTRS bool mapping::isMainThreadInGenericMode(bool IsSPMD) { if (IsSPMD || icv::Level) return false; @@ -238,83 +238,83 @@ bool mapping::isMainThreadInGenericMode(bool IsSPMD) { return isInLastWarp(); } -bool mapping::isMainThreadInGenericMode() { +OMP_ATTRS bool mapping::isMainThreadInGenericMode() { return mapping::isMainThreadInGenericMode(mapping::isSPMDMode()); } -bool mapping::isInitialThreadInLevel0(bool IsSPMD) { +OMP_ATTRS bool mapping::isInitialThreadInLevel0(bool IsSPMD) { if (IsSPMD) return mapping::getThreadIdInBlock() == 0; return isInLastWarp(); } -bool mapping::isLeaderInWarp() { +OMP_ATTRS bool mapping::isLeaderInWarp() { __kmpc_impl_lanemask_t Active = mapping::activemask(); __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT(); return utils::popc(Active & LaneMaskLT) == 0; } -LaneMaskTy mapping::activemask() { return impl::activemask(); } +OMP_ATTRS LaneMaskTy mapping::activemask() { return impl::activemask(); } -LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } +OMP_ATTRS LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } -LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } +OMP_ATTRS LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } -uint32_t mapping::getThreadIdInWarp() { +OMP_ATTRS uint32_t mapping::getThreadIdInWarp() { uint32_t ThreadIdInWarp = impl::getThreadIdInWarp(); ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr); return ThreadIdInWarp; } -uint32_t mapping::getThreadIdInBlock(int32_t Dim) { +OMP_ATTRS uint32_t mapping::getThreadIdInBlock(int32_t Dim) { uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim); return ThreadIdInBlock; } -uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } +OMP_ATTRS uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } -uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { +OMP_ATTRS uint32_t mapping::getMaxTeamThreads(bool IsSPMD) { uint32_t BlockSize = mapping::getNumberOfThreadsInBlock(); // If we are in SPMD mode, remove one warp. return BlockSize - (!IsSPMD * impl::getWarpSize()); } -uint32_t mapping::getMaxTeamThreads() { +OMP_ATTRS uint32_t mapping::getMaxTeamThreads() { return mapping::getMaxTeamThreads(mapping::isSPMDMode()); } -uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { +OMP_ATTRS uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) { return impl::getNumberOfThreadsInBlock(Dim); } -uint32_t mapping::getNumberOfThreadsInKernel() { +OMP_ATTRS uint32_t mapping::getNumberOfThreadsInKernel() { return impl::getNumberOfThreadsInKernel(); } -uint32_t mapping::getWarpIdInBlock() { +OMP_ATTRS uint32_t mapping::getWarpIdInBlock() { uint32_t WarpID = impl::getWarpIdInBlock(); ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr); return WarpID; } -uint32_t mapping::getBlockIdInKernel(int32_t Dim) { +OMP_ATTRS uint32_t mapping::getBlockIdInKernel(int32_t Dim) { uint32_t BlockId = impl::getBlockIdInKernel(Dim); ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr); return BlockId; } -uint32_t mapping::getNumberOfWarpsInBlock() { +OMP_ATTRS uint32_t mapping::getNumberOfWarpsInBlock() { uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock(); ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr); return NumberOfWarpsInBlocks; } -uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { +OMP_ATTRS uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) { uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim); ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr); return NumberOfBlocks; } -uint32_t mapping::getNumberOfProcessorElements() { +OMP_ATTRS uint32_t mapping::getNumberOfProcessorElements() { return static_cast(config::getHardwareParallelism()); } @@ -328,26 +328,27 @@ uint32_t mapping::getNumberOfProcessorElements() { // the TU. We will need to solve this more correctly in the future. [[gnu::weak]] int SHARED(IsSPMDMode); -void mapping::init(bool IsSPMD) { +OMP_ATTRS void mapping::init(bool IsSPMD) { if (mapping::isInitialThreadInLevel0(IsSPMD)) IsSPMDMode = IsSPMD; } -bool mapping::isSPMDMode() { return IsSPMDMode; } +OMP_ATTRS bool mapping::isSPMDMode() { return IsSPMDMode; } -bool mapping::isGenericMode() { return !isSPMDMode(); } +OMP_ATTRS bool mapping::isGenericMode() { return !isSPMDMode(); } ///} extern "C" { -[[gnu::noinline]] uint32_t __kmpc_get_hardware_thread_id_in_block() { +[[gnu::noinline]] OMP_ATTRS uint32_t __kmpc_get_hardware_thread_id_in_block() { return mapping::getThreadIdInBlock(); } -[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() { +[[gnu::noinline]] OMP_ATTRS uint32_t +__kmpc_get_hardware_num_threads_in_block() { return impl::getNumberOfThreadsInBlock(mapping::DIM_X); } -[[gnu::noinline]] uint32_t __kmpc_get_warp_size() { +[[gnu::noinline]] OMP_ATTRS uint32_t __kmpc_get_warp_size() { return impl::getWarpSize(); } } @@ -361,26 +362,28 @@ _TGT_KERNEL_LANGUAGE(block_dim, getNumberOfThreadsInBlock) _TGT_KERNEL_LANGUAGE(grid_dim, getNumberOfBlocksInKernel) extern "C" { -uint64_t ompx_ballot_sync(uint64_t mask, int pred) { +OMP_ATTRS uint64_t ompx_ballot_sync(uint64_t mask, int pred) { return utils::ballotSync(mask, pred); } -int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) { +OMP_ATTRS int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, + int width) { return utils::shuffleDown(mask, var, delta, width); } -float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta, - int width) { +OMP_ATTRS float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta, + int width) { return utils::bitCast( utils::shuffleDown(mask, utils::bitCast(var), delta, width)); } -long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) { +OMP_ATTRS long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, + int width) { return utils::shuffleDown(mask, var, delta, width); } -double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta, - int width) { +OMP_ATTRS double ompx_shfl_down_sync_d(uint64_t mask, double var, + unsigned delta, int width) { return utils::bitCast( utils::shuffleDown(mask, utils::bitCast(var), delta, width)); } diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp index ba6fbf5d5c7e3..8489e1e011e93 100644 --- a/offload/DeviceRTL/src/Misc.cpp +++ b/offload/DeviceRTL/src/Misc.cpp @@ -22,23 +22,23 @@ namespace ompx { namespace impl { -double getWTick(); +OMP_ATTRS double getWTick(); -double getWTime(); +OMP_ATTRS double getWTime(); /// AMDGCN Implementation /// ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -double getWTick() { +OMP_ATTRS double getWTick() { // The number of ticks per second for the AMDGPU clock varies by card and can // only be retrived by querying the driver. We rely on the device environment // to inform us what the proper frequency is. return 1.0 / config::getClockFrequency(); } -double getWTime() { +OMP_ATTRS double getWTime() { return static_cast(__builtin_readsteadycounter()) * getWTick(); } @@ -51,12 +51,12 @@ double getWTime() { device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -double getWTick() { +OMP_ATTRS double getWTick() { // Timer precision is 1ns return ((double)1E-9); } -double getWTime() { +OMP_ATTRS double getWTime() { uint64_t nsecs = __nvvm_read_ptx_sreg_globaltimer(); return static_cast(nsecs) * getWTick(); } @@ -66,7 +66,7 @@ double getWTime() { /// Lookup a device-side function using a host pointer /p HstPtr using the table /// provided by the device plugin. The table is an ordered pair of host and /// device pointers sorted on the value of the host pointer. -void *indirectCallLookup(void *HstPtr) { +OMP_ATTRS void *indirectCallLookup(void *HstPtr) { if (!HstPtr) return nullptr; @@ -111,7 +111,8 @@ void *indirectCallLookup(void *HstPtr) { [[gnu::visibility("protected"), gnu::weak, gnu::retain]] rpc::Client Client asm("__llvm_rpc_client"); #else -[[gnu::visibility("protected"), gnu::weak]] rpc::Client Client asm("__llvm_rpc_client"); +[[gnu::visibility("protected"), + gnu::weak]] rpc::Client Client asm("__llvm_rpc_client"); #endif } // namespace impl @@ -122,19 +123,21 @@ void *indirectCallLookup(void *HstPtr) { ///{ extern "C" { -int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; } +OMP_ATTRS int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { + return 0; +} -int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } +OMP_ATTRS int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; } -double omp_get_wtick(void) { return ompx::impl::getWTick(); } +OMP_ATTRS double omp_get_wtick(void) { return ompx::impl::getWTick(); } -double omp_get_wtime(void) { return ompx::impl::getWTime(); } +OMP_ATTRS double omp_get_wtime(void) { return ompx::impl::getWTime(); } -void *__llvm_omp_indirect_call_lookup(void *HstPtr) { +OMP_ATTRS void *__llvm_omp_indirect_call_lookup(void *HstPtr) { return ompx::impl::indirectCallLookup(HstPtr); } -void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { +OMP_ATTRS void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { switch (allocator) { case omp_default_mem_alloc: case omp_large_cap_mem_alloc: @@ -147,7 +150,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { } } -void omp_free(void *ptr, omp_allocator_handle_t allocator) { +OMP_ATTRS void omp_free(void *ptr, omp_allocator_handle_t allocator) { switch (allocator) { case omp_default_mem_alloc: case omp_large_cap_mem_alloc: @@ -161,7 +164,8 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) { } } -unsigned long long __llvm_omp_host_call(void *fn, void *data, size_t size) { +OMP_ATTRS unsigned long long __llvm_omp_host_call(void *fn, void *data, + size_t size) { rpc::Client::Port Port = ompx::impl::Client.open(); Port.send_n(data, size); Port.send([=](rpc::Buffer *buffer, uint32_t) { diff --git a/offload/DeviceRTL/src/Parallelism.cpp b/offload/DeviceRTL/src/Parallelism.cpp index 5286d53b623f0..8473da4c5fa54 100644 --- a/offload/DeviceRTL/src/Parallelism.cpp +++ b/offload/DeviceRTL/src/Parallelism.cpp @@ -46,7 +46,7 @@ using namespace ompx; namespace { -uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { +OMP_ATTRS uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { uint32_t NThreadsICV = NumThreadsClause != -1 ? NumThreadsClause : icv::NThreads; uint32_t NumThreads = mapping::getMaxTeamThreads(); @@ -68,9 +68,10 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { } // Invoke an outlined parallel function unwrapping arguments (up to 32). -[[clang::always_inline]] void invokeMicrotask(int32_t global_tid, - int32_t bound_tid, void *fn, - void **args, int64_t nargs) { +[[clang::always_inline]] OMP_ATTRS void invokeMicrotask(int32_t global_tid, + int32_t bound_tid, + void *fn, void **args, + int64_t nargs) { switch (nargs) { #include "generated_microtask_cases.gen" default: @@ -83,10 +84,9 @@ uint32_t determineNumberOfThreads(int32_t NumThreadsClause) { extern "C" { -[[clang::always_inline]] void __kmpc_parallel_spmd(IdentTy *ident, - int32_t num_threads, - void *fn, void **args, - const int64_t nargs) { +[[clang::always_inline]] OMP_ATTRS void +__kmpc_parallel_spmd(IdentTy *ident, int32_t num_threads, void *fn, void **args, + const int64_t nargs) { uint32_t TId = mapping::getThreadIdInBlock(); uint32_t NumThreads = determineNumberOfThreads(num_threads); uint32_t PTeamSize = @@ -141,7 +141,7 @@ extern "C" { return; } -[[clang::always_inline]] void +[[clang::always_inline]] OMP_ATTRS void __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, int32_t num_threads, int proc_bind, void *fn, void *wrapper_fn, void **args, int64_t nargs) { @@ -278,7 +278,8 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, __kmpc_end_sharing_variables(); } -[[clang::noinline]] bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { +[[clang::noinline]] OMP_ATTRS bool +__kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) { // Work function and arguments for L1 parallel region. *WorkFn = state::ParallelRegionFn; @@ -292,7 +293,7 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, return ThreadIsActive; } -[[clang::noinline]] void __kmpc_kernel_end_parallel() { +[[clang::noinline]] OMP_ATTRS void __kmpc_kernel_end_parallel() { // In case we have modified an ICV for this thread before a ThreadState was // created. We drop it now to not contaminate the next parallel region. ASSERT(!mapping::isSPMDMode(), nullptr); @@ -301,14 +302,19 @@ __kmpc_parallel_51(IdentTy *ident, int32_t, int32_t if_expr, ASSERT(!mapping::isSPMDMode(), nullptr); } -uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); } +OMP_ATTRS uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { + return omp_get_level(); +} -int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); } +OMP_ATTRS int32_t __kmpc_global_thread_num(IdentTy *) { + return omp_get_thread_num(); +} -void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams, - int32_t thread_limit) {} +OMP_ATTRS void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, + int32_t num_teams, int32_t thread_limit) {} -void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {} +OMP_ATTRS void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, + int proc_bind) {} } #pragma omp end declare target diff --git a/offload/DeviceRTL/src/Profiling.cpp b/offload/DeviceRTL/src/Profiling.cpp index bb3caaadcc03d..0a0be5a2f028d 100644 --- a/offload/DeviceRTL/src/Profiling.cpp +++ b/offload/DeviceRTL/src/Profiling.cpp @@ -14,9 +14,9 @@ extern "C" { // Provides empty implementations for certain functions in compiler-rt // that are emitted by the PGO instrumentation. -void __llvm_profile_register_function(void *Ptr) {} -void __llvm_profile_register_names_function(void *Ptr, long int I) {} -void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} +OMP_ATTRS void __llvm_profile_register_function(void *Ptr) {} +OMP_ATTRS void __llvm_profile_register_names_function(void *Ptr, long int I) {} +OMP_ATTRS void __llvm_profile_instrument_memop(long int I, void *Ptr, int I2) {} } #pragma omp end declare target diff --git a/offload/DeviceRTL/src/Reduction.cpp b/offload/DeviceRTL/src/Reduction.cpp index d3b4528401953..5ecefddfaa51f 100644 --- a/offload/DeviceRTL/src/Reduction.cpp +++ b/offload/DeviceRTL/src/Reduction.cpp @@ -24,15 +24,17 @@ namespace { #pragma omp begin declare target device_type(nohost) -void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) { +OMP_ATTRS void gpu_regular_warp_reduce(void *reduce_data, + ShuffleReductFnTy shflFct) { for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) { shflFct(reduce_data, /*LaneId - not used= */ 0, /*Offset = */ mask, /*AlgoVersion=*/0); } } -void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, - uint32_t size, uint32_t tid) { +OMP_ATTRS void gpu_irregular_warp_reduce(void *reduce_data, + ShuffleReductFnTy shflFct, + uint32_t size, uint32_t tid) { uint32_t curr_size; uint32_t mask; curr_size = size; @@ -44,8 +46,8 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct, } } -static uint32_t gpu_irregular_simd_reduce(void *reduce_data, - ShuffleReductFnTy shflFct) { +OMP_ATTRS static uint32_t gpu_irregular_simd_reduce(void *reduce_data, + ShuffleReductFnTy shflFct) { uint32_t size, remote_id, physical_lane_id; physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize(); __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT(); @@ -63,9 +65,9 @@ static uint32_t gpu_irregular_simd_reduce(void *reduce_data, return (logical_lane_id == 0); } -static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { +OMP_ATTRS static int32_t +nvptx_parallel_reduce_nowait(void *reduce_data, ShuffleReductFnTy shflFct, + InterWarpCopyFnTy cpyFct) { uint32_t BlockThreadId = mapping::getThreadIdInBlock(); if (mapping::isMainThreadInGenericMode(/*IsSPMD=*/false)) BlockThreadId = 0; @@ -73,16 +75,16 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, if (NumThreads == 1) return 1; - // - // This reduce function handles reduction within a team. It handles - // parallel regions in both L1 and L2 parallelism levels. It also - // supports Generic, SPMD, and NoOMP modes. - // - // 1. Reduce within a warp. - // 2. Warp master copies value to warp 0 via shared memory. - // 3. Warp 0 reduces to a single value. - // 4. The reduced value is available in the thread that returns 1. - // + // + // This reduce function handles reduction within a team. It handles + // parallel regions in both L1 and L2 parallelism levels. It also + // supports Generic, SPMD, and NoOMP modes. + // + // 1. Reduce within a warp. + // 2. Warp master copies value to warp 0 via shared memory. + // 3. Warp 0 reduces to a single value. + // 4. The reduced value is available in the thread that returns 1. + // #if __has_builtin(__nvvm_reflect) if (__nvvm_reflect("__CUDA_ARCH") >= 700) { @@ -157,26 +159,24 @@ static int32_t nvptx_parallel_reduce_nowait(void *reduce_data, return BlockThreadId == 0; } -uint32_t roundToWarpsize(uint32_t s) { +OMP_ATTRS uint32_t roundToWarpsize(uint32_t s) { if (s < mapping::getWarpSize()) return 1; return (s & ~(unsigned)(mapping::getWarpSize() - 1)); } -uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } +OMP_ATTRS uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; } } // namespace extern "C" { -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(IdentTy *Loc, - uint64_t reduce_data_size, - void *reduce_data, - ShuffleReductFnTy shflFct, - InterWarpCopyFnTy cpyFct) { +OMP_ATTRS int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + IdentTy *Loc, uint64_t reduce_data_size, void *reduce_data, + ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) { return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct); } -int32_t __kmpc_nvptx_teams_reduce_nowait_v2( +OMP_ATTRS int32_t __kmpc_nvptx_teams_reduce_nowait_v2( IdentTy *Loc, void *GlobalBuffer, uint32_t num_of_records, uint64_t reduce_data_size, void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct, ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, @@ -313,7 +313,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2( } } -void *__kmpc_reduction_get_fixed_buffer() { +OMP_ATTRS void *__kmpc_reduction_get_fixed_buffer() { return state::getKernelLaunchEnvironment().ReductionBuffer; } diff --git a/offload/DeviceRTL/src/State.cpp b/offload/DeviceRTL/src/State.cpp index 855c74fa58e0a..baa8d7364d75e 100644 --- a/offload/DeviceRTL/src/State.cpp +++ b/offload/DeviceRTL/src/State.cpp @@ -53,13 +53,15 @@ namespace { extern "C" { #ifdef __AMDGPU__ -[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } -[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } +[[gnu::weak]] OMP_ATTRS void *malloc(size_t Size) { + return allocator::alloc(Size); +} +[[gnu::weak]] OMP_ATTRS void free(void *Ptr) { allocator::free(Ptr); } #else -[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); +[[gnu::weak, gnu::leaf]] OMP_ATTRS void *malloc(size_t Size); +[[gnu::weak, gnu::leaf]] OMP_ATTRS void free(void *Ptr); #endif } @@ -76,19 +78,19 @@ extern "C" { /// struct SharedMemorySmartStackTy { /// Initialize the stack. Must be called by all threads. - void init(bool IsSPMD); + OMP_ATTRS void init(bool IsSPMD); /// Allocate \p Bytes on the stack for the encountering thread. Each thread /// can call this function. - void *push(uint64_t Bytes); + OMP_ATTRS void *push(uint64_t Bytes); /// Deallocate the last allocation made by the encountering thread and pointed /// to by \p Ptr from the stack. Each thread can call this function. - void pop(void *Ptr, uint64_t Bytes); + OMP_ATTRS void pop(void *Ptr, uint64_t Bytes); private: /// Compute the size of the storage space reserved for a thread. - uint32_t computeThreadStorageTotal() { + OMP_ATTRS uint32_t computeThreadStorageTotal() { uint32_t NumLanesInBlock = mapping::getNumberOfThreadsInBlock(); return utils::alignDown((state::SharedScratchpadSize / NumLanesInBlock), allocator::ALIGNMENT); @@ -96,7 +98,7 @@ struct SharedMemorySmartStackTy { /// Return the top address of the warp data stack, that is the first address /// this warp will allocate memory at next. - void *getThreadDataTop(uint32_t TId) { + OMP_ATTRS void *getThreadDataTop(uint32_t TId) { return &Data[computeThreadStorageTotal() * TId + Usage[TId]]; } @@ -113,11 +115,11 @@ static_assert(state::SharedScratchpadSize / mapping::MaxThreadsPerTeam <= 256, /// The allocation of a single shared memory scratchpad. static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack); -void SharedMemorySmartStackTy::init(bool IsSPMD) { +OMP_ATTRS void SharedMemorySmartStackTy::init(bool IsSPMD) { Usage[mapping::getThreadIdInBlock()] = 0; } -void *SharedMemorySmartStackTy::push(uint64_t Bytes) { +OMP_ATTRS void *SharedMemorySmartStackTy::push(uint64_t Bytes) { // First align the number of requested bytes. /// FIXME: The stack shouldn't require worst-case padding. Alignment needs to /// be passed in as an argument and the stack rewritten to support it. @@ -148,7 +150,7 @@ void *SharedMemorySmartStackTy::push(uint64_t Bytes) { return GlobalMemory; } -void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { +OMP_ATTRS void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { uint64_t AlignedBytes = utils::alignPtr(Bytes, allocator::ALIGNMENT); if (utils::isSharedMemPtr(Ptr)) { int TId = mapping::getThreadIdInBlock(); @@ -160,28 +162,29 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) { } // namespace -void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } +OMP_ATTRS void *memory::getDynamicBuffer() { return DynamicSharedBuffer; } -void *memory::allocShared(uint64_t Bytes, const char *Reason) { +OMP_ATTRS void *memory::allocShared(uint64_t Bytes, const char *Reason) { return SharedMemorySmartStack.push(Bytes); } -void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { +OMP_ATTRS void memory::freeShared(void *Ptr, uint64_t Bytes, + const char *Reason) { SharedMemorySmartStack.pop(Ptr, Bytes); } -void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { +OMP_ATTRS void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { void *Ptr = malloc(Bytes); if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) PRINT("nullptr returned by malloc!\n"); return Ptr; } -void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } +OMP_ATTRS void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } ///} -bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { +OMP_ATTRS bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { return (NThreadsVar == Other.NThreadsVar) & (LevelVar == Other.LevelVar) & (ActiveLevelVar == Other.ActiveLevelVar) & (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) & @@ -189,7 +192,7 @@ bool state::ICVStateTy::operator==(const ICVStateTy &Other) const { (RunSchedChunkVar == Other.RunSchedChunkVar); } -void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { +OMP_ATTRS void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { ASSERT(NThreadsVar == Other.NThreadsVar, nullptr); ASSERT(LevelVar == Other.LevelVar, nullptr); ASSERT(ActiveLevelVar == Other.ActiveLevelVar, nullptr); @@ -198,7 +201,7 @@ void state::ICVStateTy::assertEqual(const ICVStateTy &Other) const { ASSERT(RunSchedChunkVar == Other.RunSchedChunkVar, nullptr); } -void state::TeamStateTy::init(bool IsSPMD) { +OMP_ATTRS void state::TeamStateTy::init(bool IsSPMD) { ICVState.NThreadsVar = 0; ICVState.LevelVar = 0; ICVState.ActiveLevelVar = 0; @@ -211,13 +214,13 @@ void state::TeamStateTy::init(bool IsSPMD) { ParallelRegionFnVar = nullptr; } -bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { +OMP_ATTRS bool state::TeamStateTy::operator==(const TeamStateTy &Other) const { return (ICVState == Other.ICVState) & (HasThreadState == Other.HasThreadState) & (ParallelTeamSize == Other.ParallelTeamSize); } -void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { +OMP_ATTRS void state::TeamStateTy::assertEqual(TeamStateTy &Other) const { ICVState.assertEqual(Other.ICVState); ASSERT(ParallelTeamSize == Other.ParallelTeamSize, nullptr); ASSERT(HasThreadState == Other.HasThreadState, nullptr); @@ -228,8 +231,8 @@ state::ThreadStateTy **SHARED(ompx::state::ThreadStates); namespace { -int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, - int OutOfBoundsVal = -1) { +OMP_ATTRS int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, + int OutOfBoundsVal = -1) { if (Level == 0) return DefaultVal; int LevelVar = omp_get_level(); @@ -243,8 +246,8 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal, } // namespace -void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, - KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { +OMP_ATTRS void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, + KernelLaunchEnvironmentTy &KernelLaunchEnvironment) { SharedMemorySmartStack.init(IsSPMD); if (mapping::isInitialThreadInLevel0(IsSPMD)) { TeamState.init(IsSPMD); @@ -254,15 +257,15 @@ void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, } } -KernelEnvironmentTy &state::getKernelEnvironment() { +OMP_ATTRS KernelEnvironmentTy &state::getKernelEnvironment() { return *KernelEnvironmentPtr; } -KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { +OMP_ATTRS KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() { return *KernelLaunchEnvironmentPtr; } -void state::enterDataEnvironment(IdentTy *Ident) { +OMP_ATTRS void state::enterDataEnvironment(IdentTy *Ident) { ASSERT(config::mayUseThreadStates(), "Thread state modified while explicitly disabled!"); if (!config::mayUseThreadStates()) @@ -291,7 +294,7 @@ void state::enterDataEnvironment(IdentTy *Ident) { ThreadStates[TId] = NewThreadState; } -void state::exitDataEnvironment() { +OMP_ATTRS void state::exitDataEnvironment() { ASSERT(config::mayUseThreadStates(), "Thread state modified while explicitly disabled!"); @@ -299,7 +302,7 @@ void state::exitDataEnvironment() { resetStateForThread(TId); } -void state::resetStateForThread(uint32_t TId) { +OMP_ATTRS void state::resetStateForThread(uint32_t TId) { if (!config::mayUseThreadStates()) return; if (OMP_LIKELY(!TeamState.HasThreadState || !ThreadStates[TId])) @@ -310,7 +313,7 @@ void state::resetStateForThread(uint32_t TId) { ThreadStates[TId] = PreviousThreadState; } -void state::runAndCheckState(void(Func(void))) { +OMP_ATTRS void state::runAndCheckState(void(Func(void))) { TeamStateTy OldTeamState = TeamState; OldTeamState.assertEqual(TeamState); @@ -319,133 +322,146 @@ void state::runAndCheckState(void(Func(void))) { OldTeamState.assertEqual(TeamState); } -void state::assumeInitialState(bool IsSPMD) { +OMP_ATTRS void state::assumeInitialState(bool IsSPMD) { TeamStateTy InitialTeamState; InitialTeamState.init(IsSPMD); InitialTeamState.assertEqual(TeamState); ASSERT(mapping::isSPMDMode() == IsSPMD, nullptr); } -int state::getEffectivePTeamSize() { +OMP_ATTRS int state::getEffectivePTeamSize() { int PTeamSize = state::ParallelTeamSize; return PTeamSize ? PTeamSize : mapping::getMaxTeamThreads(); } extern "C" { -void omp_set_dynamic(int V) {} +OMP_ATTRS void omp_set_dynamic(int V) {} -int omp_get_dynamic(void) { return 0; } +OMP_ATTRS int omp_get_dynamic(void) { return 0; } -void omp_set_num_threads(int V) { icv::NThreads = V; } +OMP_ATTRS void omp_set_num_threads(int V) { icv::NThreads = V; } -int omp_get_max_threads(void) { +OMP_ATTRS int omp_get_max_threads(void) { int NT = icv::NThreads; return NT > 0 ? NT : mapping::getMaxTeamThreads(); } -int omp_get_level(void) { +OMP_ATTRS int omp_get_level(void) { int LevelVar = icv::Level; ASSERT(LevelVar >= 0, nullptr); return LevelVar; } -int omp_get_active_level(void) { return !!icv::ActiveLevel; } +OMP_ATTRS int omp_get_active_level(void) { return !!icv::ActiveLevel; } -int omp_in_parallel(void) { return !!icv::ActiveLevel; } +OMP_ATTRS int omp_in_parallel(void) { return !!icv::ActiveLevel; } -void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { +OMP_ATTRS void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) { *ScheduleKind = static_cast((int)icv::RunSched); *ChunkSize = state::RunSchedChunk; } -void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { +OMP_ATTRS void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) { icv::RunSched = (int)ScheduleKind; state::RunSchedChunk = ChunkSize; } -int omp_get_ancestor_thread_num(int Level) { +OMP_ATTRS int omp_get_ancestor_thread_num(int Level) { return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0); } -int omp_get_thread_num(void) { +OMP_ATTRS int omp_get_thread_num(void) { return omp_get_ancestor_thread_num(omp_get_level()); } -int omp_get_team_size(int Level) { +OMP_ATTRS int omp_get_team_size(int Level) { return returnValIfLevelIsActive(Level, state::getEffectivePTeamSize(), 1); } -int omp_get_num_threads(void) { +OMP_ATTRS int omp_get_num_threads(void) { return omp_get_level() != 1 ? 1 : state::getEffectivePTeamSize(); } -int omp_get_thread_limit(void) { return mapping::getMaxTeamThreads(); } +OMP_ATTRS int omp_get_thread_limit(void) { + return mapping::getMaxTeamThreads(); +} -int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); } +OMP_ATTRS int omp_get_num_procs(void) { + return mapping::getNumberOfProcessorElements(); +} -void omp_set_nested(int) {} +OMP_ATTRS void omp_set_nested(int) {} -int omp_get_nested(void) { return false; } +OMP_ATTRS int omp_get_nested(void) { return false; } -void omp_set_max_active_levels(int Levels) { +OMP_ATTRS void omp_set_max_active_levels(int Levels) { icv::MaxActiveLevels = Levels > 0 ? 1 : 0; } -int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } +OMP_ATTRS int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; } -omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; } +OMP_ATTRS omp_proc_bind_t omp_get_proc_bind(void) { + return omp_proc_bind_false; +} -int omp_get_num_places(void) { return 0; } +OMP_ATTRS int omp_get_num_places(void) { return 0; } -int omp_get_place_num_procs(int) { return omp_get_num_procs(); } +OMP_ATTRS int omp_get_place_num_procs(int) { return omp_get_num_procs(); } -void omp_get_place_proc_ids(int, int *) { +OMP_ATTRS void omp_get_place_proc_ids(int, int *) { // TODO } -int omp_get_place_num(void) { return 0; } +OMP_ATTRS int omp_get_place_num(void) { return 0; } -int omp_get_partition_num_places(void) { return 0; } +OMP_ATTRS int omp_get_partition_num_places(void) { return 0; } -void omp_get_partition_place_nums(int *) { +OMP_ATTRS void omp_get_partition_place_nums(int *) { // TODO } -int omp_get_cancellation(void) { return 0; } +OMP_ATTRS int omp_get_cancellation(void) { return 0; } -void omp_set_default_device(int) {} +OMP_ATTRS void omp_set_default_device(int) {} -int omp_get_default_device(void) { return -1; } +OMP_ATTRS int omp_get_default_device(void) { return -1; } -int omp_get_num_devices(void) { return config::getNumDevices(); } +OMP_ATTRS int omp_get_num_devices(void) { return config::getNumDevices(); } -int omp_get_device_num(void) { return config::getDeviceNum(); } +OMP_ATTRS int omp_get_device_num(void) { return config::getDeviceNum(); } -int omp_get_num_teams(void) { return mapping::getNumberOfBlocksInKernel(); } +OMP_ATTRS int omp_get_num_teams(void) { + return mapping::getNumberOfBlocksInKernel(); +} -int omp_get_team_num() { return mapping::getBlockIdInKernel(); } +OMP_ATTRS int omp_get_team_num() { return mapping::getBlockIdInKernel(); } -int omp_get_initial_device(void) { return -1; } +OMP_ATTRS int omp_get_initial_device(void) { return -1; } -int omp_is_initial_device(void) { return 0; } +OMP_ATTRS int omp_is_initial_device(void) { return 0; } } extern "C" { -[[clang::noinline]] void *__kmpc_alloc_shared(uint64_t Bytes) { +[[clang::noinline]] OMP_ATTRS void *__kmpc_alloc_shared(uint64_t Bytes) { return memory::allocShared(Bytes, "Frontend alloc shared"); } -[[clang::noinline]] void __kmpc_free_shared(void *Ptr, uint64_t Bytes) { +[[clang::noinline]] OMP_ATTRS void __kmpc_free_shared(void *Ptr, + uint64_t Bytes) { memory::freeShared(Ptr, Bytes, "Frontend free shared"); } -void *__kmpc_get_dynamic_shared() { return memory::getDynamicBuffer(); } +OMP_ATTRS void *__kmpc_get_dynamic_shared() { + return memory::getDynamicBuffer(); +} -void *llvm_omp_target_dynamic_shared_alloc() { +OMP_ATTRS void *llvm_omp_target_dynamic_shared_alloc() { return __kmpc_get_dynamic_shared(); } -void *llvm_omp_get_dynamic_shared() { return __kmpc_get_dynamic_shared(); } +OMP_ATTRS void *llvm_omp_get_dynamic_shared() { + return __kmpc_get_dynamic_shared(); +} /// Allocate storage in shared memory to communicate arguments from the main /// thread to the workers in generic mode. If we exceed @@ -460,7 +476,8 @@ constexpr uint64_t NUM_SHARED_VARIABLES_IN_SHARED_MEM = 64; #pragma omp allocate(SharedMemVariableSharingSpacePtr) \ allocator(omp_pteam_mem_alloc) -void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { +OMP_ATTRS void __kmpc_begin_sharing_variables(void ***GlobalArgs, + uint64_t nArgs) { if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) { SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0]; } else { @@ -472,12 +489,12 @@ void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t nArgs) { *GlobalArgs = SharedMemVariableSharingSpacePtr; } -void __kmpc_end_sharing_variables() { +OMP_ATTRS void __kmpc_end_sharing_variables() { if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0]) memory::freeGlobal(SharedMemVariableSharingSpacePtr, "new extended args"); } -void __kmpc_get_shared_variables(void ***GlobalArgs) { +OMP_ATTRS void __kmpc_get_shared_variables(void ***GlobalArgs) { *GlobalArgs = SharedMemVariableSharingSpacePtr; } } diff --git a/offload/DeviceRTL/src/Synchronization.cpp b/offload/DeviceRTL/src/Synchronization.cpp index 72a97ae3fcfb4..f054a8add1313 100644 --- a/offload/DeviceRTL/src/Synchronization.cpp +++ b/offload/DeviceRTL/src/Synchronization.cpp @@ -29,36 +29,41 @@ namespace impl { /// ///{ /// NOTE: This function needs to be implemented by every target. -uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope); +OMP_ATTRS uint32_t atomicInc(uint32_t *Address, uint32_t Val, + atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope); ///} // Forward declarations defined to be defined for AMDGCN and NVPTX. -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope); -void namedBarrierInit(); -void namedBarrier(); -void fenceTeam(atomic::OrderingTy Ordering); -void fenceKernel(atomic::OrderingTy Ordering); -void fenceSystem(atomic::OrderingTy Ordering); -void syncWarp(__kmpc_impl_lanemask_t); -void syncThreads(atomic::OrderingTy Ordering); -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } -void unsetLock(omp_lock_t *); -int testLock(omp_lock_t *); -void initLock(omp_lock_t *); -void destroyLock(omp_lock_t *); -void setLock(omp_lock_t *); -void unsetCriticalLock(omp_lock_t *); -void setCriticalLock(omp_lock_t *); +OMP_ATTRS uint32_t atomicInc(uint32_t *A, uint32_t V, + atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope); +OMP_ATTRS void namedBarrierInit(); +OMP_ATTRS void namedBarrier(); +OMP_ATTRS void fenceTeam(atomic::OrderingTy Ordering); +OMP_ATTRS void fenceKernel(atomic::OrderingTy Ordering); +OMP_ATTRS void fenceSystem(atomic::OrderingTy Ordering); +OMP_ATTRS void syncWarp(__kmpc_impl_lanemask_t); +OMP_ATTRS void syncThreads(atomic::OrderingTy Ordering); +OMP_ATTRS void syncThreadsAligned(atomic::OrderingTy Ordering) { + syncThreads(Ordering); +} +OMP_ATTRS void unsetLock(omp_lock_t *); +OMP_ATTRS int testLock(omp_lock_t *); +OMP_ATTRS void initLock(omp_lock_t *); +OMP_ATTRS void destroyLock(omp_lock_t *); +OMP_ATTRS void setLock(omp_lock_t *); +OMP_ATTRS void unsetCriticalLock(omp_lock_t *); +OMP_ATTRS void setCriticalLock(omp_lock_t *); /// AMDGCN Implementation /// ///{ #pragma omp begin declare variant match(device = {arch(amdgcn)}) -uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { +OMP_ATTRS uint32_t atomicInc(uint32_t *A, uint32_t V, + atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { // builtin_amdgcn_atomic_inc32 should expand to this switch when // passed a runtime value, but does not do so yet. Workaround here. @@ -91,12 +96,12 @@ uint32_t atomicInc(uint32_t *A, uint32_t V, atomic::OrderingTy Ordering, uint32_t SHARED(namedBarrierTracker); -void namedBarrierInit() { +OMP_ATTRS void namedBarrierInit() { // Don't have global ctors, and shared memory is not zero init atomic::store(&namedBarrierTracker, 0u, atomic::release); } -void namedBarrier() { +OMP_ATTRS void namedBarrier() { uint32_t NumThreads = omp_get_num_threads(); // assert(NumThreads % 32 == 0); @@ -143,25 +148,25 @@ void namedBarrier() { fence::team(atomic::release); } -void fenceTeam(atomic::OrderingTy Ordering) { +OMP_ATTRS void fenceTeam(atomic::OrderingTy Ordering) { return __scoped_atomic_thread_fence(Ordering, atomic::workgroup); } -void fenceKernel(atomic::OrderingTy Ordering) { +OMP_ATTRS void fenceKernel(atomic::OrderingTy Ordering) { return __scoped_atomic_thread_fence(Ordering, atomic::device_); } -void fenceSystem(atomic::OrderingTy Ordering) { +OMP_ATTRS void fenceSystem(atomic::OrderingTy Ordering) { return __scoped_atomic_thread_fence(Ordering, atomic::system); } -void syncWarp(__kmpc_impl_lanemask_t) { +OMP_ATTRS void syncWarp(__kmpc_impl_lanemask_t) { // This is a no-op on current AMDGPU hardware but it is used by the optimizer // to enforce convergent behaviour between control flow graphs. __builtin_amdgcn_wave_barrier(); } -void syncThreads(atomic::OrderingTy Ordering) { +OMP_ATTRS void syncThreads(atomic::OrderingTy Ordering) { if (Ordering != atomic::relaxed) fenceTeam(Ordering == atomic::acq_rel ? atomic::release : atomic::seq_cst); @@ -170,23 +175,25 @@ void syncThreads(atomic::OrderingTy Ordering) { if (Ordering != atomic::relaxed) fenceTeam(Ordering == atomic::acq_rel ? atomic::aquire : atomic::seq_cst); } -void syncThreadsAligned(atomic::OrderingTy Ordering) { syncThreads(Ordering); } +OMP_ATTRS void syncThreadsAligned(atomic::OrderingTy Ordering) { + syncThreads(Ordering); +} // TODO: Don't have wavefront lane locks. Possibly can't have them. -void unsetLock(omp_lock_t *) { __builtin_trap(); } -int testLock(omp_lock_t *) { __builtin_trap(); } -void initLock(omp_lock_t *) { __builtin_trap(); } -void destroyLock(omp_lock_t *) { __builtin_trap(); } -void setLock(omp_lock_t *) { __builtin_trap(); } +OMP_ATTRS void unsetLock(omp_lock_t *) { __builtin_trap(); } +OMP_ATTRS int testLock(omp_lock_t *) { __builtin_trap(); } +OMP_ATTRS void initLock(omp_lock_t *) { __builtin_trap(); } +OMP_ATTRS void destroyLock(omp_lock_t *) { __builtin_trap(); } +OMP_ATTRS void setLock(omp_lock_t *) { __builtin_trap(); } constexpr uint32_t UNSET = 0; constexpr uint32_t SET = 1; -void unsetCriticalLock(omp_lock_t *Lock) { +OMP_ATTRS void unsetCriticalLock(omp_lock_t *Lock) { (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::acq_rel); } -void setCriticalLock(omp_lock_t *Lock) { +OMP_ATTRS void setCriticalLock(omp_lock_t *Lock) { uint64_t LowestActiveThread = utils::ffs(mapping::activemask()) - 1; if (mapping::getThreadIdInWarp() == LowestActiveThread) { fenceKernel(atomic::release); @@ -208,14 +215,15 @@ void setCriticalLock(omp_lock_t *Lock) { device = {arch(nvptx, nvptx64)}, \ implementation = {extension(match_any)}) -uint32_t atomicInc(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { +OMP_ATTRS uint32_t atomicInc(uint32_t *Address, uint32_t Val, + atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { return __nvvm_atom_inc_gen_ui(Address, Val); } -void namedBarrierInit() {} +OMP_ATTRS void namedBarrierInit() {} -void namedBarrier() { +OMP_ATTRS void namedBarrier() { uint32_t NumThreads = omp_get_num_threads(); ASSERT(NumThreads % 32 == 0, nullptr); @@ -225,20 +233,24 @@ void namedBarrier() { __nvvm_barrier_sync_cnt(BarrierNo, NumThreads); } -void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); } +OMP_ATTRS void fenceTeam(atomic::OrderingTy) { __nvvm_membar_cta(); } -void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); } +OMP_ATTRS void fenceKernel(atomic::OrderingTy) { __nvvm_membar_gl(); } -void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } +OMP_ATTRS void fenceSystem(atomic::OrderingTy) { __nvvm_membar_sys(); } -void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); } +OMP_ATTRS void syncWarp(__kmpc_impl_lanemask_t Mask) { + __nvvm_bar_warp_sync(Mask); +} -void syncThreads(atomic::OrderingTy Ordering) { +OMP_ATTRS void syncThreads(atomic::OrderingTy Ordering) { constexpr int BarrierNo = 8; __nvvm_barrier_sync(BarrierNo); } -void syncThreadsAligned(atomic::OrderingTy Ordering) { __syncthreads(); } +OMP_ATTRS void syncThreadsAligned(atomic::OrderingTy Ordering) { + __syncthreads(); +} constexpr uint32_t OMP_SPIN = 1000; constexpr uint32_t UNSET = 0; @@ -247,19 +259,19 @@ constexpr uint32_t SET = 1; // TODO: This seems to hide a bug in the declare variant handling. If it is // called before it is defined // here the overload won't happen. Investigate lalter! -void unsetLock(omp_lock_t *Lock) { +OMP_ATTRS void unsetLock(omp_lock_t *Lock) { (void)atomicExchange((uint32_t *)Lock, UNSET, atomic::seq_cst); } -int testLock(omp_lock_t *Lock) { +OMP_ATTRS int testLock(omp_lock_t *Lock) { return atomic::add((uint32_t *)Lock, 0u, atomic::seq_cst); } -void initLock(omp_lock_t *Lock) { unsetLock(Lock); } +OMP_ATTRS void initLock(omp_lock_t *Lock) { unsetLock(Lock); } -void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } +OMP_ATTRS void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); } -void setLock(omp_lock_t *Lock) { +OMP_ATTRS void setLock(omp_lock_t *Lock) { // TODO: not sure spinning is a good idea here.. while (atomic::cas((uint32_t *)Lock, UNSET, SET, atomic::seq_cst, atomic::seq_cst) != UNSET) { @@ -275,56 +287,63 @@ void setLock(omp_lock_t *Lock) { } // wait for 0 to be the read value } -void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } +OMP_ATTRS void unsetCriticalLock(omp_lock_t *Lock) { unsetLock(Lock); } -void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } +OMP_ATTRS void setCriticalLock(omp_lock_t *Lock) { setLock(Lock); } #pragma omp end declare variant ///} } // namespace impl -void synchronize::init(bool IsSPMD) { +OMP_ATTRS void synchronize::init(bool IsSPMD) { if (!IsSPMD) impl::namedBarrierInit(); } -void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } +OMP_ATTRS void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); } -void synchronize::threads(atomic::OrderingTy Ordering) { +OMP_ATTRS void synchronize::threads(atomic::OrderingTy Ordering) { impl::syncThreads(Ordering); } -void synchronize::threadsAligned(atomic::OrderingTy Ordering) { +OMP_ATTRS void synchronize::threadsAligned(atomic::OrderingTy Ordering) { impl::syncThreadsAligned(Ordering); } -void fence::team(atomic::OrderingTy Ordering) { impl::fenceTeam(Ordering); } +OMP_ATTRS void fence::team(atomic::OrderingTy Ordering) { + impl::fenceTeam(Ordering); +} -void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); } +OMP_ATTRS void fence::kernel(atomic::OrderingTy Ordering) { + impl::fenceKernel(Ordering); +} -void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); } +OMP_ATTRS void fence::system(atomic::OrderingTy Ordering) { + impl::fenceSystem(Ordering); +} -uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering, - atomic::MemScopeTy MemScope) { +OMP_ATTRS uint32_t atomic::inc(uint32_t *Addr, uint32_t V, + atomic::OrderingTy Ordering, + atomic::MemScopeTy MemScope) { return impl::atomicInc(Addr, V, Ordering, MemScope); } -void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } +OMP_ATTRS void unsetCriticalLock(omp_lock_t *Lock) { impl::unsetLock(Lock); } -void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); } +OMP_ATTRS void setCriticalLock(omp_lock_t *Lock) { impl::setLock(Lock); } extern "C" { -void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} +OMP_ATTRS void __kmpc_ordered(IdentTy *Loc, int32_t TId) {} -void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} +OMP_ATTRS void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {} -int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { +OMP_ATTRS int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) { __kmpc_barrier(Loc, TId); return 0; } -void __kmpc_barrier(IdentTy *Loc, int32_t TId) { +OMP_ATTRS void __kmpc_barrier(IdentTy *Loc, int32_t TId) { if (mapping::isMainThreadInGenericMode()) return __kmpc_flush(Loc); @@ -334,66 +353,71 @@ void __kmpc_barrier(IdentTy *Loc, int32_t TId) { impl::namedBarrier(); } -[[clang::noinline]] void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) { +[[clang::noinline]] OMP_ATTRS void __kmpc_barrier_simple_spmd(IdentTy *Loc, + int32_t TId) { synchronize::threadsAligned(atomic::OrderingTy::seq_cst); } -[[clang::noinline]] void __kmpc_barrier_simple_generic(IdentTy *Loc, - int32_t TId) { +[[clang::noinline]] OMP_ATTRS void __kmpc_barrier_simple_generic(IdentTy *Loc, + int32_t TId) { synchronize::threads(atomic::OrderingTy::seq_cst); } -int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { +OMP_ATTRS int32_t __kmpc_master(IdentTy *Loc, int32_t TId) { return omp_get_thread_num() == 0; } -void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} +OMP_ATTRS void __kmpc_end_master(IdentTy *Loc, int32_t TId) {} -int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) { +OMP_ATTRS int32_t __kmpc_masked(IdentTy *Loc, int32_t TId, int32_t Filter) { return omp_get_thread_num() == Filter; } -void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {} +OMP_ATTRS void __kmpc_end_masked(IdentTy *Loc, int32_t TId) {} -int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { +OMP_ATTRS int32_t __kmpc_single(IdentTy *Loc, int32_t TId) { return __kmpc_master(Loc, TId); } -void __kmpc_end_single(IdentTy *Loc, int32_t TId) { +OMP_ATTRS void __kmpc_end_single(IdentTy *Loc, int32_t TId) { // The barrier is explicitly called. } -void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); } +OMP_ATTRS void __kmpc_flush(IdentTy *Loc) { fence::kernel(atomic::seq_cst); } -uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); } +OMP_ATTRS uint64_t __kmpc_warp_active_thread_mask(void) { + return mapping::activemask(); +} -void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } +OMP_ATTRS void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } -void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { +OMP_ATTRS void __kmpc_critical(IdentTy *Loc, int32_t TId, + CriticalNameTy *Name) { impl::setCriticalLock(reinterpret_cast(Name)); } -void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { +OMP_ATTRS void __kmpc_end_critical(IdentTy *Loc, int32_t TId, + CriticalNameTy *Name) { impl::unsetCriticalLock(reinterpret_cast(Name)); } -void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } +OMP_ATTRS void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); } -void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } +OMP_ATTRS void omp_destroy_lock(omp_lock_t *Lock) { impl::destroyLock(Lock); } -void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } +OMP_ATTRS void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); } -void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } +OMP_ATTRS void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); } -int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } +OMP_ATTRS int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); } -void ompx_sync_block(int Ordering) { +OMP_ATTRS void ompx_sync_block(int Ordering) { impl::syncThreadsAligned(atomic::OrderingTy(Ordering)); } -void ompx_sync_block_acq_rel() { +OMP_ATTRS void ompx_sync_block_acq_rel() { impl::syncThreadsAligned(atomic::OrderingTy::acq_rel); } -void ompx_sync_block_divergent(int Ordering) { +OMP_ATTRS void ompx_sync_block_divergent(int Ordering) { impl::syncThreads(atomic::OrderingTy(Ordering)); } } // extern "C" diff --git a/offload/DeviceRTL/src/Tasking.cpp b/offload/DeviceRTL/src/Tasking.cpp index 23a967c1a337e..6bcf461103dd7 100644 --- a/offload/DeviceRTL/src/Tasking.cpp +++ b/offload/DeviceRTL/src/Tasking.cpp @@ -24,10 +24,10 @@ using namespace ompx; extern "C" { -TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, - size_t TaskSizeInclPrivateValues, - size_t SharedValuesSize, - TaskFnTy TaskFn) { +OMP_ATTRS TaskDescriptorTy * +__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, + size_t TaskSizeInclPrivateValues, size_t SharedValuesSize, + TaskFnTy TaskFn) { auto TaskSizeInclPrivateValuesPadded = utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *))); auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize; @@ -40,14 +40,14 @@ TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, int32_t, int32_t, return TaskDescriptor; } -int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { +OMP_ATTRS int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); } -int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int32_t, - void *, int32_t, void *) { +OMP_ATTRS int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, + int32_t, void *, int32_t, void *) { state::DateEnvironmentRAII DERAII(Loc); TaskDescriptor->TaskFn(0, TaskDescriptor); @@ -56,33 +56,35 @@ int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId, return 0; } -void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { +OMP_ATTRS void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { state::enterDataEnvironment(Loc); } -void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor) { +OMP_ATTRS void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor) { state::exitDataEnvironment(); memory::freeGlobal(TaskDescriptor, "explicit task descriptor"); } -void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t, - void *) {} +OMP_ATTRS void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, + int32_t, void *) {} -void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {} +OMP_ATTRS void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {} -void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {} +OMP_ATTRS void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {} -int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; } +OMP_ATTRS int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { + return 0; +} -int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; } +OMP_ATTRS int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; } -void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, - TaskDescriptorTy *TaskDescriptor, int, - uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int, - int32_t, uint64_t, void *) { +OMP_ATTRS void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, + TaskDescriptorTy *TaskDescriptor, int, + uint64_t *LowerBound, uint64_t *UpperBound, + int64_t, int, int32_t, uint64_t, void *) { // Skip task entirely if empty iteration space. if (*LowerBound > *UpperBound) return; @@ -93,7 +95,7 @@ void __kmpc_taskloop(IdentTy *Loc, uint32_t TId, __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0); } -int omp_in_final(void) { +OMP_ATTRS int omp_in_final(void) { // treat all tasks as final... Specs may expect runtime to keep // track more precisely if a task was actively set by users... This // is not explicitly specified; will treat as if runtime can @@ -101,7 +103,7 @@ int omp_in_final(void) { return 1; } -int omp_get_max_task_priority(void) { return 0; } +OMP_ATTRS int omp_get_max_task_priority(void) { return 0; } } #pragma omp end declare target diff --git a/offload/DeviceRTL/src/Workshare.cpp b/offload/DeviceRTL/src/Workshare.cpp index ad60e66548be9..e7b5986b40d2e 100644 --- a/offload/DeviceRTL/src/Workshare.cpp +++ b/offload/DeviceRTL/src/Workshare.cpp @@ -70,8 +70,9 @@ template struct omptarget_nvptx_LoopSupport { */ // helper function for static chunk - static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk, - T entityId, T numberOfEntities) { + OMP_ATTRS static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, + ST chunk, T entityId, + T numberOfEntities) { // each thread executes multiple chunks all of the same size, except // the last one // distance between two successive chunks @@ -90,8 +91,9 @@ template struct omptarget_nvptx_LoopSupport { // Loop with static scheduling without chunk // helper function for static no chunk - static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk, - T entityId, T numberOfEntities) { + OMP_ATTRS static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, + ST &chunk, T entityId, + T numberOfEntities) { // No chunk size specified. Each thread or warp gets at most one // chunk; chunks are all almost of equal size T loopSize = ub - lb + 1; @@ -115,9 +117,10 @@ template struct omptarget_nvptx_LoopSupport { //////////////////////////////////////////////////////////////////////////////// // Support for Static Init - static void for_static_init(int32_t, int32_t schedtype, int32_t *plastiter, - T *plower, T *pupper, ST *pstride, ST chunk, - bool IsSPMDExecutionMode) { + OMP_ATTRS static void for_static_init(int32_t, int32_t schedtype, + int32_t *plastiter, T *plower, + T *pupper, ST *pstride, ST chunk, + bool IsSPMDExecutionMode) { int32_t gtid = omp_get_thread_num(); int numberOfActiveOMPThreads = omp_get_num_threads(); @@ -202,14 +205,14 @@ template struct omptarget_nvptx_LoopSupport { //////////////////////////////////////////////////////////////////////////////// // Support for dispatch Init - static int OrderedSchedule(kmp_sched_t schedule) { + OMP_ATTRS static int OrderedSchedule(kmp_sched_t schedule) { return schedule >= kmp_sched_ordered_first && schedule <= kmp_sched_ordered_last; } - static void dispatch_init(IdentTy *loc, int32_t threadId, - kmp_sched_t schedule, T lb, T ub, ST st, ST chunk, - DynamicScheduleTracker *DST) { + OMP_ATTRS static void dispatch_init(IdentTy *loc, int32_t threadId, + kmp_sched_t schedule, T lb, T ub, ST st, + ST chunk, DynamicScheduleTracker *DST) { int tid = mapping::getThreadIdInBlock(); T tnum = omp_get_num_threads(); T tripCount = ub - lb + 1; // +1 because ub is inclusive @@ -339,7 +342,7 @@ template struct omptarget_nvptx_LoopSupport { //////////////////////////////////////////////////////////////////////////////// // Support for dispatch next - static uint64_t NextIter() { + OMP_ATTRS static uint64_t NextIter() { __kmpc_impl_lanemask_t active = mapping::activemask(); uint32_t leader = utils::ffs(active) - 1; uint32_t change = utils::popc(active); @@ -353,8 +356,8 @@ template struct omptarget_nvptx_LoopSupport { return warp_res + rank; } - static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound, - T loopUpperBound) { + OMP_ATTRS static int DynamicNextChunk(T &lb, T &ub, T chunkSize, + T loopLowerBound, T loopUpperBound) { T N = NextIter(); lb = loopLowerBound + N * chunkSize; ub = lb + chunkSize - 1; // Clang uses i <= ub @@ -379,9 +382,9 @@ template struct omptarget_nvptx_LoopSupport { return FINISHED; } - static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, - T *plower, T *pupper, ST *pstride, - DynamicScheduleTracker *DST) { + OMP_ATTRS static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast, + T *plower, T *pupper, ST *pstride, + DynamicScheduleTracker *DST) { // ID of a thread in its own warp // automatically selects thread or warp ID based on selected implementation @@ -432,7 +435,7 @@ template struct omptarget_nvptx_LoopSupport { return DISPATCH_NOTFINISHED; } - static void dispatch_fini() { + OMP_ATTRS static void dispatch_fini() { // nothing } @@ -462,7 +465,7 @@ template struct omptarget_nvptx_LoopSupport { static DynamicScheduleTracker **SHARED(ThreadDST); // Create a new DST, link the current one, and define the new as current. -static DynamicScheduleTracker *pushDST() { +OMP_ATTRS static DynamicScheduleTracker *pushDST() { int32_t ThreadIndex = mapping::getThreadIdInBlock(); // Each block will allocate an array of pointers to DST structs. The array is // equal in length to the number of threads in that block. @@ -491,12 +494,12 @@ static DynamicScheduleTracker *pushDST() { } // Return the current DST. -static DynamicScheduleTracker *peekDST() { +OMP_ATTRS static DynamicScheduleTracker *peekDST() { return ThreadDST[mapping::getThreadIdInBlock()]; } // Pop the current DST and restore the last one. -static void popDST() { +OMP_ATTRS static void popDST() { int32_t ThreadIndex = mapping::getThreadIdInBlock(); DynamicScheduleTracker *CurrentDST = ThreadDST[ThreadIndex]; DynamicScheduleTracker *OldDST = CurrentDST->NextDST; @@ -513,7 +516,7 @@ static void popDST() { synchronize::threads(atomic::seq_cst); } -void workshare::init(bool IsSPMD) { +OMP_ATTRS void workshare::init(bool IsSPMD) { if (mapping::isInitialThreadInLevel0(IsSPMD)) ThreadDST = nullptr; } @@ -521,168 +524,179 @@ void workshare::init(bool IsSPMD) { extern "C" { // init -void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule, - int32_t lb, int32_t ub, int32_t st, int32_t chunk) { +OMP_ATTRS void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, + int32_t schedule, int32_t lb, int32_t ub, + int32_t st, int32_t chunk) { DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } -void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule, - uint32_t lb, uint32_t ub, int32_t st, - int32_t chunk) { +OMP_ATTRS void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, + int32_t schedule, uint32_t lb, + uint32_t ub, int32_t st, int32_t chunk) { DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } -void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule, - int64_t lb, int64_t ub, int64_t st, int64_t chunk) { +OMP_ATTRS void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, + int32_t schedule, int64_t lb, int64_t ub, + int64_t st, int64_t chunk) { DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } -void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule, - uint64_t lb, uint64_t ub, int64_t st, - int64_t chunk) { +OMP_ATTRS void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, + int32_t schedule, uint64_t lb, + uint64_t ub, int64_t st, int64_t chunk) { DynamicScheduleTracker *DST = pushDST(); omptarget_nvptx_LoopSupport::dispatch_init( loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST); } // next -int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, - int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { +OMP_ATTRS int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last, + int32_t *p_lb, int32_t *p_ub, + int32_t *p_st) { DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st, DST); } -int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) { +OMP_ATTRS int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, + int32_t *p_last, uint32_t *p_lb, + uint32_t *p_ub, int32_t *p_st) { DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st, DST); } -int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, - int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { +OMP_ATTRS int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last, + int64_t *p_lb, int64_t *p_ub, + int64_t *p_st) { DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st, DST); } -int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last, - uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) { +OMP_ATTRS int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, + int32_t *p_last, uint64_t *p_lb, + uint64_t *p_ub, int64_t *p_st) { DynamicScheduleTracker *DST = peekDST(); return omptarget_nvptx_LoopSupport::dispatch_next( loc, tid, p_last, p_lb, p_ub, p_st, DST); } // fini -void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { +OMP_ATTRS void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) { omptarget_nvptx_LoopSupport::dispatch_fini(); } -void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { +OMP_ATTRS void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) { omptarget_nvptx_LoopSupport::dispatch_fini(); } -void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { +OMP_ATTRS void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) { omptarget_nvptx_LoopSupport::dispatch_fini(); } -void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { +OMP_ATTRS void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { omptarget_nvptx_LoopSupport::dispatch_fini(); } // deinit -void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } +OMP_ATTRS void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } //////////////////////////////////////////////////////////////////////////////// // KMP interface implementation (static loops) //////////////////////////////////////////////////////////////////////////////// -void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { +OMP_ATTRS void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } -void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { +OMP_ATTRS void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } -void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { +OMP_ATTRS void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } -void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { +OMP_ATTRS void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } -void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { +OMP_ATTRS void __kmpc_distribute_static_init_4(IdentTy *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } -void __kmpc_distribute_static_init_4u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { +OMP_ATTRS void __kmpc_distribute_static_init_4u( + IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } -void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { +OMP_ATTRS void __kmpc_distribute_static_init_8(IdentTy *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } -void __kmpc_distribute_static_init_8u(IdentTy *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { +OMP_ATTRS void __kmpc_distribute_static_init_8u( + IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk) { omptarget_nvptx_LoopSupport::for_static_init( global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, mapping::isSPMDMode()); } -void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} +OMP_ATTRS void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {} -void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) {} +OMP_ATTRS void __kmpc_distribute_static_fini(IdentTy *loc, int32_t global_tid) { +} } namespace ompx { @@ -696,10 +710,11 @@ template class StaticLoopChunker { /// size equal to the number of threads in the block and a thread chunk size /// equal to one. In contrast to the chunked version we can get away with a /// single loop in this case - static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), void *Arg, - Ty NumBlocks, Ty BId, Ty NumThreads, - Ty TId, Ty NumIters, - bool OneIterationPerThread) { + OMP_ATTRS static void NormalizedLoopNestNoChunk(void (*LoopBody)(Ty, void *), + void *Arg, Ty NumBlocks, + Ty BId, Ty NumThreads, Ty TId, + Ty NumIters, + bool OneIterationPerThread) { Ty KernelIteration = NumBlocks * NumThreads; // Start index in the normalized space. @@ -726,11 +741,12 @@ template class StaticLoopChunker { /// Generic loop nest that handles block and/or thread distribution in the /// presence of user specified chunk sizes (for at least one of them). - static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), void *Arg, - Ty BlockChunk, Ty NumBlocks, Ty BId, - Ty ThreadChunk, Ty NumThreads, Ty TId, - Ty NumIters, - bool OneIterationPerThread) { + OMP_ATTRS static void NormalizedLoopNestChunked(void (*LoopBody)(Ty, void *), + void *Arg, Ty BlockChunk, + Ty NumBlocks, Ty BId, + Ty ThreadChunk, Ty NumThreads, + Ty TId, Ty NumIters, + bool OneIterationPerThread) { Ty KernelIteration = NumBlocks * BlockChunk; // Start index in the chunked space. @@ -768,8 +784,9 @@ template class StaticLoopChunker { public: /// Worksharing `for`-loop. - static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty NumThreads, Ty ThreadChunk) { + OMP_ATTRS static void For(IdentTy *Loc, void (*LoopBody)(Ty, void *), + void *Arg, Ty NumIters, Ty NumThreads, + Ty ThreadChunk) { ASSERT(NumIters >= 0, "Bad iteration count"); ASSERT(ThreadChunk >= 0, "Bad thread count"); @@ -807,8 +824,8 @@ template class StaticLoopChunker { } /// Worksharing `distrbute`-loop. - static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), void *Arg, - Ty NumIters, Ty BlockChunk) { + OMP_ATTRS static void Distribute(IdentTy *Loc, void (*LoopBody)(Ty, void *), + void *Arg, Ty NumIters, Ty BlockChunk) { ASSERT(icv::Level == 0, "Bad distribute"); ASSERT(icv::ActiveLevel == 0, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute"); @@ -854,9 +871,10 @@ template class StaticLoopChunker { } /// Worksharing `distrbute parallel for`-loop. - static void DistributeFor(IdentTy *Loc, void (*LoopBody)(Ty, void *), - void *Arg, Ty NumIters, Ty NumThreads, - Ty BlockChunk, Ty ThreadChunk) { + OMP_ATTRS static void DistributeFor(IdentTy *Loc, + void (*LoopBody)(Ty, void *), void *Arg, + Ty NumIters, Ty NumThreads, Ty BlockChunk, + Ty ThreadChunk) { ASSERT(icv::Level == 1, "Bad distribute"); ASSERT(icv::ActiveLevel == 1, "Bad distribute"); ASSERT(state::ParallelRegionFn == nullptr, "Bad distribute");