Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 7 additions & 11 deletions offload/DeviceRTL/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,10 @@ set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")

# Set flags for LLVM Bitcode compilation.
set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
${clang_opt_flags} --offload-device-only
-nocudalib -nogpulib -nogpuinc -nostdlibinc
-fopenmp -fopenmp-cuda-mode
-Wno-unknown-cuda-version -Wno-openmp-target
set(bc_flags -c -flto -std=c++17 -fvisibility=hidden
${clang_opt_flags} -nogpulib -nostdlibinc
-fno-rtti -fno-exceptions -fconvergent-functions
-Wno-unknown-cuda-version
-DOMPTARGET_DEVICE_RUNTIME
-I${include_directory}
-I${devicertl_base_directory}/../include
Expand All @@ -123,8 +122,7 @@ function(compileDeviceRTLLibrary target_name target_triple)
add_custom_command(OUTPUT ${outfile}
COMMAND ${CLANG_TOOL}
${bc_flags}
-fopenmp-targets=${target_triple}
-Xopenmp-target=${target_triple} -march=
--target=${target_triple}
${target_bc_flags}
-MD -MF ${depfile}
${infile} -o ${outfile}
Expand Down Expand Up @@ -242,10 +240,8 @@ function(compileDeviceRTLLibrary target_name target_triple)
set(ide_target_name omptarget-ide-${target_name})
add_library(${ide_target_name} STATIC EXCLUDE_FROM_ALL ${src_files})
target_compile_options(${ide_target_name} PRIVATE
-fopenmp-targets=${target_triple} -Xopenmp-target=${target_triple} -march=
-fopenmp -fopenmp-cuda-mode -mllvm -openmp-opt-disable
-foffload-lto -fvisibility=hidden --offload-device-only
-nocudalib -nogpulib -nogpuinc -nostdlibinc -Wno-unknown-cuda-version
-fvisibility=hidden --target=${target_triple}
-nogpulib -nostdlibinc -Wno-unknown-cuda-version
)
target_compile_definitions(${ide_target_name} PRIVATE SHARED_SCRATCHPAD_SIZE=512)
target_include_directories(${ide_target_name} PRIVATE
Expand Down
4 changes: 0 additions & 4 deletions offload/DeviceRTL/include/Allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
// Forward declaration.
struct KernelEnvironmentTy;

#pragma omp begin declare target device_type(nohost)

namespace ompx {

namespace allocator {
Expand All @@ -44,6 +42,4 @@ extern "C" {
[[gnu::weak]] void free(void *Ptr);
}

#pragma omp end declare target

#endif
10 changes: 1 addition & 9 deletions offload/DeviceRTL/include/DeviceTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,14 +99,7 @@ struct TaskDescriptorTy {
TaskFnTy TaskFn;
};

#pragma omp begin declare variant match(device = {arch(amdgcn)})
using LaneMaskTy = uint64_t;
#pragma omp end declare variant

#pragma omp begin declare variant match( \
device = {arch(amdgcn)}, implementation = {extension(match_none)})
using LaneMaskTy = uint64_t;
#pragma omp end declare variant

namespace lanes {
enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
Expand Down Expand Up @@ -163,8 +156,7 @@ typedef enum omp_allocator_handle_t {
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)

#define SHARED(NAME) \
NAME [[clang::loader_uninitialized]]; \
OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
[[clang::address_space(3)]] NAME [[clang::loader_uninitialized]];

// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
// now that's not the case.
Expand Down
4 changes: 0 additions & 4 deletions offload/DeviceRTL/include/DeviceUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
#include "DeviceTypes.h"
#include "Shared/Utils.h"

#pragma omp begin declare target device_type(nohost)

namespace utils {

template <typename T> struct type_identity {
Expand Down Expand Up @@ -95,6 +93,4 @@ bool isThreadLocalMemPtr(void *Ptr);

} // namespace utils

#pragma omp end declare target

#endif
4 changes: 0 additions & 4 deletions offload/DeviceRTL/include/Mapping.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,8 @@ enum {
DIM_Z = 2,
};

#pragma omp begin declare target device_type(nohost)

inline constexpr uint32_t MaxThreadsPerTeam = 1024;

#pragma omp end declare target

/// Initialize the mapping machinery.
void init(bool IsSPMD);

Expand Down
10 changes: 2 additions & 8 deletions offload/DeviceRTL/include/State.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
// Forward declaration.
struct KernelEnvironmentTy;

#pragma omp begin declare target device_type(nohost)

namespace ompx {

namespace memory {
Expand Down Expand Up @@ -88,8 +86,7 @@ struct TeamStateTy {
ParallelRegionFnTy ParallelRegionFnVar;
};

extern TeamStateTy TeamState;
#pragma omp allocate(TeamState) allocator(omp_pteam_mem_alloc)
extern TeamStateTy [[clang::address_space(3)]] TeamState;

struct ThreadStateTy {

Expand All @@ -115,8 +112,7 @@ struct ThreadStateTy {
}
};

extern ThreadStateTy **ThreadStates;
#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
extern ThreadStateTy **[[clang::address_space(3)]] ThreadStates;

/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
Expand Down Expand Up @@ -378,6 +374,4 @@ inline state::Value<uint32_t, state::VK_RunSched> RunSched;

} // namespace ompx

#pragma omp end declare target

#endif
4 changes: 0 additions & 4 deletions offload/DeviceRTL/include/Synchronization.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
#include "DeviceTypes.h"
#include "DeviceUtils.h"

#pragma omp begin declare target device_type(nohost)

namespace ompx {
namespace atomic {

Expand Down Expand Up @@ -220,6 +218,4 @@ void system(atomic::OrderingTy Ordering);

} // namespace ompx

#pragma omp end declare target

#endif
4 changes: 0 additions & 4 deletions offload/DeviceRTL/include/Workshare.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
#ifndef OMPTARGET_WORKSHARE_H
#define OMPTARGET_WORKSHARE_H

#pragma omp begin declare target device_type(nohost)

namespace ompx {

namespace workshare {
Expand All @@ -25,6 +23,4 @@ void init(bool IsSPMD);

} // namespace ompx

#pragma omp end declare target

#endif
4 changes: 0 additions & 4 deletions offload/DeviceRTL/src/Allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@

using namespace ompx;

#pragma omp begin declare target device_type(nohost)

[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
Expand Down Expand Up @@ -77,5 +75,3 @@ void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }

///}

#pragma omp end declare target
4 changes: 0 additions & 4 deletions offload/DeviceRTL/src/Configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

using namespace ompx;

#pragma omp begin declare target device_type(nohost)

// Weak definitions will be overridden by CGOpenmpRuntimeGPU if enabled.
[[gnu::weak]] extern const uint32_t __omp_rtl_debug_kind = 0;
[[gnu::weak]] extern const uint32_t __omp_rtl_assume_no_thread_state = 0;
Expand Down Expand Up @@ -85,5 +83,3 @@ bool config::mayUseNestedParallelism() {
return false;
return state::getKernelEnvironment().Configuration.MayUseNestedParallelism;
}

#pragma omp end declare target
4 changes: 0 additions & 4 deletions offload/DeviceRTL/src/Debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@

using namespace ompx;

#pragma omp begin declare target device_type(nohost)

extern "C" {
void __assert_assume(bool condition) { __builtin_assume(condition); }

Expand All @@ -44,5 +42,3 @@ void __assert_fail_internal(const char *expr, const char *msg, const char *file,
__builtin_trap();
}
}

#pragma omp end declare target
16 changes: 4 additions & 12 deletions offload/DeviceRTL/src/DeviceUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,10 @@
#include "Interface.h"
#include "Mapping.h"

#pragma omp begin declare target device_type(nohost)

using namespace ompx;

namespace impl {

bool isSharedMemPtr(const void *Ptr) { return false; }

void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
static_assert(sizeof(unsigned long) == 8, "");
*LowBits = static_cast<uint32_t>(Val & 0x00000000FFFFFFFFUL);
Expand All @@ -42,7 +38,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred);
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
#ifdef __AMDGPU__

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
int Self = mapping::getThreadIdInWarp();
Expand All @@ -66,15 +62,13 @@ bool isSharedMemPtr(const void *Ptr) {
return __builtin_amdgcn_is_shared(
(const __attribute__((address_space(0))) void *)Ptr);
}
#pragma omp end declare variant
#endif
///}

/// NVPTX Implementation
///
///{
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
#ifdef __NVPTX__

int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane, int32_t Width) {
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, Width - 1);
Expand All @@ -91,7 +85,7 @@ uint64_t ballotSync(uint64_t Mask, int32_t Pred) {

bool isSharedMemPtr(const void *Ptr) { return __nvvm_isspacep_shared(Ptr); }

#pragma omp end declare variant
#endif
///}
} // namespace impl

Expand Down Expand Up @@ -137,5 +131,3 @@ int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
return utils::shuffleDown(lanes::All, Val, Delta, Width);
}
}

#pragma omp end declare target
4 changes: 0 additions & 4 deletions offload/DeviceRTL/src/Kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@

using namespace ompx;

#pragma omp begin declare target device_type(nohost)

static void
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
Expand Down Expand Up @@ -155,5 +153,3 @@ void __kmpc_target_deinit() {

int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
}

#pragma omp end declare target
4 changes: 0 additions & 4 deletions offload/DeviceRTL/src/LibC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@

#include "LibC.h"

#pragma omp begin declare target device_type(nohost)

#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
extern "C" int vprintf(const char *format, __builtin_va_list) { return -1; }
#else
Expand Down Expand Up @@ -48,5 +46,3 @@ namespace ompx {
return ::vprintf(Format, vlist);
}
} // namespace ompx

#pragma omp end declare target
30 changes: 5 additions & 25 deletions offload/DeviceRTL/src/Mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,33 +15,17 @@
#include "Interface.h"
#include "State.h"

#pragma omp begin declare target device_type(nohost)

#include "llvm/Frontend/OpenMP/OMPGridValues.h"

using namespace ompx;

namespace ompx {
namespace impl {

// Forward declarations defined to be defined for AMDGCN and NVPTX.
LaneMaskTy activemask();
LaneMaskTy lanemaskLT();
LaneMaskTy lanemaskGT();
uint32_t getThreadIdInWarp();
uint32_t getThreadIdInBlock(int32_t Dim);
uint32_t getNumberOfThreadsInBlock(int32_t Dim);
uint32_t getNumberOfThreadsInKernel();
uint32_t getBlockIdInKernel(int32_t Dim);
uint32_t getNumberOfBlocksInKernel(int32_t Dim);
uint32_t getWarpIdInBlock();
uint32_t getNumberOfWarpsInBlock();
uint32_t getWarpSize();

/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
#ifdef __AMDGPU__

uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }

Expand Down Expand Up @@ -128,15 +112,13 @@ uint32_t getNumberOfWarpsInBlock() {
return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
}

#pragma omp end declare variant
#endif
///}

/// NVPTX Implementation
///
///{
#pragma omp begin declare variant match( \
device = {arch(nvptx, nvptx64)}, \
implementation = {extension(match_any)})
#ifdef __NVPTX__

uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
switch (Dim) {
Expand Down Expand Up @@ -214,7 +196,7 @@ uint32_t getNumberOfWarpsInBlock() {
mapping::getWarpSize();
}

#pragma omp end declare variant
#endif
///}

} // namespace impl
Expand Down Expand Up @@ -376,7 +358,7 @@ float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
}

long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
return utils::shuffleDown(mask, var, delta, width);
return utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width);
}

double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
Expand All @@ -385,5 +367,3 @@ double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
}
}

#pragma omp end declare target
Loading