Skip to content

Commit 05b4e6d

Browse files
committed
reland patches xteamr/fence/atomic
b57c0ba [OpenMP] Update atomic helpers to just use headers (llvm#122185) f53cb84 [OpenMP] Use __builtin_bit_cast instead of UB type punning Change-Id: If054e7788a54c6b4a2550615cd5efc64d70e11c5
1 parent 5eb4e3f commit 05b4e6d

File tree

7 files changed

+146
-397
lines changed

7 files changed

+146
-397
lines changed

offload/DeviceRTL/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ set(bc_flags -c -foffload-lto -std=c++17 -fvisibility=hidden
107107
${clang_opt_flags} --offload-device-only
108108
-nocudalib -nogpulib -nogpuinc
109109
-fopenmp -fopenmp-cuda-mode
110-
-Wno-unknown-cuda-version
110+
-Wno-unknown-cuda-version -Wno-openmp-target
111111
-I${CMAKE_BINARY_DIR}/openmp/runtime/src # Need omp.h for LibM.
112112
-I${CMAKE_BINARY_DIR}/projects/openmp/runtime/src # Need omp.h for LibM.
113113
-I${CMAKE_BINARY_DIR}/runtime/src

offload/DeviceRTL/include/Synchronization.h

Lines changed: 129 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@
1313
#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
1414

1515
#include "DeviceTypes.h"
16+
#include "DeviceUtils.h"
1617

17-
namespace ompx {
18+
#pragma omp begin declare target device_type(nohost)
1819

20+
namespace ompx {
1921
namespace atomic {
2022

2123
enum OrderingTy {
@@ -44,60 +46,134 @@ enum MemScopeTy {
4446
uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering,
4547
MemScopeTy MemScope = MemScopeTy::all);
4648

47-
template <typename Ty> Ty add(Ty *Address, Ty Val, atomic::OrderingTy Ordering);
48-
49-
template <typename Ty> Ty add_system(Ty *Address, Ty Val, atomic::OrderingTy Ordering);
50-
5149
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
5250
/// result is stored in \p *Addr;
5351
/// {
5452

55-
#define ATOMIC_COMMON_OP(TY) \
56-
TY add(TY *Addr, TY V, OrderingTy Ordering); \
57-
TY add_system(TY *Addr, TY V, OrderingTy Ordering); \
58-
TY mul(TY *Addr, TY V, OrderingTy Ordering); \
59-
TY load(TY *Addr, OrderingTy Ordering); \
60-
void store(TY *Addr, TY V, OrderingTy Ordering); \
61-
bool cas(TY *Addr, TY ExpectedV, TY DesiredV, OrderingTy OrderingSucc, \
62-
OrderingTy OrderingFail);
63-
64-
#define ATOMIC_FP_ONLY_OP(TY) \
65-
TY min(TY *Addr, TY V, OrderingTy Ordering); \
66-
TY max(TY *Addr, TY V, OrderingTy Ordering);
67-
68-
#define ATOMIC_INT_ONLY_OP(TY) \
69-
TY min(TY *Addr, TY V, OrderingTy Ordering); \
70-
TY max(TY *Addr, TY V, OrderingTy Ordering); \
71-
TY bit_or(TY *Addr, TY V, OrderingTy Ordering); \
72-
TY bit_and(TY *Addr, TY V, OrderingTy Ordering); \
73-
TY bit_xor(TY *Addr, TY V, OrderingTy Ordering);
74-
75-
#define ATOMIC_FP_OP(TY) \
76-
ATOMIC_FP_ONLY_OP(TY) \
77-
ATOMIC_COMMON_OP(TY)
78-
79-
#define ATOMIC_INT_OP(TY) \
80-
ATOMIC_INT_ONLY_OP(TY) \
81-
ATOMIC_COMMON_OP(TY)
82-
83-
// This needs to be kept in sync with the header. Also the reason we don't use
84-
// templates here.
85-
ATOMIC_INT_OP(int8_t)
86-
ATOMIC_INT_OP(int16_t)
87-
ATOMIC_INT_OP(int32_t)
88-
ATOMIC_INT_OP(int64_t)
89-
ATOMIC_INT_OP(uint8_t)
90-
ATOMIC_INT_OP(uint16_t)
91-
ATOMIC_INT_OP(uint32_t)
92-
ATOMIC_INT_OP(uint64_t)
93-
ATOMIC_FP_OP(float)
94-
ATOMIC_FP_OP(double)
95-
96-
#undef ATOMIC_INT_ONLY_OP
97-
#undef ATOMIC_FP_ONLY_OP
98-
#undef ATOMIC_COMMON_OP
99-
#undef ATOMIC_INT_OP
100-
#undef ATOMIC_FP_OP
53+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
54+
bool cas(Ty *Address, V ExpectedV, V DesiredV, atomic::OrderingTy OrderingSucc,
55+
atomic::OrderingTy OrderingFail,
56+
atomic::ScopeTy Scope = ScopeTy::device_) {
57+
return __scoped_atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
58+
OrderingSucc, OrderingFail, Scope);
59+
}
60+
61+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
62+
V add(Ty *Address, V Val, atomic::OrderingTy Ordering) {
63+
return __scoped_atomic_fetch_add(Address, Val, Ordering,
64+
__MEMORY_SCOPE_DEVICE);
65+
}
66+
67+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
68+
V add_system(Ty *Address, V Val, atomic::OrderingTy Ordering) {
69+
return __scoped_atomic_fetch_add(Address, Val, Ordering,
70+
__MEMORY_SCOPE_SYSTEM);
71+
}
72+
73+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
74+
V load(Ty *Address, atomic::OrderingTy Ordering) {
75+
return add(Address, Ty(0), Ordering);
76+
}
77+
78+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
79+
void store(Ty *Address, V Val, atomic::OrderingTy Ordering) {
80+
__scoped_atomic_store_n(Address, Val, Ordering, __MEMORY_SCOPE_DEVICE);
81+
}
82+
83+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
84+
V mul(Ty *Address, V Val, atomic::OrderingTy Ordering) {
85+
Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
86+
bool Success;
87+
do {
88+
TypedCurrentVal = atomic::load(Address, Ordering);
89+
TypedNewVal = TypedCurrentVal * Val;
90+
Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering,
91+
atomic::relaxed);
92+
} while (!Success);
93+
return TypedResultVal;
94+
}
95+
96+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
97+
utils::enable_if_t<!utils::is_floating_point_v<V>, V>
98+
max(Ty *Address, V Val, atomic::OrderingTy Ordering) {
99+
return __scoped_atomic_fetch_max(Address, Val, Ordering,
100+
__MEMORY_SCOPE_DEVICE);
101+
}
102+
103+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
104+
utils::enable_if_t<utils::is_same_v<V, float>, V>
105+
max(Ty *Address, V Val, atomic::OrderingTy Ordering) {
106+
if (Val >= 0)
107+
return utils::bitCast<float>(
108+
max((int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering));
109+
return utils::bitCast<float>(
110+
min((uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering));
111+
}
112+
113+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
114+
utils::enable_if_t<utils::is_same_v<V, double>, V>
115+
max(Ty *Address, V Val, atomic::OrderingTy Ordering) {
116+
if (Val >= 0)
117+
return utils::bitCast<double>(
118+
max((int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering));
119+
return utils::bitCast<double>(
120+
min((uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering));
121+
}
122+
123+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
124+
utils::enable_if_t<!utils::is_floating_point_v<V>, V>
125+
min(Ty *Address, V Val, atomic::OrderingTy Ordering) {
126+
return __scoped_atomic_fetch_min(Address, Val, Ordering,
127+
__MEMORY_SCOPE_DEVICE);
128+
}
129+
130+
// TODO: Implement this with __atomic_fetch_max and remove the duplication.
131+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
132+
utils::enable_if_t<utils::is_same_v<V, float>, V>
133+
min(Ty *Address, V Val, atomic::OrderingTy Ordering) {
134+
if (Val >= 0)
135+
return utils::bitCast<float>(
136+
min((int32_t *)Address, utils::bitCast<int32_t>(Val), Ordering));
137+
return utils::bitCast<float>(
138+
max((uint32_t *)Address, utils::bitCast<uint32_t>(Val), Ordering));
139+
}
140+
141+
// TODO: Implement this with __atomic_fetch_max and remove the duplication.
142+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
143+
utils::enable_if_t<utils::is_same_v<V, double>, V>
144+
min(Ty *Address, utils::remove_addrspace_t<Ty> Val,
145+
atomic::OrderingTy Ordering) {
146+
if (Val >= 0)
147+
return utils::bitCast<double>(
148+
min((int64_t *)Address, utils::bitCast<int64_t>(Val), Ordering));
149+
return utils::bitCast<double>(
150+
max((uint64_t *)Address, utils::bitCast<uint64_t>(Val), Ordering));
151+
}
152+
153+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
154+
V bit_or(Ty *Address, V Val, atomic::OrderingTy Ordering) {
155+
return __scoped_atomic_fetch_or(Address, Val, Ordering,
156+
__MEMORY_SCOPE_DEVICE);
157+
}
158+
159+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
160+
V bit_and(Ty *Address, V Val, atomic::OrderingTy Ordering) {
161+
return __scoped_atomic_fetch_and(Address, Val, Ordering,
162+
__MEMORY_SCOPE_DEVICE);
163+
}
164+
165+
template <typename Ty, typename V = utils::remove_addrspace_t<Ty>>
166+
V bit_xor(Ty *Address, V Val, atomic::OrderingTy Ordering) {
167+
return __scoped_atomic_fetch_xor(Address, Val, Ordering,
168+
__MEMORY_SCOPE_DEVICE);
169+
}
170+
171+
static inline uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
172+
atomic::OrderingTy Ordering) {
173+
uint32_t R;
174+
__scoped_atomic_exchange(Address, &Val, &R, Ordering, __MEMORY_SCOPE_DEVICE);
175+
return R;
176+
}
101177

102178
///}
103179

@@ -150,4 +226,6 @@ void system(atomic::OrderingTy Ordering);
150226

151227
} // namespace ompx
152228

229+
#pragma omp end declare target
230+
153231
#endif

offload/DeviceRTL/src/Mapping.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -381,8 +381,8 @@ int ompx_shfl_down_sync_i(uint64_t mask, int var, unsigned delta, int width) {
381381

382382
float ompx_shfl_down_sync_f(uint64_t mask, float var, unsigned delta,
383383
int width) {
384-
return utils::convertViaPun<float>(utils::shuffleDown(
385-
mask, utils::convertViaPun<int32_t>(var), delta, width));
384+
return utils::bitCast<float>(
385+
utils::shuffleDown(mask, utils::bitCast<int32_t>(var), delta, width));
386386
}
387387

388388
long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
@@ -391,8 +391,8 @@ long ompx_shfl_down_sync_l(uint64_t mask, long var, unsigned delta, int width) {
391391

392392
double ompx_shfl_down_sync_d(uint64_t mask, double var, unsigned delta,
393393
int width) {
394-
return utils::convertViaPun<double>(utils::shuffleDown(
395-
mask, utils::convertViaPun<int64_t>(var), delta, width));
394+
return utils::bitCast<double>(
395+
utils::shuffleDown(mask, utils::bitCast<int64_t>(var), delta, width));
396396
}
397397
}
398398

0 commit comments

Comments
 (0)