@@ -223,6 +223,41 @@ __device__ __forceinline__ void fastAtomicAdd(
223223 }
224224}
225225
226+
227+ #ifdef USE_ROCM
228+ // This function implements a committed store.
229+ // Upon returning, the store is committed to global memory.
230+ // This is useful in avoiding the need for fences.
231+ template <typename T>
232+ __device__ inline void cmtdStore (void * address, T value) {
233+ int constexpr num_long_per_val = sizeof (value)/sizeof (long );
234+ int constexpr num_int_per_val = sizeof (value)/sizeof (int );
235+ int constexpr num_short_per_val = sizeof (value)/sizeof (short );
236+ int constexpr num_char_per_val = sizeof (value)/sizeof (char );
237+ union pnr { T v;
238+ long l[num_long_per_val];
239+ int i[num_int_per_val];
240+ short s[num_short_per_val];
241+ char c[num_char_per_val]; }
242+ _pnr = {.v = value };
243+ if constexpr (num_long_per_val*sizeof (long ) == sizeof (value))
244+ for (int i=0 ; i<num_long_per_val; i++)
245+ __hip_atomic_store (reinterpret_cast <long *>(address)+i, _pnr.l [i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
246+ else if constexpr (num_int_per_val*sizeof (int ) == sizeof (value))
247+ for (int i=0 ; i<num_int_per_val; i++)
248+ __hip_atomic_store (reinterpret_cast <int *>(address)+i, _pnr.i [i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
249+ else if constexpr (num_short_per_val*sizeof (short ) == sizeof (value))
250+ for (int i=0 ; i<num_short_per_val; i++)
251+ __hip_atomic_store (reinterpret_cast <short *>(address)+i, _pnr.s [i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
252+ else if constexpr (num_char_per_val*sizeof (char ) == sizeof (value))
253+ for (int i=0 ; i<num_char_per_val; i++)
254+ __hip_atomic_store (reinterpret_cast <char *>(address)+i, _pnr.c [i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
255+ __atomic_signal_fence (__ATOMIC_SEQ_CST);
256+ asm volatile (" s_waitcnt vmcnt(0)" ::: " memory" );
257+ __atomic_signal_fence (__ATOMIC_SEQ_CST);
258+ }
259+ #endif
260+
226261#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__))
227262// This function implements warp-level opportunistic fastatomics
228263// To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd.
0 commit comments