Skip to content

Commit 28f820a

Browse files
[AUTOGENERATED] [rocm7.1_internal_testing] [ROCm] No-fence global reduce (#2586)
Cherry-pick of #2584 Co-authored-by: Jerry Mannil <[email protected]>
1 parent 1455054 commit 28f820a

File tree

2 files changed

+43
-6
lines changed

2 files changed

+43
-6
lines changed

aten/src/ATen/native/cuda/KernelUtils.cuh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,41 @@ __device__ __forceinline__ void fastAtomicAdd(
223223
}
224224
}
225225

226+
227+
#ifdef USE_ROCM
228+
// This function implements a committed store.
229+
// Upon returning, the store is committed to global memory.
230+
// This is useful in avoiding the need for fences.
231+
template <typename T>
232+
__device__ inline void cmtdStore(void* address, T value) {
233+
int constexpr num_long_per_val = sizeof(value)/sizeof(long);
234+
int constexpr num_int_per_val = sizeof(value)/sizeof(int);
235+
int constexpr num_short_per_val = sizeof(value)/sizeof(short);
236+
int constexpr num_char_per_val = sizeof(value)/sizeof(char);
237+
union pnr { T v;
238+
long l[num_long_per_val];
239+
int i[num_int_per_val];
240+
short s[num_short_per_val];
241+
char c[num_char_per_val]; }
242+
_pnr = {.v = value };
243+
if constexpr (num_long_per_val*sizeof(long) == sizeof(value))
244+
for (int i=0; i<num_long_per_val; i++)
245+
__hip_atomic_store(reinterpret_cast<long *>(address)+i, _pnr.l[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
246+
else if constexpr (num_int_per_val*sizeof(int) == sizeof(value))
247+
for (int i=0; i<num_int_per_val; i++)
248+
__hip_atomic_store(reinterpret_cast<int *>(address)+i, _pnr.i[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
249+
else if constexpr (num_short_per_val*sizeof(short) == sizeof(value))
250+
for (int i=0; i<num_short_per_val; i++)
251+
__hip_atomic_store(reinterpret_cast<short *>(address)+i, _pnr.s[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
252+
else if constexpr (num_char_per_val*sizeof(char) == sizeof(value))
253+
for (int i=0; i<num_char_per_val; i++)
254+
__hip_atomic_store(reinterpret_cast<char *>(address)+i, _pnr.c[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
255+
__atomic_signal_fence(__ATOMIC_SEQ_CST);
256+
asm volatile("s_waitcnt vmcnt(0)" ::: "memory");
257+
__atomic_signal_fence(__ATOMIC_SEQ_CST);
258+
}
259+
#endif
260+
226261
#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__))
227262
// This function implements warp-level opportunistic fastatomics
228263
// To reduce contention on an atomicAdd, this replaces per-thread atomicAdd with a per-warp atomicAdd.

aten/src/ATen/native/cuda/Reduce.cuh

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <thrust/pair.h>
1919

2020
#include <ATen/native/cuda/jit_utils.h>
21+
#include <ATen/native/cuda/KernelUtils.cuh>
2122

2223
namespace at::native {
2324

@@ -796,22 +797,23 @@ struct ReduceOp {
796797
bool should_store = config.should_store(output_idx);
797798
if (should_store) {
798799
index_t offset = config.staging_memory_offset(blockIdx.y);
800+
#ifndef USE_ROCM
799801
reduce_buffer[offset] = value;
800-
#ifdef USE_ROCM
801-
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); // make sure writes are globally visible
802+
#else // [CMTSTRS]
803+
// In architectures with split caches, global fences are costly.
804+
// Here we preempt need for fences by committing stores to global memory.
805+
cmtdStore(&reduce_buffer[offset], value);
802806
#endif
803807
}
804808

805-
#ifndef USE_ROCM
809+
#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
806810
__threadfence(); // make sure writes are globally visible
807811
#endif
808812
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
809813
bool is_last_block_done = mark_block_finished();
810814

811815
if (is_last_block_done) {
812-
#ifdef USE_ROCM
813-
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); // complete the acquire pattern after release
814-
#else
816+
#ifndef USE_ROCM // skip fence if store are committed [CMTSTRS]
815817
__threadfence(); // complete the acquire pattern after atomic
816818
#endif
817819
for (auto &v : value) {

0 commit comments

Comments
 (0)