Skip to content

Commit c140442

Browse files
[AUTOGENERATED] [release/2.8] [ROCm] Use fine-grain fence in reduction (#2561)
Cherry-pick of #2553 Co-authored-by: Jerry Mannil <[email protected]>
1 parent fd4b1e7 commit c140442

File tree

1 file changed

+9
-0
lines changed

1 file changed

+9
-0
lines changed

aten/src/ATen/native/cuda/Reduce.cuh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,14 +797,23 @@ struct ReduceOp {
797797
if (should_store) {
798798
index_t offset = config.staging_memory_offset(blockIdx.y);
799799
reduce_buffer[offset] = value;
800+
#ifdef USE_ROCM
801+
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent"); // make sure writes are globally visible
802+
#endif
800803
}
801804

805+
#ifndef USE_ROCM
802806
__threadfence(); // make sure writes are globally visible
807+
#endif
803808
__syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done
804809
bool is_last_block_done = mark_block_finished();
805810

806811
if (is_last_block_done) {
812+
#ifdef USE_ROCM
813+
__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent"); // complete the acquire pattern after release
814+
#else
807815
__threadfence(); // complete the acquire pattern after atomic
816+
#endif
808817
for (auto &v : value) {
809818
v = ident;
810819
}

0 commit comments

Comments
 (0)