File tree Expand file tree Collapse file tree 1 file changed +9
-0
lines changed
aten/src/ATen/native/cuda Expand file tree Collapse file tree 1 file changed +9
-0
lines changed Original file line number Diff line number Diff line change @@ -797,14 +797,23 @@ struct ReduceOp {
797797 if (should_store) {
798798 index_t offset = config.staging_memory_offset (blockIdx .y );
799799 reduce_buffer[offset] = value;
800+ #ifdef USE_ROCM
801+ __builtin_amdgcn_fence (__ATOMIC_RELEASE, " agent" ); // make sure writes are globally visible
802+ #endif
800803 }
801804
805+ #ifndef USE_ROCM
802806 __threadfence (); // make sure writes are globally visible
807+ #endif
803808 __syncthreads (); // if multiple warps in this block wrote to staging, make sure they're all done
804809 bool is_last_block_done = mark_block_finished ();
805810
806811 if (is_last_block_done) {
812+ #ifdef USE_ROCM
813+ __builtin_amdgcn_fence (__ATOMIC_ACQUIRE, " agent" ); // complete the acquire pattern after release
814+ #else
807815 __threadfence (); // complete the acquire pattern after atomic
816+ #endif
808817 for (auto &v : value) {
809818 v = ident;
810819 }
You can’t perform that action at this time.
0 commit comments