Skip to content

Commit 5606deb

Browse files
authored
SWDEV-491314 - Re-enable cross-lane sync builtins (#94)
* Enables warp sync builtins by default * Removes HIP_ENABLE_WARP_SYNC_BUILTINS; that macro will no longer have an effect. Instead, we will now be able to disable the builtins with the macro: HIP_DISABLE_WARP_SYNC_BUILTINS
1 parent 29df3ae commit 5606deb

File tree

5 files changed

+22
-15
lines changed

5 files changed

+22
-15
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ Full documentation for HIP is available at [rocm.docs.amd.com](https://rocm.docs
1616
- HIP Extensions APIs for microscaling formats, which are supported on AMD GPUs.
1717
* New `wptr` and `rptr` values in `ClPrint`, for better logging in dispatch barrier methods.
1818
* New debug mask, to print precise code object information for logging.
19+
* The `_sync()` version of crosslane builtins such as `shfl_sync()` and `__reduce_add_sync` are enabled by default. These can be
20+
disabled by setting the preprocessor macro `HIP_DISABLE_WARP_SYNC_BUILTINS`.
1921

2022
### Changed
2123

hipamd/include/hip/amd_detail/amd_hip_bf16.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -663,7 +663,7 @@ __hip_bfloat16 __shfl_xor(MAYBE_UNDEF __hip_bfloat16 var, int lane_mask, int wid
663663
return tmp.f;
664664
}
665665

666-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
666+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
667667
/**
668668
* \ingroup HIP_INTRINSIC_BFLOAT16_MOVE
669669
* \brief shfl down warp intrinsic for bfloat16
@@ -771,7 +771,7 @@ __BF16_DEVICE_STATIC__ __hip_bfloat162 __shfl_xor_sync(const unsigned long long
771771
u.ui = __shfl_xor_sync<unsigned long long, unsigned int>(mask, u.ui, delta, width);
772772
return u.bf162;
773773
}
774-
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
774+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
775775

776776
/**
777777
* \ingroup HIP_INTRINSIC_BFLOAT16_ARITH

hipamd/include/hip/amd_detail/amd_hip_cooperative_groups.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ class coalesced_group : public thread_group {
551551

552552
return __shfl(var, lane, warpSize);
553553
}
554-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
554+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
555555

556556
/** \brief Ballot function on group level.
557557
*
@@ -617,7 +617,7 @@ class coalesced_group : public thread_group {
617617
__match_all_sync(static_cast<unsigned long long>(coalesced_info.member_mask), value,
618618
&pred));
619619
}
620-
#endif
620+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
621621
};
622622

623623
/** \ingroup CooperativeGConstruct
@@ -819,14 +819,14 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
819819
friend __CG_QUALIFIER__ coalesced_group
820820
binary_partition(const thread_block_tile<fsize, fparent>& tgrp, bool pred);
821821

822-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
822+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
823823
__CG_QUALIFIER__ unsigned long long build_mask() const {
824824
unsigned long long mask = ~0ull >> (64 - numThreads);
825825
// thread_rank() gives thread id from 0..thread launch size.
826826
return mask << (((internal::workgroup::thread_rank() % warpSize) / numThreads) *
827827
numThreads);
828828
}
829-
#endif
829+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
830830

831831
public:
832832

@@ -850,7 +850,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
850850
return (__shfl_xor(var, laneMask, numThreads));
851851
}
852852

853-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
853+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
854854
__CG_QUALIFIER__ unsigned long long ballot(int pred) const {
855855
const auto mask = build_mask();
856856
return internal::helper::adjust_mask(mask, __ballot_sync(mask, pred));
@@ -869,7 +869,7 @@ template <unsigned int size> class thread_block_tile_base : public tile_base<siz
869869
const auto mask = build_mask();
870870
return internal::helper::adjust_mask(mask, __match_all_sync(mask, value, &pred));
871871
}
872-
#endif
872+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
873873
};
874874

875875
/** \brief User exposed API that captures the state of the parent group pre-partition
@@ -1197,7 +1197,7 @@ __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const Paren
11971197
return impl::tiled_partition_internal<size, ParentCGTy>(g);
11981198
}
11991199

1200-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
1200+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
12011201

12021202
/** \ingroup CooperativeGConstruct
12031203
* \brief Binary partition.

hipamd/include/hip/amd_detail/amd_warp_functions.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,15 +122,15 @@ unsigned long long int __ballot64(int predicate) {
122122
}
123123

124124
// See amd_warp_sync_functions.h for an explanation of this preprocessor flag.
125-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
125+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
126126
// Since threads in a wave do not make independent progress, __activemask()
127127
// always returns the exact active mask, i.e, all active threads in the wave.
128128
__device__
129129
inline
130130
unsigned long long __activemask() {
131131
return __ballot(true);
132132
}
133-
#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
133+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS
134134

135135
__device__ static inline unsigned int __lane_id() {
136136
if (warpSize == 32) return __builtin_amdgcn_mbcnt_lo(-1, 0);

hipamd/include/hip/amd_detail/amd_warp_sync_functions.h

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,11 @@ THE SOFTWARE.
2424

2525
// Warp sync builtins (with explicit mask argument) introduced in ROCm 6.2 as a
2626
// preview to allow end-users to adapt to the new interface involving 64-bit
27-
// masks. These are disabled by default, and can be enabled by setting the macro
28-
// "HIP_ENABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the
27+
// masks. These are enabled by default, and can be disabled by setting the macro
28+
// "HIP_DISABLE_WARP_SYNC_BUILTINS". This arrangement also applies to the
2929
// __activemask() builtin defined in amd_warp_functions.h.
30-
#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS
30+
#if !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
31+
3132
#if !defined(__HIPCC_RTC__)
3233
#include "amd_warp_functions.h"
3334
#include "amd_device_functions.h"
@@ -722,5 +723,9 @@ __device__ inline unsigned long long __reduce_xor_sync(MaskT mask, unsigned long
722723
return __reduce_op_sync(mask, val, op, wfReduce);
723724
}
724725

726+
#undef __hip_do_sync
727+
#undef __hip_check_mask
728+
#undef __hip_adjust_mask_for_wave32
729+
725730
#endif // HIP_ENABLE_EXTRA_WARP_SYNC_TYPES
726-
#endif // HIP_ENABLE_WARP_SYNC_BUILTINS
731+
#endif // HIP_DISABLE_WARP_SYNC_BUILTINS

0 commit comments

Comments
 (0)