@@ -116,6 +116,20 @@ __gpu_read_first_lane_u32(uint64_t __lane_mask, uint32_t __x) {
116116 return __nvvm_shfl_sync_idx_i32 (__mask , __x , __id , __gpu_num_lanes () - 1 );
117117}
118118
119+ // Copies the value from the first active thread in the warp to the rest.
120+ _DEFAULT_FN_ATTRS static __inline__ uint64_t
121+ __gpu_read_first_lane_u64 (uint64_t __lane_mask , uint64_t __x ) {
122+ uint32_t __hi = (uint32_t )(__x >> 32ull );
123+ uint32_t __lo = (uint32_t )(__x & 0xFFFFFFFF );
124+ uint32_t __mask = (uint32_t )__lane_mask ;
125+ uint32_t __id = __builtin_ffs (__mask ) - 1 ;
126+ return ((uint64_t )__nvvm_shfl_sync_idx_i32 (__mask , __hi , __id ,
127+ __gpu_num_lanes () - 1 )
128+ << 32ull ) |
129+ ((uint64_t )__nvvm_shfl_sync_idx_i32 (__mask , __lo , __id ,
130+ __gpu_num_lanes () - 1 ));
131+ }
132+
119133// Returns a bitmask of threads in the current lane for which \p x is true.
120134_DEFAULT_FN_ATTRS static __inline__ uint64_t __gpu_ballot (uint64_t __lane_mask ,
121135 bool __x ) {
@@ -142,6 +156,20 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x) {
142156 __nvvm_shfl_sync_idx_i32 (__mask , __x , __idx , __gpu_num_lanes () - 1u );
143157}
144158
159+ // Shuffles the the lanes inside the warp according to the given index.
160+ _DEFAULT_FN_ATTRS static __inline__ uint64_t
161+ __gpu_shuffle_idx_u64 (uint64_t __lane_mask , uint32_t __idx , uint64_t __x ) {
162+ uint32_t __hi = (uint32_t )(__x >> 32ull );
163+ uint32_t __lo = (uint32_t )(__x & 0xFFFFFFFF );
164+ uint32_t __mask = (uint32_t )__lane_mask ;
165+ uint64_t __bitmask = (__mask >> __idx ) & 1u ;
166+ return - __bitmask & ((uint64_t )__nvvm_shfl_sync_idx_i32 (
167+ __mask , __hi , __idx , __gpu_num_lanes () - 1u )
168+ << 32ull ) |
169+ ((uint64_t )__nvvm_shfl_sync_idx_i32 (__mask , __lo , __idx ,
170+ __gpu_num_lanes () - 1u ));
171+ }
172+
145173// Terminates execution of the calling thread.
146174_DEFAULT_FN_ATTRS [[noreturn ]] static __inline__ void __gpu_exit (void ) {
147175 __nvvm_exit ();
0 commit comments