@@ -111,26 +111,54 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
111111 return __gpu_lane_id () == __gpu_first_lane_id (__lane_mask );
112112}
113113
114+ // Gets the first floating point value from the active lanes.
115+ _DEFAULT_FN_ATTRS static __inline__ float
116+ __gpu_shuffle_idx_f32 (uint64_t __lane_mask , float __x ) {
117+ return __builtin_bit_cast (
118+ float ,
119+ __gpu_shuffle_idx_u32 (__lane_mask , __builtin_bit_cast (uint32_t , __x )));
120+ }
121+
122+ // Gets the first floating point value from the active lanes.
123+ _DEFAULT_FN_ATTRS static __inline__ double
124+ __gpu_shuffle_idx_f64 (uint64_t __lane_mask , double __x ) {
125+ return __builtin_bit_cast (
126+ double ,
127+ __gpu_shuffle_idx_u64 (__lane_mask , __builtin_bit_cast (uint64_t , __x )));
128+ }
129+
114130// Gets the sum of all lanes inside the warp or wavefront.
115- _DEFAULT_FN_ATTRS static __inline__ uint32_t
116- __gpu_lane_reduce_u32 (uint64_t __lane_mask , uint32_t x ) {
117- for (uint32_t step = __gpu_num_lanes () / 2 ; step > 0 ; step /= 2 ) {
118- uint32_t index = step + __gpu_lane_id ();
119- x += __gpu_shuffle_idx_u32 (__lane_mask , index , x );
131+ #define __DO_LANE_REDUCE (__type , __suffix ) \
132+ _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_##__suffix( \
133+ uint64_t __lane_mask, __type x) { \
134+ for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) { \
135+ uint32_t index = step + __gpu_lane_id(); \
136+ x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
137+ } \
138+ return __gpu_read_first_lane_##__suffix(__lane_mask, x); \
120139 }
121- return __gpu_read_first_lane_u32 (__lane_mask , x );
122- }
140+ __DO_LANE_REDUCE (uint32_t , u32 );
141+ __DO_LANE_REDUCE (uint64_t , u64 );
142+ __DO_LANE_REDUCE (float , f32 );
143+ __DO_LANE_REDUCE (double , f64 );
144+ #undef __DO_LANE_REDUCE
123145
124146// Gets the accumulator scan of the threads in the warp or wavefront.
125- _DEFAULT_FN_ATTRS static __inline__ uint32_t
126- __gpu_lane_scan_u32 (uint64_t __lane_mask , uint32_t x ) {
127- for (uint32_t step = 1 ; step < __gpu_num_lanes (); step *= 2 ) {
128- uint32_t index = __gpu_lane_id () - step ;
129- uint32_t bitmask = __gpu_lane_id () >= step ;
130- x += - bitmask & __gpu_shuffle_idx_u32 (__lane_mask , index , x );
147+ #define __DO_LANE_SCAN (__type , __bitmask_type , __suffix ) \
148+ _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
149+ uint64_t __lane_mask, uint32_t x) { \
150+ for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
151+ uint32_t index = __gpu_lane_id() - step; \
152+ __bitmask_type bitmask = __gpu_lane_id() >= step; \
153+ x += -bitmask & __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
154+ } \
155+ return x; \
131156 }
132- return x ;
133- }
157+ __DO_LANE_SCAN (uint32_t , uint32_t , u32 );
158+ __DO_LANE_SCAN (uint64_t , uint64_t , u64 );
159+ __DO_LANE_SCAN (float , uint32_t , f32 );
160+ __DO_LANE_SCAN (double , uint64_t , f64 );
161+ #undef __DO_LANE_SCAN
134162
135163_Pragma ("omp end declare variant" );
136164_Pragma ("omp end declare target" );
0 commit comments