@@ -113,18 +113,34 @@ __gpu_is_first_in_lane(uint64_t __lane_mask) {
113113
114114// Gets the first floating point value from the active lanes.
115115_DEFAULT_FN_ATTRS static __inline__ float
116- __gpu_shuffle_idx_f32 (uint64_t __lane_mask , float __x ) {
116+ __gpu_read_first_lane_f32 (uint64_t __lane_mask , float __x ) {
117117 return __builtin_bit_cast (
118- float ,
119- __gpu_shuffle_idx_u32 ( __lane_mask , __builtin_bit_cast (uint32_t , __x )));
118+ float , __gpu_read_first_lane_u32 ( __lane_mask ,
119+ __builtin_bit_cast (uint32_t , __x )));
120120}
121121
122122// Gets the first floating point value from the active lanes.
123123_DEFAULT_FN_ATTRS static __inline__ double
124- __gpu_shuffle_idx_f64 (uint64_t __lane_mask , double __x ) {
124+ __gpu_read_first_lane_f64 (uint64_t __lane_mask , double __x ) {
125125 return __builtin_bit_cast (
126- double ,
127- __gpu_shuffle_idx_u64 (__lane_mask , __builtin_bit_cast (uint64_t , __x )));
126+ double , __gpu_read_first_lane_u64 (__lane_mask ,
127+ __builtin_bit_cast (uint64_t , __x )));
128+ }
129+
130+ // Gets the first floating point value from the active lanes.
131+ _DEFAULT_FN_ATTRS static __inline__ float
132+ __gpu_shuffle_idx_f32 (uint64_t __lane_mask , uint32_t __idx , float __x ) {
133+ return __builtin_bit_cast (
134+ float , __gpu_shuffle_idx_u32 (__lane_mask , __idx ,
135+ __builtin_bit_cast (uint32_t , __x )));
136+ }
137+
138+ // Gets the first floating point value from the active lanes.
139+ _DEFAULT_FN_ATTRS static __inline__ double
140+ __gpu_shuffle_idx_f64 (uint64_t __lane_mask , uint32_t __idx , double __x ) {
141+ return __builtin_bit_cast (
142+ double , __gpu_shuffle_idx_u64 (__lane_mask , __idx ,
143+ __builtin_bit_cast (uint64_t , __x )));
128144}
129145
130146// Gets the sum of all lanes inside the warp or wavefront.
@@ -150,7 +166,10 @@ __DO_LANE_REDUCE(double, f64);
150166 for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
151167 uint32_t index = __gpu_lane_id() - step; \
152168 __bitmask_type bitmask = __gpu_lane_id() >= step; \
153- x += -bitmask & __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
169+ x += __builtin_bit_cast( \
170+ __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
171+ __gpu_shuffle_idx_##__suffix( \
172+ __lane_mask, index, x))); \
154173 } \
155174 return x; \
156175 }
0 commit comments