@@ -148,34 +148,35 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
148148}
149149
150150// Gets the sum of all lanes inside the warp or wavefront.
151- #define __DO_LANE_REDUCE (__type , __suffix ) \
152- _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_reduce_ ##__suffix( \
153- uint64_t __lane_mask, __type x ) { \
154- for (uint32_t step = __gpu_num_lanes() / 2; step > 0; step /= 2) { \
155- uint32_t index = step + __gpu_lane_id(); \
156- x += __gpu_shuffle_idx_##__suffix(__lane_mask, index, x); \
151+ #define __DO_LANE_SUM (__type , __suffix ) \
152+ _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_ ##__suffix( \
153+ uint64_t __lane_mask, __type __x ) { \
154+ for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
155+ uint32_t __index = __step + __gpu_lane_id(); \
156+ __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x); \
157157 } \
158- return __gpu_read_first_lane_##__suffix(__lane_mask, x); \
158+ return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
159159 }
160- __DO_LANE_REDUCE (uint32_t , u32 ); // uint32_t __gpu_lane_reduce_u32 (m, x)
161- __DO_LANE_REDUCE (uint64_t , u64 ); // uint64_t __gpu_lane_reduce_u64 (m, x)
162- __DO_LANE_REDUCE (float , f32 ); // float __gpu_lane_reduce_f32 (m, x)
163- __DO_LANE_REDUCE (double , f64 ); // double __gpu_lane_reduce_f64 (m, x)
164- #undef __DO_LANE_REDUCE
160+ __DO_LANE_SUM (uint32_t , u32 ); // uint32_t __gpu_lane_sum_u32 (m, x)
161+ __DO_LANE_SUM (uint64_t , u64 ); // uint64_t __gpu_lane_sum_u64 (m, x)
162+ __DO_LANE_SUM (float , f32 ); // float __gpu_lane_sum_f32 (m, x)
163+ __DO_LANE_SUM (double , f64 ); // double __gpu_lane_sum_f64 (m, x)
164+ #undef __DO_LANE_SUM
165165
166166// Gets the accumulator scan of the threads in the warp or wavefront.
167167#define __DO_LANE_SCAN (__type , __bitmask_type , __suffix ) \
168168 _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
169- uint64_t __lane_mask, uint32_t x) { \
170- for (uint32_t step = 1; step < __gpu_num_lanes(); step *= 2) { \
171- uint32_t index = __gpu_lane_id() - step; \
172- __bitmask_type bitmask = __gpu_lane_id() >= step; \
173- x += __builtin_bit_cast( \
174- __type, -bitmask & __builtin_bit_cast(__bitmask_type, \
175- __gpu_shuffle_idx_##__suffix( \
176- __lane_mask, index, x))); \
169+ uint64_t __lane_mask, uint32_t __x) { \
170+ for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
171+ uint32_t __index = __gpu_lane_id() - __step; \
172+ __bitmask_type bitmask = __gpu_lane_id() >= __step; \
173+ __x += __builtin_bit_cast( \
174+ __type, \
175+ -bitmask & __builtin_bit_cast(__bitmask_type, \
176+ __gpu_shuffle_idx_##__suffix( \
177+ __lane_mask, __index, __x))); \
177178 } \
178- return x; \
179+ return __x; \
179180 }
180181__DO_LANE_SCAN (uint32_t , uint32_t , u32 ); // uint32_t __gpu_lane_scan_u32(m, x)
181182__DO_LANE_SCAN (uint64_t , uint64_t , u64 ); // uint64_t __gpu_lane_scan_u64(m, x)
0 commit comments