@@ -153,10 +153,10 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x) {
153153 } \
154154 return __gpu_read_first_lane_##__suffix(__lane_mask, x); \
155155 }
156- __DO_LANE_REDUCE (uint32_t , u32 );
157- __DO_LANE_REDUCE (uint64_t , u64 );
158- __DO_LANE_REDUCE (float , f32 );
159- __DO_LANE_REDUCE (double , f64 );
156+ __DO_LANE_REDUCE (uint32_t , u32 ); // uint32_t __gpu_lane_reduce_u32(m, x)
157+ __DO_LANE_REDUCE (uint64_t , u64 ); // uint64_t __gpu_lane_reduce_u64(m, x)
158+ __DO_LANE_REDUCE (float , f32 ); // float __gpu_lane_reduce_f32(m, x)
159+ __DO_LANE_REDUCE (double , f64 ); // double __gpu_lane_reduce_f64(m, x)
160160#undef __DO_LANE_REDUCE
161161
162162// Gets the accumulator scan of the threads in the warp or wavefront.
@@ -173,10 +173,10 @@ __DO_LANE_REDUCE(double, f64);
173173 } \
174174 return x; \
175175 }
176- __DO_LANE_SCAN (uint32_t , uint32_t , u32 );
177- __DO_LANE_SCAN (uint64_t , uint64_t , u64 );
178- __DO_LANE_SCAN (float , uint32_t , f32 );
179- __DO_LANE_SCAN (double , uint64_t , f64 );
176+ __DO_LANE_SCAN (uint32_t , uint32_t , u32 ); // uint32_t __gpu_lane_scan_u32(m, x)
177+ __DO_LANE_SCAN (uint64_t , uint64_t , u64 ); // uint64_t __gpu_lane_scan_u64(m, x)
178+ __DO_LANE_SCAN (float , uint32_t , f32 ); // float __gpu_lane_scan_f32(m, x)
179+ __DO_LANE_SCAN (double , uint64_t , f64 ); // double __gpu_lane_scan_f64(m, x)
180180#undef __DO_LANE_SCAN
181181
182182_Pragma ("omp end declare variant" );
0 commit comments