@@ -48,140 +48,140 @@ _Pragma("omp begin declare variant match(device = {kind(gpu)})");
4848#define __GPU_Z_DIM 2
4949
5050// Returns the number of blocks in the requested dimension.
51- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_blocks (int __dim ) {
51+ _DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_blocks (int __dim ) {
5252 switch (__dim ) {
5353 case 0 :
54- return __gpu_num_blocks_x ();
54+ return _gpu_num_blocks_x ();
5555 case 1 :
56- return __gpu_num_blocks_y ();
56+ return _gpu_num_blocks_y ();
5757 case 2 :
58- return __gpu_num_blocks_z ();
58+ return _gpu_num_blocks_z ();
5959 default :
6060 __builtin_unreachable ();
6161 }
6262}
6363
6464// Returns the number of block id in the requested dimension.
65- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_block_id (int __dim ) {
65+ _DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_block_id (int __dim ) {
6666 switch (__dim ) {
6767 case 0 :
68- return __gpu_block_id_x ();
68+ return _gpu_block_id_x ();
6969 case 1 :
70- return __gpu_block_id_y ();
70+ return _gpu_block_id_y ();
7171 case 2 :
72- return __gpu_block_id_z ();
72+ return _gpu_block_id_z ();
7373 default :
7474 __builtin_unreachable ();
7575 }
7676}
7777
7878// Returns the number of threads in the requested dimension.
79- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_num_threads (int __dim ) {
79+ _DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_num_threads (int __dim ) {
8080 switch (__dim ) {
8181 case 0 :
82- return __gpu_num_threads_x ();
82+ return _gpu_num_threads_x ();
8383 case 1 :
84- return __gpu_num_threads_y ();
84+ return _gpu_num_threads_y ();
8585 case 2 :
86- return __gpu_num_threads_z ();
86+ return _gpu_num_threads_z ();
8787 default :
8888 __builtin_unreachable ();
8989 }
9090}
9191
9292// Returns the thread id in the requested dimension.
93- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_thread_id (int __dim ) {
93+ _DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_thread_id (int __dim ) {
9494 switch (__dim ) {
9595 case 0 :
96- return __gpu_thread_id_x ();
96+ return _gpu_thread_id_x ();
9797 case 1 :
98- return __gpu_thread_id_y ();
98+ return _gpu_thread_id_y ();
9999 case 2 :
100- return __gpu_thread_id_z ();
100+ return _gpu_thread_id_z ();
101101 default :
102102 __builtin_unreachable ();
103103 }
104104}
105105
106106// Get the first active thread inside the lane.
107107_DEFAULT_FN_ATTRS static __inline__ uint64_t
108- __gpu_first_lane_id (uint64_t __lane_mask ) {
108+ _gpu_first_lane_id (uint64_t __lane_mask ) {
109109 return __builtin_ffsll (__lane_mask ) - 1 ;
110110}
111111
112112// Conditional that is only true for a single thread in a lane.
113113_DEFAULT_FN_ATTRS static __inline__ bool
114- __gpu_is_first_in_lane (uint64_t __lane_mask ) {
115- return __gpu_lane_id () == __gpu_first_lane_id (__lane_mask );
114+ _gpu_is_first_in_lane (uint64_t __lane_mask ) {
115+ return _gpu_lane_id () == _gpu_first_lane_id (__lane_mask );
116116}
117117
118118// Gets the first floating point value from the active lanes.
119119_DEFAULT_FN_ATTRS static __inline__ float
120- __gpu_read_first_lane_f32 (uint64_t __lane_mask , float __x ) {
120+ _gpu_read_first_lane_f32 (uint64_t __lane_mask , float __x ) {
121121 return __builtin_bit_cast (
122- float , __gpu_read_first_lane_u32 ( __lane_mask ,
123- __builtin_bit_cast (uint32_t , __x )));
122+ float ,
123+ _gpu_read_first_lane_u32 ( __lane_mask , __builtin_bit_cast (uint32_t , __x )));
124124}
125125
126126// Gets the first floating point value from the active lanes.
127127_DEFAULT_FN_ATTRS static __inline__ double
128- __gpu_read_first_lane_f64 (uint64_t __lane_mask , double __x ) {
128+ _gpu_read_first_lane_f64 (uint64_t __lane_mask , double __x ) {
129129 return __builtin_bit_cast (
130- double , __gpu_read_first_lane_u64 ( __lane_mask ,
131- __builtin_bit_cast (uint64_t , __x )));
130+ double ,
131+ _gpu_read_first_lane_u64 ( __lane_mask , __builtin_bit_cast (uint64_t , __x )));
132132}
133133
134134// Shuffles the the lanes according to the given index.
135135_DEFAULT_FN_ATTRS static __inline__ float
136- __gpu_shuffle_idx_f32 (uint64_t __lane_mask , uint32_t __idx , float __x ) {
136+ _gpu_shuffle_idx_f32 (uint64_t __lane_mask , uint32_t __idx , float __x ) {
137137 return __builtin_bit_cast (
138- float , __gpu_shuffle_idx_u32 (__lane_mask , __idx ,
139- __builtin_bit_cast (uint32_t , __x )));
138+ float , _gpu_shuffle_idx_u32 (__lane_mask , __idx ,
139+ __builtin_bit_cast (uint32_t , __x )));
140140}
141141
142142// Shuffles the the lanes according to the given index.
143143_DEFAULT_FN_ATTRS static __inline__ double
144- __gpu_shuffle_idx_f64 (uint64_t __lane_mask , uint32_t __idx , double __x ) {
144+ _gpu_shuffle_idx_f64 (uint64_t __lane_mask , uint32_t __idx , double __x ) {
145145 return __builtin_bit_cast (
146- double , __gpu_shuffle_idx_u64 (__lane_mask , __idx ,
147- __builtin_bit_cast (uint64_t , __x )));
146+ double , _gpu_shuffle_idx_u64 (__lane_mask , __idx ,
147+ __builtin_bit_cast (uint64_t , __x )));
148148}
149149
150150// Gets the sum of all lanes inside the warp or wavefront.
151151#define __DO_LANE_SUM (__type , __suffix ) \
152- _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_ ##__suffix( \
152+ _DEFAULT_FN_ATTRS static __inline__ __type _gpu_lane_sum_ ##__suffix( \
153153 uint64_t __lane_mask, __type __x) { \
154- for (uint32_t __step = __gpu_num_lanes () / 2; __step > 0; __step /= 2) { \
155- uint32_t __index = __step + __gpu_lane_id (); \
156- __x += __gpu_shuffle_idx_ ##__suffix(__lane_mask, __index, __x); \
154+ for (uint32_t __step = _gpu_num_lanes () / 2; __step > 0; __step /= 2) { \
155+ uint32_t __index = __step + _gpu_lane_id (); \
156+ __x += _gpu_shuffle_idx_ ##__suffix(__lane_mask, __index, __x); \
157157 } \
158- return __gpu_read_first_lane_ ##__suffix(__lane_mask, __x); \
158+ return _gpu_read_first_lane_ ##__suffix(__lane_mask, __x); \
159159 }
160- __DO_LANE_SUM (uint32_t , u32 ); // uint32_t __gpu_lane_sum_u32 (m, x)
161- __DO_LANE_SUM (uint64_t , u64 ); // uint64_t __gpu_lane_sum_u64 (m, x)
162- __DO_LANE_SUM (float , f32 ); // float __gpu_lane_sum_f32 (m, x)
163- __DO_LANE_SUM (double , f64 ); // double __gpu_lane_sum_f64 (m, x)
160+ __DO_LANE_SUM (uint32_t , u32 ); // uint32_t _gpu_lane_sum_u32 (m, x)
161+ __DO_LANE_SUM (uint64_t , u64 ); // uint64_t _gpu_lane_sum_u64 (m, x)
162+ __DO_LANE_SUM (float , f32 ); // float _gpu_lane_sum_f32 (m, x)
163+ __DO_LANE_SUM (double , f64 ); // double _gpu_lane_sum_f64 (m, x)
164164#undef __DO_LANE_SUM
165165
166166// Gets the accumulator scan of the threads in the warp or wavefront.
167167#define __DO_LANE_SCAN (__type , __bitmask_type , __suffix ) \
168- _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_ ##__suffix( \
168+ _DEFAULT_FN_ATTRS static __inline__ uint32_t _gpu_lane_scan_ ##__suffix( \
169169 uint64_t __lane_mask, uint32_t __x) { \
170- for (uint32_t __step = 1; __step < __gpu_num_lanes (); __step *= 2) { \
171- uint32_t __index = __gpu_lane_id () - __step; \
172- __bitmask_type bitmask = __gpu_lane_id () >= __step; \
170+ for (uint32_t __step = 1; __step < _gpu_num_lanes (); __step *= 2) { \
171+ uint32_t __index = _gpu_lane_id () - __step; \
172+ __bitmask_type bitmask = _gpu_lane_id () >= __step; \
173173 __x += __builtin_bit_cast( \
174174 __type, \
175175 -bitmask & __builtin_bit_cast(__bitmask_type, \
176- __gpu_shuffle_idx_ ##__suffix( \
176+ _gpu_shuffle_idx_ ##__suffix( \
177177 __lane_mask, __index, __x))); \
178178 } \
179179 return __x; \
180180 }
181- __DO_LANE_SCAN (uint32_t , uint32_t , u32 ); // uint32_t __gpu_lane_scan_u32 (m, x)
182- __DO_LANE_SCAN (uint64_t , uint64_t , u64 ); // uint64_t __gpu_lane_scan_u64 (m, x)
183- __DO_LANE_SCAN (float , uint32_t , f32 ); // float __gpu_lane_scan_f32 (m, x)
184- __DO_LANE_SCAN (double , uint64_t , f64 ); // double __gpu_lane_scan_f64 (m, x)
181+ __DO_LANE_SCAN (uint32_t , uint32_t , u32 ); // uint32_t _gpu_lane_scan_u32 (m, x)
182+ __DO_LANE_SCAN (uint64_t , uint64_t , u64 ); // uint64_t _gpu_lane_scan_u64 (m, x)
183+ __DO_LANE_SCAN (float , uint32_t , f32 ); // float _gpu_lane_scan_f32 (m, x)
184+ __DO_LANE_SCAN (double , uint64_t , f64 ); // double _gpu_lane_scan_f64 (m, x)
185185#undef __DO_LANE_SCAN
186186
187187_Pragma ("omp end declare variant" );
0 commit comments