Skip to content

Commit 94280b1

Browse files
author
Raghuveer Devulapalli
authored
Merge pull request #68 from r-devulap/gcc-specific
Use global Macros for GCC specific keywords
2 parents f97b484 + dd79993 commit 94280b1

File tree

3 files changed

+38
-27
lines changed

3 files changed

+38
-27
lines changed

src/avx512-64bit-argsort.hpp

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
108108
zmm_t arrzmm[4];
109109
argzmm_t argzmm[4];
110110

111-
#pragma GCC unroll 2
111+
#pragma X86_SIMD_SORT_UNROLL_LOOP(2)
112112
for (int ii = 0; ii < 2; ++ii) {
113113
argzmm[ii] = argtype::loadu(arg + 8 * ii);
114114
arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
@@ -117,7 +117,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
117117

118118
uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull;
119119
opmask_t load_mask[2] = {0xFF, 0xFF};
120-
#pragma GCC unroll 2
120+
#pragma X86_SIMD_SORT_UNROLL_LOOP(2)
121121
for (int ii = 0; ii < 2; ++ii) {
122122
load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF;
123123
argzmm[ii + 2] = argtype::maskz_loadu(load_mask[ii], arg + 16 + 8 * ii);
@@ -151,7 +151,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
151151
zmm_t arrzmm[8];
152152
argzmm_t argzmm[8];
153153

154-
#pragma GCC unroll 4
154+
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
155155
for (int ii = 0; ii < 4; ++ii) {
156156
argzmm[ii] = argtype::loadu(arg + 8 * ii);
157157
arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
@@ -160,7 +160,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
160160

161161
opmask_t load_mask[4] = {0xFF, 0xFF, 0xFF, 0xFF};
162162
uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
163-
#pragma GCC unroll 4
163+
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
164164
for (int ii = 0; ii < 4; ++ii) {
165165
load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF;
166166
argzmm[ii + 4] = argtype::maskz_loadu(load_mask[ii], arg + 32 + 8 * ii);
@@ -170,7 +170,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
170170
argzmm[ii + 4]);
171171
}
172172

173-
#pragma GCC unroll 4
173+
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
174174
for (int ii = 0; ii < 8; ii = ii + 2) {
175175
bitonic_merge_two_zmm_64bit<vtype, argtype>(
176176
arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
@@ -179,11 +179,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
179179
bitonic_merge_four_zmm_64bit<vtype, argtype>(arrzmm + 4, argzmm + 4);
180180
bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
181181

182-
#pragma GCC unroll 4
182+
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
183183
for (int ii = 0; ii < 4; ++ii) {
184184
argtype::storeu(arg + 8 * ii, argzmm[ii]);
185185
}
186-
#pragma GCC unroll 4
186+
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
187187
for (int ii = 0; ii < 4; ++ii) {
188188
argtype::mask_storeu(arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4]);
189189
}
@@ -203,7 +203,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
203203
// zmm_t arrzmm[16];
204204
// argzmm_t argzmm[16];
205205
//
206-
//#pragma GCC unroll 8
206+
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
207207
// for (int ii = 0; ii < 8; ++ii) {
208208
// argzmm[ii] = argtype::loadu(arg + 8*ii);
209209
// arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
@@ -213,19 +213,19 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
213213
// opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
214214
// if (N != 128) {
215215
// uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
216-
//#pragma GCC unroll 8
216+
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
217217
// for (int ii = 0; ii < 8; ++ii) {
218218
// load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF;
219219
// }
220220
// }
221-
//#pragma GCC unroll 8
221+
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
222222
// for (int ii = 0; ii < 8; ++ii) {
223223
// argzmm[ii+8] = argtype::maskz_loadu(load_mask[ii], arg + 64 + 8*ii);
224224
// arrzmm[ii+8] = vtype::template mask_i64gather<sizeof(type_t)>(vtype::zmm_max(), load_mask[ii], argzmm[ii+8], arr);
225225
// arrzmm[ii+8] = sort_zmm_64bit<vtype, argtype>(arrzmm[ii+8], argzmm[ii+8]);
226226
// }
227227
//
228-
//#pragma GCC unroll 8
228+
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
229229
// for (int ii = 0; ii < 16; ii = ii + 2) {
230230
// bitonic_merge_two_zmm_64bit<vtype, argtype>(arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
231231
// }
@@ -237,11 +237,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
237237
// bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm+8, argzmm+8);
238238
// bitonic_merge_sixteen_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
239239
//
240-
//#pragma GCC unroll 8
240+
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
241241
// for (int ii = 0; ii < 8; ++ii) {
242242
// argtype::storeu(arg + 8*ii, argzmm[ii]);
243243
// }
244-
//#pragma GCC unroll 8
244+
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
245245
// for (int ii = 0; ii < 8; ++ii) {
246246
// argtype::mask_storeu(arg + 64 + 8*ii, load_mask[ii], argzmm[ii + 8]);
247247
// }

src/avx512-common-argsort.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
198198
// first and last vtype::numlanes values are partitioned at the end
199199
zmm_t vec_left[num_unroll], vec_right[num_unroll];
200200
argzmm_t argvec_left[num_unroll], argvec_right[num_unroll];
201-
#pragma GCC unroll 8
201+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
202202
for (int ii = 0; ii < num_unroll; ++ii) {
203203
argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii);
204204
vec_left[ii] = vtype::template i64gather<sizeof(type_t)>(
@@ -224,7 +224,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
224224
*/
225225
if ((r_store + vtype::numlanes) - right < left - l_store) {
226226
right -= num_unroll * vtype::numlanes;
227-
#pragma GCC unroll 8
227+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
228228
for (int ii = 0; ii < num_unroll; ++ii) {
229229
arg_vec[ii]
230230
= argtype::loadu(arg + right + ii * vtype::numlanes);
@@ -233,7 +233,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
233233
}
234234
}
235235
else {
236-
#pragma GCC unroll 8
236+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
237237
for (int ii = 0; ii < num_unroll; ++ii) {
238238
arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes);
239239
curr_vec[ii] = vtype::template i64gather<sizeof(type_t)>(
@@ -242,7 +242,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
242242
left += num_unroll * vtype::numlanes;
243243
}
244244
// partition the current vector and save it on both sides of the array
245-
#pragma GCC unroll 8
245+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
246246
for (int ii = 0; ii < num_unroll; ++ii) {
247247
int32_t amount_gt_pivot
248248
= partition_vec<vtype>(arg,
@@ -259,7 +259,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
259259
}
260260

261261
/* partition and save vec_left and vec_right */
262-
#pragma GCC unroll 8
262+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
263263
for (int ii = 0; ii < num_unroll; ++ii) {
264264
int32_t amount_gt_pivot
265265
= partition_vec<vtype>(arg,
@@ -273,7 +273,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
273273
l_store += (vtype::numlanes - amount_gt_pivot);
274274
r_store -= amount_gt_pivot;
275275
}
276-
#pragma GCC unroll 8
276+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
277277
for (int ii = 0; ii < num_unroll; ++ii) {
278278
int32_t amount_gt_pivot
279279
= partition_vec<vtype>(arg,

src/avx512-common-qsort.h

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,12 @@
6767
#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
6868
#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
6969

70+
/* Compiler specific macros specific */
7071
#ifdef _MSC_VER
7172
#define X86_SIMD_SORT_INLINE static inline
7273
#define X86_SIMD_SORT_FINLINE static __forceinline
74+
#define LIKELY(x)
75+
#define UNLIKELY(x)
7376
#elif defined(__CYGWIN__)
7477
/*
7578
* Force inline in cygwin to work around a compiler bug. See
@@ -80,13 +83,21 @@
8083
#elif defined(__GNUC__)
8184
#define X86_SIMD_SORT_INLINE static inline
8285
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
86+
#define LIKELY(x) __builtin_expect((x), 1)
87+
#define UNLIKELY(x) __builtin_expect((x), 0)
8388
#else
8489
#define X86_SIMD_SORT_INLINE static
8590
#define X86_SIMD_SORT_FINLINE static
91+
#define LIKELY(x)
92+
#define UNLIKELY(x)
8693
#endif
8794

88-
#define LIKELY(x) __builtin_expect((x), 1)
89-
#define UNLIKELY(x) __builtin_expect((x), 0)
95+
#if __GNUC__ >= 8
96+
#define X86_SIMD_SORT_UNROLL_LOOP(num)\
97+
GCC unroll num
98+
#else
99+
#define X86_SIMD_SORT_UNROLL_LOOP(num)
100+
#endif
90101

91102
template <typename type>
92103
struct zmm_vector;
@@ -382,7 +393,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
382393
// We will now have atleast 16 registers worth of data to process:
383394
// left and right vtype::numlanes values are partitioned at the end
384395
zmm_t vec_left[num_unroll], vec_right[num_unroll];
385-
#pragma GCC unroll 8
396+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
386397
for (int ii = 0; ii < num_unroll; ++ii) {
387398
vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
388399
vec_right[ii] = vtype::loadu(
@@ -403,20 +414,20 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
403414
*/
404415
if ((r_store + vtype::numlanes) - right < left - l_store) {
405416
right -= num_unroll * vtype::numlanes;
406-
#pragma GCC unroll 8
417+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
407418
for (int ii = 0; ii < num_unroll; ++ii) {
408419
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
409420
}
410421
}
411422
else {
412-
#pragma GCC unroll 8
423+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
413424
for (int ii = 0; ii < num_unroll; ++ii) {
414425
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
415426
}
416427
left += num_unroll * vtype::numlanes;
417428
}
418429
// partition the current vector and save it on both sides of the array
419-
#pragma GCC unroll 8
430+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
420431
for (int ii = 0; ii < num_unroll; ++ii) {
421432
int32_t amount_ge_pivot
422433
= partition_vec<vtype>(arr,
@@ -432,7 +443,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
432443
}
433444

434445
/* partition and save vec_left[8] and vec_right[8] */
435-
#pragma GCC unroll 8
446+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
436447
for (int ii = 0; ii < num_unroll; ++ii) {
437448
int32_t amount_ge_pivot
438449
= partition_vec<vtype>(arr,
@@ -445,7 +456,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
445456
l_store += (vtype::numlanes - amount_ge_pivot);
446457
r_store -= amount_ge_pivot;
447458
}
448-
#pragma GCC unroll 8
459+
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
449460
for (int ii = 0; ii < num_unroll; ++ii) {
450461
int32_t amount_ge_pivot
451462
= partition_vec<vtype>(arr,

0 commit comments

Comments
 (0)