@@ -108,7 +108,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
108
108
zmm_t arrzmm[4 ];
109
109
argzmm_t argzmm[4 ];
110
110
111
- #pragma GCC unroll 2
111
+ #pragma X86_SIMD_SORT_UNROLL_LOOP(2)
112
112
for (int ii = 0 ; ii < 2 ; ++ii) {
113
113
argzmm[ii] = argtype::loadu (arg + 8 * ii);
114
114
arrzmm[ii] = vtype::template i64gather<sizeof (type_t )>(argzmm[ii], arr);
@@ -117,7 +117,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
117
117
118
118
uint64_t combined_mask = (0x1ull << (N - 16 )) - 0x1ull ;
119
119
opmask_t load_mask[2 ] = {0xFF , 0xFF };
120
- #pragma GCC unroll 2
120
+ #pragma X86_SIMD_SORT_UNROLL_LOOP(2)
121
121
for (int ii = 0 ; ii < 2 ; ++ii) {
122
122
load_mask[ii] = (combined_mask >> (ii * 8 )) & 0xFF ;
123
123
argzmm[ii + 2 ] = argtype::maskz_loadu (load_mask[ii], arg + 16 + 8 * ii);
@@ -151,7 +151,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
151
151
zmm_t arrzmm[8 ];
152
152
argzmm_t argzmm[8 ];
153
153
154
- #pragma GCC unroll 4
154
+ #pragma X86_SIMD_SORT_UNROLL_LOOP(4)
155
155
for (int ii = 0 ; ii < 4 ; ++ii) {
156
156
argzmm[ii] = argtype::loadu (arg + 8 * ii);
157
157
arrzmm[ii] = vtype::template i64gather<sizeof (type_t )>(argzmm[ii], arr);
@@ -160,7 +160,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
160
160
161
161
opmask_t load_mask[4 ] = {0xFF , 0xFF , 0xFF , 0xFF };
162
162
uint64_t combined_mask = (0x1ull << (N - 32 )) - 0x1ull ;
163
- #pragma GCC unroll 4
163
+ #pragma X86_SIMD_SORT_UNROLL_LOOP(4)
164
164
for (int ii = 0 ; ii < 4 ; ++ii) {
165
165
load_mask[ii] = (combined_mask >> (ii * 8 )) & 0xFF ;
166
166
argzmm[ii + 4 ] = argtype::maskz_loadu (load_mask[ii], arg + 32 + 8 * ii);
@@ -170,7 +170,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
170
170
argzmm[ii + 4 ]);
171
171
}
172
172
173
- #pragma GCC unroll 4
173
+ #pragma X86_SIMD_SORT_UNROLL_LOOP(4)
174
174
for (int ii = 0 ; ii < 8 ; ii = ii + 2 ) {
175
175
bitonic_merge_two_zmm_64bit<vtype, argtype>(
176
176
arrzmm[ii], arrzmm[ii + 1 ], argzmm[ii], argzmm[ii + 1 ]);
@@ -179,11 +179,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
179
179
bitonic_merge_four_zmm_64bit<vtype, argtype>(arrzmm + 4 , argzmm + 4 );
180
180
bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
181
181
182
- #pragma GCC unroll 4
182
+ #pragma X86_SIMD_SORT_UNROLL_LOOP(4)
183
183
for (int ii = 0 ; ii < 4 ; ++ii) {
184
184
argtype::storeu (arg + 8 * ii, argzmm[ii]);
185
185
}
186
- #pragma GCC unroll 4
186
+ #pragma X86_SIMD_SORT_UNROLL_LOOP(4)
187
187
for (int ii = 0 ; ii < 4 ; ++ii) {
188
188
argtype::mask_storeu (arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4 ]);
189
189
}
@@ -203,7 +203,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
203
203
// zmm_t arrzmm[16];
204
204
// argzmm_t argzmm[16];
205
205
//
206
- // #pragma GCC unroll 8
206
+ // #pragma X86_SIMD_SORT_UNROLL_LOOP(8)
207
207
// for (int ii = 0; ii < 8; ++ii) {
208
208
// argzmm[ii] = argtype::loadu(arg + 8*ii);
209
209
// arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
@@ -213,19 +213,19 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
213
213
// opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
214
214
// if (N != 128) {
215
215
// uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
216
- // #pragma GCC unroll 8
216
+ // #pragma X86_SIMD_SORT_UNROLL_LOOP(8)
217
217
// for (int ii = 0; ii < 8; ++ii) {
218
218
// load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF;
219
219
// }
220
220
// }
221
- // #pragma GCC unroll 8
221
+ // #pragma X86_SIMD_SORT_UNROLL_LOOP(8)
222
222
// for (int ii = 0; ii < 8; ++ii) {
223
223
// argzmm[ii+8] = argtype::maskz_loadu(load_mask[ii], arg + 64 + 8*ii);
224
224
// arrzmm[ii+8] = vtype::template mask_i64gather<sizeof(type_t)>(vtype::zmm_max(), load_mask[ii], argzmm[ii+8], arr);
225
225
// arrzmm[ii+8] = sort_zmm_64bit<vtype, argtype>(arrzmm[ii+8], argzmm[ii+8]);
226
226
// }
227
227
//
228
- // #pragma GCC unroll 8
228
+ // #pragma X86_SIMD_SORT_UNROLL_LOOP(8)
229
229
// for (int ii = 0; ii < 16; ii = ii + 2) {
230
230
// bitonic_merge_two_zmm_64bit<vtype, argtype>(arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
231
231
// }
@@ -237,11 +237,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
237
237
// bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm+8, argzmm+8);
238
238
// bitonic_merge_sixteen_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
239
239
//
240
- // #pragma GCC unroll 8
240
+ // #pragma X86_SIMD_SORT_UNROLL_LOOP(8)
241
241
// for (int ii = 0; ii < 8; ++ii) {
242
242
// argtype::storeu(arg + 8*ii, argzmm[ii]);
243
243
// }
244
- // #pragma GCC unroll 8
244
+ // #pragma X86_SIMD_SORT_UNROLL_LOOP(8)
245
245
// for (int ii = 0; ii < 8; ++ii) {
246
246
// argtype::mask_storeu(arg + 64 + 8*ii, load_mask[ii], argzmm[ii + 8]);
247
247
// }
0 commit comments