@@ -57,7 +57,7 @@ struct avx2_vector<int64_t> {
57
57
static const uint8_t numlanes = 4 ;
58
58
static constexpr int network_sort_threshold = 64 ;
59
59
static constexpr int partition_unroll_factor = 4 ;
60
-
60
+
61
61
using swizzle_ops = avx2_64bit_swizzle_ops;
62
62
63
63
static type_t type_max ()
@@ -85,14 +85,6 @@ struct avx2_vector<int64_t> {
85
85
{
86
86
return _mm256_xor_si256 (x, y);
87
87
}
88
- static opmask_t knot_opmask (opmask_t x)
89
- {
90
- return ~x;
91
- }
92
- static opmask_t le (reg_t x, reg_t y)
93
- {
94
- return ~_mm256_cmpgt_epi64 (x, y);
95
- }
96
88
static opmask_t ge (reg_t x, reg_t y)
97
89
{
98
90
opmask_t equal = eq (x, y);
@@ -113,8 +105,7 @@ struct avx2_vector<int64_t> {
113
105
template <int scale>
114
106
static reg_t i64gather (__m256i index, void const *base)
115
107
{
116
- return _mm256_i64gather_epi64 (
117
- (long long int const *)base, index, scale);
108
+ return _mm256_i64gather_epi64 ((int64_t const *)base, index, scale);
118
109
}
119
110
static reg_t loadu (void const *mem)
120
111
{
@@ -205,10 +196,12 @@ struct avx2_vector<int64_t> {
205
196
{
206
197
return sort_ymm_64bit<avx2_vector<type_t >>(x);
207
198
}
208
- static reg_t cast_from (__m256i v){
199
+ static reg_t cast_from (__m256i v)
200
+ {
209
201
return v;
210
202
}
211
- static __m256i cast_to (reg_t v){
203
+ static __m256i cast_to (reg_t v)
204
+ {
212
205
return v;
213
206
}
214
207
};
@@ -221,7 +214,7 @@ struct avx2_vector<uint64_t> {
221
214
static const uint8_t numlanes = 4 ;
222
215
static constexpr int network_sort_threshold = 64 ;
223
216
static constexpr int partition_unroll_factor = 4 ;
224
-
217
+
225
218
using swizzle_ops = avx2_64bit_swizzle_ops;
226
219
227
220
static type_t type_max ()
@@ -257,10 +250,6 @@ struct avx2_vector<uint64_t> {
257
250
return _mm256_i64gather_epi64 (
258
251
(long long int const *)base, index, scale);
259
252
}
260
- static opmask_t knot_opmask (opmask_t x)
261
- {
262
- return ~x;
263
- }
264
253
static opmask_t ge (reg_t x, reg_t y)
265
254
{
266
255
opmask_t equal = eq (x, y);
@@ -362,10 +351,12 @@ struct avx2_vector<uint64_t> {
362
351
{
363
352
return sort_ymm_64bit<avx2_vector<type_t >>(x);
364
353
}
365
- static reg_t cast_from (__m256i v){
354
+ static reg_t cast_from (__m256i v)
355
+ {
366
356
return v;
367
357
}
368
- static __m256i cast_to (reg_t v){
358
+ static __m256i cast_to (reg_t v)
359
+ {
369
360
return v;
370
361
}
371
362
};
@@ -378,7 +369,7 @@ struct avx2_vector<double> {
378
369
static const uint8_t numlanes = 4 ;
379
370
static constexpr int network_sort_threshold = 64 ;
380
371
static constexpr int partition_unroll_factor = 4 ;
381
-
372
+
382
373
using swizzle_ops = avx2_64bit_swizzle_ops;
383
374
384
375
static type_t type_max ()
@@ -421,10 +412,6 @@ struct avx2_vector<double> {
421
412
{
422
413
return _mm256_maskload_pd ((const double *)mem, mask);
423
414
}
424
- static opmask_t knot_opmask (opmask_t x)
425
- {
426
- return ~x;
427
- }
428
415
static opmask_t ge (reg_t x, reg_t y)
429
416
{
430
417
return _mm256_castpd_si256 (_mm256_cmp_pd (x, y, _CMP_GE_OQ));
@@ -531,55 +518,64 @@ struct avx2_vector<double> {
531
518
{
532
519
return sort_ymm_64bit<avx2_vector<type_t >>(x);
533
520
}
534
- static reg_t cast_from (__m256i v){
521
+ static reg_t cast_from (__m256i v)
522
+ {
535
523
return _mm256_castsi256_pd (v);
536
524
}
537
- static __m256i cast_to (reg_t v){
525
+ static __m256i cast_to (reg_t v)
526
+ {
538
527
return _mm256_castpd_si256 (v);
539
528
}
540
529
};
541
530
542
- struct avx2_64bit_swizzle_ops {
531
+ struct avx2_64bit_swizzle_ops {
543
532
template <typename vtype, int scale>
544
- X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n (typename vtype::reg_t reg){
533
+ X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n (typename vtype::reg_t reg)
534
+ {
545
535
__m256i v = vtype::cast_to (reg);
546
536
547
- if constexpr (scale == 2 ){
537
+ if constexpr (scale == 2 ) {
548
538
v = _mm256_permute4x64_epi64 (v, 0b10110001 );
549
- }else if constexpr (scale == 4 ){
539
+ }
540
+ else if constexpr (scale == 4 ) {
550
541
v = _mm256_permute4x64_epi64 (v, 0b01001110 );
551
- }else {
542
+ }
543
+ else {
552
544
static_assert (scale == -1 , " should not be reached" );
553
545
}
554
546
555
547
return vtype::cast_from (v);
556
548
}
557
549
558
550
template <typename vtype, int scale>
559
- X86_SIMD_SORT_INLINE typename vtype::reg_t reverse_n (typename vtype::reg_t reg){
551
+ X86_SIMD_SORT_INLINE typename vtype::reg_t
552
+ reverse_n (typename vtype::reg_t reg)
553
+ {
560
554
__m256i v = vtype::cast_to (reg);
561
555
562
- if constexpr (scale == 2 ){
563
- return swap_n<vtype, 2 >(reg);
564
- }else if constexpr (scale == 4 ){
556
+ if constexpr (scale == 2 ) { return swap_n<vtype, 2 >(reg); }
557
+ else if constexpr (scale == 4 ) {
565
558
return vtype::reverse (reg);
566
- }else {
559
+ }
560
+ else {
567
561
static_assert (scale == -1 , " should not be reached" );
568
562
}
569
563
570
564
return vtype::cast_from (v);
571
565
}
572
566
573
567
template <typename vtype, int scale>
574
- X86_SIMD_SORT_INLINE typename vtype::reg_t merge_n (typename vtype::reg_t reg, typename vtype::reg_t other){
568
+ X86_SIMD_SORT_INLINE typename vtype::reg_t
569
+ merge_n (typename vtype::reg_t reg, typename vtype::reg_t other)
570
+ {
575
571
__m256d v1 = _mm256_castsi256_pd (vtype::cast_to (reg));
576
572
__m256d v2 = _mm256_castsi256_pd (vtype::cast_to (other));
577
573
578
- if constexpr (scale == 2 ){
579
- v1 = _mm256_blend_pd (v1, v2, 0b0101 );
580
- }else if constexpr (scale == 4 ){
574
+ if constexpr (scale == 2 ) { v1 = _mm256_blend_pd (v1, v2, 0b0101 ); }
575
+ else if constexpr (scale == 4 ) {
581
576
v1 = _mm256_blend_pd (v1, v2, 0b0011 );
582
- }else {
577
+ }
578
+ else {
583
579
static_assert (scale == -1 , " should not be reached" );
584
580
}
585
581
0 commit comments