@@ -40,12 +40,16 @@ X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit(reg_t ymm)
40
40
ymm = cmp_merge<vtype>(
41
41
ymm, vtype::template shuffle<SHUFFLE_MASK (2 , 3 , 0 , 1 )>(ymm), oxAA);
42
42
ymm = cmp_merge<vtype>(
43
- ymm, vtype::permutexvar (vtype::seti (NETWORK_32BIT_AVX2_1), ymm), oxCC);
43
+ ymm,
44
+ vtype::permutexvar (vtype::seti (NETWORK_32BIT_AVX2_1), ymm),
45
+ oxCC);
44
46
ymm = cmp_merge<vtype>(
45
47
ymm, vtype::template shuffle<SHUFFLE_MASK (2 , 3 , 0 , 1 )>(ymm), oxAA);
46
48
ymm = cmp_merge<vtype>(ymm, vtype::permutexvar (rev_index, ymm), oxF0);
47
49
ymm = cmp_merge<vtype>(
48
- ymm, vtype::permutexvar (vtype::seti (NETWORK_32BIT_AVX2_3), ymm), oxCC);
50
+ ymm,
51
+ vtype::permutexvar (vtype::seti (NETWORK_32BIT_AVX2_3), ymm),
52
+ oxCC);
49
53
ymm = cmp_merge<vtype>(
50
54
ymm, vtype::template shuffle<SHUFFLE_MASK (2 , 3 , 0 , 1 )>(ymm), oxAA);
51
55
return ymm;
@@ -200,18 +204,21 @@ struct avx2_vector<int32_t> {
200
204
{
201
205
return sort_ymm_32bit<avx2_vector<type_t >>(x);
202
206
}
203
- static reg_t cast_from (__m256i v){
207
+ static reg_t cast_from (__m256i v)
208
+ {
204
209
return v;
205
210
}
206
- static __m256i cast_to (reg_t v){
211
+ static __m256i cast_to (reg_t v)
212
+ {
207
213
return v;
208
214
}
209
215
static int double_compressstore (type_t *left_addr,
210
216
type_t *right_addr,
211
217
opmask_t k,
212
218
reg_t reg)
213
219
{
214
- return avx2_double_compressstore32<type_t >(left_addr, right_addr, k, reg);
220
+ return avx2_double_compressstore32<type_t >(
221
+ left_addr, right_addr, k, reg);
215
222
}
216
223
};
217
224
template <>
@@ -346,18 +353,21 @@ struct avx2_vector<uint32_t> {
346
353
{
347
354
return sort_ymm_32bit<avx2_vector<type_t >>(x);
348
355
}
349
- static reg_t cast_from (__m256i v){
356
+ static reg_t cast_from (__m256i v)
357
+ {
350
358
return v;
351
359
}
352
- static __m256i cast_to (reg_t v){
360
+ static __m256i cast_to (reg_t v)
361
+ {
353
362
return v;
354
363
}
355
364
static int double_compressstore (type_t *left_addr,
356
365
type_t *right_addr,
357
366
opmask_t k,
358
367
reg_t reg)
359
368
{
360
- return avx2_double_compressstore32<type_t >(left_addr, right_addr, k, reg);
369
+ return avx2_double_compressstore32<type_t >(
370
+ left_addr, right_addr, k, reg);
361
371
}
362
372
};
363
373
template <>
@@ -419,9 +429,10 @@ struct avx2_vector<float> {
419
429
template <int type>
420
430
static opmask_t fpclass (reg_t x)
421
431
{
422
- if constexpr (type == (0x01 | 0x80 )){
432
+ if constexpr (type == (0x01 | 0x80 )) {
423
433
return _mm256_castps_si256 (_mm256_cmp_ps (x, x, _CMP_UNORD_Q));
424
- }else {
434
+ }
435
+ else {
425
436
static_assert (type == (0x01 | 0x80 ), " should not reach here" );
426
437
}
427
438
}
@@ -514,75 +525,90 @@ struct avx2_vector<float> {
514
525
{
515
526
return sort_ymm_32bit<avx2_vector<type_t >>(x);
516
527
}
517
- static reg_t cast_from (__m256i v){
528
+ static reg_t cast_from (__m256i v)
529
+ {
518
530
return _mm256_castsi256_ps (v);
519
531
}
520
- static __m256i cast_to (reg_t v){
532
+ static __m256i cast_to (reg_t v)
533
+ {
521
534
return _mm256_castps_si256 (v);
522
535
}
523
536
static int double_compressstore (type_t *left_addr,
524
537
type_t *right_addr,
525
538
opmask_t k,
526
539
reg_t reg)
527
540
{
528
- return avx2_double_compressstore32<type_t >(left_addr, right_addr, k, reg);
541
+ return avx2_double_compressstore32<type_t >(
542
+ left_addr, right_addr, k, reg);
529
543
}
530
544
};
531
545
532
- struct avx2_32bit_swizzle_ops {
546
+ struct avx2_32bit_swizzle_ops {
533
547
template <typename vtype, int scale>
534
- X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n (typename vtype::reg_t reg){
548
+ X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n (typename vtype::reg_t reg)
549
+ {
535
550
__m256i v = vtype::cast_to (reg);
536
551
537
- if constexpr (scale == 2 ){
552
+ if constexpr (scale == 2 ) {
538
553
__m256 vf = _mm256_castsi256_ps (v);
539
554
vf = _mm256_permute_ps (vf, 0b10110001 );
540
555
v = _mm256_castps_si256 (vf);
541
- }else if constexpr (scale == 4 ){
556
+ }
557
+ else if constexpr (scale == 4 ) {
542
558
__m256 vf = _mm256_castsi256_ps (v);
543
559
vf = _mm256_permute_ps (vf, 0b01001110 );
544
560
v = _mm256_castps_si256 (vf);
545
- }else if constexpr (scale == 8 ){
561
+ }
562
+ else if constexpr (scale == 8 ) {
546
563
v = _mm256_permute2x128_si256 (v, v, 0b00000001 );
547
- }else {
564
+ }
565
+ else {
548
566
static_assert (scale == -1 , " should not be reached" );
549
567
}
550
568
551
569
return vtype::cast_from (v);
552
570
}
553
571
554
572
template <typename vtype, int scale>
555
- X86_SIMD_SORT_INLINE typename vtype::reg_t reverse_n (typename vtype::reg_t reg){
573
+ X86_SIMD_SORT_INLINE typename vtype::reg_t
574
+ reverse_n (typename vtype::reg_t reg)
575
+ {
556
576
__m256i v = vtype::cast_to (reg);
557
577
558
- if constexpr (scale == 2 ){
559
- return swap_n<vtype, 2 >(reg);
560
- }else if constexpr (scale == 4 ){
578
+ if constexpr (scale == 2 ) { return swap_n<vtype, 2 >(reg); }
579
+ else if constexpr (scale == 4 ) {
561
580
constexpr uint64_t mask = 0b00011011 ;
562
581
__m256 vf = _mm256_castsi256_ps (v);
563
582
vf = _mm256_permute_ps (vf, mask);
564
583
v = _mm256_castps_si256 (vf);
565
- }else if constexpr (scale == 8 ){
584
+ }
585
+ else if constexpr (scale == 8 ) {
566
586
return vtype::reverse (reg);
567
- }else {
587
+ }
588
+ else {
568
589
static_assert (scale == -1 , " should not be reached" );
569
590
}
570
591
571
592
return vtype::cast_from (v);
572
593
}
573
594
574
595
template <typename vtype, int scale>
575
- X86_SIMD_SORT_INLINE typename vtype::reg_t merge_n (typename vtype::reg_t reg, typename vtype::reg_t other){
596
+ X86_SIMD_SORT_INLINE typename vtype::reg_t
597
+ merge_n (typename vtype::reg_t reg, typename vtype::reg_t other)
598
+ {
576
599
__m256i v1 = vtype::cast_to (reg);
577
600
__m256i v2 = vtype::cast_to (other);
578
601
579
- if constexpr (scale == 2 ){
602
+ if constexpr (scale == 2 ) {
580
603
v1 = _mm256_blend_epi32 (v1, v2, 0b01010101 );
581
- }else if constexpr (scale == 4 ){
604
+ }
605
+ else if constexpr (scale == 4 ) {
582
606
v1 = _mm256_blend_epi32 (v1, v2, 0b00110011 );
583
- }else if constexpr (scale == 8 ){
607
+ }
608
+ else if constexpr (scale == 8 ) {
584
609
v1 = _mm256_blend_epi32 (v1, v2, 0b00001111 );
585
- }else {
610
+ }
611
+ else {
586
612
static_assert (scale == -1 , " should not be reached" );
587
613
}
588
614
0 commit comments