@@ -30,24 +30,13 @@ template <typename vtype, typename reg_t = typename vtype::reg_t>
30
30
X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit_half (reg_t ymm)
31
31
{
32
32
using swizzle = typename vtype::swizzle_ops;
33
-
34
- const typename vtype::opmask_t oxAA
35
- = vtype::seti (-1 , 0 , -1 , 0 );
36
- const typename vtype::opmask_t oxCC
37
- = vtype::seti (-1 , -1 , 0 , 0 );
38
-
39
- ymm = cmp_merge<vtype>(
40
- ymm,
41
- swizzle::template swap_n<vtype, 2 >(ymm),
42
- oxAA);
43
- ymm = cmp_merge<vtype>(
44
- ymm,
45
- vtype::reverse (ymm),
46
- oxCC);
47
- ymm = cmp_merge<vtype>(
48
- ymm,
49
- swizzle::template swap_n<vtype, 2 >(ymm),
50
- oxAA);
33
+
34
+ const typename vtype::opmask_t oxAA = vtype::seti (-1 , 0 , -1 , 0 );
35
+ const typename vtype::opmask_t oxCC = vtype::seti (-1 , -1 , 0 , 0 );
36
+
37
+ ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2 >(ymm), oxAA);
38
+ ymm = cmp_merge<vtype>(ymm, vtype::reverse (ymm), oxCC);
39
+ ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2 >(ymm), oxAA);
51
40
return ymm;
52
41
}
53
42
@@ -61,7 +50,7 @@ struct avx2_half_vector<int32_t> {
61
50
using opmask_t = __m128i;
62
51
static const uint8_t numlanes = 4 ;
63
52
static constexpr simd_type vec_type = simd_type::AVX2;
64
-
53
+
65
54
using swizzle_ops = avx2_32bit_half_swizzle_ops;
66
55
67
56
static type_t type_max ()
@@ -81,13 +70,11 @@ struct avx2_half_vector<int32_t> {
81
70
auto mask = ((0x1ull << num_to_read) - 0x1ull );
82
71
return convert_int_to_avx2_mask_half (mask);
83
72
}
84
- static ymmi_t
85
- seti (int v1, int v2, int v3, int v4)
73
+ static ymmi_t seti (int v1, int v2, int v3, int v4)
86
74
{
87
75
return _mm_set_epi32 (v1, v2, v3, v4);
88
76
}
89
- static reg_t
90
- set (int v1, int v2, int v3, int v4)
77
+ static reg_t set (int v1, int v2, int v3, int v4)
91
78
{
92
79
return _mm_set_epi32 (v1, v2, v3, v4);
93
80
}
@@ -99,8 +86,8 @@ struct avx2_half_vector<int32_t> {
99
86
{
100
87
opmask_t equal = eq (x, y);
101
88
opmask_t greater = _mm_cmpgt_epi32 (x, y);
102
- return _mm_castps_si128 (_mm_or_ps ( _mm_castsi128_ps (equal),
103
- _mm_castsi128_ps (greater)));
89
+ return _mm_castps_si128 (
90
+ _mm_or_ps ( _mm_castsi128_ps (equal), _mm_castsi128_ps (greater)));
104
91
}
105
92
static opmask_t eq (reg_t x, reg_t y)
106
93
{
@@ -110,14 +97,12 @@ struct avx2_half_vector<int32_t> {
110
97
static reg_t
111
98
mask_i64gather (reg_t src, opmask_t mask, __m256i index, void const *base)
112
99
{
113
- return _mm256_mask_i64gather_epi32 (src, (const int *) base, index, mask, scale);
100
+ return _mm256_mask_i64gather_epi32 (
101
+ src, (const int *)base, index, mask, scale);
114
102
}
115
103
static reg_t i64gather (type_t *arr, arrsize_t *ind)
116
104
{
117
- return set (arr[ind[3 ]],
118
- arr[ind[2 ]],
119
- arr[ind[1 ]],
120
- arr[ind[0 ]]);
105
+ return set (arr[ind[3 ]], arr[ind[2 ]], arr[ind[1 ]], arr[ind[0 ]]);
121
106
}
122
107
static reg_t loadu (void const *mem)
123
108
{
@@ -143,8 +128,8 @@ struct avx2_half_vector<int32_t> {
143
128
static reg_t mask_mov (reg_t x, opmask_t mask, reg_t y)
144
129
{
145
130
return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps (x),
146
- _mm_castsi128_ps (y),
147
- _mm_castsi128_ps (mask)));
131
+ _mm_castsi128_ps (y),
132
+ _mm_castsi128_ps (mask)));
148
133
}
149
134
static void mask_storeu (void *mem, opmask_t mask, reg_t x)
150
135
{
@@ -217,7 +202,7 @@ struct avx2_half_vector<uint32_t> {
217
202
using opmask_t = __m128i;
218
203
static const uint8_t numlanes = 4 ;
219
204
static constexpr simd_type vec_type = simd_type::AVX2;
220
-
205
+
221
206
using swizzle_ops = avx2_32bit_half_swizzle_ops;
222
207
223
208
static type_t type_max ()
@@ -237,28 +222,24 @@ struct avx2_half_vector<uint32_t> {
237
222
auto mask = ((0x1ull << num_to_read) - 0x1ull );
238
223
return convert_int_to_avx2_mask_half (mask);
239
224
}
240
- static ymmi_t
241
- seti (int v1, int v2, int v3, int v4)
225
+ static ymmi_t seti (int v1, int v2, int v3, int v4)
242
226
{
243
227
return _mm_set_epi32 (v1, v2, v3, v4);
244
228
}
245
- static reg_t
246
- set (int v1, int v2, int v3, int v4)
229
+ static reg_t set (int v1, int v2, int v3, int v4)
247
230
{
248
231
return _mm_set_epi32 (v1, v2, v3, v4);
249
232
}
250
233
template <int scale>
251
234
static reg_t
252
235
mask_i64gather (reg_t src, opmask_t mask, __m256i index, void const *base)
253
236
{
254
- return _mm256_mask_i64gather_epi32 (src, (const int *) base, index, mask, scale);
237
+ return _mm256_mask_i64gather_epi32 (
238
+ src, (const int *)base, index, mask, scale);
255
239
}
256
240
static reg_t i64gather (type_t *arr, arrsize_t *ind)
257
241
{
258
- return set (arr[ind[3 ]],
259
- arr[ind[2 ]],
260
- arr[ind[1 ]],
261
- arr[ind[0 ]]);
242
+ return set (arr[ind[3 ]], arr[ind[2 ]], arr[ind[1 ]], arr[ind[0 ]]);
262
243
}
263
244
static opmask_t ge (reg_t x, reg_t y)
264
245
{
@@ -289,8 +270,8 @@ struct avx2_half_vector<uint32_t> {
289
270
static reg_t mask_mov (reg_t x, opmask_t mask, reg_t y)
290
271
{
291
272
return _mm_castps_si128 (_mm_blendv_ps (_mm_castsi128_ps (x),
292
- _mm_castsi128_ps (y),
293
- _mm_castsi128_ps (mask)));
273
+ _mm_castsi128_ps (y),
274
+ _mm_castsi128_ps (mask)));
294
275
}
295
276
static void mask_storeu (void *mem, opmask_t mask, reg_t x)
296
277
{
@@ -363,7 +344,7 @@ struct avx2_half_vector<float> {
363
344
using opmask_t = __m128i;
364
345
static const uint8_t numlanes = 4 ;
365
346
static constexpr simd_type vec_type = simd_type::AVX2;
366
-
347
+
367
348
using swizzle_ops = avx2_32bit_half_swizzle_ops;
368
349
369
350
static type_t type_max ()
@@ -379,13 +360,11 @@ struct avx2_half_vector<float> {
379
360
return _mm_set1_ps (type_max ());
380
361
}
381
362
382
- static ymmi_t
383
- seti (int v1, int v2, int v3, int v4)
363
+ static ymmi_t seti (int v1, int v2, int v3, int v4)
384
364
{
385
365
return _mm_set_epi32 (v1, v2, v3, v4);
386
366
}
387
- static reg_t
388
- set (float v1, float v2, float v3, float v4)
367
+ static reg_t set (float v1, float v2, float v3, float v4)
389
368
{
390
369
return _mm_set_ps (v1, v2, v3, v4);
391
370
}
@@ -424,14 +403,12 @@ struct avx2_half_vector<float> {
424
403
static reg_t
425
404
mask_i64gather (reg_t src, opmask_t mask, __m256i index, void const *base)
426
405
{
427
- return _mm256_mask_i64gather_ps (src, (const float *) base, index, _mm_castsi128_ps (mask), scale);
406
+ return _mm256_mask_i64gather_ps (
407
+ src, (const float *)base, index, _mm_castsi128_ps (mask), scale);
428
408
}
429
409
static reg_t i64gather (type_t *arr, arrsize_t *ind)
430
410
{
431
- return set (arr[ind[3 ]],
432
- arr[ind[2 ]],
433
- arr[ind[1 ]],
434
- arr[ind[0 ]]);
411
+ return set (arr[ind[3 ]], arr[ind[2 ]], arr[ind[1 ]], arr[ind[0 ]]);
435
412
}
436
413
static reg_t loadu (void const *mem)
437
414
{
@@ -490,8 +467,7 @@ struct avx2_half_vector<float> {
490
467
template <uint8_t mask>
491
468
static reg_t shuffle (reg_t ymm)
492
469
{
493
- return _mm_castsi128_ps (
494
- _mm_shuffle_epi32 (_mm_castps_si128 (ymm), mask));
470
+ return _mm_castsi128_ps (_mm_shuffle_epi32 (_mm_castps_si128 (ymm), mask));
495
471
}
496
472
static void storeu (void *mem, reg_t x)
497
473
{
@@ -566,9 +542,7 @@ struct avx2_32bit_half_swizzle_ops {
566
542
__m128i v1 = vtype::cast_to (reg);
567
543
__m128i v2 = vtype::cast_to (other);
568
544
569
- if constexpr (scale == 2 ) {
570
- v1 = _mm_blend_epi32 (v1, v2, 0b0101 );
571
- }
545
+ if constexpr (scale == 2 ) { v1 = _mm_blend_epi32 (v1, v2, 0b0101 ); }
572
546
else if constexpr (scale == 4 ) {
573
547
v1 = _mm_blend_epi32 (v1, v2, 0b0011 );
574
548
}
0 commit comments