@@ -40,14 +40,8 @@ struct ymm_vector<float> {
40
40
return _mm256_set1_ps (type_max ());
41
41
}
42
42
43
- static zmmi_t seti (int v1,
44
- int v2,
45
- int v3,
46
- int v4,
47
- int v5,
48
- int v6,
49
- int v7,
50
- int v8)
43
+ static zmmi_t
44
+ seti (int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
51
45
{
52
46
return _mm256_set_epi32 (v1, v2, v3, v4, v5, v6, v7, v8);
53
47
}
@@ -93,7 +87,7 @@ struct ymm_vector<float> {
93
87
}
94
88
static zmm_t loadu (void const *mem)
95
89
{
96
- return _mm256_loadu_ps ((float *) mem);
90
+ return _mm256_loadu_ps ((float *) mem);
97
91
}
98
92
static zmm_t max (zmm_t x, zmm_t y)
99
93
{
@@ -129,16 +123,22 @@ struct ymm_vector<float> {
129
123
}
130
124
static type_t reducemax (zmm_t v)
131
125
{
132
- __m128 v128 = _mm_max_ps (_mm256_castps256_ps128 (v), _mm256_extractf32x4_ps (v, 1 ));
133
- __m128 v64 = _mm_max_ps (v128, _mm_shuffle_ps (v128, v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
134
- __m128 v32 = _mm_max_ps (v64, _mm_shuffle_ps (v64, v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
126
+ __m128 v128 = _mm_max_ps (_mm256_castps256_ps128 (v),
127
+ _mm256_extractf32x4_ps (v, 1 ));
128
+ __m128 v64 = _mm_max_ps (
129
+ v128, _mm_shuffle_ps (v128, v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
130
+ __m128 v32 = _mm_max_ps (
131
+ v64, _mm_shuffle_ps (v64, v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
135
132
return _mm_cvtss_f32 (v32);
136
133
}
137
134
static type_t reducemin (zmm_t v)
138
135
{
139
- __m128 v128 = _mm_min_ps (_mm256_castps256_ps128 (v), _mm256_extractf32x4_ps (v, 1 ));
140
- __m128 v64 = _mm_min_ps (v128, _mm_shuffle_ps (v128, v128,_MM_SHUFFLE (1 , 0 , 3 , 2 )));
141
- __m128 v32 = _mm_min_ps (v64, _mm_shuffle_ps (v64, v64,_MM_SHUFFLE (0 , 0 , 0 , 1 )));
136
+ __m128 v128 = _mm_min_ps (_mm256_castps256_ps128 (v),
137
+ _mm256_extractf32x4_ps (v, 1 ));
138
+ __m128 v64 = _mm_min_ps (
139
+ v128, _mm_shuffle_ps (v128, v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
140
+ __m128 v32 = _mm_min_ps (
141
+ v64, _mm_shuffle_ps (v64, v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
142
142
return _mm_cvtss_f32 (v32);
143
143
}
144
144
static zmm_t set1 (type_t v)
@@ -160,7 +160,7 @@ struct ymm_vector<float> {
160
160
}
161
161
static void storeu (void *mem, zmm_t x)
162
162
{
163
- _mm256_storeu_ps ((float *)mem, x);
163
+ _mm256_storeu_ps ((float *)mem, x);
164
164
}
165
165
};
166
166
template <>
@@ -184,14 +184,8 @@ struct ymm_vector<uint32_t> {
184
184
return _mm256_set1_epi32 (type_max ());
185
185
}
186
186
187
- static zmmi_t seti (int v1,
188
- int v2,
189
- int v3,
190
- int v4,
191
- int v5,
192
- int v6,
193
- int v7,
194
- int v8)
187
+ static zmmi_t
188
+ seti (int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
195
189
{
196
190
return _mm256_set_epi32 (v1, v2, v3, v4, v5, v6, v7, v8);
197
191
}
@@ -228,7 +222,7 @@ struct ymm_vector<uint32_t> {
228
222
}
229
223
static zmm_t loadu (void const *mem)
230
224
{
231
- return _mm256_loadu_si256 ((__m256i*) mem);
225
+ return _mm256_loadu_si256 ((__m256i *) mem);
232
226
}
233
227
static zmm_t max (zmm_t x, zmm_t y)
234
228
{
@@ -264,16 +258,22 @@ struct ymm_vector<uint32_t> {
264
258
}
265
259
static type_t reducemax (zmm_t v)
266
260
{
267
- __m128i v128 = _mm_max_epu32 (_mm256_castsi256_si128 (v), _mm256_extracti128_si256 (v, 1 ));
268
- __m128i v64 = _mm_max_epu32 (v128, _mm_shuffle_epi32 (v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
269
- __m128i v32 = _mm_max_epu32 (v64, _mm_shuffle_epi32 (v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
261
+ __m128i v128 = _mm_max_epu32 (_mm256_castsi256_si128 (v),
262
+ _mm256_extracti128_si256 (v, 1 ));
263
+ __m128i v64 = _mm_max_epu32 (
264
+ v128, _mm_shuffle_epi32 (v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
265
+ __m128i v32 = _mm_max_epu32 (
266
+ v64, _mm_shuffle_epi32 (v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
270
267
return (type_t )_mm_cvtsi128_si32 (v32);
271
268
}
272
269
static type_t reducemin (zmm_t v)
273
270
{
274
- __m128i v128 = _mm_min_epu32 (_mm256_castsi256_si128 (v), _mm256_extracti128_si256 (v, 1 ));
275
- __m128i v64 = _mm_min_epu32 (v128, _mm_shuffle_epi32 (v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
276
- __m128i v32 = _mm_min_epu32 (v64, _mm_shuffle_epi32 (v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
271
+ __m128i v128 = _mm_min_epu32 (_mm256_castsi256_si128 (v),
272
+ _mm256_extracti128_si256 (v, 1 ));
273
+ __m128i v64 = _mm_min_epu32 (
274
+ v128, _mm_shuffle_epi32 (v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
275
+ __m128i v32 = _mm_min_epu32 (
276
+ v64, _mm_shuffle_epi32 (v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
277
277
return (type_t )_mm_cvtsi128_si32 (v32);
278
278
}
279
279
static zmm_t set1 (type_t v)
@@ -289,7 +289,7 @@ struct ymm_vector<uint32_t> {
289
289
}
290
290
static void storeu (void *mem, zmm_t x)
291
291
{
292
- _mm256_storeu_si256 ((__m256i*) mem, x);
292
+ _mm256_storeu_si256 ((__m256i *) mem, x);
293
293
}
294
294
};
295
295
template <>
@@ -313,14 +313,8 @@ struct ymm_vector<int32_t> {
313
313
return _mm256_set1_epi32 (type_max ());
314
314
} // TODO: this should broadcast bits as is?
315
315
316
- static zmmi_t seti (int v1,
317
- int v2,
318
- int v3,
319
- int v4,
320
- int v5,
321
- int v6,
322
- int v7,
323
- int v8)
316
+ static zmmi_t
317
+ seti (int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
324
318
{
325
319
return _mm256_set_epi32 (v1, v2, v3, v4, v5, v6, v7, v8);
326
320
}
@@ -357,7 +351,7 @@ struct ymm_vector<int32_t> {
357
351
}
358
352
static zmm_t loadu (void const *mem)
359
353
{
360
- return _mm256_loadu_si256 ((__m256i*) mem);
354
+ return _mm256_loadu_si256 ((__m256i *) mem);
361
355
}
362
356
static zmm_t max (zmm_t x, zmm_t y)
363
357
{
@@ -393,16 +387,22 @@ struct ymm_vector<int32_t> {
393
387
}
394
388
static type_t reducemax (zmm_t v)
395
389
{
396
- __m128i v128 = _mm_max_epi32 (_mm256_castsi256_si128 (v), _mm256_extracti128_si256 (v, 1 ));
397
- __m128i v64 = _mm_max_epi32 (v128, _mm_shuffle_epi32 (v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
398
- __m128i v32 = _mm_max_epi32 (v64, _mm_shuffle_epi32 (v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
390
+ __m128i v128 = _mm_max_epi32 (_mm256_castsi256_si128 (v),
391
+ _mm256_extracti128_si256 (v, 1 ));
392
+ __m128i v64 = _mm_max_epi32 (
393
+ v128, _mm_shuffle_epi32 (v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
394
+ __m128i v32 = _mm_max_epi32 (
395
+ v64, _mm_shuffle_epi32 (v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
399
396
return (type_t )_mm_cvtsi128_si32 (v32);
400
397
}
401
398
static type_t reducemin (zmm_t v)
402
399
{
403
- __m128i v128 = _mm_min_epi32 (_mm256_castsi256_si128 (v), _mm256_extracti128_si256 (v, 1 ));
404
- __m128i v64 = _mm_min_epi32 (v128, _mm_shuffle_epi32 (v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
405
- __m128i v32 = _mm_min_epi32 (v64, _mm_shuffle_epi32 (v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
400
+ __m128i v128 = _mm_min_epi32 (_mm256_castsi256_si128 (v),
401
+ _mm256_extracti128_si256 (v, 1 ));
402
+ __m128i v64 = _mm_min_epi32 (
403
+ v128, _mm_shuffle_epi32 (v128, _MM_SHUFFLE (1 , 0 , 3 , 2 )));
404
+ __m128i v32 = _mm_min_epi32 (
405
+ v64, _mm_shuffle_epi32 (v64, _MM_SHUFFLE (0 , 0 , 0 , 1 )));
406
406
return (type_t )_mm_cvtsi128_si32 (v32);
407
407
}
408
408
static zmm_t set1 (type_t v)
@@ -418,7 +418,7 @@ struct ymm_vector<int32_t> {
418
418
}
419
419
static void storeu (void *mem, zmm_t x)
420
420
{
421
- _mm256_storeu_si256 ((__m256i*) mem, x);
421
+ _mm256_storeu_si256 ((__m256i *) mem, x);
422
422
}
423
423
};
424
424
template <>
@@ -443,14 +443,8 @@ struct zmm_vector<int64_t> {
443
443
return _mm512_set1_epi64 (type_max ());
444
444
} // TODO: this should broadcast bits as is?
445
445
446
- static zmmi_t seti (int v1,
447
- int v2,
448
- int v3,
449
- int v4,
450
- int v5,
451
- int v6,
452
- int v7,
453
- int v8)
446
+ static zmmi_t
447
+ seti (int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
454
448
{
455
449
return _mm512_set_epi64 (v1, v2, v3, v4, v5, v6, v7, v8);
456
450
}
@@ -567,14 +561,8 @@ struct zmm_vector<uint64_t> {
567
561
return _mm512_set1_epi64 (type_max ());
568
562
}
569
563
570
- static zmmi_t seti (int v1,
571
- int v2,
572
- int v3,
573
- int v4,
574
- int v5,
575
- int v6,
576
- int v7,
577
- int v8)
564
+ static zmmi_t
565
+ seti (int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
578
566
{
579
567
return _mm512_set_epi64 (v1, v2, v3, v4, v5, v6, v7, v8);
580
568
}
@@ -679,14 +667,8 @@ struct zmm_vector<double> {
679
667
return _mm512_set1_pd (type_max ());
680
668
}
681
669
682
- static zmmi_t seti (int v1,
683
- int v2,
684
- int v3,
685
- int v4,
686
- int v5,
687
- int v6,
688
- int v7,
689
- int v8)
670
+ static zmmi_t
671
+ seti (int v1, int v2, int v3, int v4, int v5, int v6, int v7, int v8)
690
672
{
691
673
return _mm512_set_epi64 (v1, v2, v3, v4, v5, v6, v7, v8);
692
674
}
@@ -793,16 +775,12 @@ X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm)
793
775
zmm = cmp_merge<vtype>(
794
776
zmm, vtype::template shuffle<SHUFFLE_MASK (1 , 1 , 1 , 1 )>(zmm), 0xAA );
795
777
zmm = cmp_merge<vtype>(
796
- zmm,
797
- vtype::permutexvar (vtype::seti (NETWORK_64BIT_1), zmm),
798
- 0xCC );
778
+ zmm, vtype::permutexvar (vtype::seti (NETWORK_64BIT_1), zmm), 0xCC );
799
779
zmm = cmp_merge<vtype>(
800
780
zmm, vtype::template shuffle<SHUFFLE_MASK (1 , 1 , 1 , 1 )>(zmm), 0xAA );
801
781
zmm = cmp_merge<vtype>(zmm, vtype::permutexvar (rev_index, zmm), 0xF0 );
802
782
zmm = cmp_merge<vtype>(
803
- zmm,
804
- vtype::permutexvar (vtype::seti (NETWORK_64BIT_3), zmm),
805
- 0xCC );
783
+ zmm, vtype::permutexvar (vtype::seti (NETWORK_64BIT_3), zmm), 0xCC );
806
784
zmm = cmp_merge<vtype>(
807
785
zmm, vtype::template shuffle<SHUFFLE_MASK (1 , 1 , 1 , 1 )>(zmm), 0xAA );
808
786
return zmm;
0 commit comments