numpy
diff --git a/‎src/avx2-32bit-half.hpp‎
Lines changed: 18 additions & 36 deletions b/‎src/avx2-32bit-half.hpp‎
Lines changed: 18 additions & 36 deletions
diff --git a/‎src/avx2-32bit-qsort.hpp‎
Lines changed: 6 additions & 51 deletions b/‎src/avx2-32bit-qsort.hpp‎
Lines changed: 6 additions & 51 deletions
diff --git a/‎src/avx2-64bit-qsort.hpp‎
Lines changed: 15 additions & 29 deletions b/‎src/avx2-64bit-qsort.hpp‎
Lines changed: 15 additions & 29 deletions
@@ -9,36 +9,6 @@
 
 #include "avx2-emu-funcs.hpp"
 
-/*
- * Constants used in sorting 8 elements in a ymm registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-
-// ymm                  7, 6, 5, 4, 3, 2, 1, 0
-#define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK_32BIT_AVX2_3 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK_32BIT_AVX2_4 3, 2, 1, 0, 7, 6, 5, 4
-
-/*
- * Assumes ymm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename reg_t = typename vtype::reg_t>
-X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit_half(reg_t ymm)
-{
-    using swizzle = typename vtype::swizzle_ops;
-
-    const typename vtype::opmask_t oxAA = vtype::seti(-1, 0, -1, 0);
-    const typename vtype::opmask_t oxCC = vtype::seti(-1, -1, 0, 0);
-
-    ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2>(ymm), oxAA);
-    ymm = cmp_merge<vtype>(ymm, vtype::reverse(ymm), oxCC);
-    ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2>(ymm), oxAA);
-    return ymm;
-}
-
 struct avx2_32bit_half_swizzle_ops;
 
 template <>
@@ -74,6 +44,10 @@ struct avx2_half_vector<int32_t> {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_half(mask);
     }
+    static opmask_t convert_int_to_mask(uint64_t intMask)
+    {
+        return convert_int_to_avx2_mask_half(intMask);
+    }
     static regi_t seti(int v1, int v2, int v3, int v4)
     {
         return _mm_set_epi32(v1, v2, v3, v4);
@@ -155,7 +129,7 @@ struct avx2_half_vector<int32_t> {
     }
     static reg_t reverse(reg_t ymm)
     {
-        const __m128i rev_index = _mm_set_epi32(0, 1, 2, 3);
+        const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES);
         return permutexvar(rev_index, ymm);
     }
     static type_t reducemax(reg_t v)
@@ -181,7 +155,7 @@ struct avx2_half_vector<int32_t> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_32bit_half<avx2_half_vector<type_t>>(x);
+        return sort_reg_4lanes<avx2_half_vector<type_t>>(x);
     }
     static reg_t cast_from(__m128i v)
     {
@@ -237,6 +211,10 @@ struct avx2_half_vector<uint32_t> {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_half(mask);
     }
+    static opmask_t convert_int_to_mask(uint64_t intMask)
+    {
+        return convert_int_to_avx2_mask_half(intMask);
+    }
     static regi_t seti(int v1, int v2, int v3, int v4)
     {
         return _mm_set_epi32(v1, v2, v3, v4);
@@ -309,7 +287,7 @@ struct avx2_half_vector<uint32_t> {
     }
     static reg_t reverse(reg_t ymm)
     {
-        const __m128i rev_index = _mm_set_epi32(0, 1, 2, 3);
+        const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES);
         return permutexvar(rev_index, ymm);
     }
     static type_t reducemax(reg_t v)
@@ -335,7 +313,7 @@ struct avx2_half_vector<uint32_t> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_32bit_half<avx2_half_vector<type_t>>(x);
+        return sort_reg_4lanes<avx2_half_vector<type_t>>(x);
     }
     static reg_t cast_from(__m128i v)
     {
@@ -411,6 +389,10 @@ struct avx2_half_vector<float> {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_half(mask);
     }
+    static opmask_t convert_int_to_mask(uint64_t intMask)
+    {
+        return convert_int_to_avx2_mask_half(intMask);
+    }
     static int32_t convert_mask_to_int(opmask_t mask)
     {
         return convert_avx2_mask_to_int_half(mask);
@@ -478,7 +460,7 @@ struct avx2_half_vector<float> {
     }
     static reg_t reverse(reg_t ymm)
     {
-        const __m128i rev_index = _mm_set_epi32(0, 1, 2, 3);
+        const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES);
         return permutexvar(rev_index, ymm);
     }
     static type_t reducemax(reg_t v)
@@ -504,7 +486,7 @@ struct avx2_half_vector<float> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_32bit_half<avx2_half_vector<type_t>>(x);
+        return sort_reg_4lanes<avx2_half_vector<type_t>>(x);
     }
     static reg_t cast_from(__m128i v)
     {
 
@@ -9,51 +9,6 @@
 
 #include "avx2-emu-funcs.hpp"
 
-/*
- * Constants used in sorting 8 elements in a ymm registers. Based on Bitonic
- * sorting network (see
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)
- */
-
-// ymm                  7, 6, 5, 4, 3, 2, 1, 0
-#define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3
-#define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7
-#define NETWORK_32BIT_AVX2_3 5, 4, 7, 6, 1, 0, 3, 2
-#define NETWORK_32BIT_AVX2_4 3, 2, 1, 0, 7, 6, 5, 4
-
-/*
- * Assumes ymm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename reg_t = typename vtype::reg_t>
-X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit(reg_t ymm)
-{
-    const typename vtype::opmask_t oxAA = _mm256_set_epi32(
-            0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
-    const typename vtype::opmask_t oxCC = _mm256_set_epi32(
-            0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
-    const typename vtype::opmask_t oxF0 = _mm256_set_epi32(
-            0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0, 0);
-
-    const typename vtype::ymmi_t rev_index = vtype::seti(NETWORK_32BIT_AVX2_2);
-    ymm = cmp_merge<vtype>(
-            ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
-    ymm = cmp_merge<vtype>(
-            ymm,
-            vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_1), ymm),
-            oxCC);
-    ymm = cmp_merge<vtype>(
-            ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
-    ymm = cmp_merge<vtype>(ymm, vtype::permutexvar(rev_index, ymm), oxF0);
-    ymm = cmp_merge<vtype>(
-            ymm,
-            vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_3), ymm),
-            oxCC);
-    ymm = cmp_merge<vtype>(
-            ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);
-    return ymm;
-}
-
 struct avx2_32bit_swizzle_ops;
 
 template <>
@@ -180,7 +135,7 @@ struct avx2_vector<int32_t> {
     }
     static reg_t reverse(reg_t ymm)
     {
-        const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
+        const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES);
         return permutexvar(rev_index, ymm);
     }
     static type_t reducemax(reg_t v)
@@ -206,7 +161,7 @@ struct avx2_vector<int32_t> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_32bit<avx2_vector<type_t>>(x);
+        return sort_reg_8lanes<avx2_vector<type_t>>(x);
     }
     static reg_t cast_from(__m256i v)
     {
@@ -342,7 +297,7 @@ struct avx2_vector<uint32_t> {
     }
     static reg_t reverse(reg_t ymm)
     {
-        const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
+        const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES);
         return permutexvar(rev_index, ymm);
     }
     static type_t reducemax(reg_t v)
@@ -368,7 +323,7 @@ struct avx2_vector<uint32_t> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_32bit<avx2_vector<type_t>>(x);
+        return sort_reg_8lanes<avx2_vector<type_t>>(x);
     }
     static reg_t cast_from(__m256i v)
     {
@@ -520,7 +475,7 @@ struct avx2_vector<float> {
     }
     static reg_t reverse(reg_t ymm)
     {
-        const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
+        const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES);
         return permutexvar(rev_index, ymm);
     }
     static type_t reducemax(reg_t v)
@@ -547,7 +502,7 @@ struct avx2_vector<float> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_32bit<avx2_vector<type_t>>(x);
+        return sort_reg_8lanes<avx2_vector<type_t>>(x);
     }
     static reg_t cast_from(__m256i v)
     {
 
@@ -10,32 +10,6 @@
 
 #include "avx2-emu-funcs.hpp"
 
-/*
- * Assumes ymm is random and performs a full sorting network defined in
- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
- */
-template <typename vtype, typename reg_t = typename vtype::reg_t>
-X86_SIMD_SORT_INLINE reg_t sort_ymm_64bit(reg_t ymm)
-{
-    const typename vtype::opmask_t oxAA
-            = _mm256_set_epi64x(0xFFFFFFFFFFFFFFFF, 0, 0xFFFFFFFFFFFFFFFF, 0);
-    const typename vtype::opmask_t oxCC
-            = _mm256_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0, 0);
-    ymm = cmp_merge<vtype>(
-            ymm,
-            vtype::template permutexvar<SHUFFLE_MASK(2, 3, 0, 1)>(ymm),
-            oxAA);
-    ymm = cmp_merge<vtype>(
-            ymm,
-            vtype::template permutexvar<SHUFFLE_MASK(0, 1, 2, 3)>(ymm),
-            oxCC);
-    ymm = cmp_merge<vtype>(
-            ymm,
-            vtype::template permutexvar<SHUFFLE_MASK(2, 3, 0, 1)>(ymm),
-            oxAA);
-    return ymm;
-}
-
 struct avx2_64bit_swizzle_ops;
 
 template <>
@@ -81,6 +55,10 @@ struct avx2_vector<int64_t> {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_64bit(mask);
     }
+    static opmask_t convert_int_to_mask(uint64_t intMask)
+    {
+        return convert_int_to_avx2_mask_64bit(intMask);
+    }
     static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
@@ -207,7 +185,7 @@ struct avx2_vector<int64_t> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_64bit<avx2_vector<type_t>>(x);
+        return sort_reg_4lanes<avx2_vector<type_t>>(x);
     }
     static reg_t cast_from(__m256i v)
     {
@@ -265,6 +243,10 @@ struct avx2_vector<uint64_t> {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_64bit(mask);
     }
+    static opmask_t convert_int_to_mask(uint64_t intMask)
+    {
+        return convert_int_to_avx2_mask_64bit(intMask);
+    }
     static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)
     {
         return _mm256_set_epi64x(v1, v2, v3, v4);
@@ -389,7 +371,7 @@ struct avx2_vector<uint64_t> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_64bit<avx2_vector<type_t>>(x);
+        return sort_reg_4lanes<avx2_vector<type_t>>(x);
     }
     static reg_t cast_from(__m256i v)
     {
@@ -460,6 +442,10 @@ struct avx2_vector<double> {
         auto mask = ((0x1ull << num_to_read) - 0x1ull);
         return convert_int_to_avx2_mask_64bit(mask);
     }
+    static opmask_t convert_int_to_mask(uint64_t intMask)
+    {
+        return convert_int_to_avx2_mask_64bit(intMask);
+    }
     static int32_t convert_mask_to_int(opmask_t mask)
     {
         return convert_avx2_mask_to_int_64bit(mask);
@@ -593,7 +579,7 @@ struct avx2_vector<double> {
     }
     static reg_t sort_vec(reg_t x)
     {
-        return sort_ymm_64bit<avx2_vector<type_t>>(x);
+        return sort_reg_4lanes<avx2_vector<type_t>>(x);
     }
     static reg_t cast_from(__m256i v)
     {
Original file line number	Diff line number	Diff line change
`@@ -9,36 +9,6 @@`
`9`	`9`
`10`	`10`	`#include "avx2-emu-funcs.hpp"`
`11`	`11`
`12`		`-/*`
`13`		`- * Constants used in sorting 8 elements in a ymm registers. Based on Bitonic`
`14`		`- * sorting network (see`
`15`		`- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)`
`16`		`- */`
`17`		`-`
`18`		`-// ymm 7, 6, 5, 4, 3, 2, 1, 0`
`19`		`-#define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3`
`20`		`-#define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7`
`21`		`-#define NETWORK_32BIT_AVX2_3 5, 4, 7, 6, 1, 0, 3, 2`
`22`		`-#define NETWORK_32BIT_AVX2_4 3, 2, 1, 0, 7, 6, 5, 4`
`23`		`-`
`24`		`-/*`
`25`		`- * Assumes ymm is random and performs a full sorting network defined in`
`26`		`- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg`
`27`		`- */`
`28`		`-template <typename vtype, typename reg_t = typename vtype::reg_t>`
`29`		`-X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit_half(reg_t ymm)`
`30`		`-{`
`31`		`- using swizzle = typename vtype::swizzle_ops;`
`32`		`-`
`33`		`- const typename vtype::opmask_t oxAA = vtype::seti(-1, 0, -1, 0);`
`34`		`- const typename vtype::opmask_t oxCC = vtype::seti(-1, -1, 0, 0);`
`35`		`-`
`36`		`- ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2>(ymm), oxAA);`
`37`		`- ymm = cmp_merge<vtype>(ymm, vtype::reverse(ymm), oxCC);`
`38`		`- ymm = cmp_merge<vtype>(ymm, swizzle::template swap_n<vtype, 2>(ymm), oxAA);`
`39`		`- return ymm;`
`40`		`-}`
`41`		`-`
`42`	`12`	`struct avx2_32bit_half_swizzle_ops;`
`43`	`13`
`44`	`14`	`template <>`
`@@ -74,6 +44,10 @@ struct avx2_half_vector<int32_t> {`
`74`	`44`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`75`	`45`	`return convert_int_to_avx2_mask_half(mask);`
`76`	`46`	`}`
	`47`	`+ static opmask_t convert_int_to_mask(uint64_t intMask)`
	`48`	`+ {`
	`49`	`+ return convert_int_to_avx2_mask_half(intMask);`
	`50`	`+ }`
`77`	`51`	`static regi_t seti(int v1, int v2, int v3, int v4)`
`78`	`52`	`{`
`79`	`53`	`return _mm_set_epi32(v1, v2, v3, v4);`
`@@ -155,7 +129,7 @@ struct avx2_half_vector<int32_t> {`
`155`	`129`	`}`
`156`	`130`	`static reg_t reverse(reg_t ymm)`
`157`	`131`	`{`
`158`		`- const __m128i rev_index = _mm_set_epi32(0, 1, 2, 3);`
	`132`	`+ const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES);`
`159`	`133`	`return permutexvar(rev_index, ymm);`
`160`	`134`	`}`
`161`	`135`	`static type_t reducemax(reg_t v)`
`@@ -181,7 +155,7 @@ struct avx2_half_vector<int32_t> {`
`181`	`155`	`}`
`182`	`156`	`static reg_t sort_vec(reg_t x)`
`183`	`157`	`{`
`184`		`- return sort_ymm_32bit_half<avx2_half_vector<type_t>>(x);`
	`158`	`+ return sort_reg_4lanes<avx2_half_vector<type_t>>(x);`
`185`	`159`	`}`
`186`	`160`	`static reg_t cast_from(__m128i v)`
`187`	`161`	`{`
`@@ -237,6 +211,10 @@ struct avx2_half_vector<uint32_t> {`
`237`	`211`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`238`	`212`	`return convert_int_to_avx2_mask_half(mask);`
`239`	`213`	`}`
	`214`	`+ static opmask_t convert_int_to_mask(uint64_t intMask)`
	`215`	`+ {`
	`216`	`+ return convert_int_to_avx2_mask_half(intMask);`
	`217`	`+ }`
`240`	`218`	`static regi_t seti(int v1, int v2, int v3, int v4)`
`241`	`219`	`{`
`242`	`220`	`return _mm_set_epi32(v1, v2, v3, v4);`
`@@ -309,7 +287,7 @@ struct avx2_half_vector<uint32_t> {`
`309`	`287`	`}`
`310`	`288`	`static reg_t reverse(reg_t ymm)`
`311`	`289`	`{`
`312`		`- const __m128i rev_index = _mm_set_epi32(0, 1, 2, 3);`
	`290`	`+ const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES);`
`313`	`291`	`return permutexvar(rev_index, ymm);`
`314`	`292`	`}`
`315`	`293`	`static type_t reducemax(reg_t v)`
`@@ -335,7 +313,7 @@ struct avx2_half_vector<uint32_t> {`
`335`	`313`	`}`
`336`	`314`	`static reg_t sort_vec(reg_t x)`
`337`	`315`	`{`
`338`		`- return sort_ymm_32bit_half<avx2_half_vector<type_t>>(x);`
	`316`	`+ return sort_reg_4lanes<avx2_half_vector<type_t>>(x);`
`339`	`317`	`}`
`340`	`318`	`static reg_t cast_from(__m128i v)`
`341`	`319`	`{`
`@@ -411,6 +389,10 @@ struct avx2_half_vector<float> {`
`411`	`389`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`412`	`390`	`return convert_int_to_avx2_mask_half(mask);`
`413`	`391`	`}`
	`392`	`+ static opmask_t convert_int_to_mask(uint64_t intMask)`
	`393`	`+ {`
	`394`	`+ return convert_int_to_avx2_mask_half(intMask);`
	`395`	`+ }`
`414`	`396`	`static int32_t convert_mask_to_int(opmask_t mask)`
`415`	`397`	`{`
`416`	`398`	`return convert_avx2_mask_to_int_half(mask);`
`@@ -478,7 +460,7 @@ struct avx2_half_vector<float> {`
`478`	`460`	`}`
`479`	`461`	`static reg_t reverse(reg_t ymm)`
`480`	`462`	`{`
`481`		`- const __m128i rev_index = _mm_set_epi32(0, 1, 2, 3);`
	`463`	`+ const __m128i rev_index = _mm_set_epi32(NETWORK_REVERSE_4LANES);`
`482`	`464`	`return permutexvar(rev_index, ymm);`
`483`	`465`	`}`
`484`	`466`	`static type_t reducemax(reg_t v)`
`@@ -504,7 +486,7 @@ struct avx2_half_vector<float> {`
`504`	`486`	`}`
`505`	`487`	`static reg_t sort_vec(reg_t x)`
`506`	`488`	`{`
`507`		`- return sort_ymm_32bit_half<avx2_half_vector<type_t>>(x);`
	`489`	`+ return sort_reg_4lanes<avx2_half_vector<type_t>>(x);`
`508`	`490`	`}`
`509`	`491`	`static reg_t cast_from(__m128i v)`
`510`	`492`	`{`
Original file line number	Diff line number	Diff line change
`@@ -9,51 +9,6 @@`
`9`	`9`
`10`	`10`	`#include "avx2-emu-funcs.hpp"`
`11`	`11`
`12`		`-/*`
`13`		`- * Constants used in sorting 8 elements in a ymm registers. Based on Bitonic`
`14`		`- * sorting network (see`
`15`		`- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg)`
`16`		`- */`
`17`		`-`
`18`		`-// ymm 7, 6, 5, 4, 3, 2, 1, 0`
`19`		`-#define NETWORK_32BIT_AVX2_1 4, 5, 6, 7, 0, 1, 2, 3`
`20`		`-#define NETWORK_32BIT_AVX2_2 0, 1, 2, 3, 4, 5, 6, 7`
`21`		`-#define NETWORK_32BIT_AVX2_3 5, 4, 7, 6, 1, 0, 3, 2`
`22`		`-#define NETWORK_32BIT_AVX2_4 3, 2, 1, 0, 7, 6, 5, 4`
`23`		`-`
`24`		`-/*`
`25`		`- * Assumes ymm is random and performs a full sorting network defined in`
`26`		`- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg`
`27`		`- */`
`28`		`-template <typename vtype, typename reg_t = typename vtype::reg_t>`
`29`		`-X86_SIMD_SORT_INLINE reg_t sort_ymm_32bit(reg_t ymm)`
`30`		`-{`
`31`		`- const typename vtype::opmask_t oxAA = _mm256_set_epi32(`
`32`		`- 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0, 0xFFFFFFFF, 0);`
`33`		`- const typename vtype::opmask_t oxCC = _mm256_set_epi32(`
`34`		`- 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0);`
`35`		`- const typename vtype::opmask_t oxF0 = _mm256_set_epi32(`
`36`		`- 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0, 0, 0, 0);`
`37`		`-`
`38`		`- const typename vtype::ymmi_t rev_index = vtype::seti(NETWORK_32BIT_AVX2_2);`
`39`		`- ymm = cmp_merge<vtype>(`
`40`		`- ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);`
`41`		`- ymm = cmp_merge<vtype>(`
`42`		`- ymm,`
`43`		`- vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_1), ymm),`
`44`		`- oxCC);`
`45`		`- ymm = cmp_merge<vtype>(`
`46`		`- ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);`
`47`		`- ymm = cmp_merge<vtype>(ymm, vtype::permutexvar(rev_index, ymm), oxF0);`
`48`		`- ymm = cmp_merge<vtype>(`
`49`		`- ymm,`
`50`		`- vtype::permutexvar(vtype::seti(NETWORK_32BIT_AVX2_3), ymm),`
`51`		`- oxCC);`
`52`		`- ymm = cmp_merge<vtype>(`
`53`		`- ymm, vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(ymm), oxAA);`
`54`		`- return ymm;`
`55`		`-}`
`56`		`-`
`57`	`12`	`struct avx2_32bit_swizzle_ops;`
`58`	`13`
`59`	`14`	`template <>`
`@@ -180,7 +135,7 @@ struct avx2_vector<int32_t> {`
`180`	`135`	`}`
`181`	`136`	`static reg_t reverse(reg_t ymm)`
`182`	`137`	`{`
`183`		`- const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);`
	`138`	`+ const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES);`
`184`	`139`	`return permutexvar(rev_index, ymm);`
`185`	`140`	`}`
`186`	`141`	`static type_t reducemax(reg_t v)`
`@@ -206,7 +161,7 @@ struct avx2_vector<int32_t> {`
`206`	`161`	`}`
`207`	`162`	`static reg_t sort_vec(reg_t x)`
`208`	`163`	`{`
`209`		`- return sort_ymm_32bit<avx2_vector<type_t>>(x);`
	`164`	`+ return sort_reg_8lanes<avx2_vector<type_t>>(x);`
`210`	`165`	`}`
`211`	`166`	`static reg_t cast_from(__m256i v)`
`212`	`167`	`{`
`@@ -342,7 +297,7 @@ struct avx2_vector<uint32_t> {`
`342`	`297`	`}`
`343`	`298`	`static reg_t reverse(reg_t ymm)`
`344`	`299`	`{`
`345`		`- const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);`
	`300`	`+ const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES);`
`346`	`301`	`return permutexvar(rev_index, ymm);`
`347`	`302`	`}`
`348`	`303`	`static type_t reducemax(reg_t v)`
`@@ -368,7 +323,7 @@ struct avx2_vector<uint32_t> {`
`368`	`323`	`}`
`369`	`324`	`static reg_t sort_vec(reg_t x)`
`370`	`325`	`{`
`371`		`- return sort_ymm_32bit<avx2_vector<type_t>>(x);`
	`326`	`+ return sort_reg_8lanes<avx2_vector<type_t>>(x);`
`372`	`327`	`}`
`373`	`328`	`static reg_t cast_from(__m256i v)`
`374`	`329`	`{`
`@@ -520,7 +475,7 @@ struct avx2_vector<float> {`
`520`	`475`	`}`
`521`	`476`	`static reg_t reverse(reg_t ymm)`
`522`	`477`	`{`
`523`		`- const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);`
	`478`	`+ const __m256i rev_index = _mm256_set_epi32(NETWORK_REVERSE_8LANES);`
`524`	`479`	`return permutexvar(rev_index, ymm);`
`525`	`480`	`}`
`526`	`481`	`static type_t reducemax(reg_t v)`
`@@ -547,7 +502,7 @@ struct avx2_vector<float> {`
`547`	`502`	`}`
`548`	`503`	`static reg_t sort_vec(reg_t x)`
`549`	`504`	`{`
`550`		`- return sort_ymm_32bit<avx2_vector<type_t>>(x);`
	`505`	`+ return sort_reg_8lanes<avx2_vector<type_t>>(x);`
`551`	`506`	`}`
`552`	`507`	`static reg_t cast_from(__m256i v)`
`553`	`508`	`{`
Original file line number	Diff line number	Diff line change
`@@ -10,32 +10,6 @@`
`10`	`10`
`11`	`11`	`#include "avx2-emu-funcs.hpp"`
`12`	`12`
`13`		`-/*`
`14`		`- * Assumes ymm is random and performs a full sorting network defined in`
`15`		`- * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg`
`16`		`- */`
`17`		`-template <typename vtype, typename reg_t = typename vtype::reg_t>`
`18`		`-X86_SIMD_SORT_INLINE reg_t sort_ymm_64bit(reg_t ymm)`
`19`		`-{`
`20`		`- const typename vtype::opmask_t oxAA`
`21`		`- = _mm256_set_epi64x(0xFFFFFFFFFFFFFFFF, 0, 0xFFFFFFFFFFFFFFFF, 0);`
`22`		`- const typename vtype::opmask_t oxCC`
`23`		`- = _mm256_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0, 0);`
`24`		`- ymm = cmp_merge<vtype>(`
`25`		`- ymm,`
`26`		`- vtype::template permutexvar<SHUFFLE_MASK(2, 3, 0, 1)>(ymm),`
`27`		`- oxAA);`
`28`		`- ymm = cmp_merge<vtype>(`
`29`		`- ymm,`
`30`		`- vtype::template permutexvar<SHUFFLE_MASK(0, 1, 2, 3)>(ymm),`
`31`		`- oxCC);`
`32`		`- ymm = cmp_merge<vtype>(`
`33`		`- ymm,`
`34`		`- vtype::template permutexvar<SHUFFLE_MASK(2, 3, 0, 1)>(ymm),`
`35`		`- oxAA);`
`36`		`- return ymm;`
`37`		`-}`
`38`		`-`
`39`	`13`	`struct avx2_64bit_swizzle_ops;`
`40`	`14`
`41`	`15`	`template <>`
`@@ -81,6 +55,10 @@ struct avx2_vector<int64_t> {`
`81`	`55`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`82`	`56`	`return convert_int_to_avx2_mask_64bit(mask);`
`83`	`57`	`}`
	`58`	`+ static opmask_t convert_int_to_mask(uint64_t intMask)`
	`59`	`+ {`
	`60`	`+ return convert_int_to_avx2_mask_64bit(intMask);`
	`61`	`+ }`
`84`	`62`	`static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)`
`85`	`63`	`{`
`86`	`64`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`@@ -207,7 +185,7 @@ struct avx2_vector<int64_t> {`
`207`	`185`	`}`
`208`	`186`	`static reg_t sort_vec(reg_t x)`
`209`	`187`	`{`
`210`		`- return sort_ymm_64bit<avx2_vector<type_t>>(x);`
	`188`	`+ return sort_reg_4lanes<avx2_vector<type_t>>(x);`
`211`	`189`	`}`
`212`	`190`	`static reg_t cast_from(__m256i v)`
`213`	`191`	`{`
`@@ -265,6 +243,10 @@ struct avx2_vector<uint64_t> {`
`265`	`243`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`266`	`244`	`return convert_int_to_avx2_mask_64bit(mask);`
`267`	`245`	`}`
	`246`	`+ static opmask_t convert_int_to_mask(uint64_t intMask)`
	`247`	`+ {`
	`248`	`+ return convert_int_to_avx2_mask_64bit(intMask);`
	`249`	`+ }`
`268`	`250`	`static ymmi_t seti(int64_t v1, int64_t v2, int64_t v3, int64_t v4)`
`269`	`251`	`{`
`270`	`252`	`return _mm256_set_epi64x(v1, v2, v3, v4);`
`@@ -389,7 +371,7 @@ struct avx2_vector<uint64_t> {`
`389`	`371`	`}`
`390`	`372`	`static reg_t sort_vec(reg_t x)`
`391`	`373`	`{`
`392`		`- return sort_ymm_64bit<avx2_vector<type_t>>(x);`
	`374`	`+ return sort_reg_4lanes<avx2_vector<type_t>>(x);`
`393`	`375`	`}`
`394`	`376`	`static reg_t cast_from(__m256i v)`
`395`	`377`	`{`
`@@ -460,6 +442,10 @@ struct avx2_vector<double> {`
`460`	`442`	`auto mask = ((0x1ull << num_to_read) - 0x1ull);`
`461`	`443`	`return convert_int_to_avx2_mask_64bit(mask);`
`462`	`444`	`}`
	`445`	`+ static opmask_t convert_int_to_mask(uint64_t intMask)`
	`446`	`+ {`
	`447`	`+ return convert_int_to_avx2_mask_64bit(intMask);`
	`448`	`+ }`
`463`	`449`	`static int32_t convert_mask_to_int(opmask_t mask)`
`464`	`450`	`{`
`465`	`451`	`return convert_avx2_mask_to_int_64bit(mask);`
`@@ -593,7 +579,7 @@ struct avx2_vector<double> {`
`593`	`579`	`}`
`594`	`580`	`static reg_t sort_vec(reg_t x)`
`595`	`581`	`{`
`596`		`- return sort_ymm_64bit<avx2_vector<type_t>>(x);`
	`582`	`+ return sort_reg_4lanes<avx2_vector<type_t>>(x);`
`597`	`583`	`}`
`598`	`584`	`static reg_t cast_from(__m256i v)`
`599`	`585`	`{`