From 1d249f5837a2efa204583463bb425883245d0c8e Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Mon, 26 Feb 2024 14:46:27 -0800
Subject: [PATCH 1/5] Use smart pivot in key-value

---
 src/avx512-64bit-common.h         | 86 +++++++++++++++++++++++++++++++
 src/avx512-64bit-keyvaluesort.hpp |  5 +-
 2 files changed, 90 insertions(+), 1 deletion(-)
diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
index 1cd4ca1c..c529fae6 100644
--- a/src/avx512-64bit-common.h
+++ b/src/avx512-64bit-common.h
@@ -25,6 +25,7 @@ template <typename vtype, typename reg_t>
 X86_SIMD_SORT_INLINE reg_t sort_zmm_64bit(reg_t zmm);
 
 struct avx512_64bit_swizzle_ops;
+struct avx512_ymm_64bit_swizzle_ops;
 
 template <>
 struct ymm_vector<float> {
@@ -34,6 +35,7 @@ struct ymm_vector<float> {
     using opmask_t = __mmask8;
     static const uint8_t numlanes = 8;
     static constexpr simd_type vec_type = simd_type::AVX512;
+    using swizzle_ops = avx512_ymm_64bit_swizzle_ops;
 
     static type_t type_max()
     {
@@ -208,6 +210,9 @@ struct ymm_vector<float> {
     {
         return _mm256_castps_si256(v);
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static reg_t reverse(reg_t ymm)
     {
         const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
@@ -222,6 +227,7 @@ struct ymm_vector<uint32_t> {
     using opmask_t = __mmask8;
     static const uint8_t numlanes = 8;
     static constexpr simd_type vec_type = simd_type::AVX512;
+    using swizzle_ops = avx512_ymm_64bit_swizzle_ops;
 
     static type_t type_max()
     {
@@ -382,6 +388,9 @@ struct ymm_vector<uint32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static reg_t reverse(reg_t ymm)
     {
         const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
@@ -396,6 +405,7 @@ struct ymm_vector<int32_t> {
     using opmask_t = __mmask8;
     static const uint8_t numlanes = 8;
     static constexpr simd_type vec_type = simd_type::AVX512;
+    using swizzle_ops = avx512_ymm_64bit_swizzle_ops;
 
     static type_t type_max()
     {
@@ -556,6 +566,9 @@ struct ymm_vector<int32_t> {
     {
         return v;
     }
+    static bool all_false(opmask_t k){
+        return k == 0;
+    }
     static reg_t reverse(reg_t ymm)
     {
         const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2);
@@ -1204,4 +1217,77 @@ struct avx512_64bit_swizzle_ops {
     }
 };
 
+struct avx512_ymm_64bit_swizzle_ops {
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg)
+    {
+        __m256i v = vtype::cast_to(reg);
+
+        if constexpr (scale == 2) {
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, 0b10110001);
+            v = _mm256_castps_si256(vf);
+        }
+        else if constexpr (scale == 4) {
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, 0b01001110);
+            v = _mm256_castps_si256(vf);
+        }
+        else if constexpr (scale == 8) {
+            v = _mm256_permute2x128_si256(v, v, 0b00000001);
+        }
+        else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v);
+    }
+
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t
+    reverse_n(typename vtype::reg_t reg)
+    {
+        __m256i v = vtype::cast_to(reg);
+
+        if constexpr (scale == 2) { return swap_n<vtype, 2>(reg); }
+        else if constexpr (scale == 4) {
+            constexpr uint64_t mask = 0b00011011;
+            __m256 vf = _mm256_castsi256_ps(v);
+            vf = _mm256_permute_ps(vf, mask);
+            v = _mm256_castps_si256(vf);
+        }
+        else if constexpr (scale == 8) {
+            return vtype::reverse(reg);
+        }
+        else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v);
+    }
+
+    template <typename vtype, int scale>
+    X86_SIMD_SORT_INLINE typename vtype::reg_t
+    merge_n(typename vtype::reg_t reg, typename vtype::reg_t other)
+    {
+        __m256i v1 = vtype::cast_to(reg);
+        __m256i v2 = vtype::cast_to(other);
+
+        if constexpr (scale == 2) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b01010101);
+        }
+        else if constexpr (scale == 4) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b00110011);
+        }
+        else if constexpr (scale == 8) {
+            v1 = _mm256_blend_epi32(v1, v2, 0b00001111);
+        }
+        else {
+            static_assert(scale == -1, "should not be reached");
+        }
+
+        return vtype::cast_from(v1);
+    }
+};
+
 #endif
diff --git a/src/avx512-64bit-keyvaluesort.hpp b/src/avx512-64bit-keyvaluesort.hpp
index 9acdbd71..48b28ad7 100644
--- a/src/avx512-64bit-keyvaluesort.hpp
+++ b/src/avx512-64bit-keyvaluesort.hpp
@@ -388,7 +388,10 @@ X86_SIMD_SORT_INLINE void qsort_64bit_(type1_t *keys,
         return;
     }
 
-    type1_t pivot = get_pivot_blocks<vtype1>(keys, left, right);
+    type1_t pivot;
+    auto pivot_result = get_pivot_smart<vtype1, type1_t>(keys, left, right);
+    pivot = pivot_result.pivot;
+
     type1_t smallest = vtype1::type_max();
     type1_t biggest = vtype1::type_min();
     arrsize_t pivot_index = partition_avx512_unrolled<vtype1, vtype2, 4>(

From bc77ec988979654d216288285353bf7828d2e702 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 27 Feb 2024 09:20:48 -0800
Subject: [PATCH 2/5] format fixes

---
 src/avx512-64bit-common.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
index c529fae6..cbd38a32 100644
--- a/src/avx512-64bit-common.h
+++ b/src/avx512-64bit-common.h
@@ -210,7 +210,8 @@ struct ymm_vector<float> {
     {
         return _mm256_castps_si256(v);
     }
-    static bool all_false(opmask_t k){
+    static bool all_false(opmask_t k)
+    {
         return k == 0;
     }
     static reg_t reverse(reg_t ymm)
@@ -388,7 +389,8 @@ struct ymm_vector<uint32_t> {
     {
         return v;
     }
-    static bool all_false(opmask_t k){
+    static bool all_false(opmask_t k)
+    {
         return k == 0;
     }
     static reg_t reverse(reg_t ymm)
@@ -566,7 +568,8 @@ struct ymm_vector<int32_t> {
     {
         return v;
     }
-    static bool all_false(opmask_t k){
+    static bool all_false(opmask_t k)
+    {
         return k == 0;
     }
     static reg_t reverse(reg_t ymm)

From 8d16dd6b02cae91d3330c75369e3e5e07662af01 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 27 Feb 2024 14:57:14 -0800
Subject: [PATCH 3/5] Bug in rand_array.h

---
 utils/rand_array.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/rand_array.h b/utils/rand_array.h
index a9703551..dc20dbb9 100644
--- a/utils/rand_array.h
+++ b/utils/rand_array.h
@@ -137,7 +137,7 @@ static std::vector<T> get_array(std::string arrtype,
             val = std::numeric_limits<T>::max();
         }
         for (size_t ii = 1; ii <= arrsize; ++ii) {
-            if (rand() % 0x1) { arr[ii] = val; }
+            if (rand() & 0x1) { arr[ii] = val; }
         }
     }
     else {

From 24e8e3db9ab565de28a9b5514f5e9a7458da5831 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 27 Feb 2024 14:57:59 -0800
Subject: [PATCH 4/5] update to 5.0

---
 meson.build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/meson.build b/meson.build
index 873094ba..1e2a913a 100644
--- a/meson.build
+++ b/meson.build
@@ -1,5 +1,5 @@
 project('x86-simd-sort', 'cpp',
-        version : '4.0.0',
+        version : '5.0.0',
         license : 'BSD 3-clause',
         default_options : ['cpp_std=c++17'])
 fs = import('fs')

From 3d47ffc1c1fb7d2231ac87881f89aa017c39b1dd Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Tue, 27 Feb 2024 15:04:07 -0800
Subject: [PATCH 5/5] Remove dead code

---
 src/xss-pivot-selection.hpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp
index 59dc0489..13fed026 100644
--- a/src/xss-pivot-selection.hpp
+++ b/src/xss-pivot-selection.hpp
@@ -157,12 +157,7 @@ get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right)
         // Thus, median probably is a fine pivot, since it will move all of this common value into its own partition
         return pivot_results<type_t>(median);
     }
-    else {
-        // Should be unreachable
-        return pivot_results<type_t>(median);
-    }
 
-    // Should be unreachable
     return pivot_results<type_t>(median);
 }