From e29d303c6bbf5a4e6b6afd3543c222577070e968 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Tue, 7 Jan 2025 15:15:01 -0800
Subject: [PATCH 1/7] Add OpenMP acceleration for normal quicksort

---
 src/avx512-16bit-qsort.hpp      |  5 ++-
 src/avx512-64bit-common.h       |  3 +-
 src/xss-common-includes.h       |  5 +++
 src/xss-common-keyvaluesort.hpp |  5 ---
 src/xss-common-qsort.h          | 76 ++++++++++++++++++++++++++++++---
 tests/test-qsort.cpp            |  4 ++
 6 files changed, 85 insertions(+), 13 deletions(-)
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
index 3b18b98..1ed829b 100644
--- a/src/avx512-16bit-qsort.hpp
+++ b/src/avx512-16bit-qsort.hpp
@@ -556,6 +556,7 @@ avx512_qsort_fp16(uint16_t *arr,
 {
     using vtype = zmm_vector<float16>;
 
+    // TODO multithreading support here
     if (arrsize > 1) {
         arrsize_t nan_count = 0;
         if (UNLIKELY(hasnan)) {
@@ -564,11 +565,11 @@ avx512_qsort_fp16(uint16_t *arr,
         }
         if (descending) {
             qsort_<vtype, Comparator<vtype, true>, uint16_t>(
-                    arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
+                    arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
         }
         else {
             qsort_<vtype, Comparator<vtype, false>, uint16_t>(
-                    arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
+                    arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
         }
         replace_inf_with_nan(arr, arrsize, nan_count, descending);
     }
diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
index f27a31f..5d55196 100644
--- a/src/avx512-64bit-common.h
+++ b/src/avx512-64bit-common.h
@@ -968,7 +968,8 @@ struct zmm_vector<uint64_t> {
 static_assert(sizeof(size_t) == sizeof(uint64_t),
               "Size of size_t and uint64_t are not the same");
 template <>
-struct zmm_vector<size_t> : public zmm_vector<uint64_t> {};
+struct zmm_vector<size_t> : public zmm_vector<uint64_t> {
+};
 #endif
 
 template <>
diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h
index 386ca86..27d6c36 100644
--- a/src/xss-common-includes.h
+++ b/src/xss-common-includes.h
@@ -82,6 +82,11 @@
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \
             21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
 
+#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
+#define XSS_COMPILE_OPENMP
+#include <omp.h>
+#endif
+
 template <class... T>
 constexpr bool always_false = false;
 
diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp
index df85861..a607b62 100644
--- a/src/xss-common-keyvaluesort.hpp
+++ b/src/xss-common-keyvaluesort.hpp
@@ -11,11 +11,6 @@
 #include "xss-common-qsort.h"
 #include "xss-network-keyvaluesort.hpp"
 
-#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
-#define XSS_COMPILE_OPENMP
-#include <omp.h>
-#endif
-
 /*
  * Sort all the NAN's to end of the array and return the index of the last elem
  * in the array which is not a nan
diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
index 0fb6263..96e1b5c 100644
--- a/src/xss-common-qsort.h
+++ b/src/xss-common-qsort.h
@@ -521,8 +521,11 @@ template <typename vtype, int maxN>
 void sort_n(typename vtype::type_t *arr, int N);
 
 template <typename vtype, typename comparator, typename type_t>
-static void
-qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
+static void qsort_(type_t *arr,
+                   arrsize_t left,
+                   arrsize_t right,
+                   arrsize_t max_iters,
+                   arrsize_t task_threshold)
 {
     /*
      * Resort to std::sort if quicksort isnt making any progress
@@ -559,10 +562,40 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
     type_t leftmostValue = comparator::leftmost(smallest, biggest);
     type_t rightmostValue = comparator::rightmost(smallest, biggest);
 
+#ifdef XSS_COMPILE_OPENMP
+    if (pivot != leftmostValue) {
+        bool parallel_left = (pivot_index - left) > task_threshold;
+        if (parallel_left) {
+#pragma omp task
+            qsort_<vtype, comparator>(
+                    arr, left, pivot_index - 1, max_iters - 1, task_threshold);
+        }
+        else {
+            qsort_<vtype, comparator>(
+                    arr, left, pivot_index - 1, max_iters - 1, task_threshold);
+        }
+    }
+    if (pivot != rightmostValue) {
+        bool parallel_right = (right - pivot_index) > task_threshold;
+
+        if (parallel_right) {
+#pragma omp task
+            qsort_<vtype, comparator>(
+                    arr, pivot_index, right, max_iters - 1, task_threshold);
+        }
+        else {
+            qsort_<vtype, comparator>(
+                    arr, pivot_index, right, max_iters - 1, task_threshold);
+        }
+    }
+#else
+    UNUSED(task_threshold);
+
     if (pivot != leftmostValue)
-        qsort_<vtype, comparator>(arr, left, pivot_index - 1, max_iters - 1);
+        qsort_<vtype, comparator>(arr, left, pivot_index - 1, max_iters - 1, 0);
     if (pivot != rightmostValue)
-        qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1);
+        qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1, 0);
+#endif
 }
 
 template <typename vtype, typename comparator, typename type_t>
@@ -627,8 +660,41 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan)
         }
 
         UNUSED(hasnan);
+
+#ifdef XSS_COMPILE_OPENMP
+
+        bool use_parallel = arrsize > 10000;
+
+        if (use_parallel) {
+            // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
+            constexpr int thread_limit = 8;
+            int thread_count = std::min(thread_limit, omp_get_max_threads());
+            arrsize_t task_threshold
+                    = std::max((arrsize_t)10000, arrsize / 100);
+
+            // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
+            // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
+            // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
+#pragma omp parallel num_threads(thread_count)
+#pragma omp single
+            qsort_<vtype, comparator, T>(arr,
+                                         0,
+                                         arrsize - 1,
+                                         2 * (arrsize_t)log2(arrsize),
+                                         task_threshold);
+        }
+        else {
+            qsort_<vtype, comparator, T>(arr,
+                                         0,
+                                         arrsize - 1,
+                                         2 * (arrsize_t)log2(arrsize),
+                                         std::numeric_limits<arrsize_t>::max());
+        }
+#pragma omp taskwait
+#else
         qsort_<vtype, comparator, T>(
-                arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
+                arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
+#endif
 
         replace_inf_with_nan(arr, arrsize, nan_count, descending);
     }
diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp
index 7eef83e..c404967 100644
--- a/tests/test-qsort.cpp
+++ b/tests/test-qsort.cpp
@@ -11,6 +11,10 @@ class simdsort : public ::testing::Test {
     simdsort()
     {
         std::iota(arrsize.begin(), arrsize.end(), 1);
+        arrsize.push_back(10'000);
+        arrsize.push_back(100'000);
+        arrsize.push_back(1'000'000);
+
         arrtype = {"random",
                    "constant",
                    "sorted",

From dbbf2debae185c85f0d83df84f7f592924ba3918 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Thu, 9 Jan 2025 15:31:20 -0800
Subject: [PATCH 2/7] Speed up tests, in particular when OpenMP is disabled

---
 tests/meson.build       |  7 +++++++
 tests/test-keyvalue.cpp | 15 ++++++++++-----
 tests/test-qsort.cpp    | 15 ++++++++++-----
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/tests/meson.build b/tests/meson.build
index 0583c55..92c689b 100644
--- a/tests/meson.build
+++ b/tests/meson.build
@@ -1,19 +1,26 @@
 libtests = []
 
+if get_option('use_openmp')
+  openmpflags = ['-DXSS_USE_OPENMP=true']
+endif
+
 libtests += static_library('tests_qsort',
   files('test-qsort.cpp', ),
   dependencies: gtest_dep,
   include_directories : [src, lib, utils],
+  cpp_args : [openmpflags],
   )
 
 libtests += static_library('tests_kvsort',
   files('test-keyvalue.cpp', ),
   dependencies: gtest_dep,
   include_directories : [src, lib, utils],
+  cpp_args : [openmpflags],
   )
 
 libtests += static_library('tests_objsort',
   files('test-objqsort.cpp', ),
   dependencies: gtest_dep,
   include_directories : [src, lib, utils],
+  cpp_args : [openmpflags],
   )
diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp
index c0e683c..c1386af 100644
--- a/tests/test-keyvalue.cpp
+++ b/tests/test-keyvalue.cpp
@@ -15,9 +15,13 @@ class simdkvsort : public ::testing::Test {
     simdkvsort()
     {
         std::iota(arrsize.begin(), arrsize.end(), 1);
-        arrsize.push_back(10'000);
-        arrsize.push_back(100'000);
-        arrsize.push_back(1'000'000);
+        std::iota(arrsize_long.begin(), arrsize_long.end(), 1);
+#ifdef XSS_USE_OPENMP
+        // These extended tests are only needed for the OpenMP logic
+        arrsize_long.push_back(10'000);
+        arrsize_long.push_back(100'000);
+        arrsize_long.push_back(1'000'000);
+#endif
 
         arrtype = {"random",
                    "constant",
@@ -32,6 +36,7 @@ class simdkvsort : public ::testing::Test {
     }
     std::vector<std::string> arrtype;
     std::vector<size_t> arrsize = std::vector<size_t>(1024);
+    std::vector<size_t> arrsize_long = std::vector<size_t>(1024);
 };
 
 TYPED_TEST_SUITE_P(simdkvsort);
@@ -168,7 +173,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_ascending)
     using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type;
     for (auto type : this->arrtype) {
         bool hasnan = is_nan_test(type);
-        for (auto size : this->arrsize) {
+        for (auto size : this->arrsize_long) {
             std::vector<T1> key = get_array<T1>(type, size);
             std::vector<T2> val = get_array<T2>(type, size);
             std::vector<T1> key_bckp = key;
@@ -199,7 +204,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_descending)
     using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type;
     for (auto type : this->arrtype) {
         bool hasnan = is_nan_test(type);
-        for (auto size : this->arrsize) {
+        for (auto size : this->arrsize_long) {
             std::vector<T1> key = get_array<T1>(type, size);
             std::vector<T2> val = get_array<T2>(type, size);
             std::vector<T1> key_bckp = key;
diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp
index c404967..8a48207 100644
--- a/tests/test-qsort.cpp
+++ b/tests/test-qsort.cpp
@@ -11,9 +11,13 @@ class simdsort : public ::testing::Test {
     simdsort()
     {
         std::iota(arrsize.begin(), arrsize.end(), 1);
-        arrsize.push_back(10'000);
-        arrsize.push_back(100'000);
-        arrsize.push_back(1'000'000);
+        std::iota(arrsize_long.begin(), arrsize_long.end(), 1);
+#ifdef XSS_USE_OPENMP
+        // These extended tests are only needed for the OpenMP logic
+        arrsize_long.push_back(10'000);
+        arrsize_long.push_back(100'000);
+        arrsize_long.push_back(1'000'000);
+#endif
 
         arrtype = {"random",
                    "constant",
@@ -28,6 +32,7 @@ class simdsort : public ::testing::Test {
     }
     std::vector<std::string> arrtype;
     std::vector<size_t> arrsize = std::vector<size_t>(1024);
+    std::vector<size_t> arrsize_long = std::vector<size_t>(1024);
 };
 
 TYPED_TEST_SUITE_P(simdsort);
@@ -36,7 +41,7 @@ TYPED_TEST_P(simdsort, test_qsort_ascending)
 {
     for (auto type : this->arrtype) {
         bool hasnan = is_nan_test(type);
-        for (auto size : this->arrsize) {
+        for (auto size : this->arrsize_long) {
             std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
 
             // Ascending order
@@ -58,7 +63,7 @@ TYPED_TEST_P(simdsort, test_qsort_descending)
 {
     for (auto type : this->arrtype) {
         bool hasnan = is_nan_test(type);
-        for (auto size : this->arrsize) {
+        for (auto size : this->arrsize_long) {
             std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);
 
             // Descending order

From d77d0e35582ecb9816c3ebde25c439cef594e610 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Thu, 16 Jan 2025 14:57:17 -0800
Subject: [PATCH 3/7] Change threshold for OpenMP/tasks from 10k to 100k

---
 src/xss-common-qsort.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
index 96e1b5c..801ec72 100644
--- a/src/xss-common-qsort.h
+++ b/src/xss-common-qsort.h
@@ -663,14 +663,14 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan)
 
 #ifdef XSS_COMPILE_OPENMP
 
-        bool use_parallel = arrsize > 10000;
+        bool use_parallel = arrsize > 100000;
 
         if (use_parallel) {
             // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
             constexpr int thread_limit = 8;
             int thread_count = std::min(thread_limit, omp_get_max_threads());
             arrsize_t task_threshold
-                    = std::max((arrsize_t)10000, arrsize / 100);
+                    = std::max((arrsize_t)100000, arrsize / 100);
 
             // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
             // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems

From e30d8b6710deaca61f06e61848d1e16b9b718a64 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Tue, 21 Jan 2025 10:52:04 -0800
Subject: [PATCH 4/7] Fix that weird formatting issue

---
 src/avx512-64bit-common.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h
index 5d55196..f27a31f 100644
--- a/src/avx512-64bit-common.h
+++ b/src/avx512-64bit-common.h
@@ -968,8 +968,7 @@ struct zmm_vector<uint64_t> {
 static_assert(sizeof(size_t) == sizeof(uint64_t),
               "Size of size_t and uint64_t are not the same");
 template <>
-struct zmm_vector<size_t> : public zmm_vector<uint64_t> {
-};
+struct zmm_vector<size_t> : public zmm_vector<uint64_t> {};
 #endif
 
 template <>

From 1c5c19bdf0c202a49625a7a3af681bc9300e5498 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Wed, 26 Mar 2025 10:35:09 -0700
Subject: [PATCH 5/7] Fix missing openmp flags

---
 lib/meson.build | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/meson.build b/lib/meson.build
index 5cbc105..48046b3 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -34,7 +34,7 @@ if cpp.has_argument('-march=icelake-client')
       'x86simdsort-icl.cpp',
       ),
     include_directories : [src],
-    cpp_args : ['-march=icelake-client'],
+    cpp_args : ['-march=icelake-client', openmpflags],
     gnu_symbol_visibility : 'inlineshidden',
     )
 endif
@@ -45,7 +45,7 @@ if cancompilefp16
       'x86simdsort-spr.cpp',
       ),
     include_directories : [src],
-    cpp_args : ['-march=sapphirerapids'],
+    cpp_args : ['-march=sapphirerapids', openmpflags],
     gnu_symbol_visibility : 'inlineshidden',
     )
 endif

From dba705f75720ed9211e835baba52f19cd44000d6 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Wed, 26 Mar 2025 11:41:03 -0700
Subject: [PATCH 6/7] Use unrolled partition for _Float16

---
 src/avx512fp16-16bit-qsort.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp
index 7de26a0..f93cf68 100644
--- a/src/avx512fp16-16bit-qsort.hpp
+++ b/src/avx512fp16-16bit-qsort.hpp
@@ -22,7 +22,7 @@ struct zmm_vector<_Float16> {
     using opmask_t = __mmask32;
     static const uint8_t numlanes = 32;
     static constexpr int network_sort_threshold = 128;
-    static constexpr int partition_unroll_factor = 0;
+    static constexpr int partition_unroll_factor = 8;
     static constexpr simd_type vec_type = simd_type::AVX512;
 
     using swizzle_ops = avx512_16bit_swizzle_ops;

From e01e79fe371c39735bcec7c051bbcb252e6185bb Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <matthew.sterrett@intel.com>
Date: Thu, 27 Mar 2025 08:54:21 -0700
Subject: [PATCH 7/7] Change 16-bit swizzle from vector to C arrays

---
 src/avx512-16bit-common.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h
index 524ce7a..e1a76d3 100644
--- a/src/avx512-16bit-common.h
+++ b/src/avx512-16bit-common.h
@@ -14,11 +14,11 @@ struct avx512_16bit_swizzle_ops {
         __m512i v = vtype::cast_to(reg);
 
         if constexpr (scale == 2) {
-            std::vector<uint16_t> arr
+            constexpr static uint16_t arr[]
                     = {1,  0,  3,  2,  5,  4,  7,  6,  9,  8,  11,
                        10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20,
                        23, 22, 25, 24, 27, 26, 29, 28, 31, 30};
-            __m512i mask = _mm512_loadu_si512(arr.data());
+            __m512i mask = _mm512_loadu_si512(arr);
             v = _mm512_permutexvar_epi16(mask, v);
         }
         else if constexpr (scale == 4) {
@@ -48,27 +48,27 @@ struct avx512_16bit_swizzle_ops {
 
         if constexpr (scale == 2) { return swap_n<vtype, 2>(reg); }
         else if constexpr (scale == 4) {
-            std::vector<uint16_t> arr
+            constexpr static uint16_t arr[]
                     = {3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,
                        8,  15, 14, 13, 12, 19, 18, 17, 16, 23, 22,
                        21, 20, 27, 26, 25, 24, 31, 30, 29, 28};
-            __m512i mask = _mm512_loadu_si512(arr.data());
+            __m512i mask = _mm512_loadu_si512(arr);
             v = _mm512_permutexvar_epi16(mask, v);
         }
         else if constexpr (scale == 8) {
-            std::vector<uint16_t> arr
+            constexpr static int16_t arr[]
                     = {7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13,
                        12, 11, 10, 9,  8,  23, 22, 21, 20, 19, 18,
                        17, 16, 31, 30, 29, 28, 27, 26, 25, 24};
-            __m512i mask = _mm512_loadu_si512(arr.data());
+            __m512i mask = _mm512_loadu_si512(arr);
             v = _mm512_permutexvar_epi16(mask, v);
         }
         else if constexpr (scale == 16) {
-            std::vector<uint16_t> arr
+            constexpr static uint16_t arr[]
                     = {15, 14, 13, 12, 11, 10, 9,  8,  7,  6,  5,
                        4,  3,  2,  1,  0,  31, 30, 29, 28, 27, 26,
                        25, 24, 23, 22, 21, 20, 19, 18, 17, 16};
-            __m512i mask = _mm512_loadu_si512(arr.data());
+            __m512i mask = _mm512_loadu_si512(arr);
             v = _mm512_permutexvar_epi16(mask, v);
         }
         else if constexpr (scale == 32) {