From 72eb422c01a8508d50441a431947cb68aeb8517c Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Fri, 4 Apr 2025 12:14:33 -0700
Subject: [PATCH] Switch to std::threads from openmp

---
 Makefile                   |  4 +-
 scripts/bench-compare.sh   |  2 +-
 src/avx512-16bit-qsort.hpp |  9 ++--
 src/xss-common-includes.h  | 28 +++++++++++++
 src/xss-common-qsort.h     | 86 ++++++++++++++------------------------
 5 files changed, 66 insertions(+), 63 deletions(-)
diff --git a/Makefile b/Makefile
index 023ace6d..ae86f591 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 test:
-	meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir
+	meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
 	cd builddir && ninja
 
 test_openmp:
@@ -7,7 +7,7 @@ test_openmp:
 	cd builddir && ninja
 
 bench:
-	meson setup -Dbuild_benchmarks=true --warnlevel 2 --werror --buildtype release builddir
+	meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
 	cd builddir && ninja
 
 debug:
diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh
index a224acd0..f4a9197e 100755
--- a/scripts/bench-compare.sh
+++ b/scripts/bench-compare.sh
@@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then
 fi
 compare=$(realpath .bench/google-benchmark/tools/compare.py)
 
-meson setup -Dbuild_benchmarks=true -Dbuild_ippbench=true --warnlevel 0 --buildtype release builddir-${branch}
+meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir-${branch}
 cd builddir-${branch}
 ninja
 $compare filters ./benchexe $1 $2 --benchmark_repetitions=$3
diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp
index 1ed829b3..11d65b46 100644
--- a/src/avx512-16bit-qsort.hpp
+++ b/src/avx512-16bit-qsort.hpp
@@ -9,10 +9,6 @@
 
 #include "avx512-16bit-common.h"
 
-struct float16 {
-    uint16_t val;
-};
-
 template <>
 struct zmm_vector<float16> {
     using type_t = uint16_t;
@@ -555,6 +551,7 @@ avx512_qsort_fp16(uint16_t *arr,
                   bool descending = false)
 {
     using vtype = zmm_vector<float16>;
+    struct threadmanager tm;
 
     // TODO multithreading support here
     if (arrsize > 1) {
@@ -565,11 +562,11 @@ avx512_qsort_fp16(uint16_t *arr,
         }
         if (descending) {
             qsort_<vtype, Comparator<vtype, true>, uint16_t>(
-                    arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
+                    arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm);
         }
         else {
             qsort_<vtype, Comparator<vtype, false>, uint16_t>(
-                    arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
+                    arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm);
         }
         replace_inf_with_nan(arr, arrsize, nan_count, descending);
     }
diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h
index 27d6c360..abcd2951 100644
--- a/src/xss-common-includes.h
+++ b/src/xss-common-includes.h
@@ -7,6 +7,8 @@
 #include <immintrin.h>
 #include <limits>
 #include <vector>
+#include <thread>
+#include <mutex>
 #include "xss-custom-float.h"
 
 #define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
@@ -87,6 +89,10 @@
 #include <omp.h>
 #endif
 
+struct float16 {
+    uint16_t val;
+};
+
 template <class... T>
 constexpr bool always_false = false;
 
@@ -109,4 +115,26 @@ enum class simd_type : int { AVX2, AVX512 };
 template <typename vtype, typename T = typename vtype::type_t>
 X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b);
 
+struct threadmanager {
+    int max_thread_count;
+    std::mutex mymutex;
+    int sharedCount;
+    arrsize_t task_threshold;
+
+    threadmanager() {
+#ifdef XSS_COMPILE_OPENMP
+        max_thread_count = 8;
+#else
+        max_thread_count = 0;
+#endif
+        sharedCount = 0;
+        task_threshold = 100000;
+    };
+    void incrementCount(int ii) {
+        mymutex.lock();
+        sharedCount += ii;
+        mymutex.unlock();
+    }
+};
+
 #endif // XSS_COMMON_INCLUDES
diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h
index 801ec72c..673a9150 100644
--- a/src/xss-common-qsort.h
+++ b/src/xss-common-qsort.h
@@ -525,7 +525,7 @@ static void qsort_(type_t *arr,
                    arrsize_t left,
                    arrsize_t right,
                    arrsize_t max_iters,
-                   arrsize_t task_threshold)
+                   struct threadmanager &tm)
 {
     /*
      * Resort to std::sort if quicksort isnt making any progress
@@ -562,40 +562,49 @@ static void qsort_(type_t *arr,
     type_t leftmostValue = comparator::leftmost(smallest, biggest);
     type_t rightmostValue = comparator::rightmost(smallest, biggest);
 
-#ifdef XSS_COMPILE_OPENMP
+    std::thread t1, t2;
+    bool parallel_left = ((pivot_index - left) > tm.task_threshold)
+            && (tm.sharedCount < tm.max_thread_count);
     if (pivot != leftmostValue) {
-        bool parallel_left = (pivot_index - left) > task_threshold;
         if (parallel_left) {
-#pragma omp task
-            qsort_<vtype, comparator>(
-                    arr, left, pivot_index - 1, max_iters - 1, task_threshold);
+            tm.incrementCount(1);
+            t1 = std::thread(qsort_<vtype, comparator, type_t>,
+                             arr,
+                             left,
+                             pivot_index - 1,
+                             max_iters - 1,
+                             std::ref(tm));
         }
         else {
             qsort_<vtype, comparator>(
-                    arr, left, pivot_index - 1, max_iters - 1, task_threshold);
+                    arr, left, pivot_index - 1, max_iters - 1, tm);
         }
     }
+    bool parallel_right = ((right - pivot_index) > tm.task_threshold)
+            && (tm.sharedCount < tm.max_thread_count);
     if (pivot != rightmostValue) {
-        bool parallel_right = (right - pivot_index) > task_threshold;
-
         if (parallel_right) {
-#pragma omp task
-            qsort_<vtype, comparator>(
-                    arr, pivot_index, right, max_iters - 1, task_threshold);
+            tm.incrementCount(1);
+            t2 = std::thread(qsort_<vtype, comparator, type_t>,
+                             arr,
+                             pivot_index,
+                             right,
+                             max_iters - 1,
+                             std::ref(tm));
         }
         else {
             qsort_<vtype, comparator>(
-                    arr, pivot_index, right, max_iters - 1, task_threshold);
+                    arr, pivot_index, right, max_iters - 1, tm);
         }
     }
-#else
-    UNUSED(task_threshold);
-
-    if (pivot != leftmostValue)
-        qsort_<vtype, comparator>(arr, left, pivot_index - 1, max_iters - 1, 0);
-    if (pivot != rightmostValue)
-        qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1, 0);
-#endif
+    if (t1.joinable()) {
+        t1.join();
+        tm.incrementCount(-1);
+    }
+    if (t2.joinable()) {
+        t2.join();
+        tm.incrementCount(-1);
+    }
 }
 
 template <typename vtype, typename comparator, typename type_t>
@@ -661,40 +670,9 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan)
 
         UNUSED(hasnan);
 
-#ifdef XSS_COMPILE_OPENMP
-
-        bool use_parallel = arrsize > 100000;
-
-        if (use_parallel) {
-            // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
-            constexpr int thread_limit = 8;
-            int thread_count = std::min(thread_limit, omp_get_max_threads());
-            arrsize_t task_threshold
-                    = std::max((arrsize_t)100000, arrsize / 100);
-
-            // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
-            // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
-            // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
-#pragma omp parallel num_threads(thread_count)
-#pragma omp single
-            qsort_<vtype, comparator, T>(arr,
-                                         0,
-                                         arrsize - 1,
-                                         2 * (arrsize_t)log2(arrsize),
-                                         task_threshold);
-        }
-        else {
-            qsort_<vtype, comparator, T>(arr,
-                                         0,
-                                         arrsize - 1,
-                                         2 * (arrsize_t)log2(arrsize),
-                                         std::numeric_limits<arrsize_t>::max());
-        }
-#pragma omp taskwait
-#else
+        struct threadmanager tm;
         qsort_<vtype, comparator, T>(
-                arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
-#endif
+                arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm);
 
         replace_inf_with_nan(arr, arrsize, nan_count, descending);
     }