From 72eb422c01a8508d50441a431947cb68aeb8517c Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 4 Apr 2025 12:14:33 -0700 Subject: [PATCH] Switch to std::threads from openmp --- Makefile | 4 +- scripts/bench-compare.sh | 2 +- src/avx512-16bit-qsort.hpp | 9 ++-- src/xss-common-includes.h | 28 +++++++++++++ src/xss-common-qsort.h | 86 ++++++++++++++------------------------ 5 files changed, 66 insertions(+), 63 deletions(-) diff --git a/Makefile b/Makefile index 023ace6d..ae86f591 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ test: - meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir + meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir cd builddir && ninja test_openmp: @@ -7,7 +7,7 @@ test_openmp: cd builddir && ninja bench: - meson setup -Dbuild_benchmarks=true --warnlevel 2 --werror --buildtype release builddir + meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir cd builddir && ninja debug: diff --git a/scripts/bench-compare.sh b/scripts/bench-compare.sh index a224acd0..f4a9197e 100755 --- a/scripts/bench-compare.sh +++ b/scripts/bench-compare.sh @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then fi compare=$(realpath .bench/google-benchmark/tools/compare.py) -meson setup -Dbuild_benchmarks=true -Dbuild_ippbench=true --warnlevel 0 --buildtype release builddir-${branch} +meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir-${branch} cd builddir-${branch} ninja $compare filters ./benchexe $1 $2 --benchmark_repetitions=$3 diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 1ed829b3..11d65b46 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -9,10 +9,6 @@ #include "avx512-16bit-common.h" -struct float16 { - uint16_t val; -}; - template <> struct zmm_vector { using type_t = uint16_t; @@ -555,6 +551,7 @@ avx512_qsort_fp16(uint16_t *arr, bool descending = false) { using vtype = zmm_vector; + struct threadmanager tm; // TODO multithreading support here if (arrsize > 1) { @@ -565,11 +562,11 @@ avx512_qsort_fp16(uint16_t *arr, } if (descending) { qsort_, uint16_t>( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm); } else { qsort_, uint16_t>( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm); } replace_inf_with_nan(arr, arrsize, nan_count, descending); } diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index 27d6c360..abcd2951 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include "xss-custom-float.h" #define X86_SIMD_SORT_INFINITY std::numeric_limits::infinity() @@ -87,6 +89,10 @@ #include #endif +struct float16 { + uint16_t val; +}; + template constexpr bool always_false = false; @@ -109,4 +115,26 @@ enum class simd_type : int { AVX2, AVX512 }; template X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b); +struct threadmanager { + int max_thread_count; + std::mutex mymutex; + int sharedCount; + arrsize_t task_threshold; + + threadmanager() { +#ifdef XSS_COMPILE_OPENMP + max_thread_count = 8; +#else + max_thread_count = 0; +#endif + sharedCount = 0; + task_threshold = 100000; + }; + void incrementCount(int ii) { + mymutex.lock(); + sharedCount += ii; + mymutex.unlock(); + } +}; + #endif // XSS_COMMON_INCLUDES diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 801ec72c..673a9150 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -525,7 +525,7 @@ static void qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters, - arrsize_t task_threshold) + struct threadmanager &tm) { /* * Resort to std::sort if quicksort isnt making any progress @@ -562,40 +562,49 @@ static void qsort_(type_t *arr, type_t leftmostValue = comparator::leftmost(smallest, biggest); type_t rightmostValue = comparator::rightmost(smallest, biggest); -#ifdef XSS_COMPILE_OPENMP + std::thread t1, t2; + bool parallel_left = ((pivot_index - left) > tm.task_threshold) + && (tm.sharedCount < tm.max_thread_count); if (pivot != leftmostValue) { - bool parallel_left = (pivot_index - left) > task_threshold; if (parallel_left) { -#pragma omp task - qsort_( - arr, left, pivot_index - 1, max_iters - 1, task_threshold); + tm.incrementCount(1); + t1 = std::thread(qsort_, + arr, + left, + pivot_index - 1, + max_iters - 1, + std::ref(tm)); } else { qsort_( - arr, left, pivot_index - 1, max_iters - 1, task_threshold); + arr, left, pivot_index - 1, max_iters - 1, tm); } } + bool parallel_right = ((right - pivot_index) > tm.task_threshold) + && (tm.sharedCount < tm.max_thread_count); if (pivot != rightmostValue) { - bool parallel_right = (right - pivot_index) > task_threshold; - if (parallel_right) { -#pragma omp task - qsort_( - arr, pivot_index, right, max_iters - 1, task_threshold); + tm.incrementCount(1); + t2 = std::thread(qsort_, + arr, + pivot_index, + right, + max_iters - 1, + std::ref(tm)); } else { qsort_( - arr, pivot_index, right, max_iters - 1, task_threshold); + arr, pivot_index, right, max_iters - 1, tm); } } -#else - UNUSED(task_threshold); - - if (pivot != leftmostValue) - qsort_(arr, left, pivot_index - 1, max_iters - 1, 0); - if (pivot != rightmostValue) - qsort_(arr, pivot_index, right, max_iters - 1, 0); -#endif + if (t1.joinable()) { + t1.join(); + tm.incrementCount(-1); + } + if (t2.joinable()) { + t2.join(); + tm.incrementCount(-1); + } } template @@ -661,40 +670,9 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) UNUSED(hasnan); -#ifdef XSS_COMPILE_OPENMP - - bool use_parallel = arrsize > 100000; - - if (use_parallel) { - // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system - constexpr int thread_limit = 8; - int thread_count = std::min(thread_limit, omp_get_max_threads()); - arrsize_t task_threshold - = std::max((arrsize_t)100000, arrsize / 100); - - // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_ - // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems - // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays -#pragma omp parallel num_threads(thread_count) -#pragma omp single - qsort_(arr, - 0, - arrsize - 1, - 2 * (arrsize_t)log2(arrsize), - task_threshold); - } - else { - qsort_(arr, - 0, - arrsize - 1, - 2 * (arrsize_t)log2(arrsize), - std::numeric_limits::max()); - } -#pragma omp taskwait -#else + struct threadmanager tm; qsort_( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); -#endif + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm); replace_inf_with_nan(arr, arrsize, nan_count, descending); }