diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 6dbe24d..fbe1856 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -552,9 +552,7 @@ avx512_qsort_fp16_helper(uint16_t *arr, arrsize_t arrsize) bool use_parallel = arrsize > 100000; if (use_parallel) { - // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system - constexpr int thread_limit = 8; - int thread_count = std::min(thread_limit, omp_get_max_threads()); + int thread_count = xss_get_num_threads(); arrsize_t task_threshold = std::max((arrsize_t)100000, arrsize / 100); // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_ diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index 4f5c30d..6c071c2 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -629,9 +629,7 @@ X86_SIMD_SORT_INLINE void xss_argsort(T *arr, bool use_parallel = arrsize > 10000; if (use_parallel) { - // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system - constexpr int thread_limit = 8; - int thread_count = std::min(thread_limit, omp_get_max_threads()); + int thread_count = xss_get_num_threads(); arrsize_t task_threshold = std::max((arrsize_t)10000, arrsize / 100); diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index a7c34c1..c36f7db 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -85,6 +85,13 @@ #if defined(XSS_USE_OPENMP) && defined(_OPENMP) #define XSS_COMPILE_OPENMP #include + +// Limit the number of threads to 16: emperically determined, can be probably +// better tuned at a later stage +X86_SIMD_SORT_INLINE int xss_get_num_threads() +{ + return std::min(16, (int)omp_get_max_threads()); +} #endif template diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index 1a15de7..3a07e01 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -610,9 +610,7 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv( bool use_parallel = arrsize > 10000; if (use_parallel) { - // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system - constexpr int thread_limit = 8; - int thread_count = std::min(thread_limit, omp_get_max_threads()); + int thread_count = xss_get_num_threads(); arrsize_t task_threshold = std::max((arrsize_t)10000, arrsize / 100); diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index cf4a34a..73b947a 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -672,9 +672,7 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) bool use_parallel = arrsize > 100000; if (use_parallel) { - // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system - constexpr int thread_limit = 8; - int thread_count = std::min(thread_limit, omp_get_max_threads()); + int thread_count = xss_get_num_threads(); arrsize_t task_threshold = std::max((arrsize_t)100000, arrsize / 100);