diff --git a/benchmarks/bench-objsort.hpp b/benchmarks/bench-objsort.hpp index 4f5719a..b61364a 100644 --- a/benchmarks/bench-objsort.hpp +++ b/benchmarks/bench-objsort.hpp @@ -13,9 +13,9 @@ struct Point3D { static constexpr std::string_view name {val}; Point3D() { - x = (T)rand() / RAND_MAX; - y = (T)rand() / RAND_MAX; - z = (T)rand() / RAND_MAX; + x = (T)rand() / (T)RAND_MAX; + y = (T)rand() / (T)RAND_MAX; + z = (T)rand() / (T)RAND_MAX; } T distance() { diff --git a/src/xss-common-argsort.h b/src/xss-common-argsort.h index eb46bbe..70eec0b 100644 --- a/src/xss-common-argsort.h +++ b/src/xss-common-argsort.h @@ -472,7 +472,8 @@ X86_SIMD_SORT_INLINE void argsort_(type_t *arr, arrsize_t *arg, arrsize_t left, arrsize_t right, - arrsize_t max_iters) + arrsize_t max_iters, + arrsize_t task_threshold) { /* * Resort to std::sort if quicksort isnt making any progress @@ -494,11 +495,57 @@ X86_SIMD_SORT_INLINE void argsort_(type_t *arr, type_t biggest = vtype::type_min(); arrsize_t pivot_index = argpartition_unrolled( arr, arg, left, right + 1, pivot, &smallest, &biggest); +#ifdef XSS_COMPILE_OPENMP + if (pivot != smallest) { + bool parallel_left = (pivot_index - left) > task_threshold; + if (parallel_left) { +#pragma omp task + argsort_(arr, + arg, + left, + pivot_index - 1, + max_iters - 1, + task_threshold); + } + else { + argsort_(arr, + arg, + left, + pivot_index - 1, + max_iters - 1, + task_threshold); + } + } + if (pivot != biggest) { + bool parallel_right = (right - pivot_index) > task_threshold; + + if (parallel_right) { +#pragma omp task + argsort_(arr, + arg, + pivot_index, + right, + max_iters - 1, + task_threshold); + } + else { + argsort_(arr, + arg, + pivot_index, + right, + max_iters - 1, + task_threshold); + } + } +#else + UNUSED(task_threshold); if (pivot != smallest) argsort_( - arr, arg, left, pivot_index - 1, max_iters - 1); + arr, arg, left, pivot_index - 1, max_iters - 1, 0); if (pivot != biggest) - argsort_(arr, arg, pivot_index, right, max_iters - 1); + argsort_( + arr, arg, pivot_index, right, max_iters - 1, 0); +#endif } template @@ -570,8 +617,43 @@ X86_SIMD_SORT_INLINE void xss_argsort(T *arr, } } UNUSED(hasnan); + +#ifdef XSS_COMPILE_OPENMP + + bool use_parallel = arrsize > 10000; + + if (use_parallel) { + // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system + constexpr int thread_limit = 8; + int thread_count = std::min(thread_limit, omp_get_max_threads()); + arrsize_t task_threshold + = std::max((arrsize_t)10000, arrsize / 100); + + // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_ + // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems + // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays +#pragma omp parallel num_threads(thread_count) +#pragma omp single + argsort_(arr, + arg, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize), + task_threshold); +#pragma omp taskwait + } + else { + argsort_(arr, + arg, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize), + std::numeric_limits::max()); + } +#else argsort_( - arr, arg, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); + arr, arg, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); +#endif if (descending) { std::reverse(arg, arg + arrsize); } } diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index a607b62..1a15de7 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -627,6 +627,7 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv( index_last_elem, maxiters, task_threshold); +#pragma omp taskwait } else { kvsort_(keys, @@ -636,7 +637,6 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv( maxiters, std::numeric_limits::max()); } -#pragma omp taskwait #else kvsort_( keys, indexes, 0, index_last_elem, maxiters, 0); diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 5108e38..e3bf019 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -682,6 +682,7 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) arrsize - 1, 2 * (arrsize_t)log2(arrsize), task_threshold); +#pragma omp taskwait } else { qsort_(arr, @@ -690,7 +691,6 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) 2 * (arrsize_t)log2(arrsize), std::numeric_limits::max()); } -#pragma omp taskwait #else qsort_( arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index 3083f72..869ecea 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -89,7 +89,7 @@ TYPED_TEST_P(simdsort, test_argsort_ascending) { for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector arr = get_array(type, size); std::vector sortedarr = arr; @@ -110,7 +110,7 @@ TYPED_TEST_P(simdsort, test_argsort_descending) { for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector arr = get_array(type, size); std::vector sortedarr = arr;