From e29d303c6bbf5a4e6b6afd3543c222577070e968 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Tue, 7 Jan 2025 15:15:01 -0800 Subject: [PATCH 1/7] Add OpenMP acceleration for normal quicksort --- src/avx512-16bit-qsort.hpp | 5 ++- src/avx512-64bit-common.h | 3 +- src/xss-common-includes.h | 5 +++ src/xss-common-keyvaluesort.hpp | 5 --- src/xss-common-qsort.h | 76 ++++++++++++++++++++++++++++++--- tests/test-qsort.cpp | 4 ++ 6 files changed, 85 insertions(+), 13 deletions(-) diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 3b18b98..1ed829b 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -556,6 +556,7 @@ avx512_qsort_fp16(uint16_t *arr, { using vtype = zmm_vector; + // TODO multithreading support here if (arrsize > 1) { arrsize_t nan_count = 0; if (UNLIKELY(hasnan)) { @@ -564,11 +565,11 @@ avx512_qsort_fp16(uint16_t *arr, } if (descending) { qsort_, uint16_t>( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); } else { qsort_, uint16_t>( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); } replace_inf_with_nan(arr, arrsize, nan_count, descending); } diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index f27a31f..5d55196 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -968,7 +968,8 @@ struct zmm_vector { static_assert(sizeof(size_t) == sizeof(uint64_t), "Size of size_t and uint64_t are not the same"); template <> -struct zmm_vector : public zmm_vector {}; +struct zmm_vector : public zmm_vector { +}; #endif template <> diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index 386ca86..27d6c36 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -82,6 +82,11 @@ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +#if defined(XSS_USE_OPENMP) && defined(_OPENMP) +#define XSS_COMPILE_OPENMP +#include +#endif + template constexpr bool always_false = false; diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index df85861..a607b62 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -11,11 +11,6 @@ #include "xss-common-qsort.h" #include "xss-network-keyvaluesort.hpp" -#if defined(XSS_USE_OPENMP) && defined(_OPENMP) -#define XSS_COMPILE_OPENMP -#include -#endif - /* * Sort all the NAN's to end of the array and return the index of the last elem * in the array which is not a nan diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 0fb6263..96e1b5c 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -521,8 +521,11 @@ template void sort_n(typename vtype::type_t *arr, int N); template -static void -qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) +static void qsort_(type_t *arr, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters, + arrsize_t task_threshold) { /* * Resort to std::sort if quicksort isnt making any progress @@ -559,10 +562,40 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) type_t leftmostValue = comparator::leftmost(smallest, biggest); type_t rightmostValue = comparator::rightmost(smallest, biggest); +#ifdef XSS_COMPILE_OPENMP + if (pivot != leftmostValue) { + bool parallel_left = (pivot_index - left) > task_threshold; + if (parallel_left) { +#pragma omp task + qsort_( + arr, left, pivot_index - 1, max_iters - 1, task_threshold); + } + else { + qsort_( + arr, left, pivot_index - 1, max_iters - 1, task_threshold); + } + } + if (pivot != rightmostValue) { + bool parallel_right = (right - pivot_index) > task_threshold; + + if (parallel_right) { +#pragma omp task + qsort_( + arr, pivot_index, right, max_iters - 1, task_threshold); + } + else { + qsort_( + arr, pivot_index, right, max_iters - 1, task_threshold); + } + } +#else + UNUSED(task_threshold); + if (pivot != leftmostValue) - qsort_(arr, left, pivot_index - 1, max_iters - 1); + qsort_(arr, left, pivot_index - 1, max_iters - 1, 0); if (pivot != rightmostValue) - qsort_(arr, pivot_index, right, max_iters - 1); + qsort_(arr, pivot_index, right, max_iters - 1, 0); +#endif } template @@ -627,8 +660,41 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) } UNUSED(hasnan); + +#ifdef XSS_COMPILE_OPENMP + + bool use_parallel = arrsize > 10000; + + if (use_parallel) { + // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system + constexpr int thread_limit = 8; + int thread_count = std::min(thread_limit, omp_get_max_threads()); + arrsize_t task_threshold + = std::max((arrsize_t)10000, arrsize / 100); + + // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_ + // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems + // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays +#pragma omp parallel num_threads(thread_count) +#pragma omp single + qsort_(arr, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize), + task_threshold); + } + else { + qsort_(arr, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize), + std::numeric_limits::max()); + } +#pragma omp taskwait +#else qsort_( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); +#endif replace_inf_with_nan(arr, arrsize, nan_count, descending); } diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index 7eef83e..c404967 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -11,6 +11,10 @@ class simdsort : public ::testing::Test { simdsort() { std::iota(arrsize.begin(), arrsize.end(), 1); + arrsize.push_back(10'000); + arrsize.push_back(100'000); + arrsize.push_back(1'000'000); + arrtype = {"random", "constant", "sorted", From dbbf2debae185c85f0d83df84f7f592924ba3918 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Thu, 9 Jan 2025 15:31:20 -0800 Subject: [PATCH 2/7] Speed up tests, in particular when OpenMP is disabled --- tests/meson.build | 7 +++++++ tests/test-keyvalue.cpp | 15 ++++++++++----- tests/test-qsort.cpp | 15 ++++++++++----- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/meson.build b/tests/meson.build index 0583c55..92c689b 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -1,19 +1,26 @@ libtests = [] +if get_option('use_openmp') + openmpflags = ['-DXSS_USE_OPENMP=true'] +endif + libtests += static_library('tests_qsort', files('test-qsort.cpp', ), dependencies: gtest_dep, include_directories : [src, lib, utils], + cpp_args : [openmpflags], ) libtests += static_library('tests_kvsort', files('test-keyvalue.cpp', ), dependencies: gtest_dep, include_directories : [src, lib, utils], + cpp_args : [openmpflags], ) libtests += static_library('tests_objsort', files('test-objqsort.cpp', ), dependencies: gtest_dep, include_directories : [src, lib, utils], + cpp_args : [openmpflags], ) diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp index c0e683c..c1386af 100644 --- a/tests/test-keyvalue.cpp +++ b/tests/test-keyvalue.cpp @@ -15,9 +15,13 @@ class simdkvsort : public ::testing::Test { simdkvsort() { std::iota(arrsize.begin(), arrsize.end(), 1); - arrsize.push_back(10'000); - arrsize.push_back(100'000); - arrsize.push_back(1'000'000); + std::iota(arrsize_long.begin(), arrsize_long.end(), 1); +#ifdef XSS_USE_OPENMP + // These extended tests are only needed for the OpenMP logic + arrsize_long.push_back(10'000); + arrsize_long.push_back(100'000); + arrsize_long.push_back(1'000'000); +#endif arrtype = {"random", "constant", @@ -32,6 +36,7 @@ class simdkvsort : public ::testing::Test { } std::vector arrtype; std::vector arrsize = std::vector(1024); + std::vector arrsize_long = std::vector(1024); }; TYPED_TEST_SUITE_P(simdkvsort); @@ -168,7 +173,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_ascending) using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type; for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector key = get_array(type, size); std::vector val = get_array(type, size); std::vector key_bckp = key; @@ -199,7 +204,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_descending) using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type; for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector key = get_array(type, size); std::vector val = get_array(type, size); std::vector key_bckp = key; diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index c404967..8a48207 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -11,9 +11,13 @@ class simdsort : public ::testing::Test { simdsort() { std::iota(arrsize.begin(), arrsize.end(), 1); - arrsize.push_back(10'000); - arrsize.push_back(100'000); - arrsize.push_back(1'000'000); + std::iota(arrsize_long.begin(), arrsize_long.end(), 1); +#ifdef XSS_USE_OPENMP + // These extended tests are only needed for the OpenMP logic + arrsize_long.push_back(10'000); + arrsize_long.push_back(100'000); + arrsize_long.push_back(1'000'000); +#endif arrtype = {"random", "constant", @@ -28,6 +32,7 @@ class simdsort : public ::testing::Test { } std::vector arrtype; std::vector arrsize = std::vector(1024); + std::vector arrsize_long = std::vector(1024); }; TYPED_TEST_SUITE_P(simdsort); @@ -36,7 +41,7 @@ TYPED_TEST_P(simdsort, test_qsort_ascending) { for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector basearr = get_array(type, size); // Ascending order @@ -58,7 +63,7 @@ TYPED_TEST_P(simdsort, test_qsort_descending) { for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector basearr = get_array(type, size); // Descending order From d77d0e35582ecb9816c3ebde25c439cef594e610 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Thu, 16 Jan 2025 14:57:17 -0800 Subject: [PATCH 3/7] Change threshold for OpenMP/tasks from 10k to 100k --- src/xss-common-qsort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 96e1b5c..801ec72 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -663,14 +663,14 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) #ifdef XSS_COMPILE_OPENMP - bool use_parallel = arrsize > 10000; + bool use_parallel = arrsize > 100000; if (use_parallel) { // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system constexpr int thread_limit = 8; int thread_count = std::min(thread_limit, omp_get_max_threads()); arrsize_t task_threshold - = std::max((arrsize_t)10000, arrsize / 100); + = std::max((arrsize_t)100000, arrsize / 100); // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_ // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems From e30d8b6710deaca61f06e61848d1e16b9b718a64 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Tue, 21 Jan 2025 10:52:04 -0800 Subject: [PATCH 4/7] Fix that weird formatting issue --- src/avx512-64bit-common.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 5d55196..f27a31f 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -968,8 +968,7 @@ struct zmm_vector { static_assert(sizeof(size_t) == sizeof(uint64_t), "Size of size_t and uint64_t are not the same"); template <> -struct zmm_vector : public zmm_vector { -}; +struct zmm_vector : public zmm_vector {}; #endif template <> From 1c5c19bdf0c202a49625a7a3af681bc9300e5498 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Wed, 26 Mar 2025 10:35:09 -0700 Subject: [PATCH 5/7] Fix missing openmp flags --- lib/meson.build | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/meson.build b/lib/meson.build index 5cbc105..48046b3 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -34,7 +34,7 @@ if cpp.has_argument('-march=icelake-client') 'x86simdsort-icl.cpp', ), include_directories : [src], - cpp_args : ['-march=icelake-client'], + cpp_args : ['-march=icelake-client', openmpflags], gnu_symbol_visibility : 'inlineshidden', ) endif @@ -45,7 +45,7 @@ if cancompilefp16 'x86simdsort-spr.cpp', ), include_directories : [src], - cpp_args : ['-march=sapphirerapids'], + cpp_args : ['-march=sapphirerapids', openmpflags], gnu_symbol_visibility : 'inlineshidden', ) endif From dba705f75720ed9211e835baba52f19cd44000d6 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Wed, 26 Mar 2025 11:41:03 -0700 Subject: [PATCH 6/7] Use unrolled partition for _Float16 --- src/avx512fp16-16bit-qsort.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/avx512fp16-16bit-qsort.hpp b/src/avx512fp16-16bit-qsort.hpp index 7de26a0..f93cf68 100644 --- a/src/avx512fp16-16bit-qsort.hpp +++ b/src/avx512fp16-16bit-qsort.hpp @@ -22,7 +22,7 @@ struct zmm_vector<_Float16> { using opmask_t = __mmask32; static const uint8_t numlanes = 32; static constexpr int network_sort_threshold = 128; - static constexpr int partition_unroll_factor = 0; + static constexpr int partition_unroll_factor = 8; static constexpr simd_type vec_type = simd_type::AVX512; using swizzle_ops = avx512_16bit_swizzle_ops; From e01e79fe371c39735bcec7c051bbcb252e6185bb Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Thu, 27 Mar 2025 08:54:21 -0700 Subject: [PATCH 7/7] Change 16-bit swizzle from vector to C arrays --- src/avx512-16bit-common.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/avx512-16bit-common.h b/src/avx512-16bit-common.h index 524ce7a..e1a76d3 100644 --- a/src/avx512-16bit-common.h +++ b/src/avx512-16bit-common.h @@ -14,11 +14,11 @@ struct avx512_16bit_swizzle_ops { __m512i v = vtype::cast_to(reg); if constexpr (scale == 2) { - std::vector arr + constexpr static uint16_t arr[] = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18, 21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30}; - __m512i mask = _mm512_loadu_si512(arr.data()); + __m512i mask = _mm512_loadu_si512(arr); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 4) { @@ -48,27 +48,27 @@ struct avx512_16bit_swizzle_ops { if constexpr (scale == 2) { return swap_n(reg); } else if constexpr (scale == 4) { - std::vector arr + constexpr static uint16_t arr[] = {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; - __m512i mask = _mm512_loadu_si512(arr.data()); + __m512i mask = _mm512_loadu_si512(arr); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 8) { - std::vector arr + constexpr static int16_t arr[] = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}; - __m512i mask = _mm512_loadu_si512(arr.data()); + __m512i mask = _mm512_loadu_si512(arr); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 16) { - std::vector arr + constexpr static uint16_t arr[] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16}; - __m512i mask = _mm512_loadu_si512(arr.data()); + __m512i mask = _mm512_loadu_si512(arr); v = _mm512_permutexvar_epi16(mask, v); } else if constexpr (scale == 32) {