Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
test:
meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

test_openmp:
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

bench:
meson setup -Dbuild_benchmarks=true --warnlevel 2 --werror --buildtype release builddir
meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

debug:
Expand Down
2 changes: 1 addition & 1 deletion scripts/bench-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then
fi
compare=$(realpath .bench/google-benchmark/tools/compare.py)

meson setup -Dbuild_benchmarks=true -Dbuild_ippbench=true --warnlevel 0 --buildtype release builddir-${branch}
meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir-${branch}
cd builddir-${branch}
ninja
$compare filters ./benchexe $1 $2 --benchmark_repetitions=$3
9 changes: 3 additions & 6 deletions src/avx512-16bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@

#include "avx512-16bit-common.h"

struct float16 {
uint16_t val;
};

template <>
struct zmm_vector<float16> {
using type_t = uint16_t;
Expand Down Expand Up @@ -555,6 +551,7 @@ avx512_qsort_fp16(uint16_t *arr,
bool descending = false)
{
using vtype = zmm_vector<float16>;
struct threadmanager tm;

// TODO multithreading support here
if (arrsize > 1) {
Expand All @@ -565,11 +562,11 @@ avx512_qsort_fp16(uint16_t *arr,
}
if (descending) {
qsort_<vtype, Comparator<vtype, true>, uint16_t>(
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm);
}
else {
qsort_<vtype, Comparator<vtype, false>, uint16_t>(
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm);
}
replace_inf_with_nan(arr, arrsize, nan_count, descending);
}
Expand Down
28 changes: 28 additions & 0 deletions src/xss-common-includes.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
#include <immintrin.h>
#include <limits>
#include <vector>
#include <thread>
#include <mutex>
#include "xss-custom-float.h"

#define X86_SIMD_SORT_INFINITY std::numeric_limits<double>::infinity()
Expand Down Expand Up @@ -87,6 +89,10 @@
#include <omp.h>
#endif

struct float16 {
uint16_t val;
};

template <class... T>
constexpr bool always_false = false;

Expand All @@ -109,4 +115,26 @@ enum class simd_type : int { AVX2, AVX512 };
template <typename vtype, typename T = typename vtype::type_t>
X86_SIMD_SORT_INLINE bool comparison_func(const T &a, const T &b);

struct threadmanager {
int max_thread_count;
std::mutex mymutex;
int sharedCount;
arrsize_t task_threshold;

threadmanager() {
#ifdef XSS_COMPILE_OPENMP
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know, this needs a new macro.

max_thread_count = 8;
#else
max_thread_count = 0;
Copy link
Preview

Copilot AI May 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting max_thread_count to 0 in non-OpenMP builds disables parallel thread spawning. Consider initializing it with a non-zero value (e.g., based on std::thread::hardware_concurrency()) to enable proper multithreading.

Copilot uses AI. Check for mistakes.

#endif
sharedCount = 0;
task_threshold = 100000;
};
void incrementCount(int ii) {
mymutex.lock();
sharedCount += ii;
mymutex.unlock();
}
};

#endif // XSS_COMMON_INCLUDES
86 changes: 32 additions & 54 deletions src/xss-common-qsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,7 @@ static void qsort_(type_t *arr,
arrsize_t left,
arrsize_t right,
arrsize_t max_iters,
arrsize_t task_threshold)
struct threadmanager &tm)
{
/*
* Resort to std::sort if quicksort isnt making any progress
Expand Down Expand Up @@ -562,40 +562,49 @@ static void qsort_(type_t *arr,
type_t leftmostValue = comparator::leftmost(smallest, biggest);
type_t rightmostValue = comparator::rightmost(smallest, biggest);

#ifdef XSS_COMPILE_OPENMP
std::thread t1, t2;
bool parallel_left = ((pivot_index - left) > tm.task_threshold)
&& (tm.sharedCount < tm.max_thread_count);
if (pivot != leftmostValue) {
bool parallel_left = (pivot_index - left) > task_threshold;
if (parallel_left) {
#pragma omp task
qsort_<vtype, comparator>(
arr, left, pivot_index - 1, max_iters - 1, task_threshold);
tm.incrementCount(1);
t1 = std::thread(qsort_<vtype, comparator, type_t>,
arr,
left,
pivot_index - 1,
max_iters - 1,
std::ref(tm));
}
else {
qsort_<vtype, comparator>(
arr, left, pivot_index - 1, max_iters - 1, task_threshold);
arr, left, pivot_index - 1, max_iters - 1, tm);
}
}
bool parallel_right = ((right - pivot_index) > tm.task_threshold)
&& (tm.sharedCount < tm.max_thread_count);
if (pivot != rightmostValue) {
bool parallel_right = (right - pivot_index) > task_threshold;

if (parallel_right) {
#pragma omp task
qsort_<vtype, comparator>(
arr, pivot_index, right, max_iters - 1, task_threshold);
tm.incrementCount(1);
t2 = std::thread(qsort_<vtype, comparator, type_t>,
arr,
pivot_index,
right,
max_iters - 1,
std::ref(tm));
}
else {
qsort_<vtype, comparator>(
arr, pivot_index, right, max_iters - 1, task_threshold);
arr, pivot_index, right, max_iters - 1, tm);
}
}
#else
UNUSED(task_threshold);

if (pivot != leftmostValue)
qsort_<vtype, comparator>(arr, left, pivot_index - 1, max_iters - 1, 0);
if (pivot != rightmostValue)
qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1, 0);
#endif
if (t1.joinable()) {
t1.join();
tm.incrementCount(-1);
}
if (t2.joinable()) {
t2.join();
tm.incrementCount(-1);
}
}

template <typename vtype, typename comparator, typename type_t>
Expand Down Expand Up @@ -661,40 +670,9 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan)

UNUSED(hasnan);

#ifdef XSS_COMPILE_OPENMP

bool use_parallel = arrsize > 100000;

if (use_parallel) {
// This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
constexpr int thread_limit = 8;
int thread_count = std::min(thread_limit, omp_get_max_threads());
arrsize_t task_threshold
= std::max((arrsize_t)100000, arrsize / 100);

// We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
// The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
// Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
#pragma omp parallel num_threads(thread_count)
#pragma omp single
qsort_<vtype, comparator, T>(arr,
0,
arrsize - 1,
2 * (arrsize_t)log2(arrsize),
task_threshold);
}
else {
qsort_<vtype, comparator, T>(arr,
0,
arrsize - 1,
2 * (arrsize_t)log2(arrsize),
std::numeric_limits<arrsize_t>::max());
}
#pragma omp taskwait
#else
struct threadmanager tm;
qsort_<vtype, comparator, T>(
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
#endif
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), tm);

replace_inf_with_nan(arr, arrsize, nan_count, descending);
}
Expand Down
Loading