Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ jobs:
CXX: g++-10
run: |
make clean
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
meson setup -Dbuild_tests=true -Duse_openmp=true --Duse_stdthreads=true --warnlevel 2 --werror --buildtype release builddir
cd builddir
ninja

Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ test_openmp:
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

test_stdthreads:
meson setup -Dbuild_tests=true -Duse_stdthreads=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

test_asan:
meson setup -Dbuild_tests=true -Duse_openmp=true -Db_sanitize=address,undefined -Dfatal_sanitizers=true -Db_lundef=false -Dasan_ci_dont_validate=true --warnlevel 0 --buildtype debugoptimized builddir
cd builddir && ninja
Expand Down
18 changes: 8 additions & 10 deletions lib/meson.build
Original file line number Diff line number Diff line change
@@ -1,18 +1,13 @@
libtargets = []

# Add compile flags for OpenMP if enabled
openmpflags = []
if get_option('use_openmp')
openmpflags = ['-DXSS_USE_OPENMP=true', '-fopenmp']
endif

if cpp.has_argument('-march=haswell')
libtargets += static_library('libavx',
files(
'x86simdsort-avx2.cpp',
),
include_directories : [src],
cpp_args : ['-march=haswell', openmpflags],
cpp_args : ['-march=haswell', stdthreadsflag],
dependencies: [omp_dep],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand All @@ -23,7 +18,8 @@ if cpp.has_argument('-march=skylake-avx512')
'x86simdsort-skx.cpp',
),
include_directories : [src],
cpp_args : ['-march=skylake-avx512', openmpflags],
cpp_args : ['-march=skylake-avx512', stdthreadsflag],
dependencies: [omp_dep],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand All @@ -34,7 +30,8 @@ if cpp.has_argument('-march=icelake-client')
'x86simdsort-icl.cpp',
),
include_directories : [src],
cpp_args : ['-march=icelake-client', openmpflags],
cpp_args : ['-march=icelake-client', stdthreadsflag],
dependencies: [omp_dep],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand All @@ -45,7 +42,8 @@ if cancompilefp16
'x86simdsort-spr.cpp',
),
include_directories : [src],
cpp_args : ['-march=sapphirerapids', openmpflags],
cpp_args : ['-march=sapphirerapids', stdthreadsflag],
dependencies: [omp_dep],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand Down
18 changes: 16 additions & 2 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@ if get_option('build_vqsortbench')
benchvq = true
endif

# build with openmp
omp = []
omp_dep = []
if get_option('use_openmp')
omp = dependency('openmp', required : true)
omp_dep = declare_dependency(dependencies: omp, compile_args: ['-DXSS_USE_OPENMP'])
endif

# build with std::threads
stdthreadsflag = []
if get_option('use_stdthreads')
stdthreadsflag += ['-DXSS_BUILD_WITH_STD_THREADS']
endif

fp16code = '''#include<immintrin.h>
int main() {
__m512h temp = _mm512_set1_ph(1.0f);
Expand All @@ -43,8 +57,8 @@ if get_option('lib_type') == 'shared'
libsimdsort = shared_library('x86simdsortcpp',
'lib/x86simdsort.cpp',
include_directories : [src, utils, lib],
link_args : [openmpflags],
link_with : [libtargets],
dependencies: [omp],
gnu_symbol_visibility : 'inlineshidden',
install : true,
soversion : 1,
Expand All @@ -53,7 +67,7 @@ else
libsimdsort = static_library('x86simdsortcpp',
'lib/x86simdsort.cpp',
include_directories : [src, utils, lib],
link_args : [openmpflags],
dependencies: [omp],
link_with : [libtargets],
gnu_symbol_visibility : 'inlineshidden',
install : true,
Expand Down
2 changes: 2 additions & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ option('build_vqsortbench', type : 'boolean', value : true,
description : 'Add google vqsort to benchmarks (default: "true").')
option('use_openmp', type : 'boolean', value : false,
description : 'Use OpenMP to accelerate key-value sort (default: "false").')
option('use_stdthreads', type : 'boolean', value : false,
description : 'Use std::threads to accelerate qsort (default: "false").')
option('lib_type', type : 'string', value : 'shared',
description : 'Library type: shared or static (default: "shared").')
option('fatal_sanitizers', type : 'boolean', value : 'false',
Expand Down
2 changes: 1 addition & 1 deletion scripts/bench-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if [ ! -d .bench/google-benchmark ]; then
fi
compare=$(realpath .bench/google-benchmark/tools/compare.py)

meson setup -Dbuild_benchmarks=true -Dbuild_ippbench=true --warnlevel 0 --buildtype release builddir-${branch}
meson setup -Dbuild_benchmarks=true -Duse_stdthreads=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir-${branch}
cd builddir-${branch}
ninja
$compare filters ./benchexe $1 $2 --benchmark_repetitions=$3
2 changes: 1 addition & 1 deletion scripts/branch-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ build_branch() {
fi
fi
cd $dir_name
meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir
meson setup -Dbuild_benchmarks=true -Duse_openmp=true -Duse_stdthreads=true --warnlevel 0 --buildtype release builddir
cd builddir
ninja
cd ../../
Expand Down
41 changes: 23 additions & 18 deletions src/avx512-16bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,25 +548,35 @@ avx512_qsort_fp16_helper(uint16_t *arr, arrsize_t arrsize)
using T = uint16_t;
using vtype = zmm_vector<float16>;

#ifdef XSS_COMPILE_OPENMP
#ifdef XSS_BUILD_WITH_STD_THREADS
bool use_parallel = arrsize > 100000;
#else
bool use_parallel = false;
#endif

if (use_parallel) {
// This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
#ifdef XSS_BUILD_WITH_STD_THREADS

// This thread limit was determined experimentally
constexpr int thread_limit = 8;
int thread_count = std::min(thread_limit, omp_get_max_threads());
int thread_count = std::min(thread_limit,
(int)std::thread::hardware_concurrency());
Comment on lines 561 to +563

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is 8 threads in a system with a lot of cores still better? Maybe you need to partition the data better to avoid locality problems.

arrsize_t task_threshold = std::max((arrsize_t)100000, arrsize / 100);

// We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
// The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
// Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
#pragma omp parallel num_threads(thread_count)
#pragma omp single
qsort_<vtype, comparator, T>(arr,
0,
arrsize - 1,
2 * (arrsize_t)log2(arrsize),
task_threshold);
// Create a thread pool
ThreadPool pool(thread_count);

// Initial sort task
qsort_threads<vtype, comparator, T>(arr,
0,
arrsize - 1,
2 * (arrsize_t)log2(arrsize),
task_threshold,
pool);

// Wait for all tasks to complete
pool.wait_all();
#endif
}
else {
qsort_<vtype, comparator, T>(arr,
Expand All @@ -575,11 +585,6 @@ avx512_qsort_fp16_helper(uint16_t *arr, arrsize_t arrsize)
2 * (arrsize_t)log2(arrsize),
std::numeric_limits<arrsize_t>::max());
}
#pragma omp taskwait
#else
qsort_<vtype, comparator, T>(
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
#endif
}

[[maybe_unused]] X86_SIMD_SORT_INLINE void
Expand Down
Loading
Loading