Skip to content

Commit c9763ee

Browse files
author
Raghuveer Devulapalli
authored
Merge branch 'intel:main' into postgres
2 parents 7a49843 + d62f656 commit c9763ee

29 files changed

+1112
-177
lines changed

.github/workflows/build-numpy.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
np-multiarray-tgl:
1515

1616
if: github.repository == 'intel/x86-simd-sort'
17-
runs-on: intel-ubuntu-latest
17+
runs-on: intel-ubuntu-24.04
1818

1919
steps:
2020
- name: Checkout x86-simd-sort
@@ -80,7 +80,7 @@ jobs:
8080
np-multiarray-spr:
8181

8282
if: github.repository == 'intel/x86-simd-sort'
83-
runs-on: intel-ubuntu-latest
83+
runs-on: intel-ubuntu-24.04
8484

8585
steps:
8686
- name: Checkout x86-simd-sort

.github/workflows/c-cpp.yml

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ permissions: read-all
1111
jobs:
1212
SKL-gcc9:
1313

14-
runs-on: intel-ubuntu-latest
14+
runs-on: intel-ubuntu-24.04
1515

1616
steps:
1717
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -41,7 +41,7 @@ jobs:
4141

4242
SKX-gcc10:
4343

44-
runs-on: intel-ubuntu-latest
44+
runs-on: intel-ubuntu-24.04
4545

4646
steps:
4747
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -71,7 +71,7 @@ jobs:
7171

7272
TGL-gcc11:
7373

74-
runs-on: intel-ubuntu-latest
74+
runs-on: intel-ubuntu-24.04
7575

7676
steps:
7777
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -100,7 +100,7 @@ jobs:
100100

101101
SPR-gcc13:
102102

103-
runs-on: intel-ubuntu-latest
103+
runs-on: intel-ubuntu-24.04
104104

105105
steps:
106106
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -135,9 +135,41 @@ jobs:
135135
- name: Run test suite on SPR
136136
run: sde -spr -- ./builddir/testexe
137137

138+
SKX-SKL-openmp:
139+
140+
runs-on: intel-ubuntu-24.04
141+
142+
steps:
143+
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
144+
145+
- name: Install dependencies
146+
run: |
147+
sudo apt update
148+
sudo apt -y install g++-10 libgtest-dev meson curl git
149+
150+
- name: Install Intel SDE
151+
run: |
152+
curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz
153+
mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
154+
sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
155+
156+
- name: Build
157+
env:
158+
CXX: g++-10
159+
run: |
160+
make clean
161+
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
162+
cd builddir
163+
ninja
164+
165+
- name: Run test suite on SKX and SKL
166+
run: |
167+
sde -skx -- ./builddir/testexe
168+
sde -skl -- ./builddir/testexe
169+
138170
SPR-gcc13-special-cases:
139171

140-
runs-on: intel-ubuntu-latest
172+
runs-on: intel-ubuntu-24.04
141173

142174
steps:
143175
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -172,7 +204,7 @@ jobs:
172204

173205
manylinux-32bit:
174206

175-
runs-on: intel-ubuntu-latest
207+
runs-on: intel-ubuntu-24.04
176208

177209
steps:
178210
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -184,7 +216,7 @@ jobs:
184216
185217
SPR-icpx:
186218

187-
runs-on: intel-ubuntu-latest
219+
runs-on: intel-ubuntu-24.04
188220

189221
steps:
190222
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
@@ -194,8 +226,7 @@ jobs:
194226
echo "deb https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
195227
sudo add-apt-repository -y "deb https://apt.repos.intel.com/oneapi all main"
196228
sudo apt update --allow-insecure-repositories
197-
sudo apt --allow-unauthenticated -y install intel-oneapi-compiler-dpcpp-cpp libgtest-dev curl git python3-pip
198-
sudo pip3 install meson ninja
229+
sudo apt --allow-unauthenticated -y install intel-oneapi-compiler-dpcpp-cpp libgtest-dev curl git python3-pip meson
199230
200231
- name: Install Intel SDE
201232
run: |

.github/workflows/linting.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ permissions: read-all
1111
jobs:
1212
clang-format:
1313

14-
runs-on: intel-ubuntu-latest
14+
runs-on: intel-ubuntu-24.04
1515

1616
steps:
1717
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

.github/workflows/scorecard.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ jobs:
2222

2323
name: Scorecard analysis
2424
if: github.repository == 'intel/x86-simd-sort'
25-
runs-on: ubuntu-latest
25+
runs-on: intel-ubuntu-24.04
2626
permissions:
2727
# Needed to upload the results to code-scanning dashboard.
2828
security-events: write

Makefile

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,30 @@
1-
meson:
2-
meson setup -Dbuild_tests=true -Dbuild_benchmarks=true --warnlevel 2 --werror --buildtype release builddir
1+
test:
2+
meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir
33
cd builddir && ninja
44

5-
mesondebug:
6-
meson setup -Dbuild_tests=true -Dbuild_benchmarks=true --warnlevel 2 --werror --buildtype debug debug
5+
test_openmp:
6+
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
7+
cd builddir && ninja
8+
9+
bench:
10+
meson setup -Dbuild_benchmarks=true --warnlevel 2 --werror --buildtype release builddir
11+
cd builddir && ninja
12+
13+
debug:
14+
meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype debug debug
715
cd debug && ninja
816

17+
sharedlib:
18+
meson setup --warnlevel 2 --werror --buildtype release builddir
19+
cd builddir && ninja
20+
21+
staticlib:
22+
meson setup -Dlib_type=static --warnlevel 2 --werror --buildtype release builddir
23+
cd builddir && ninja
24+
25+
install:
26+
meson setup --warnlevel 2 --werror --buildtype release builddir
27+
cd builddir && meson install
28+
929
clean:
1030
$(RM) -rf $(TESTOBJS) $(BENCHOBJS) $(UTILOBJS) testexe benchexe builddir debug

README.md

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,9 @@ int32_t, double, uint64_t, int64_t]`
4545
4646
## Key-value sort routines on pairs of arrays
4747
```cpp
48-
void x86simdsort::keyvalue_qsort(T1* key, T2* val, size_t size, bool hasnan);
48+
void x86simdsort::keyvalue_qsort(T1* key, T2* val, size_t size, bool hasnan, bool descending);
49+
void x86simdsort::keyvalue_select(T1* key, T2* val, size_t k, size_t size, bool hasnan, bool descending);
50+
void x86simdsort::keyvalue_partial_sort(T1* key, T2* val, size_t k, size_t size, bool hasnan, bool descending);
4951
```
5052
Supported datatypes: `T1`, `T2` $\in$ `[float, uint32_t, int32_t, double,
5153
uint64_t, int64_t]` Note that keyvalue sort is not yet supported for 16-bit
@@ -78,6 +80,16 @@ benchmark](https://github.com/google/benchmark) frameworks respectively. You
7880
can configure meson to build them both by using `-Dbuild_tests=true` and
7981
`-Dbuild_benchmarks=true`.
8082

83+
### Note about building with avx512 by g++ v9 and v10
84+
85+
There is a risk when compile with avx512 by g++ v9 and v10,
86+
as some `MMX Technology` instructions is used by g++ v9/v10
87+
without clearing fpu state.
88+
Check [issue 154](https://github.com/intel/x86-simd-sort/issues/154)
89+
for more details.
90+
91+
Adding `g++` option `-mno-mmx`, which disables `MMX Technology` instructions, is a possible workaround.
92+
8193
## Example usage
8294

8395
#### Sort an array of floats
@@ -159,13 +171,11 @@ different metrics:
159171
160172
The performance data (shown in the plot below) can be collected by building the
161173
benchmarks suite and running `./builddir/benchexe --benchmark_filter==*obj*`.
162-
The data plot shown below was collected on a processor with AVX-512 because
163-
`object_qsort` is currently accelerated only on AVX-512 (we plan to add the
164-
AVX2 version soon). For the simplest of cases where we want to sort an array of
165-
struct by one of its members, `object_qsort` can be up-to 5x faster for 32-bit
166-
data type and about 4x for 64-bit data type. It tends to do even better when
167-
the metric to sort by gets more complicated. Sorting by Euclidean distance can
168-
be up-to 10x faster.
174+
The data plot shown below was collected on a processor with AVX-512. For the
175+
simplest of cases where we want to sort an array of struct by one of its
176+
members, `object_qsort` can be up-to 5x faster for 32-bit data type and about
177+
4x for 64-bit data type. It tends to do even better when the metric to sort by
178+
gets more complicated. Sorting by Euclidean distance can be up-to 10x faster.
169179
170180
![alt text](./misc/object_qsort-perf.jpg?raw=true)
171181

benchmarks/bench-keyvalue.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ static void scalarkvsort(benchmark::State &state, Args &&...args)
1313
std::vector<T> key_bkp = key;
1414
// benchmark
1515
for (auto _ : state) {
16-
xss::scalar::keyvalue_qsort(key.data(), val.data(), arrsize, false);
16+
xss::scalar::keyvalue_qsort(
17+
key.data(), val.data(), arrsize, false, false);
1718
state.PauseTiming();
1819
key = key_bkp;
1920
state.ResumeTiming();

lib/meson.build

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
libtargets = []
22

3+
# Add compile flags for OpenMP if enabled
4+
openmpflags = []
5+
if get_option('use_openmp')
6+
openmpflags = ['-DXSS_USE_OPENMP=true', '-fopenmp']
7+
endif
8+
39
if cpp.has_argument('-march=haswell')
410
libtargets += static_library('libavx',
511
files(
612
'x86simdsort-avx2.cpp',
713
),
814
include_directories : [src],
9-
cpp_args : ['-march=haswell'],
15+
cpp_args : ['-march=haswell', openmpflags],
1016
gnu_symbol_visibility : 'inlineshidden',
1117
)
1218
endif
@@ -17,7 +23,7 @@ if cpp.has_argument('-march=skylake-avx512')
1723
'x86simdsort-skx.cpp',
1824
),
1925
include_directories : [src],
20-
cpp_args : ['-march=skylake-avx512'],
26+
cpp_args : ['-march=skylake-avx512', openmpflags],
2127
gnu_symbol_visibility : 'inlineshidden',
2228
)
2329
endif

lib/x86simdsort-avx2.cpp

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -34,38 +34,48 @@
3434
return x86simdsortStatic::argselect(arr, k, arrsize, hasnan); \
3535
}
3636

37-
#define DEFINE_KEYVALUE_METHODS(type) \
38-
template <> \
39-
void keyvalue_qsort(type *key, uint64_t *val, size_t arrsize, bool hasnan) \
40-
{ \
41-
x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \
42-
} \
43-
template <> \
44-
void keyvalue_qsort(type *key, int64_t *val, size_t arrsize, bool hasnan) \
45-
{ \
46-
x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \
47-
} \
48-
template <> \
49-
void keyvalue_qsort(type *key, double *val, size_t arrsize, bool hasnan) \
50-
{ \
51-
x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \
52-
} \
37+
#define DEFINE_KEYVALUE_METHODS_BASE(type1, type2) \
5338
template <> \
54-
void keyvalue_qsort(type *key, uint32_t *val, size_t arrsize, bool hasnan) \
39+
void keyvalue_qsort(type1 *key, \
40+
type2 *val, \
41+
size_t arrsize, \
42+
bool hasnan, \
43+
bool descending) \
5544
{ \
56-
x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \
45+
x86simdsortStatic::keyvalue_qsort( \
46+
key, val, arrsize, hasnan, descending); \
5747
} \
5848
template <> \
59-
void keyvalue_qsort(type *key, int32_t *val, size_t arrsize, bool hasnan) \
49+
void keyvalue_select(type1 *key, \
50+
type2 *val, \
51+
size_t k, \
52+
size_t arrsize, \
53+
bool hasnan, \
54+
bool descending) \
6055
{ \
61-
x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \
56+
x86simdsortStatic::keyvalue_select( \
57+
key, val, k, arrsize, hasnan, descending); \
6258
} \
6359
template <> \
64-
void keyvalue_qsort(type *key, float *val, size_t arrsize, bool hasnan) \
60+
void keyvalue_partial_sort(type1 *key, \
61+
type2 *val, \
62+
size_t k, \
63+
size_t arrsize, \
64+
bool hasnan, \
65+
bool descending) \
6566
{ \
66-
x86simdsortStatic::keyvalue_qsort(key, val, arrsize, hasnan); \
67+
x86simdsortStatic::keyvalue_partial_sort( \
68+
key, val, k, arrsize, hasnan, descending); \
6769
}
6870

71+
#define DEFINE_KEYVALUE_METHODS(type) \
72+
DEFINE_KEYVALUE_METHODS_BASE(type, uint64_t) \
73+
DEFINE_KEYVALUE_METHODS_BASE(type, int64_t) \
74+
DEFINE_KEYVALUE_METHODS_BASE(type, double) \
75+
DEFINE_KEYVALUE_METHODS_BASE(type, uint32_t) \
76+
DEFINE_KEYVALUE_METHODS_BASE(type, int32_t) \
77+
DEFINE_KEYVALUE_METHODS_BASE(type, float)
78+
6979
namespace xss {
7080
namespace avx2 {
7181
DEFINE_ALL_METHODS(uint32_t)

0 commit comments

Comments
 (0)