Skip to content

Commit ac6c10c

Browse files
author
Raghuveer Devulapalli
authored
Merge pull request #33 from r-devulap/256-network
Improve qsort
2 parents 6709593 + 74986fa commit ac6c10c

10 files changed

+649
-34
lines changed

src/avx512-16bit-common.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -290,10 +290,11 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
290290
}
291291

292292
template <typename vtype, typename type_t>
293-
static void
294-
qselect_16bit_(type_t *arr, int64_t pos,
295-
int64_t left, int64_t right,
296-
int64_t max_iters)
293+
static void qselect_16bit_(type_t *arr,
294+
int64_t pos,
295+
int64_t left,
296+
int64_t right,
297+
int64_t max_iters)
297298
{
298299
/*
299300
* Resort to std::sort if quicksort isnt making any progress

src/avx512-32bit-qsort.hpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -648,7 +648,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
648648
type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
649649
type_t smallest = vtype::type_max();
650650
type_t biggest = vtype::type_min();
651-
int64_t pivot_index = partition_avx512<vtype>(
651+
int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
652652
arr, left, right + 1, pivot, &smallest, &biggest);
653653
if (pivot != smallest)
654654
qsort_32bit_<vtype>(arr, left, pivot_index - 1, max_iters - 1);
@@ -657,10 +657,11 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
657657
}
658658

659659
template <typename vtype, typename type_t>
660-
static void
661-
qselect_32bit_(type_t *arr, int64_t pos,
662-
int64_t left, int64_t right,
663-
int64_t max_iters)
660+
static void qselect_32bit_(type_t *arr,
661+
int64_t pos,
662+
int64_t left,
663+
int64_t right,
664+
int64_t max_iters)
664665
{
665666
/*
666667
* Resort to std::sort if quicksort isnt making any progress
@@ -680,7 +681,7 @@ qselect_32bit_(type_t *arr, int64_t pos,
680681
type_t pivot = get_pivot_32bit<vtype>(arr, left, right);
681682
type_t smallest = vtype::type_max();
682683
type_t biggest = vtype::type_min();
683-
int64_t pivot_index = partition_avx512<vtype>(
684+
int64_t pivot_index = partition_avx512_unrolled<vtype, 2>(
684685
arr, left, right + 1, pivot, &smallest, &biggest);
685686
if ((pivot != smallest) && (pos < pivot_index))
686687
qselect_32bit_<vtype>(arr, pos, left, pivot_index - 1, max_iters - 1);

src/avx512-64bit-qsort.hpp

Lines changed: 358 additions & 8 deletions
Large diffs are not rendered by default.

src/avx512-common-qsort.h

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ void avx512_qselect(T *arr, int64_t k, int64_t arrsize);
9595
void avx512_qselect_fp16(uint16_t *arr, int64_t k, int64_t arrsize);
9696

9797
template <typename T>
98-
inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize) {
98+
inline void avx512_partial_qsort(T *arr, int64_t k, int64_t arrsize)
99+
{
99100
avx512_qselect<T>(arr, k - 1, arrsize);
100101
avx512_qsort<T>(arr, k - 1);
101102
}
@@ -259,4 +260,123 @@ static inline int64_t partition_avx512(type_t *arr,
259260
*biggest = vtype::reducemax(max_vec);
260261
return l_store;
261262
}
263+
264+
template <typename vtype,
265+
int num_unroll,
266+
typename type_t = typename vtype::type_t>
267+
static inline int64_t partition_avx512_unrolled(type_t *arr,
268+
int64_t left,
269+
int64_t right,
270+
type_t pivot,
271+
type_t *smallest,
272+
type_t *biggest)
273+
{
274+
if (right - left <= 2 * num_unroll * vtype::numlanes) {
275+
return partition_avx512<vtype>(
276+
arr, left, right, pivot, smallest, biggest);
277+
}
278+
/* make array length divisible by 8*vtype::numlanes , shortening the array */
279+
for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
280+
--i) {
281+
*smallest = std::min(*smallest, arr[left], comparison_func<vtype>);
282+
*biggest = std::max(*biggest, arr[left], comparison_func<vtype>);
283+
if (!comparison_func<vtype>(arr[left], pivot)) {
284+
std::swap(arr[left], arr[--right]);
285+
}
286+
else {
287+
++left;
288+
}
289+
}
290+
291+
if (left == right)
292+
return left; /* less than vtype::numlanes elements in the array */
293+
294+
using zmm_t = typename vtype::zmm_t;
295+
zmm_t pivot_vec = vtype::set1(pivot);
296+
zmm_t min_vec = vtype::set1(*smallest);
297+
zmm_t max_vec = vtype::set1(*biggest);
298+
299+
// We will now have atleast 16 registers worth of data to process:
300+
// left and right vtype::numlanes values are partitioned at the end
301+
zmm_t vec_left[num_unroll], vec_right[num_unroll];
302+
#pragma GCC unroll 8
303+
for (int ii = 0; ii < num_unroll; ++ii) {
304+
vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
305+
vec_right[ii] = vtype::loadu(
306+
arr + (right - vtype::numlanes * (num_unroll - ii)));
307+
}
308+
// store points of the vectors
309+
int64_t r_store = right - vtype::numlanes;
310+
int64_t l_store = left;
311+
// indices for loading the elements
312+
left += num_unroll * vtype::numlanes;
313+
right -= num_unroll * vtype::numlanes;
314+
while (right - left != 0) {
315+
zmm_t curr_vec[num_unroll];
316+
/*
317+
* if fewer elements are stored on the right side of the array,
318+
* then next elements are loaded from the right side,
319+
* otherwise from the left side
320+
*/
321+
if ((r_store + vtype::numlanes) - right < left - l_store) {
322+
right -= num_unroll * vtype::numlanes;
323+
#pragma GCC unroll 8
324+
for (int ii = 0; ii < num_unroll; ++ii) {
325+
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
326+
}
327+
}
328+
else {
329+
#pragma GCC unroll 8
330+
for (int ii = 0; ii < num_unroll; ++ii) {
331+
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
332+
}
333+
left += num_unroll * vtype::numlanes;
334+
}
335+
// partition the current vector and save it on both sides of the array
336+
#pragma GCC unroll 8
337+
for (int ii = 0; ii < num_unroll; ++ii) {
338+
int32_t amount_ge_pivot
339+
= partition_vec<vtype>(arr,
340+
l_store,
341+
r_store + vtype::numlanes,
342+
curr_vec[ii],
343+
pivot_vec,
344+
&min_vec,
345+
&max_vec);
346+
l_store += (vtype::numlanes - amount_ge_pivot);
347+
r_store -= amount_ge_pivot;
348+
}
349+
}
350+
351+
/* partition and save vec_left[8] and vec_right[8] */
352+
#pragma GCC unroll 8
353+
for (int ii = 0; ii < num_unroll; ++ii) {
354+
int32_t amount_ge_pivot
355+
= partition_vec<vtype>(arr,
356+
l_store,
357+
r_store + vtype::numlanes,
358+
vec_left[ii],
359+
pivot_vec,
360+
&min_vec,
361+
&max_vec);
362+
l_store += (vtype::numlanes - amount_ge_pivot);
363+
r_store -= amount_ge_pivot;
364+
}
365+
#pragma GCC unroll 8
366+
for (int ii = 0; ii < num_unroll; ++ii) {
367+
int32_t amount_ge_pivot
368+
= partition_vec<vtype>(arr,
369+
l_store,
370+
r_store + vtype::numlanes,
371+
vec_right[ii],
372+
pivot_vec,
373+
&min_vec,
374+
&max_vec);
375+
l_store += (vtype::numlanes - amount_ge_pivot);
376+
r_store -= amount_ge_pivot;
377+
}
378+
*smallest = vtype::reducemin(min_vec);
379+
*biggest = vtype::reducemax(max_vec);
380+
return l_store;
381+
}
262382
#endif // AVX512_QSORT_COMMON

tests/test_keyvalue.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
* *******************************************/
55

66
#include "avx512-64bit-keyvaluesort.hpp"
7-
#include "rand_array.h"
87
#include "cpuinfo.h"
8+
#include "rand_array.h"
99
#include <gtest/gtest.h>
1010
#include <vector>
1111

tests/test_partial_qsort.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ TYPED_TEST_P(avx512_partial_sort, test_ranges)
3030
int k = get_uniform_rand_array<int64_t>(1, arrsize, 1).front();
3131

3232
/* Sort the range and verify all the required elements match the presorted set */
33-
avx512_partial_qsort<TypeParam>(psortedarr.data(), k, psortedarr.size());
33+
avx512_partial_qsort<TypeParam>(
34+
psortedarr.data(), k, psortedarr.size());
3435
for (size_t jj = 0; jj < k; jj++) {
3536
ASSERT_EQ(sortedarr[jj], psortedarr[jj]);
3637
}

tests/test_qselect.hpp

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ class avx512_select : public ::testing::Test {
55
};
66
TYPED_TEST_SUITE_P(avx512_select);
77

8-
TYPED_TEST_P(avx512_select, test_arrsizes)
8+
TYPED_TEST_P(avx512_select, test_random)
99
{
1010
if (cpu_has_avx512bw()) {
1111
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
@@ -26,15 +26,16 @@ TYPED_TEST_P(avx512_select, test_arrsizes)
2626
std::sort(sortedarr.begin(), sortedarr.end());
2727
for (size_t k = 0; k < arr.size(); ++k) {
2828
psortedarr = arr;
29-
avx512_qselect<TypeParam>(psortedarr.data(), k, psortedarr.size());
29+
avx512_qselect<TypeParam>(
30+
psortedarr.data(), k, psortedarr.size());
3031
/* index k is correct */
3132
ASSERT_EQ(sortedarr[k], psortedarr[k]);
3233
/* Check left partition */
3334
for (size_t jj = 0; jj < k; jj++) {
3435
ASSERT_LE(psortedarr[jj], psortedarr[k]);
3536
}
3637
/* Check right partition */
37-
for (size_t jj = k+1; jj < arr.size(); jj++) {
38+
for (size_t jj = k + 1; jj < arr.size(); jj++) {
3839
ASSERT_GE(psortedarr[jj], psortedarr[k]);
3940
}
4041
psortedarr.clear();
@@ -48,4 +49,48 @@ TYPED_TEST_P(avx512_select, test_arrsizes)
4849
}
4950
}
5051

51-
REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_arrsizes);
52+
TYPED_TEST_P(avx512_select, test_small_range)
53+
{
54+
if (cpu_has_avx512bw()) {
55+
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
56+
GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
57+
}
58+
std::vector<int64_t> arrsizes;
59+
for (int64_t ii = 0; ii < 1024; ++ii) {
60+
arrsizes.push_back(ii);
61+
}
62+
std::vector<TypeParam> arr;
63+
std::vector<TypeParam> sortedarr;
64+
std::vector<TypeParam> psortedarr;
65+
for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
66+
/* Random array */
67+
arr = get_uniform_rand_array<TypeParam>(arrsizes[ii], 20, 1);
68+
sortedarr = arr;
69+
/* Sort with std::sort for comparison */
70+
std::sort(sortedarr.begin(), sortedarr.end());
71+
for (size_t k = 0; k < arr.size(); ++k) {
72+
psortedarr = arr;
73+
avx512_qselect<TypeParam>(
74+
psortedarr.data(), k, psortedarr.size());
75+
/* index k is correct */
76+
ASSERT_EQ(sortedarr[k], psortedarr[k]);
77+
/* Check left partition */
78+
for (size_t jj = 0; jj < k; jj++) {
79+
ASSERT_LE(psortedarr[jj], psortedarr[k]);
80+
}
81+
/* Check right partition */
82+
for (size_t jj = k + 1; jj < arr.size(); jj++) {
83+
ASSERT_GE(psortedarr[jj], psortedarr[k]);
84+
}
85+
psortedarr.clear();
86+
}
87+
arr.clear();
88+
sortedarr.clear();
89+
}
90+
}
91+
else {
92+
GTEST_SKIP() << "Skipping this test, it requires avx512bw";
93+
}
94+
}
95+
96+
REGISTER_TYPED_TEST_SUITE_P(avx512_select, test_random, test_small_range);

tests/test_qsort.hpp

Lines changed: 96 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class avx512_sort : public ::testing::Test {
1010
};
1111
TYPED_TEST_SUITE_P(avx512_sort);
1212

13-
TYPED_TEST_P(avx512_sort, test_arrsizes)
13+
TYPED_TEST_P(avx512_sort, test_random)
1414
{
1515
if (cpu_has_avx512bw()) {
1616
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
@@ -29,7 +29,7 @@ TYPED_TEST_P(avx512_sort, test_arrsizes)
2929
/* Sort with std::sort for comparison */
3030
std::sort(sortedarr.begin(), sortedarr.end());
3131
avx512_qsort<TypeParam>(arr.data(), arr.size());
32-
ASSERT_EQ(sortedarr, arr);
32+
ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
3333
arr.clear();
3434
sortedarr.clear();
3535
}
@@ -39,4 +39,97 @@ TYPED_TEST_P(avx512_sort, test_arrsizes)
3939
}
4040
}
4141

42-
REGISTER_TYPED_TEST_SUITE_P(avx512_sort, test_arrsizes);
42+
TYPED_TEST_P(avx512_sort, test_reverse)
43+
{
44+
if (cpu_has_avx512bw()) {
45+
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
46+
GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
47+
}
48+
std::vector<int64_t> arrsizes;
49+
for (int64_t ii = 0; ii < 1024; ++ii) {
50+
arrsizes.push_back((TypeParam)(ii + 1));
51+
}
52+
std::vector<TypeParam> arr;
53+
std::vector<TypeParam> sortedarr;
54+
for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
55+
/* reverse array */
56+
for (int jj = 0; jj < arrsizes[ii]; ++jj) {
57+
arr.push_back((TypeParam)(arrsizes[ii] - jj));
58+
}
59+
sortedarr = arr;
60+
/* Sort with std::sort for comparison */
61+
std::sort(sortedarr.begin(), sortedarr.end());
62+
avx512_qsort<TypeParam>(arr.data(), arr.size());
63+
ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
64+
arr.clear();
65+
sortedarr.clear();
66+
}
67+
}
68+
else {
69+
GTEST_SKIP() << "Skipping this test, it requires avx512bw";
70+
}
71+
}
72+
73+
TYPED_TEST_P(avx512_sort, test_constant)
74+
{
75+
if (cpu_has_avx512bw()) {
76+
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
77+
GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
78+
}
79+
std::vector<int64_t> arrsizes;
80+
for (int64_t ii = 0; ii < 1024; ++ii) {
81+
arrsizes.push_back((TypeParam)(ii + 1));
82+
}
83+
std::vector<TypeParam> arr;
84+
std::vector<TypeParam> sortedarr;
85+
for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
86+
/* constant array */
87+
for (int jj = 0; jj < arrsizes[ii]; ++jj) {
88+
arr.push_back(ii);
89+
}
90+
sortedarr = arr;
91+
/* Sort with std::sort for comparison */
92+
std::sort(sortedarr.begin(), sortedarr.end());
93+
avx512_qsort<TypeParam>(arr.data(), arr.size());
94+
ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
95+
arr.clear();
96+
sortedarr.clear();
97+
}
98+
}
99+
else {
100+
GTEST_SKIP() << "Skipping this test, it requires avx512bw";
101+
}
102+
}
103+
104+
TYPED_TEST_P(avx512_sort, test_small_range)
105+
{
106+
if (cpu_has_avx512bw()) {
107+
if ((sizeof(TypeParam) == 2) && (!cpu_has_avx512_vbmi2())) {
108+
GTEST_SKIP() << "Skipping this test, it requires avx512_vbmi2";
109+
}
110+
std::vector<int64_t> arrsizes;
111+
for (int64_t ii = 0; ii < 1024; ++ii) {
112+
arrsizes.push_back((TypeParam)(ii + 1));
113+
}
114+
std::vector<TypeParam> arr;
115+
std::vector<TypeParam> sortedarr;
116+
for (size_t ii = 0; ii < arrsizes.size(); ++ii) {
117+
arr = get_uniform_rand_array<TypeParam>(arrsizes[ii], 20, 1);
118+
sortedarr = arr;
119+
/* Sort with std::sort for comparison */
120+
std::sort(sortedarr.begin(), sortedarr.end());
121+
avx512_qsort<TypeParam>(arr.data(), arr.size());
122+
ASSERT_EQ(sortedarr, arr) << "Array size = " << arrsizes[ii];
123+
arr.clear();
124+
sortedarr.clear();
125+
}
126+
}
127+
else {
128+
GTEST_SKIP() << "Skipping this test, it requires avx512bw";
129+
}
130+
}
131+
REGISTER_TYPED_TEST_SUITE_P(avx512_sort,
132+
test_random,
133+
test_reverse,
134+
test_constant,
135+
test_small_range);

0 commit comments

Comments
 (0)