Skip to content

Commit 85e1008

Browse files
author
Raghuveer Devulapalli
committed
Reorg keyvalue and argsort method routines
1 parent 0d253a7 commit 85e1008

8 files changed

+371
-381
lines changed

src/avx512-64bit-argsort.hpp

Lines changed: 291 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77
#ifndef AVX512_ARGSORT_64BIT
88
#define AVX512_ARGSORT_64BIT
99

10+
#include "avx512-common-qsort.h"
1011
#include "avx512-64bit-common.h"
11-
#include "avx512-64bit-keyvalue-networks.hpp"
12-
#include "avx512-common-argsort.h"
12+
#include "xss-network-keyvaluesort.hpp"
13+
#include <numeric>
1314

1415
template <typename T>
1516
X86_SIMD_SORT_INLINE void std_argselect_withnan(
@@ -64,6 +65,294 @@ std_argsort(T *arr, arrsize_t *arg, arrsize_t left, arrsize_t right)
6465
});
6566
}
6667

68+
69+
/* Workaround for NumPy failed build on macOS x86_64: implicit instantiation of
70+
* undefined template 'zmm_vector<unsigned long>'*/
71+
#ifdef __APPLE__
72+
using argtype = typename std::conditional<sizeof(arrsize_t) == sizeof(int32_t),
73+
ymm_vector<uint32_t>,
74+
zmm_vector<uint64_t>>::type;
75+
#else
76+
using argtype = typename std::conditional<sizeof(arrsize_t) == sizeof(int32_t),
77+
ymm_vector<arrsize_t>,
78+
zmm_vector<arrsize_t>>::type;
79+
#endif
80+
using argreg_t = typename argtype::reg_t;
81+
82+
/*
83+
* Parition one ZMM register based on the pivot and returns the index of the
84+
* last element that is less than equal to the pivot.
85+
*/
86+
template <typename vtype, typename type_t, typename reg_t>
87+
X86_SIMD_SORT_INLINE int32_t partition_vec(type_t *arg,
88+
arrsize_t left,
89+
arrsize_t right,
90+
const argreg_t arg_vec,
91+
const reg_t curr_vec,
92+
const reg_t pivot_vec,
93+
reg_t *smallest_vec,
94+
reg_t *biggest_vec)
95+
{
96+
/* which elements are larger than the pivot */
97+
typename vtype::opmask_t gt_mask = vtype::ge(curr_vec, pivot_vec);
98+
int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask);
99+
argtype::mask_compressstoreu(
100+
arg + left, vtype::knot_opmask(gt_mask), arg_vec);
101+
argtype::mask_compressstoreu(
102+
arg + right - amount_gt_pivot, gt_mask, arg_vec);
103+
*smallest_vec = vtype::min(curr_vec, *smallest_vec);
104+
*biggest_vec = vtype::max(curr_vec, *biggest_vec);
105+
return amount_gt_pivot;
106+
}
107+
/*
108+
* Parition an array based on the pivot and returns the index of the
109+
* last element that is less than equal to the pivot.
110+
*/
111+
template <typename vtype, typename type_t>
112+
X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t *arr,
113+
arrsize_t *arg,
114+
arrsize_t left,
115+
arrsize_t right,
116+
type_t pivot,
117+
type_t *smallest,
118+
type_t *biggest)
119+
{
120+
/* make array length divisible by vtype::numlanes , shortening the array */
121+
for (int32_t i = (right - left) % vtype::numlanes; i > 0; --i) {
122+
*smallest = std::min(*smallest, arr[arg[left]], comparison_func<vtype>);
123+
*biggest = std::max(*biggest, arr[arg[left]], comparison_func<vtype>);
124+
if (!comparison_func<vtype>(arr[arg[left]], pivot)) {
125+
std::swap(arg[left], arg[--right]);
126+
}
127+
else {
128+
++left;
129+
}
130+
}
131+
132+
if (left == right)
133+
return left; /* less than vtype::numlanes elements in the array */
134+
135+
using reg_t = typename vtype::reg_t;
136+
reg_t pivot_vec = vtype::set1(pivot);
137+
reg_t min_vec = vtype::set1(*smallest);
138+
reg_t max_vec = vtype::set1(*biggest);
139+
140+
if (right - left == vtype::numlanes) {
141+
argreg_t argvec = argtype::loadu(arg + left);
142+
reg_t vec = vtype::i64gather(arr, arg + left);
143+
int32_t amount_gt_pivot = partition_vec<vtype>(arg,
144+
left,
145+
left + vtype::numlanes,
146+
argvec,
147+
vec,
148+
pivot_vec,
149+
&min_vec,
150+
&max_vec);
151+
*smallest = vtype::reducemin(min_vec);
152+
*biggest = vtype::reducemax(max_vec);
153+
return left + (vtype::numlanes - amount_gt_pivot);
154+
}
155+
156+
// first and last vtype::numlanes values are partitioned at the end
157+
argreg_t argvec_left = argtype::loadu(arg + left);
158+
reg_t vec_left = vtype::i64gather(arr, arg + left);
159+
argreg_t argvec_right = argtype::loadu(arg + (right - vtype::numlanes));
160+
reg_t vec_right = vtype::i64gather(arr, arg + (right - vtype::numlanes));
161+
// store points of the vectors
162+
arrsize_t r_store = right - vtype::numlanes;
163+
arrsize_t l_store = left;
164+
// indices for loading the elements
165+
left += vtype::numlanes;
166+
right -= vtype::numlanes;
167+
while (right - left != 0) {
168+
argreg_t arg_vec;
169+
reg_t curr_vec;
170+
/*
171+
* if fewer elements are stored on the right side of the array,
172+
* then next elements are loaded from the right side,
173+
* otherwise from the left side
174+
*/
175+
if ((r_store + vtype::numlanes) - right < left - l_store) {
176+
right -= vtype::numlanes;
177+
arg_vec = argtype::loadu(arg + right);
178+
curr_vec = vtype::i64gather(arr, arg + right);
179+
}
180+
else {
181+
arg_vec = argtype::loadu(arg + left);
182+
curr_vec = vtype::i64gather(arr, arg + left);
183+
left += vtype::numlanes;
184+
}
185+
// partition the current vector and save it on both sides of the array
186+
int32_t amount_gt_pivot
187+
= partition_vec<vtype>(arg,
188+
l_store,
189+
r_store + vtype::numlanes,
190+
arg_vec,
191+
curr_vec,
192+
pivot_vec,
193+
&min_vec,
194+
&max_vec);
195+
;
196+
r_store -= amount_gt_pivot;
197+
l_store += (vtype::numlanes - amount_gt_pivot);
198+
}
199+
200+
/* partition and save vec_left and vec_right */
201+
int32_t amount_gt_pivot = partition_vec<vtype>(arg,
202+
l_store,
203+
r_store + vtype::numlanes,
204+
argvec_left,
205+
vec_left,
206+
pivot_vec,
207+
&min_vec,
208+
&max_vec);
209+
l_store += (vtype::numlanes - amount_gt_pivot);
210+
amount_gt_pivot = partition_vec<vtype>(arg,
211+
l_store,
212+
l_store + vtype::numlanes,
213+
argvec_right,
214+
vec_right,
215+
pivot_vec,
216+
&min_vec,
217+
&max_vec);
218+
l_store += (vtype::numlanes - amount_gt_pivot);
219+
*smallest = vtype::reducemin(min_vec);
220+
*biggest = vtype::reducemax(max_vec);
221+
return l_store;
222+
}
223+
224+
template <typename vtype,
225+
int num_unroll,
226+
typename type_t = typename vtype::type_t>
227+
X86_SIMD_SORT_INLINE arrsize_t partition_avx512_unrolled(type_t *arr,
228+
arrsize_t *arg,
229+
arrsize_t left,
230+
arrsize_t right,
231+
type_t pivot,
232+
type_t *smallest,
233+
type_t *biggest)
234+
{
235+
if (right - left <= 8 * num_unroll * vtype::numlanes) {
236+
return partition_avx512<vtype>(
237+
arr, arg, left, right, pivot, smallest, biggest);
238+
}
239+
/* make array length divisible by vtype::numlanes , shortening the array */
240+
for (int32_t i = ((right - left) % (num_unroll * vtype::numlanes)); i > 0;
241+
--i) {
242+
*smallest = std::min(*smallest, arr[arg[left]], comparison_func<vtype>);
243+
*biggest = std::max(*biggest, arr[arg[left]], comparison_func<vtype>);
244+
if (!comparison_func<vtype>(arr[arg[left]], pivot)) {
245+
std::swap(arg[left], arg[--right]);
246+
}
247+
else {
248+
++left;
249+
}
250+
}
251+
252+
if (left == right)
253+
return left; /* less than vtype::numlanes elements in the array */
254+
255+
using reg_t = typename vtype::reg_t;
256+
reg_t pivot_vec = vtype::set1(pivot);
257+
reg_t min_vec = vtype::set1(*smallest);
258+
reg_t max_vec = vtype::set1(*biggest);
259+
260+
// first and last vtype::numlanes values are partitioned at the end
261+
reg_t vec_left[num_unroll], vec_right[num_unroll];
262+
argreg_t argvec_left[num_unroll], argvec_right[num_unroll];
263+
X86_SIMD_SORT_UNROLL_LOOP(8)
264+
for (int ii = 0; ii < num_unroll; ++ii) {
265+
argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii);
266+
vec_left[ii] = vtype::i64gather(arr, arg + left + vtype::numlanes * ii);
267+
argvec_right[ii] = argtype::loadu(
268+
arg + (right - vtype::numlanes * (num_unroll - ii)));
269+
vec_right[ii] = vtype::i64gather(
270+
arr, arg + (right - vtype::numlanes * (num_unroll - ii)));
271+
}
272+
// store points of the vectors
273+
arrsize_t r_store = right - vtype::numlanes;
274+
arrsize_t l_store = left;
275+
// indices for loading the elements
276+
left += num_unroll * vtype::numlanes;
277+
right -= num_unroll * vtype::numlanes;
278+
while (right - left != 0) {
279+
argreg_t arg_vec[num_unroll];
280+
reg_t curr_vec[num_unroll];
281+
/*
282+
* if fewer elements are stored on the right side of the array,
283+
* then next elements are loaded from the right side,
284+
* otherwise from the left side
285+
*/
286+
if ((r_store + vtype::numlanes) - right < left - l_store) {
287+
right -= num_unroll * vtype::numlanes;
288+
X86_SIMD_SORT_UNROLL_LOOP(8)
289+
for (int ii = 0; ii < num_unroll; ++ii) {
290+
arg_vec[ii]
291+
= argtype::loadu(arg + right + ii * vtype::numlanes);
292+
curr_vec[ii] = vtype::i64gather(
293+
arr, arg + right + ii * vtype::numlanes);
294+
}
295+
}
296+
else {
297+
X86_SIMD_SORT_UNROLL_LOOP(8)
298+
for (int ii = 0; ii < num_unroll; ++ii) {
299+
arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes);
300+
curr_vec[ii] = vtype::i64gather(
301+
arr, arg + left + ii * vtype::numlanes);
302+
}
303+
left += num_unroll * vtype::numlanes;
304+
}
305+
// partition the current vector and save it on both sides of the array
306+
X86_SIMD_SORT_UNROLL_LOOP(8)
307+
for (int ii = 0; ii < num_unroll; ++ii) {
308+
int32_t amount_gt_pivot
309+
= partition_vec<vtype>(arg,
310+
l_store,
311+
r_store + vtype::numlanes,
312+
arg_vec[ii],
313+
curr_vec[ii],
314+
pivot_vec,
315+
&min_vec,
316+
&max_vec);
317+
l_store += (vtype::numlanes - amount_gt_pivot);
318+
r_store -= amount_gt_pivot;
319+
}
320+
}
321+
322+
/* partition and save vec_left and vec_right */
323+
X86_SIMD_SORT_UNROLL_LOOP(8)
324+
for (int ii = 0; ii < num_unroll; ++ii) {
325+
int32_t amount_gt_pivot
326+
= partition_vec<vtype>(arg,
327+
l_store,
328+
r_store + vtype::numlanes,
329+
argvec_left[ii],
330+
vec_left[ii],
331+
pivot_vec,
332+
&min_vec,
333+
&max_vec);
334+
l_store += (vtype::numlanes - amount_gt_pivot);
335+
r_store -= amount_gt_pivot;
336+
}
337+
X86_SIMD_SORT_UNROLL_LOOP(8)
338+
for (int ii = 0; ii < num_unroll; ++ii) {
339+
int32_t amount_gt_pivot
340+
= partition_vec<vtype>(arg,
341+
l_store,
342+
r_store + vtype::numlanes,
343+
argvec_right[ii],
344+
vec_right[ii],
345+
pivot_vec,
346+
&min_vec,
347+
&max_vec);
348+
l_store += (vtype::numlanes - amount_gt_pivot);
349+
r_store -= amount_gt_pivot;
350+
}
351+
*smallest = vtype::reducemin(min_vec);
352+
*biggest = vtype::reducemax(max_vec);
353+
return l_store;
354+
}
355+
67356
template <typename vtype, typename type_t>
68357
X86_SIMD_SORT_INLINE void
69358
argsort_8_64bit(type_t *arr, arrsize_t *arg, int32_t N)

src/avx512-64bit-common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66

77
#ifndef AVX512_64BIT_COMMON
88
#define AVX512_64BIT_COMMON
9-
#include "avx512-common-qsort.h"
9+
10+
#include "xss-common-includes.h"
1011

1112
/*
1213
* Constants used in sorting 8 elements in a ZMM registers. Based on Bitonic

src/avx512-64bit-keyvaluesort.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
#ifndef AVX512_QSORT_64BIT_KV
99
#define AVX512_QSORT_64BIT_KV
1010

11+
#include "avx512-common-qsort.h"
1112
#include "avx512-64bit-common.h"
12-
#include "avx512-64bit-keyvalue-networks.hpp"
13+
#include "xss-network-keyvaluesort.hpp"
1314

1415
/*
1516
* Parition one ZMM register based on the pivot and returns the index of the

src/avx512-64bit-qsort.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#ifndef AVX512_QSORT_64BIT
88
#define AVX512_QSORT_64BIT
99

10+
#include "avx512-common-qsort.h"
1011
#include "avx512-64bit-common.h"
1112

1213
#endif // AVX512_QSORT_64BIT

0 commit comments

Comments
 (0)