Skip to content

Commit 0d253a7

Browse files
author
Raghuveer Devulapalli
committed
Move key-value specific routines to its own file
1 parent f0d35bd commit 0d253a7

File tree

3 files changed

+208
-207
lines changed

3 files changed

+208
-207
lines changed

src/avx512-64bit-keyvalue-networks.hpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,43 @@
11
#ifndef AVX512_KEYVALUE_NETWORKS
22
#define AVX512_KEYVALUE_NETWORKS
3+
4+
template <typename vtype1,
5+
typename vtype2,
6+
typename reg_t1 = typename vtype1::reg_t,
7+
typename reg_t2 = typename vtype2::reg_t>
8+
X86_SIMD_SORT_INLINE void
9+
COEX(reg_t1 &key1, reg_t1 &key2, reg_t2 &index1, reg_t2 &index2)
10+
{
11+
reg_t1 key_t1 = vtype1::min(key1, key2);
12+
reg_t1 key_t2 = vtype1::max(key1, key2);
13+
14+
reg_t2 index_t1
15+
= vtype2::mask_mov(index2, vtype1::eq(key_t1, key1), index1);
16+
reg_t2 index_t2
17+
= vtype2::mask_mov(index1, vtype1::eq(key_t1, key1), index2);
18+
19+
key1 = key_t1;
20+
key2 = key_t2;
21+
index1 = index_t1;
22+
index2 = index_t2;
23+
}
24+
25+
template <typename vtype1,
26+
typename vtype2,
27+
typename reg_t1 = typename vtype1::reg_t,
28+
typename reg_t2 = typename vtype2::reg_t,
29+
typename opmask_t = typename vtype1::opmask_t>
30+
X86_SIMD_SORT_INLINE reg_t1 cmp_merge(reg_t1 in1,
31+
reg_t1 in2,
32+
reg_t2 &indexes1,
33+
reg_t2 indexes2,
34+
opmask_t mask)
35+
{
36+
reg_t1 tmp_keys = cmp_merge<vtype1>(in1, in2, mask);
37+
indexes1 = vtype2::mask_mov(indexes2, vtype1::eq(tmp_keys, in1), indexes1);
38+
return tmp_keys; // 0 -> min, 1 -> max
39+
}
40+
341
template <typename vtype1,
442
typename vtype2,
543
typename reg_t = typename vtype1::reg_t,

src/avx512-64bit-keyvaluesort.hpp

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,176 @@
1111
#include "avx512-64bit-common.h"
1212
#include "avx512-64bit-keyvalue-networks.hpp"
1313

14+
/*
15+
* Parition one ZMM register based on the pivot and returns the index of the
16+
* last element that is less than equal to the pivot.
17+
*/
18+
template <typename vtype1,
19+
typename vtype2,
20+
typename type_t1 = typename vtype1::type_t,
21+
typename type_t2 = typename vtype2::type_t,
22+
typename reg_t1 = typename vtype1::reg_t,
23+
typename reg_t2 = typename vtype2::reg_t>
24+
X86_SIMD_SORT_INLINE int32_t partition_vec(type_t1 *keys,
25+
type_t2 *indexes,
26+
arrsize_t left,
27+
arrsize_t right,
28+
const reg_t1 keys_vec,
29+
const reg_t2 indexes_vec,
30+
const reg_t1 pivot_vec,
31+
reg_t1 *smallest_vec,
32+
reg_t1 *biggest_vec)
33+
{
34+
/* which elements are larger than the pivot */
35+
typename vtype1::opmask_t gt_mask = vtype1::ge(keys_vec, pivot_vec);
36+
int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask);
37+
vtype1::mask_compressstoreu(
38+
keys + left, vtype1::knot_opmask(gt_mask), keys_vec);
39+
vtype1::mask_compressstoreu(
40+
keys + right - amount_gt_pivot, gt_mask, keys_vec);
41+
vtype2::mask_compressstoreu(
42+
indexes + left, vtype2::knot_opmask(gt_mask), indexes_vec);
43+
vtype2::mask_compressstoreu(
44+
indexes + right - amount_gt_pivot, gt_mask, indexes_vec);
45+
*smallest_vec = vtype1::min(keys_vec, *smallest_vec);
46+
*biggest_vec = vtype1::max(keys_vec, *biggest_vec);
47+
return amount_gt_pivot;
48+
}
49+
/*
50+
* Parition an array based on the pivot and returns the index of the
51+
* last element that is less than equal to the pivot.
52+
*/
53+
template <typename vtype1,
54+
typename vtype2,
55+
typename type_t1 = typename vtype1::type_t,
56+
typename type_t2 = typename vtype2::type_t,
57+
typename reg_t1 = typename vtype1::reg_t,
58+
typename reg_t2 = typename vtype2::reg_t>
59+
X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t1 *keys,
60+
type_t2 *indexes,
61+
arrsize_t left,
62+
arrsize_t right,
63+
type_t1 pivot,
64+
type_t1 *smallest,
65+
type_t1 *biggest)
66+
{
67+
/* make array length divisible by vtype1::numlanes , shortening the array */
68+
for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) {
69+
*smallest = std::min(*smallest, keys[left]);
70+
*biggest = std::max(*biggest, keys[left]);
71+
if (keys[left] > pivot) {
72+
right--;
73+
std::swap(keys[left], keys[right]);
74+
std::swap(indexes[left], indexes[right]);
75+
}
76+
else {
77+
++left;
78+
}
79+
}
80+
81+
if (left == right)
82+
return left; /* less than vtype1::numlanes elements in the array */
83+
84+
reg_t1 pivot_vec = vtype1::set1(pivot);
85+
reg_t1 min_vec = vtype1::set1(*smallest);
86+
reg_t1 max_vec = vtype1::set1(*biggest);
87+
88+
if (right - left == vtype1::numlanes) {
89+
reg_t1 keys_vec = vtype1::loadu(keys + left);
90+
int32_t amount_gt_pivot;
91+
92+
reg_t2 indexes_vec = vtype2::loadu(indexes + left);
93+
amount_gt_pivot = partition_vec<vtype1, vtype2>(keys,
94+
indexes,
95+
left,
96+
left + vtype1::numlanes,
97+
keys_vec,
98+
indexes_vec,
99+
pivot_vec,
100+
&min_vec,
101+
&max_vec);
102+
103+
*smallest = vtype1::reducemin(min_vec);
104+
*biggest = vtype1::reducemax(max_vec);
105+
return left + (vtype1::numlanes - amount_gt_pivot);
106+
}
107+
108+
// first and last vtype1::numlanes values are partitioned at the end
109+
reg_t1 keys_vec_left = vtype1::loadu(keys + left);
110+
reg_t1 keys_vec_right = vtype1::loadu(keys + (right - vtype1::numlanes));
111+
reg_t2 indexes_vec_left;
112+
reg_t2 indexes_vec_right;
113+
indexes_vec_left = vtype2::loadu(indexes + left);
114+
indexes_vec_right = vtype2::loadu(indexes + (right - vtype1::numlanes));
115+
116+
// store points of the vectors
117+
arrsize_t r_store = right - vtype1::numlanes;
118+
arrsize_t l_store = left;
119+
// indices for loading the elements
120+
left += vtype1::numlanes;
121+
right -= vtype1::numlanes;
122+
while (right - left != 0) {
123+
reg_t1 keys_vec;
124+
reg_t2 indexes_vec;
125+
/*
126+
* if fewer elements are stored on the right side of the array,
127+
* then next elements are loaded from the right side,
128+
* otherwise from the left side
129+
*/
130+
if ((r_store + vtype1::numlanes) - right < left - l_store) {
131+
right -= vtype1::numlanes;
132+
keys_vec = vtype1::loadu(keys + right);
133+
indexes_vec = vtype2::loadu(indexes + right);
134+
}
135+
else {
136+
keys_vec = vtype1::loadu(keys + left);
137+
indexes_vec = vtype2::loadu(indexes + left);
138+
left += vtype1::numlanes;
139+
}
140+
// partition the current vector and save it on both sides of the array
141+
int32_t amount_gt_pivot;
142+
143+
amount_gt_pivot
144+
= partition_vec<vtype1, vtype2>(keys,
145+
indexes,
146+
l_store,
147+
r_store + vtype1::numlanes,
148+
keys_vec,
149+
indexes_vec,
150+
pivot_vec,
151+
&min_vec,
152+
&max_vec);
153+
r_store -= amount_gt_pivot;
154+
l_store += (vtype1::numlanes - amount_gt_pivot);
155+
}
156+
157+
/* partition and save vec_left and vec_right */
158+
int32_t amount_gt_pivot;
159+
amount_gt_pivot = partition_vec<vtype1, vtype2>(keys,
160+
indexes,
161+
l_store,
162+
r_store + vtype1::numlanes,
163+
keys_vec_left,
164+
indexes_vec_left,
165+
pivot_vec,
166+
&min_vec,
167+
&max_vec);
168+
l_store += (vtype1::numlanes - amount_gt_pivot);
169+
amount_gt_pivot = partition_vec<vtype1, vtype2>(keys,
170+
indexes,
171+
l_store,
172+
l_store + vtype1::numlanes,
173+
keys_vec_right,
174+
indexes_vec_right,
175+
pivot_vec,
176+
&min_vec,
177+
&max_vec);
178+
l_store += (vtype1::numlanes - amount_gt_pivot);
179+
*smallest = vtype1::reducemin(min_vec);
180+
*biggest = vtype1::reducemax(max_vec);
181+
return l_store;
182+
}
183+
14184
template <typename vtype1,
15185
typename vtype2,
16186
typename type1_t = typename vtype1::type_t,

0 commit comments

Comments
 (0)