|
11 | 11 | #include "avx512-64bit-common.h"
|
12 | 12 | #include "avx512-64bit-keyvalue-networks.hpp"
|
13 | 13 |
|
| 14 | +/* |
| 15 | + * Parition one ZMM register based on the pivot and returns the index of the |
| 16 | + * last element that is less than equal to the pivot. |
| 17 | + */ |
| 18 | +template <typename vtype1, |
| 19 | + typename vtype2, |
| 20 | + typename type_t1 = typename vtype1::type_t, |
| 21 | + typename type_t2 = typename vtype2::type_t, |
| 22 | + typename reg_t1 = typename vtype1::reg_t, |
| 23 | + typename reg_t2 = typename vtype2::reg_t> |
| 24 | +X86_SIMD_SORT_INLINE int32_t partition_vec(type_t1 *keys, |
| 25 | + type_t2 *indexes, |
| 26 | + arrsize_t left, |
| 27 | + arrsize_t right, |
| 28 | + const reg_t1 keys_vec, |
| 29 | + const reg_t2 indexes_vec, |
| 30 | + const reg_t1 pivot_vec, |
| 31 | + reg_t1 *smallest_vec, |
| 32 | + reg_t1 *biggest_vec) |
| 33 | +{ |
| 34 | + /* which elements are larger than the pivot */ |
| 35 | + typename vtype1::opmask_t gt_mask = vtype1::ge(keys_vec, pivot_vec); |
| 36 | + int32_t amount_gt_pivot = _mm_popcnt_u32((int32_t)gt_mask); |
| 37 | + vtype1::mask_compressstoreu( |
| 38 | + keys + left, vtype1::knot_opmask(gt_mask), keys_vec); |
| 39 | + vtype1::mask_compressstoreu( |
| 40 | + keys + right - amount_gt_pivot, gt_mask, keys_vec); |
| 41 | + vtype2::mask_compressstoreu( |
| 42 | + indexes + left, vtype2::knot_opmask(gt_mask), indexes_vec); |
| 43 | + vtype2::mask_compressstoreu( |
| 44 | + indexes + right - amount_gt_pivot, gt_mask, indexes_vec); |
| 45 | + *smallest_vec = vtype1::min(keys_vec, *smallest_vec); |
| 46 | + *biggest_vec = vtype1::max(keys_vec, *biggest_vec); |
| 47 | + return amount_gt_pivot; |
| 48 | +} |
| 49 | +/* |
| 50 | + * Parition an array based on the pivot and returns the index of the |
| 51 | + * last element that is less than equal to the pivot. |
| 52 | + */ |
| 53 | +template <typename vtype1, |
| 54 | + typename vtype2, |
| 55 | + typename type_t1 = typename vtype1::type_t, |
| 56 | + typename type_t2 = typename vtype2::type_t, |
| 57 | + typename reg_t1 = typename vtype1::reg_t, |
| 58 | + typename reg_t2 = typename vtype2::reg_t> |
| 59 | +X86_SIMD_SORT_INLINE arrsize_t partition_avx512(type_t1 *keys, |
| 60 | + type_t2 *indexes, |
| 61 | + arrsize_t left, |
| 62 | + arrsize_t right, |
| 63 | + type_t1 pivot, |
| 64 | + type_t1 *smallest, |
| 65 | + type_t1 *biggest) |
| 66 | +{ |
| 67 | + /* make array length divisible by vtype1::numlanes , shortening the array */ |
| 68 | + for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) { |
| 69 | + *smallest = std::min(*smallest, keys[left]); |
| 70 | + *biggest = std::max(*biggest, keys[left]); |
| 71 | + if (keys[left] > pivot) { |
| 72 | + right--; |
| 73 | + std::swap(keys[left], keys[right]); |
| 74 | + std::swap(indexes[left], indexes[right]); |
| 75 | + } |
| 76 | + else { |
| 77 | + ++left; |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + if (left == right) |
| 82 | + return left; /* less than vtype1::numlanes elements in the array */ |
| 83 | + |
| 84 | + reg_t1 pivot_vec = vtype1::set1(pivot); |
| 85 | + reg_t1 min_vec = vtype1::set1(*smallest); |
| 86 | + reg_t1 max_vec = vtype1::set1(*biggest); |
| 87 | + |
| 88 | + if (right - left == vtype1::numlanes) { |
| 89 | + reg_t1 keys_vec = vtype1::loadu(keys + left); |
| 90 | + int32_t amount_gt_pivot; |
| 91 | + |
| 92 | + reg_t2 indexes_vec = vtype2::loadu(indexes + left); |
| 93 | + amount_gt_pivot = partition_vec<vtype1, vtype2>(keys, |
| 94 | + indexes, |
| 95 | + left, |
| 96 | + left + vtype1::numlanes, |
| 97 | + keys_vec, |
| 98 | + indexes_vec, |
| 99 | + pivot_vec, |
| 100 | + &min_vec, |
| 101 | + &max_vec); |
| 102 | + |
| 103 | + *smallest = vtype1::reducemin(min_vec); |
| 104 | + *biggest = vtype1::reducemax(max_vec); |
| 105 | + return left + (vtype1::numlanes - amount_gt_pivot); |
| 106 | + } |
| 107 | + |
| 108 | + // first and last vtype1::numlanes values are partitioned at the end |
| 109 | + reg_t1 keys_vec_left = vtype1::loadu(keys + left); |
| 110 | + reg_t1 keys_vec_right = vtype1::loadu(keys + (right - vtype1::numlanes)); |
| 111 | + reg_t2 indexes_vec_left; |
| 112 | + reg_t2 indexes_vec_right; |
| 113 | + indexes_vec_left = vtype2::loadu(indexes + left); |
| 114 | + indexes_vec_right = vtype2::loadu(indexes + (right - vtype1::numlanes)); |
| 115 | + |
| 116 | + // store points of the vectors |
| 117 | + arrsize_t r_store = right - vtype1::numlanes; |
| 118 | + arrsize_t l_store = left; |
| 119 | + // indices for loading the elements |
| 120 | + left += vtype1::numlanes; |
| 121 | + right -= vtype1::numlanes; |
| 122 | + while (right - left != 0) { |
| 123 | + reg_t1 keys_vec; |
| 124 | + reg_t2 indexes_vec; |
| 125 | + /* |
| 126 | + * if fewer elements are stored on the right side of the array, |
| 127 | + * then next elements are loaded from the right side, |
| 128 | + * otherwise from the left side |
| 129 | + */ |
| 130 | + if ((r_store + vtype1::numlanes) - right < left - l_store) { |
| 131 | + right -= vtype1::numlanes; |
| 132 | + keys_vec = vtype1::loadu(keys + right); |
| 133 | + indexes_vec = vtype2::loadu(indexes + right); |
| 134 | + } |
| 135 | + else { |
| 136 | + keys_vec = vtype1::loadu(keys + left); |
| 137 | + indexes_vec = vtype2::loadu(indexes + left); |
| 138 | + left += vtype1::numlanes; |
| 139 | + } |
| 140 | + // partition the current vector and save it on both sides of the array |
| 141 | + int32_t amount_gt_pivot; |
| 142 | + |
| 143 | + amount_gt_pivot |
| 144 | + = partition_vec<vtype1, vtype2>(keys, |
| 145 | + indexes, |
| 146 | + l_store, |
| 147 | + r_store + vtype1::numlanes, |
| 148 | + keys_vec, |
| 149 | + indexes_vec, |
| 150 | + pivot_vec, |
| 151 | + &min_vec, |
| 152 | + &max_vec); |
| 153 | + r_store -= amount_gt_pivot; |
| 154 | + l_store += (vtype1::numlanes - amount_gt_pivot); |
| 155 | + } |
| 156 | + |
| 157 | + /* partition and save vec_left and vec_right */ |
| 158 | + int32_t amount_gt_pivot; |
| 159 | + amount_gt_pivot = partition_vec<vtype1, vtype2>(keys, |
| 160 | + indexes, |
| 161 | + l_store, |
| 162 | + r_store + vtype1::numlanes, |
| 163 | + keys_vec_left, |
| 164 | + indexes_vec_left, |
| 165 | + pivot_vec, |
| 166 | + &min_vec, |
| 167 | + &max_vec); |
| 168 | + l_store += (vtype1::numlanes - amount_gt_pivot); |
| 169 | + amount_gt_pivot = partition_vec<vtype1, vtype2>(keys, |
| 170 | + indexes, |
| 171 | + l_store, |
| 172 | + l_store + vtype1::numlanes, |
| 173 | + keys_vec_right, |
| 174 | + indexes_vec_right, |
| 175 | + pivot_vec, |
| 176 | + &min_vec, |
| 177 | + &max_vec); |
| 178 | + l_store += (vtype1::numlanes - amount_gt_pivot); |
| 179 | + *smallest = vtype1::reducemin(min_vec); |
| 180 | + *biggest = vtype1::reducemax(max_vec); |
| 181 | + return l_store; |
| 182 | +} |
| 183 | + |
14 | 184 | template <typename vtype1,
|
15 | 185 | typename vtype2,
|
16 | 186 | typename type1_t = typename vtype1::type_t,
|
|
0 commit comments