@@ -30,6 +30,25 @@ X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs)
30
30
}
31
31
}
32
32
33
+ /*
34
+ * Swizzle ops explained:
35
+ * swap_n<scale>: swap neighbouring blocks of size <scale/2> within block of size <scale>
36
+ * reg i = [7,6,5,4,3,2,1,0]
37
+ * swap_n<2>: = [[6,7],[4,5],[2,3],[0,1]]
38
+ * swap_n<4>: = [[5,4,7,6],[1,0,3,2]]
39
+ * swap_n<8>: = [[3,2,1,0,7,6,5,4]]
40
+ * reverse_n<scale>: reverse elements within block of size <scale>
41
+ * reg i = [7,6,5,4,3,2,1,0]
42
+ * rev_n<2>: = [[6,7],[4,5],[2,3],[0,1]]
43
+ * rev_n<4>: = [[4,5,6,7],[0,1,2,3]]
44
+ * rev_n<8>: = [[0,1,2,3,4,5,6,7]]
45
+ * merge_n<scale>: merge blocks of <scale/2> elements from two regs
46
+ * reg b,a = [a,a,a,a,a,a,a,a], [b,b,b,b,b,b,b,b]
47
+ * merge_n<2> = [a,b,a,b,a,b,a,b]
48
+ * merge_n<4> = [a,a,b,b,a,a,b,b]
49
+ * merge_n<8> = [a,a,a,a,b,b,b,b]
50
+ */
51
+
33
52
template <typename vtype, int numVecs, int scale, bool first = true >
34
53
X86_SIMD_SORT_FINLINE void internal_merge_n_vec (typename vtype::reg_t *reg)
35
54
{
@@ -155,10 +174,12 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N)
155
174
vtype::zmm_max (), ioMasks[j], arr + i * vtype::numlanes);
156
175
}
157
176
158
- // Run the initial sorting network
177
+ /* Run the initial sorting network to sort the columns of the [numVecs x
178
+ * num_lanes] matrix
179
+ */
159
180
bitonic_sort_n_vec<vtype, numVecs>(vecs);
160
181
161
- // Merge vectors together
182
+ // Merge the vectors using bitonic merging networks
162
183
merge_n_vec<vtype, numVecs>(vecs);
163
184
164
185
// Unmasked part of the store
0 commit comments