Skip to content

Commit fceccc3

Browse files
author
Raghuveer Devulapalli
committed
Add comments
1 parent cbd6179 commit fceccc3

File tree

2 files changed

+24
-3
lines changed

2 files changed

+24
-3
lines changed

src/xss-network-qsort.hpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,25 @@ X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs)
3030
}
3131
}
3232

33+
/*
34+
* Swizzle ops explained:
35+
* swap_n<scale>: swap neighbouring blocks of size <scale/2> within block of size <scale>
36+
* reg i = [7,6,5,4,3,2,1,0]
37+
* swap_n<2>: = [[6,7],[4,5],[2,3],[0,1]]
38+
* swap_n<4>: = [[5,4,7,6],[1,0,3,2]]
39+
* swap_n<8>: = [[3,2,1,0,7,6,5,4]]
40+
* reverse_n<scale>: reverse elements within block of size <scale>
41+
* reg i = [7,6,5,4,3,2,1,0]
42+
* rev_n<2>: = [[6,7],[4,5],[2,3],[0,1]]
43+
* rev_n<4>: = [[4,5,6,7],[0,1,2,3]]
44+
* rev_n<8>: = [[0,1,2,3,4,5,6,7]]
45+
* merge_n<scale>: merge blocks of <scale/2> elements from two regs
46+
* reg b,a = [a,a,a,a,a,a,a,a], [b,b,b,b,b,b,b,b]
47+
* merge_n<2> = [a,b,a,b,a,b,a,b]
48+
* merge_n<4> = [a,a,b,b,a,a,b,b]
49+
* merge_n<8> = [a,a,a,a,b,b,b,b]
50+
*/
51+
3352
template <typename vtype, int numVecs, int scale, bool first = true>
3453
X86_SIMD_SORT_FINLINE void internal_merge_n_vec(typename vtype::reg_t *reg)
3554
{
@@ -155,10 +174,12 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N)
155174
vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes);
156175
}
157176

158-
// Run the initial sorting network
177+
/* Run the initial sorting network to sort the columns of the [numVecs x
178+
* num_lanes] matrix
179+
*/
159180
bitonic_sort_n_vec<vtype, numVecs>(vecs);
160181

161-
// Merge vectors together
182+
// Merge the vectors using bitonic merging networks
162183
merge_n_vec<vtype, numVecs>(vecs);
163184

164185
// Unmasked part of the store

src/xss-optimal-networks.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,4 +317,4 @@ X86_SIMD_SORT_FINLINE void optimal_sort_32(reg_t *vecs)
317317
COEX<vtype>(vecs[23], vecs[24]);
318318
COEX<vtype>(vecs[25], vecs[26]);
319319
COEX<vtype>(vecs[27], vecs[28]);
320-
}
320+
}

0 commit comments

Comments
 (0)