@@ -34,8 +34,10 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_4lanes(reg_t reg)
3434 const typename vtype::opmask_t oxA = convert_int_to_mask<vtype>(0xA );
3535 const typename vtype::opmask_t oxC = convert_int_to_mask<vtype>(0xC );
3636
37- reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 2 >(reg), oxA);
38- reg = cmp_merge<vtype>(reg, vtype::reverse (reg), oxC);
37+ reg = cmp_merge<vtype>(
38+ reg, swizzle::template reverse_n<vtype, 2 >(reg), oxA);
39+ reg = cmp_merge<vtype>(
40+ reg, swizzle::template reverse_n<vtype, 4 >(reg), oxC);
3941 reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 2 >(reg), oxA);
4042 return reg;
4143}
@@ -57,12 +59,11 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t reg)
5759 reg, swizzle::template reverse_n<vtype, 2 >(reg), oxAA);
5860 reg = cmp_merge<vtype>(
5961 reg, swizzle::template reverse_n<vtype, 4 >(reg), oxCC);
62+ reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 2 >(reg), oxAA);
6063 reg = cmp_merge<vtype>(
61- reg, swizzle::template reverse_n<vtype, 2 >(reg), oxAA);
62- reg = cmp_merge<vtype>(reg, vtype::reverse (reg), oxF0);
64+ reg, swizzle::template reverse_n<vtype, 8 >(reg), oxF0);
6365 reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 4 >(reg), oxCC);
64- reg = cmp_merge<vtype>(
65- reg, swizzle::template reverse_n<vtype, 2 >(reg), oxAA);
66+ reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 2 >(reg), oxAA);
6667 return reg;
6768}
6869
@@ -85,20 +86,21 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t reg)
8586 reg = cmp_merge<vtype>(
8687 reg, swizzle::template reverse_n<vtype, 4 >(reg), oxCCCC);
8788 reg = cmp_merge<vtype>(
88- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAA);
89+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAA);
8990 reg = cmp_merge<vtype>(
9091 reg, swizzle::template reverse_n<vtype, 8 >(reg), oxF0F0);
9192 reg = cmp_merge<vtype>(
9293 reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCC);
9394 reg = cmp_merge<vtype>(
94- reg, swizzle::template reverse_n<vtype, 2 >(reg), oxAAAA);
95- reg = cmp_merge<vtype>(reg, vtype::reverse (reg), oxFF00);
95+ reg, swizzle::template swap_n<vtype, 2 >(reg), oxAAAA);
96+ reg = cmp_merge<vtype>(
97+ reg, swizzle::template reverse_n<vtype, 16 >(reg), oxFF00);
9698 reg = cmp_merge<vtype>(
9799 reg, swizzle::template swap_n<vtype, 8 >(reg), oxF0F0);
98100 reg = cmp_merge<vtype>(
99101 reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCC);
100102 reg = cmp_merge<vtype>(
101- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAA);
103+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAA);
102104 return reg;
103105}
104106
@@ -129,14 +131,14 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_32lanes(reg_t reg)
129131 reg = cmp_merge<vtype>(
130132 reg, swizzle::template reverse_n<vtype, 4 >(reg), oxCCCCCCCC);
131133 reg = cmp_merge<vtype>(
132- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAAAAAA);
134+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAAAAAA);
133135 // Level 3
134136 reg = cmp_merge<vtype>(
135137 reg, swizzle::template reverse_n<vtype, 8 >(reg), oxF0F0F0F0);
136138 reg = cmp_merge<vtype>(
137139 reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCCCCCC);
138140 reg = cmp_merge<vtype>(
139- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAAAAAA);
141+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAAAAAA);
140142 // Level 4
141143 reg = cmp_merge<vtype>(
142144 reg, swizzle::template reverse_n<vtype, 16 >(reg), oxFF00FF00);
@@ -145,17 +147,18 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_32lanes(reg_t reg)
145147 reg = cmp_merge<vtype>(
146148 reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCCCCCC);
147149 reg = cmp_merge<vtype>(
148- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAAAAAA);
150+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAAAAAA);
149151 // Level 5
150- reg = cmp_merge<vtype>(reg, vtype::reverse (reg), oxFFFF0000);
152+ reg = cmp_merge<vtype>(
153+ reg, swizzle::template reverse_n<vtype, 32 >(reg), oxFFFF0000);
151154 reg = cmp_merge<vtype>(
152155 reg, swizzle::template swap_n<vtype, 16 >(reg), oxFF00FF00);
153156 reg = cmp_merge<vtype>(
154157 reg, swizzle::template swap_n<vtype, 8 >(reg), oxF0F0F0F0);
155158 reg = cmp_merge<vtype>(
156159 reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCCCCCC);
157160 reg = cmp_merge<vtype>(
158- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAAAAAA);
161+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAAAAAA);
159162 return reg;
160163}
161164
@@ -175,15 +178,16 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_4lanes(reg_t key_reg, index_type &index_reg)
175178
176179 key_reg = cmp_merge<vtype1, vtype2>(
177180 key_reg,
178- key_swizzle::template swap_n <vtype1, 2 >(key_reg),
181+ key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
179182 index_reg,
180- index_swizzle::template swap_n <vtype2, 2 >(index_reg),
183+ index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
181184 oxA);
182- key_reg = cmp_merge<vtype1, vtype2>(key_reg,
183- vtype1::reverse (key_reg),
184- index_reg,
185- vtype2::reverse (index_reg),
186- oxC);
185+ key_reg = cmp_merge<vtype1, vtype2>(
186+ key_reg,
187+ key_swizzle::template reverse_n<vtype1, 4 >(key_reg),
188+ index_reg,
189+ index_swizzle::template reverse_n<vtype2, 4 >(index_reg),
190+ oxC);
187191 key_reg = cmp_merge<vtype1, vtype2>(
188192 key_reg,
189193 key_swizzle::template swap_n<vtype1, 2 >(key_reg),
@@ -208,9 +212,9 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t key_reg, index_type &index_reg)
208212
209213 key_reg = cmp_merge<vtype1, vtype2>(
210214 key_reg,
211- key_swizzle::template swap_n <vtype1, 2 >(key_reg),
215+ key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
212216 index_reg,
213- index_swizzle::template swap_n <vtype2, 2 >(index_reg),
217+ index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
214218 oxAA);
215219 key_reg = cmp_merge<vtype1, vtype2>(
216220 key_reg,
@@ -224,11 +228,12 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t key_reg, index_type &index_reg)
224228 index_reg,
225229 index_swizzle::template swap_n<vtype2, 2 >(index_reg),
226230 oxAA);
227- key_reg = cmp_merge<vtype1, vtype2>(key_reg,
228- vtype1::reverse (key_reg),
229- index_reg,
230- vtype2::reverse (index_reg),
231- oxF0);
231+ key_reg = cmp_merge<vtype1, vtype2>(
232+ key_reg,
233+ key_swizzle::template reverse_n<vtype1, 8 >(key_reg),
234+ index_reg,
235+ index_swizzle::template reverse_n<vtype2, 8 >(index_reg),
236+ oxF0);
232237 key_reg = cmp_merge<vtype1, vtype2>(
233238 key_reg,
234239 key_swizzle::template swap_n<vtype1, 4 >(key_reg),
@@ -273,9 +278,9 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t key_reg,
273278 oxCCCC);
274279 key_reg = cmp_merge<vtype1, vtype2>(
275280 key_reg,
276- key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
281+ key_swizzle::template swap_n <vtype1, 2 >(key_reg),
277282 index_reg,
278- index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
283+ index_swizzle::template swap_n <vtype2, 2 >(index_reg),
279284 oxAAAA);
280285 key_reg = cmp_merge<vtype1, vtype2>(
281286 key_reg,
@@ -291,15 +296,16 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t key_reg,
291296 oxCCCC);
292297 key_reg = cmp_merge<vtype1, vtype2>(
293298 key_reg,
294- key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
299+ key_swizzle::template swap_n <vtype1, 2 >(key_reg),
295300 index_reg,
296- index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
301+ index_swizzle::template swap_n <vtype2, 2 >(index_reg),
297302 oxAAAA);
298- key_reg = cmp_merge<vtype1, vtype2>(key_reg,
299- vtype1::reverse (key_reg),
300- index_reg,
301- vtype2::reverse (index_reg),
302- oxFF00);
303+ key_reg = cmp_merge<vtype1, vtype2>(
304+ key_reg,
305+ key_swizzle::template reverse_n<vtype1, 16 >(key_reg),
306+ index_reg,
307+ index_swizzle::template reverse_n<vtype2, 16 >(index_reg),
308+ oxFF00);
303309 key_reg = cmp_merge<vtype1, vtype2>(
304310 key_reg,
305311 key_swizzle::template swap_n<vtype1, 8 >(key_reg),
@@ -314,9 +320,9 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t key_reg,
314320 oxCCCC);
315321 key_reg = cmp_merge<vtype1, vtype2>(
316322 key_reg,
317- key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
323+ key_swizzle::template swap_n <vtype1, 2 >(key_reg),
318324 index_reg,
319- index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
325+ index_swizzle::template swap_n <vtype2, 2 >(index_reg),
320326 oxAAAA);
321327 return key_reg;
322328}
@@ -427,9 +433,9 @@ X86_SIMD_SORT_INLINE reg_t bitonic_merge_reg_16lanes(reg_t key_reg,
427433 oxCCCC);
428434 key_reg = cmp_merge<vtype1, vtype2>(
429435 key_reg,
430- key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
436+ key_swizzle::template swap_n <vtype1, 2 >(key_reg),
431437 index_reg,
432- index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
438+ index_swizzle::template swap_n <vtype2, 2 >(index_reg),
433439 oxAAAA);
434440 return key_reg;
435441}
0 commit comments