@@ -34,8 +34,10 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_4lanes(reg_t reg)
34
34
const typename vtype::opmask_t oxA = convert_int_to_mask<vtype>(0xA );
35
35
const typename vtype::opmask_t oxC = convert_int_to_mask<vtype>(0xC );
36
36
37
- reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 2 >(reg), oxA);
38
- reg = cmp_merge<vtype>(reg, vtype::reverse (reg), oxC);
37
+ reg = cmp_merge<vtype>(
38
+ reg, swizzle::template reverse_n<vtype, 2 >(reg), oxA);
39
+ reg = cmp_merge<vtype>(
40
+ reg, swizzle::template reverse_n<vtype, 4 >(reg), oxC);
39
41
reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 2 >(reg), oxA);
40
42
return reg;
41
43
}
@@ -57,12 +59,11 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t reg)
57
59
reg, swizzle::template reverse_n<vtype, 2 >(reg), oxAA);
58
60
reg = cmp_merge<vtype>(
59
61
reg, swizzle::template reverse_n<vtype, 4 >(reg), oxCC);
62
+ reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 2 >(reg), oxAA);
60
63
reg = cmp_merge<vtype>(
61
- reg, swizzle::template reverse_n<vtype, 2 >(reg), oxAA);
62
- reg = cmp_merge<vtype>(reg, vtype::reverse (reg), oxF0);
64
+ reg, swizzle::template reverse_n<vtype, 8 >(reg), oxF0);
63
65
reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 4 >(reg), oxCC);
64
- reg = cmp_merge<vtype>(
65
- reg, swizzle::template reverse_n<vtype, 2 >(reg), oxAA);
66
+ reg = cmp_merge<vtype>(reg, swizzle::template swap_n<vtype, 2 >(reg), oxAA);
66
67
return reg;
67
68
}
68
69
@@ -85,20 +86,21 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t reg)
85
86
reg = cmp_merge<vtype>(
86
87
reg, swizzle::template reverse_n<vtype, 4 >(reg), oxCCCC);
87
88
reg = cmp_merge<vtype>(
88
- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAA);
89
+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAA);
89
90
reg = cmp_merge<vtype>(
90
91
reg, swizzle::template reverse_n<vtype, 8 >(reg), oxF0F0);
91
92
reg = cmp_merge<vtype>(
92
93
reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCC);
93
94
reg = cmp_merge<vtype>(
94
- reg, swizzle::template reverse_n<vtype, 2 >(reg), oxAAAA);
95
- reg = cmp_merge<vtype>(reg, vtype::reverse (reg), oxFF00);
95
+ reg, swizzle::template swap_n<vtype, 2 >(reg), oxAAAA);
96
+ reg = cmp_merge<vtype>(
97
+ reg, swizzle::template reverse_n<vtype, 16 >(reg), oxFF00);
96
98
reg = cmp_merge<vtype>(
97
99
reg, swizzle::template swap_n<vtype, 8 >(reg), oxF0F0);
98
100
reg = cmp_merge<vtype>(
99
101
reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCC);
100
102
reg = cmp_merge<vtype>(
101
- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAA);
103
+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAA);
102
104
return reg;
103
105
}
104
106
@@ -129,14 +131,14 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_32lanes(reg_t reg)
129
131
reg = cmp_merge<vtype>(
130
132
reg, swizzle::template reverse_n<vtype, 4 >(reg), oxCCCCCCCC);
131
133
reg = cmp_merge<vtype>(
132
- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAAAAAA);
134
+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAAAAAA);
133
135
// Level 3
134
136
reg = cmp_merge<vtype>(
135
137
reg, swizzle::template reverse_n<vtype, 8 >(reg), oxF0F0F0F0);
136
138
reg = cmp_merge<vtype>(
137
139
reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCCCCCC);
138
140
reg = cmp_merge<vtype>(
139
- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAAAAAA);
141
+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAAAAAA);
140
142
// Level 4
141
143
reg = cmp_merge<vtype>(
142
144
reg, swizzle::template reverse_n<vtype, 16 >(reg), oxFF00FF00);
@@ -145,17 +147,18 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_32lanes(reg_t reg)
145
147
reg = cmp_merge<vtype>(
146
148
reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCCCCCC);
147
149
reg = cmp_merge<vtype>(
148
- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAAAAAA);
150
+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAAAAAA);
149
151
// Level 5
150
- reg = cmp_merge<vtype>(reg, vtype::reverse (reg), oxFFFF0000);
152
+ reg = cmp_merge<vtype>(
153
+ reg, swizzle::template reverse_n<vtype, 32 >(reg), oxFFFF0000);
151
154
reg = cmp_merge<vtype>(
152
155
reg, swizzle::template swap_n<vtype, 16 >(reg), oxFF00FF00);
153
156
reg = cmp_merge<vtype>(
154
157
reg, swizzle::template swap_n<vtype, 8 >(reg), oxF0F0F0F0);
155
158
reg = cmp_merge<vtype>(
156
159
reg, swizzle::template swap_n<vtype, 4 >(reg), oxCCCCCCCC);
157
160
reg = cmp_merge<vtype>(
158
- reg, swizzle::template reverse_n <vtype, 2 >(reg), oxAAAAAAAA);
161
+ reg, swizzle::template swap_n <vtype, 2 >(reg), oxAAAAAAAA);
159
162
return reg;
160
163
}
161
164
@@ -175,15 +178,16 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_4lanes(reg_t key_reg, index_type &index_reg)
175
178
176
179
key_reg = cmp_merge<vtype1, vtype2>(
177
180
key_reg,
178
- key_swizzle::template swap_n <vtype1, 2 >(key_reg),
181
+ key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
179
182
index_reg,
180
- index_swizzle::template swap_n <vtype2, 2 >(index_reg),
183
+ index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
181
184
oxA);
182
- key_reg = cmp_merge<vtype1, vtype2>(key_reg,
183
- vtype1::reverse (key_reg),
184
- index_reg,
185
- vtype2::reverse (index_reg),
186
- oxC);
185
+ key_reg = cmp_merge<vtype1, vtype2>(
186
+ key_reg,
187
+ key_swizzle::template reverse_n<vtype1, 4 >(key_reg),
188
+ index_reg,
189
+ index_swizzle::template reverse_n<vtype2, 4 >(index_reg),
190
+ oxC);
187
191
key_reg = cmp_merge<vtype1, vtype2>(
188
192
key_reg,
189
193
key_swizzle::template swap_n<vtype1, 2 >(key_reg),
@@ -208,9 +212,9 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t key_reg, index_type &index_reg)
208
212
209
213
key_reg = cmp_merge<vtype1, vtype2>(
210
214
key_reg,
211
- key_swizzle::template swap_n <vtype1, 2 >(key_reg),
215
+ key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
212
216
index_reg,
213
- index_swizzle::template swap_n <vtype2, 2 >(index_reg),
217
+ index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
214
218
oxAA);
215
219
key_reg = cmp_merge<vtype1, vtype2>(
216
220
key_reg,
@@ -224,11 +228,12 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_8lanes(reg_t key_reg, index_type &index_reg)
224
228
index_reg,
225
229
index_swizzle::template swap_n<vtype2, 2 >(index_reg),
226
230
oxAA);
227
- key_reg = cmp_merge<vtype1, vtype2>(key_reg,
228
- vtype1::reverse (key_reg),
229
- index_reg,
230
- vtype2::reverse (index_reg),
231
- oxF0);
231
+ key_reg = cmp_merge<vtype1, vtype2>(
232
+ key_reg,
233
+ key_swizzle::template reverse_n<vtype1, 8 >(key_reg),
234
+ index_reg,
235
+ index_swizzle::template reverse_n<vtype2, 8 >(index_reg),
236
+ oxF0);
232
237
key_reg = cmp_merge<vtype1, vtype2>(
233
238
key_reg,
234
239
key_swizzle::template swap_n<vtype1, 4 >(key_reg),
@@ -273,9 +278,9 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t key_reg,
273
278
oxCCCC);
274
279
key_reg = cmp_merge<vtype1, vtype2>(
275
280
key_reg,
276
- key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
281
+ key_swizzle::template swap_n <vtype1, 2 >(key_reg),
277
282
index_reg,
278
- index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
283
+ index_swizzle::template swap_n <vtype2, 2 >(index_reg),
279
284
oxAAAA);
280
285
key_reg = cmp_merge<vtype1, vtype2>(
281
286
key_reg,
@@ -291,15 +296,16 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t key_reg,
291
296
oxCCCC);
292
297
key_reg = cmp_merge<vtype1, vtype2>(
293
298
key_reg,
294
- key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
299
+ key_swizzle::template swap_n <vtype1, 2 >(key_reg),
295
300
index_reg,
296
- index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
301
+ index_swizzle::template swap_n <vtype2, 2 >(index_reg),
297
302
oxAAAA);
298
- key_reg = cmp_merge<vtype1, vtype2>(key_reg,
299
- vtype1::reverse (key_reg),
300
- index_reg,
301
- vtype2::reverse (index_reg),
302
- oxFF00);
303
+ key_reg = cmp_merge<vtype1, vtype2>(
304
+ key_reg,
305
+ key_swizzle::template reverse_n<vtype1, 16 >(key_reg),
306
+ index_reg,
307
+ index_swizzle::template reverse_n<vtype2, 16 >(index_reg),
308
+ oxFF00);
303
309
key_reg = cmp_merge<vtype1, vtype2>(
304
310
key_reg,
305
311
key_swizzle::template swap_n<vtype1, 8 >(key_reg),
@@ -314,9 +320,9 @@ X86_SIMD_SORT_INLINE reg_t sort_reg_16lanes(reg_t key_reg,
314
320
oxCCCC);
315
321
key_reg = cmp_merge<vtype1, vtype2>(
316
322
key_reg,
317
- key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
323
+ key_swizzle::template swap_n <vtype1, 2 >(key_reg),
318
324
index_reg,
319
- index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
325
+ index_swizzle::template swap_n <vtype2, 2 >(index_reg),
320
326
oxAAAA);
321
327
return key_reg;
322
328
}
@@ -427,9 +433,9 @@ X86_SIMD_SORT_INLINE reg_t bitonic_merge_reg_16lanes(reg_t key_reg,
427
433
oxCCCC);
428
434
key_reg = cmp_merge<vtype1, vtype2>(
429
435
key_reg,
430
- key_swizzle::template reverse_n <vtype1, 2 >(key_reg),
436
+ key_swizzle::template swap_n <vtype1, 2 >(key_reg),
431
437
index_reg,
432
- index_swizzle::template reverse_n <vtype2, 2 >(index_reg),
438
+ index_swizzle::template swap_n <vtype2, 2 >(index_reg),
433
439
oxAAAA);
434
440
return key_reg;
435
441
}
0 commit comments