Skip to content

Commit 6713f1d

Browse files
sse4.2: added the implementation for mm_cmpestra
1 parent 6ee040c commit 6713f1d

File tree

3 files changed

+1619
-0
lines changed

3 files changed

+1619
-0
lines changed

simde/simde-common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,7 @@ typedef SIMDE_FLOAT64_TYPE simde_float64;
693693
# if defined(SIMDE_ARCH_AARCH64)
694694
# define SIMDE_BUG_CLANG_45541
695695
# endif
696+
# define SIMDE_BUG_CLANG_45959
696697
# endif
697698
# if defined(HEDLEY_EMSCRIPTEN_VERSION)
698699
# define SIMDE_BUG_EMSCRIPTEN_MISSING_IMPL /* Placeholder for (as yet) unfiled issues. */

simde/x86/sse4.2.h

Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,237 @@ SIMDE_BEGIN_DECLS_
9292
#define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK
9393
#endif
9494

95+
SIMDE_FUNCTION_ATTRIBUTES
96+
int
97+
simde_mm_cmpestra_8_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
98+
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
99+
const int cmp_op = imm8 & 0x0c;
100+
const int polarity = imm8 & 0x30;
101+
simde__m128i_private
102+
bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()),
103+
a_ = simde__m128i_to_private(a),
104+
b_ = simde__m128i_to_private(b);
105+
const int upper_bound = (128 / 8) - 1;
106+
int a_invalid = 0;
107+
int b_invalid = 0;
108+
for(int i = 0 ; i < upper_bound ; i++) {
109+
for(int j = 0; j< upper_bound ; j++){
110+
int bitvalue = ((a_.i8[i] == b_.i8[j]) ? 1 : 0);
111+
if(i == la)
112+
a_invalid = 1;
113+
if(j == lb)
114+
b_invalid = 1;
115+
switch(cmp_op){
116+
case SIMDE_SIDD_CMP_EQUAL_ANY:
117+
case SIMDE_SIDD_CMP_RANGES:
118+
bitvalue = 0;
119+
break;
120+
case SIMDE_SIDD_CMP_EQUAL_EACH:
121+
if(a_invalid && b_invalid)
122+
bitvalue = 1;
123+
else
124+
bitvalue = 0;
125+
break;
126+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
127+
if(a_invalid && !b_invalid)
128+
bitvalue = 1;
129+
else if(a_invalid && b_invalid)
130+
bitvalue = 1;
131+
else
132+
bitvalue = 0;
133+
break;
134+
}
135+
bool_res_.i8[i] |= (bitvalue << j);
136+
}
137+
}
138+
int32_t int_res_1 = 0;
139+
int32_t int_res_2 = 0;
140+
switch(cmp_op) {
141+
case SIMDE_SIDD_CMP_EQUAL_ANY:
142+
for(int i = 0 ; i < upper_bound ; i++){
143+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
144+
for(int j = 0 ; j < upper_bound ; j++){
145+
int_res_1 |= (((bool_res_.i8[i] >> j) & 1) << i);
146+
}
147+
}
148+
break;
149+
case SIMDE_SIDD_CMP_RANGES:
150+
for(int i = 0 ; i < upper_bound ; i++){
151+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
152+
for(int j = 0 ; j < upper_bound ; j++){
153+
int_res_1 |= ((((bool_res_.i8[i] >> j) & 1) & ((bool_res_.i8[i] >> (j + 1)) & 1)) << i);
154+
j += 2;
155+
}
156+
}
157+
break;
158+
case SIMDE_SIDD_CMP_EQUAL_EACH:
159+
for(int i = 0 ; i < upper_bound ; i++){
160+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
161+
for(int j = 0 ; j < upper_bound ; j++){
162+
int_res_1 |= (((bool_res_.i8[i] >> i) & 1) << i);
163+
}
164+
}
165+
break;
166+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
167+
int_res_1 = 0xff;
168+
for(int i = 0 ; i < upper_bound ; i++){
169+
int k = i;
170+
HEDLEY_DIAGNOSTIC_PUSH
171+
#if defined(SIMDE_BUG_CLANG_45959)
172+
#pragma clang diagnostic ignored "-Wsign-conversion"
173+
#endif
174+
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
175+
for(int j = 0 ; j < (upper_bound-i) ; j++){
176+
int_res_1 &= (((bool_res_.i8[k] >> j) & 1 ) << i) ;
177+
k += 1;
178+
}
179+
HEDLEY_DIAGNOSTIC_POP
180+
}
181+
break;
182+
}
183+
for(int i = 0; i < upper_bound ; i++){
184+
if(polarity & SIMDE_SIDD_NEGATIVE_POLARITY){
185+
if(polarity & SIMDE_SIDD_MASKED_POSITIVE_POLARITY) {
186+
if (i >= lb) {
187+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
188+
}
189+
else {
190+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
191+
}
192+
}
193+
else{
194+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
195+
}
196+
}
197+
else{
198+
int_res_2 |= ( ((int_res_1 >> i) & 1) << i);
199+
}
200+
}
201+
return !int_res_2 & (lb > upper_bound);
202+
}
203+
204+
SIMDE_FUNCTION_ATTRIBUTES
205+
int
206+
simde_mm_cmpestra_16_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
207+
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) {
208+
const int cmp_op = imm8 & 0x0c;
209+
const int polarity = imm8 & 0x30;
210+
simde__m128i_private
211+
bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()),
212+
a_ = simde__m128i_to_private(a),
213+
b_ = simde__m128i_to_private(b);
214+
const int upper_bound = (128 / 16) - 1;
215+
int a_invalid = 0;
216+
int b_invalid = 0;
217+
for(int i = 0 ; i < upper_bound ; i++) {
218+
for(int j = 0; j< upper_bound ; j++)
219+
{
220+
int bitvalue = ((a_.i16[i] == b_.i16[j]) ? 1 : 0);
221+
if(i == la)
222+
a_invalid = 1;
223+
if(j == lb)
224+
b_invalid = 1;
225+
switch(cmp_op){
226+
case SIMDE_SIDD_CMP_EQUAL_ANY:
227+
case SIMDE_SIDD_CMP_RANGES:
228+
bitvalue = 0;
229+
break;
230+
case SIMDE_SIDD_CMP_EQUAL_EACH:
231+
if(a_invalid && b_invalid)
232+
bitvalue = 1;
233+
else
234+
bitvalue = 0;
235+
break;
236+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
237+
if(a_invalid && !b_invalid)
238+
bitvalue = 1;
239+
else if(a_invalid && b_invalid)
240+
bitvalue = 1;
241+
else
242+
bitvalue = 0;
243+
break;
244+
}
245+
bool_res_.i16[i] |= (bitvalue << j);
246+
}
247+
}
248+
int32_t int_res_1 = 0;
249+
int32_t int_res_2 = 0;
250+
switch(cmp_op) {
251+
case SIMDE_SIDD_CMP_EQUAL_ANY:
252+
for(int i = 0 ; i < upper_bound ; i++){
253+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
254+
for (int j = 0 ; j < upper_bound ; j++){
255+
int_res_1 |= (((bool_res_.i16[i] >> j) & 1) << i) ;
256+
}
257+
}
258+
break;
259+
case SIMDE_SIDD_CMP_RANGES:
260+
for(int i = 0 ; i < upper_bound ; i++){
261+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
262+
for(int j = 0 ; j < upper_bound ; j++){
263+
int_res_1 |= ((((bool_res_.i16[i] >> j) & 1) & ((bool_res_.i16[i] >> (j + 1)) & 1)) << i);
264+
j += 2;
265+
}
266+
}
267+
break;
268+
case SIMDE_SIDD_CMP_EQUAL_EACH:
269+
for(int i = 0 ; i < upper_bound ; i++){
270+
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
271+
for(int j = 0 ; j < upper_bound ; j++){
272+
int_res_1 |= (((bool_res_.i16[i] >> i) & 1) << i);
273+
}
274+
}
275+
break;
276+
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
277+
int_res_1 = 0xffff;
278+
for(int i = 0 ; i < upper_bound ; i++){
279+
int k = i;
280+
HEDLEY_DIAGNOSTIC_PUSH
281+
#if defined(SIMDE_BUG_CLANG_45959)
282+
#pragma clang diagnostic ignored "-Wsign-conversion"
283+
#endif
284+
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
285+
for(int j = 0 ; j < (upper_bound-i) ; j++){
286+
int_res_1 &= (((bool_res_.i16[k] >> j) & 1) << i) ;
287+
k += 1;
288+
}
289+
HEDLEY_DIAGNOSTIC_POP
290+
}
291+
break;
292+
}
293+
for(int i = 0; i < upper_bound ; i++){
294+
if(polarity & SIMDE_SIDD_NEGATIVE_POLARITY){
295+
if(polarity & SIMDE_SIDD_MASKED_POSITIVE_POLARITY) {
296+
if (i >= lb) {
297+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
298+
}
299+
else {
300+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
301+
}
302+
}
303+
else{
304+
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
305+
}
306+
}
307+
else{
308+
int_res_2 |= (((int_res_1 >> i) & 1) << i);
309+
}
310+
}
311+
return !int_res_2 & (lb > upper_bound);
312+
}
313+
314+
#if defined(SIMDE_X86_SSE4_2_NATIVE)
315+
#define simde_mm_cmpestra(a, la, b, lb, imm8) _mm_cmpestra(a, la, b, lb, imm8)
316+
#else
317+
#define simde_mm_cmpestra(a, la, b, lb, imm8) \
318+
(((imm8) & SIMDE_SIDD_UWORD_OPS) \
319+
? simde_mm_cmpestra_16_((a), (la), (b), (lb), (imm8)) \
320+
: simde_mm_cmpestra_8_((a), (la), (b), (lb), (imm8)))
321+
#endif
322+
#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES)
323+
#define _mm_cmpestra(a, la, b, lb, imm8) simde_mm_cmpestra(a, la, b, lb, imm8)
324+
#endif
325+
95326
SIMDE_FUNCTION_ATTRIBUTES
96327
int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8)
97328
SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 127) {

0 commit comments

Comments
 (0)