Skip to content

Commit dd9be1d

Browse files
sse4.2: bug fixes
1 parent 6713f1d commit dd9be1d

File tree

2 files changed

+86
-1424
lines changed

2 files changed

+86
-1424
lines changed

simde/x86/sse4.2.h

Lines changed: 85 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -105,8 +105,8 @@ simde_mm_cmpestra_8_(simde__m128i a, int la, simde__m128i b, int lb, const int i
105105
const int upper_bound = (128 / 8) - 1;
106106
int a_invalid = 0;
107107
int b_invalid = 0;
108-
for(int i = 0 ; i < upper_bound ; i++) {
109-
for(int j = 0; j< upper_bound ; j++){
108+
for(int i = 0 ; i <= upper_bound ; i++) {
109+
for(int j = 0; j <= upper_bound ; j++){
110110
int bitvalue = ((a_.i8[i] == b_.i8[j]) ? 1 : 0);
111111
if(i == la)
112112
a_invalid = 1;
@@ -132,70 +132,93 @@ simde_mm_cmpestra_8_(simde__m128i a, int la, simde__m128i b, int lb, const int i
132132
bitvalue = 0;
133133
break;
134134
}
135-
bool_res_.i8[i] |= (bitvalue << j);
135+
if(bitvalue)
136+
bool_res_.i16[i] |= (1UL << j);
137+
else
138+
bool_res_.i16[i] &= ~(1UL << j);
136139
}
137140
}
138141
int32_t int_res_1 = 0;
139142
int32_t int_res_2 = 0;
140143
switch(cmp_op) {
141144
case SIMDE_SIDD_CMP_EQUAL_ANY:
142-
for(int i = 0 ; i < upper_bound ; i++){
145+
for(int i = 0 ; i <= upper_bound ; i++){
143146
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
144-
for(int j = 0 ; j < upper_bound ; j++){
147+
for(int j = 0 ; j <= upper_bound ; j++){
145148
int_res_1 |= (((bool_res_.i8[i] >> j) & 1) << i);
146149
}
147150
}
148151
break;
149152
case SIMDE_SIDD_CMP_RANGES:
150-
for(int i = 0 ; i < upper_bound ; i++){
153+
for(int i = 0 ; i <= upper_bound ; i++){
151154
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
152-
for(int j = 0 ; j < upper_bound ; j++){
155+
for(int j = 0 ; j <= upper_bound ; j++){
153156
int_res_1 |= ((((bool_res_.i8[i] >> j) & 1) & ((bool_res_.i8[i] >> (j + 1)) & 1)) << i);
154157
j += 2;
155158
}
156159
}
157160
break;
158161
case SIMDE_SIDD_CMP_EQUAL_EACH:
159-
for(int i = 0 ; i < upper_bound ; i++){
160-
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
161-
for(int j = 0 ; j < upper_bound ; j++){
162-
int_res_1 |= (((bool_res_.i8[i] >> i) & 1) << i);
162+
for(int i = 0 ; i <= upper_bound ; i++){
163+
//SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
164+
for(int j = 0 ; j <= upper_bound ; j++){
165+
int bitvalue = ((bool_res_.i8[i] >> i) & 1);
166+
if(bitvalue)
167+
int_res_1 |= (1UL << i);
168+
else
169+
int_res_1 &= ~(1UL << i);
163170
}
164171
}
165172
break;
166173
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
167174
int_res_1 = 0xff;
168-
for(int i = 0 ; i < upper_bound ; i++){
175+
for(int i = 0 ; i <= upper_bound ; i++){
169176
int k = i;
170177
HEDLEY_DIAGNOSTIC_PUSH
171178
#if defined(SIMDE_BUG_CLANG_45959)
172179
#pragma clang diagnostic ignored "-Wsign-conversion"
173180
#endif
174181
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
175-
for(int j = 0 ; j < (upper_bound-i) ; j++){
182+
for(int j = 0 ; j <= (upper_bound-i) ; j++){
176183
int_res_1 &= (((bool_res_.i8[k] >> j) & 1 ) << i) ;
177184
k += 1;
178185
}
179186
HEDLEY_DIAGNOSTIC_POP
180187
}
181188
break;
182189
}
183-
for(int i = 0; i < upper_bound ; i++){
190+
for(int i = 0; i <= upper_bound ; i++){
184191
if(polarity & SIMDE_SIDD_NEGATIVE_POLARITY){
185192
if(polarity & SIMDE_SIDD_MASKED_POSITIVE_POLARITY) {
186193
if (i >= lb) {
187-
int_res_2 |= (((int_res_1 >> i) & 1) << i);
194+
int bitvalue = ((int_res_1 >> i) & 1);
195+
if(bitvalue)
196+
int_res_2 |= (1UL << i);
197+
else
198+
int_res_2 &= ~(1UL << i);
188199
}
189200
else {
190-
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
201+
int bitvalue = (((int_res_1 >> i) & 1) ^ (-1));
202+
if(bitvalue)
203+
int_res_2 |= (1UL << i);
204+
else
205+
int_res_2 &= ~(1UL << i);
191206
}
192207
}
193208
else{
194-
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
209+
int bitvalue = (((int_res_1 >> i) & 1) ^ (-1));
210+
if(bitvalue)
211+
int_res_2 |= (1UL << i);
212+
else
213+
int_res_2 &= ~(1UL << i);
195214
}
196215
}
197216
else{
198-
int_res_2 |= ( ((int_res_1 >> i) & 1) << i);
217+
int bitvalue = ((int_res_1 >> i) & 1);
218+
if(bitvalue)
219+
int_res_2 |= (1UL << i);
220+
else
221+
int_res_2 &= ~(1UL << i);
199222
}
200223
}
201224
return !int_res_2 & (lb > upper_bound);
@@ -214,8 +237,8 @@ simde_mm_cmpestra_16_(simde__m128i a, int la, simde__m128i b, int lb, const int
214237
const int upper_bound = (128 / 16) - 1;
215238
int a_invalid = 0;
216239
int b_invalid = 0;
217-
for(int i = 0 ; i < upper_bound ; i++) {
218-
for(int j = 0; j< upper_bound ; j++)
240+
for(int i = 0 ; i <= upper_bound ; i++) {
241+
for(int j = 0; j <= upper_bound ; j++)
219242
{
220243
int bitvalue = ((a_.i16[i] == b_.i16[j]) ? 1 : 0);
221244
if(i == la)
@@ -242,70 +265,93 @@ simde_mm_cmpestra_16_(simde__m128i a, int la, simde__m128i b, int lb, const int
242265
bitvalue = 0;
243266
break;
244267
}
245-
bool_res_.i16[i] |= (bitvalue << j);
268+
if(bitvalue)
269+
bool_res_.i16[i] |= (1UL << j);
270+
else
271+
bool_res_.i16[i] &= ~(1UL << j);
246272
}
247273
}
248274
int32_t int_res_1 = 0;
249275
int32_t int_res_2 = 0;
250276
switch(cmp_op) {
251277
case SIMDE_SIDD_CMP_EQUAL_ANY:
252-
for(int i = 0 ; i < upper_bound ; i++){
278+
for(int i = 0 ; i <= upper_bound ; i++){
253279
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
254-
for (int j = 0 ; j < upper_bound ; j++){
280+
for (int j = 0 ; j <= upper_bound ; j++){
255281
int_res_1 |= (((bool_res_.i16[i] >> j) & 1) << i) ;
256282
}
257283
}
258284
break;
259285
case SIMDE_SIDD_CMP_RANGES:
260-
for(int i = 0 ; i < upper_bound ; i++){
286+
for(int i = 0 ; i <= upper_bound ; i++){
261287
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
262-
for(int j = 0 ; j < upper_bound ; j++){
288+
for(int j = 0 ; j <= upper_bound ; j++){
263289
int_res_1 |= ((((bool_res_.i16[i] >> j) & 1) & ((bool_res_.i16[i] >> (j + 1)) & 1)) << i);
264290
j += 2;
265291
}
266292
}
267293
break;
268294
case SIMDE_SIDD_CMP_EQUAL_EACH:
269-
for(int i = 0 ; i < upper_bound ; i++){
295+
for(int i = 0 ; i <= upper_bound ; i++){
270296
SIMDE_VECTORIZE_REDUCTION(|:int_res_1)
271-
for(int j = 0 ; j < upper_bound ; j++){
272-
int_res_1 |= (((bool_res_.i16[i] >> i) & 1) << i);
297+
for(int j = 0 ; j <= upper_bound ; j++){
298+
int bitvalue = ((bool_res_.i16[i] >> i) & 1);
299+
if(bitvalue)
300+
int_res_1 |= (1UL << i);
301+
else
302+
int_res_1 &= ~(1UL << i);
273303
}
274304
}
275305
break;
276306
case SIMDE_SIDD_CMP_EQUAL_ORDERED:
277307
int_res_1 = 0xffff;
278-
for(int i = 0 ; i < upper_bound ; i++){
308+
for(int i = 0 ; i <= upper_bound ; i++){
279309
int k = i;
280310
HEDLEY_DIAGNOSTIC_PUSH
281311
#if defined(SIMDE_BUG_CLANG_45959)
282312
#pragma clang diagnostic ignored "-Wsign-conversion"
283313
#endif
284314
SIMDE_VECTORIZE_REDUCTION(&:int_res_1)
285-
for(int j = 0 ; j < (upper_bound-i) ; j++){
315+
for(int j = 0 ; j <= (upper_bound-i) ; j++){
286316
int_res_1 &= (((bool_res_.i16[k] >> j) & 1) << i) ;
287317
k += 1;
288318
}
289319
HEDLEY_DIAGNOSTIC_POP
290320
}
291321
break;
292322
}
293-
for(int i = 0; i < upper_bound ; i++){
323+
for(int i = 0; i <= upper_bound ; i++){
294324
if(polarity & SIMDE_SIDD_NEGATIVE_POLARITY){
295325
if(polarity & SIMDE_SIDD_MASKED_POSITIVE_POLARITY) {
296326
if (i >= lb) {
297-
int_res_2 |= (((int_res_1 >> i) & 1) << i);
327+
int bitvalue = ((int_res_1 >> i) & 1);
328+
if(bitvalue)
329+
int_res_2 |= (1UL << i);
330+
else
331+
int_res_2 &= ~(1UL << i);
298332
}
299333
else {
300-
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
334+
int bitvalue = (((int_res_1 >> i) & 1) ^ (-1));
335+
if(bitvalue)
336+
int_res_2 |= (1UL << i);
337+
else
338+
int_res_2 &= ~(1UL << i);
301339
}
302340
}
303341
else{
304-
int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i);
342+
int bitvalue = (((int_res_1 >> i) & 1) ^ (-1));
343+
if(bitvalue)
344+
int_res_2 |= (1UL << i);
345+
else
346+
int_res_2 &= ~(1UL << i);
305347
}
306348
}
307349
else{
308-
int_res_2 |= (((int_res_1 >> i) & 1) << i);
350+
int bitvalue = ((int_res_1 >> i) & 1);
351+
if(bitvalue)
352+
int_res_2 |= (1UL << i);
353+
else
354+
int_res_2 &= ~(1UL << i);
309355
}
310356
}
311357
return !int_res_2 & (lb > upper_bound);
@@ -399,7 +445,7 @@ simde_mm_cmpistrs_8_(simde__m128i a) {
399445
const int upper_bound = (128 / 8) - 1;
400446
int a_invalid = 0;
401447
SIMDE_VECTORIZE
402-
for (int i = 0 ; i < upper_bound ; i++) {
448+
for (int i = 0 ; i <= upper_bound ; i++) {
403449
if(!a_.i8[i])
404450
a_invalid = 1;
405451
}
@@ -413,7 +459,7 @@ simde_mm_cmpistrs_16_(simde__m128i a) {
413459
const int upper_bound = (128 / 16) - 1;
414460
int a_invalid = 0;
415461
SIMDE_VECTORIZE
416-
for (int i = 0 ; i < upper_bound ; i++) {
462+
for (int i = 0 ; i <= upper_bound ; i++) {
417463
if(!a_.i16[i])
418464
a_invalid = 1;
419465
}
@@ -439,7 +485,7 @@ simde_mm_cmpistrz_8_(simde__m128i b) {
439485
const int upper_bound = (128 / 8) - 1;
440486
int b_invalid = 0;
441487
SIMDE_VECTORIZE
442-
for (int i = 0 ; i < upper_bound ; i++) {
488+
for (int i = 0 ; i <= upper_bound ; i++) {
443489
if(!b_.i8[i])
444490
b_invalid = 1;
445491
}
@@ -453,7 +499,7 @@ simde_mm_cmpistrz_16_(simde__m128i b) {
453499
const int upper_bound = (128 / 16) - 1;
454500
int b_invalid = 0;
455501
SIMDE_VECTORIZE
456-
for (int i = 0 ; i < upper_bound ; i++) {
502+
for (int i = 0 ; i <= upper_bound ; i++) {
457503
if(!b_.i16[i])
458504
b_invalid = 1;
459505
}

0 commit comments

Comments
 (0)