diff --git a/simde/x86/sse4.2.h b/simde/x86/sse4.2.h index 243dac381..c296ff0d7 100644 --- a/simde/x86/sse4.2.h +++ b/simde/x86/sse4.2.h @@ -37,6 +37,268 @@ SIMDE__BEGIN_DECLS # define SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES #endif +#if defined(SIMDE_X86_SSE4_2_NATIVE) +#define SIMDE_SIDD_UBYTE_OPS _SIDD_UBYTE_OPS +#define SIMDE_SIDD_UWORD_OPS _SIDD_UWORD_OPS +#define SIMDE_SIDD_SBYTE_OPS _SIDD_SBYTE_OPS +#define SIMDE__SIDD_SWORD_OPS _SIDD_SWORD_OPS +#define SIMDE_SIDD_CMP_EQUAL_ANY _SIDD_CMP_EQUAL_ANY +#define SIMDE_SIDD_CMP_RANGES _SIDD_CMP_RANGES +#define SIMDE_SIDD_CMP_EQUAL_EACH _SIDD_CMP_EQUAL_EACH +#define SIMDE_SIDD_CMP_EQUAL_ORDERED _SIDD_CMP_EQUAL_ORDERED +#define SIMDE_SIDD_POSITIVE_POLARITY _SIDD_POSITIVE_POLARITY +#define SIMDE_SIDD_NEGATIVE_POLARITY _SIDD_NEGATIVE_POLARITY +#define SIMDE_SIDD_MASKED_POSITIVE_POLARITY _SIDD_MASKED_POSITIVE_POLARITY +#define SIMDE_SIDD_MASKED_NEGATIVE_POLARITY _SIDD_MASKED_NEGATIVE_POLARITY +#define SIMDE_SIDD_LEAST_SIGNIFICANT _SIDD_LEAST_SIGNIFICANT +#define SIMDE_SIDD_MOST_SIGNIFICANT _SIDD_MOST_SIGNIFICANT +#define SIMDE_SIDD_BIT_MASK _SIDD_BIT_MASK +#define SIMDE_SIDD_UNIT_MASK _SIDD_UNIT_MASK + +#else +#define SIMDE_SIDD_UBYTE_OPS 0x00 +#define SIMDE_SIDD_UWORD_OPS 0x01 +#define SIMDE_SIDD_SBYTE_OPS 0x02 +#define SIMDE_SIDD_SWORD_OPS 0x03 +#define SIMDE_SIDD_CMP_EQUAL_ANY 0x00 +#define SIMDE_SIDD_CMP_RANGES 0x04 +#define SIMDE_SIDD_CMP_EQUAL_EACH 0x08 +#define SIMDE_SIDD_CMP_EQUAL_ORDERED 0x0c +#define SIMDE_SIDD_POSITIVE_POLARITY 0x00 +#define SIMDE_SIDD_NEGATIVE_POLARITY 0x10 +#define SIMDE_SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define SIMDE_SIDD_MASKED_NEGATIVE_POLARITY 0x30 +#define SIMDE_SIDD_LEAST_SIGNIFICANT 0x00 +#define SIMDE_SIDD_MOST_SIGNIFICANT 0x40 +#define SIMDE_SIDD_BIT_MASK 0x00 +#define SIMDE_SIDD_UNIT_MASK 0x40 +#endif + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_cmpestra_8_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 127) { + const int cmp_op = imm8 & 0x06; + const int polarity = imm8 & 0x30; + simde__m128i_private + bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()), + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + const int upper_bound = (128 / 8) - 1; + int a_invalid = 0; + int b_invalid = 0; + for(int i = 0 ; i < upper_bound ; i++) { + for(int j = 0; j< upper_bound ; j++){ + int bitvalue = ((a_.i8[i] == b_.i8[j]) ? 1 : 0); + if(i == la) + a_invalid = 1; + if(j == lb) + b_invalid = 1; + switch(cmp_op){ + case SIMDE_SIDD_CMP_EQUAL_ANY: + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_RANGES: + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_EQUAL_EACH: + if(a_invalid && b_invalid) + bitvalue = 1; + else + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_EQUAL_ORDERED: + if(a_invalid && !b_invalid) + bitvalue = 1; + else if(a_invalid && b_invalid) + bitvalue = 1; + else + bitvalue = 0; + break; + } + bool_res_.i8[i] |= (bitvalue << j); + } + } + int32_t int_res_1 = 0; + int32_t int_res_2 = 0; + switch(cmp_op) { + case SIMDE_SIDD_CMP_EQUAL_ANY: + for(int i = 0 ; i < upper_bound ; i++){ + SIMDE__VECTORIZE_REDUCTION(|:int_res_1) + for(int j = 0 ; j < upper_bound ; j++){ + int_res_1 |= (((bool_res_.i8[i] >> j) & 1) << i); + } + } + break; + case SIMDE_SIDD_CMP_RANGES: + for(int i = 0 ; i < upper_bound ; i++){ + SIMDE__VECTORIZE_REDUCTION(|:int_res_1) + for(int j = 0 ; j < upper_bound ; j++){ + int_res_1 |= ((((bool_res_.i8[i] >> j) & 1) & ((bool_res_.i8[i] >> (j + 1)) & 1)) << i); + j += 2; + } + } + break; + case SIMDE_SIDD_CMP_EQUAL_EACH: + for(int i = 0 ; i < upper_bound ; i++){ + SIMDE__VECTORIZE_REDUCTION(|:int_res_1) + for(int j = 0 ; j < upper_bound ; j++){ + int_res_1 |= (((bool_res_.i8[i] >> i) & 1) << i); + } + } + break; + case SIMDE_SIDD_CMP_EQUAL_ORDERED: + int_res_1 = 0xff; + for(int i = 0 ; i < upper_bound ; i++){ + int k = i; + SIMDE__VECTORIZE_REDUCTION(&:int_res_1) + for(int j = 0 ; j < (upper_bound-i) ; j++){ + int_res_1 &= (((bool_res_.i8[k] >> j) & 1 ) << i) ; + k += 1; + } + } + break; + } + for(int i = 0; i < upper_bound ; i++){ + if(polarity & 1){ + if((polarity >> 1) & 1) { + if (i >= lb) { + int_res_2 |= (((int_res_1 >> i) & 1) << i); + } + else { + int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i); + } + } + else{ + int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i); + } + } + else{ + int_res_2 |= ( ((int_res_1 >> i) & 1) << i); + } + } + return !int_res_2 & (lb > upper_bound); +} + +SIMDE__FUNCTION_ATTRIBUTES +int +simde_mm_cmpestra_16_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 127) { + const int cmp_op = imm8 & 0x06; + const int polarity = imm8 & 0x30; + simde__m128i_private + bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()), + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + const int upper_bound = (128 / 16) - 1; + int a_invalid = 0; + int b_invalid = 0; + for(int i = 0 ; i < upper_bound ; i++) { + for(int j = 0; j< upper_bound ; j++) + { + int bitvalue = ((a_.i16[i] == b_.i16[j]) ? 1 : 0); + if(i == la) + a_invalid = 1; + if(j == lb) + b_invalid = 1; + switch(cmp_op){ + case SIMDE_SIDD_CMP_EQUAL_ANY: + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_RANGES: + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_EQUAL_EACH: + if(a_invalid && b_invalid) + bitvalue = 1; + else + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_EQUAL_ORDERED: + if(a_invalid && !b_invalid) + bitvalue = 1; + else if(a_invalid && b_invalid) + bitvalue = 1; + else + bitvalue = 0; + break; + } + bool_res_.i16[i] |= (bitvalue << j); + } + } + int32_t int_res_1 = 0; + int32_t int_res_2 = 0; + switch(cmp_op) { + case SIMDE_SIDD_CMP_EQUAL_ANY: + for(int i = 0 ; i < upper_bound ; i++){ + SIMDE__VECTORIZE_REDUCTION(|:int_res_1) + for (int j = 0 ; j < upper_bound ; j++){ + int_res_1 |= (((bool_res_.i16[i] >> j) & 1) << i) ; + } + } + break; + case SIMDE_SIDD_CMP_RANGES: + for(int i = 0 ; i < upper_bound ; i++){ + SIMDE__VECTORIZE_REDUCTION(|:int_res_1) + for(int j = 0 ; j < upper_bound ; j++){ + int_res_1 |= ((((bool_res_.i16[i] >> j) & 1) & ((bool_res_.i16[i] >> (j + 1)) & 1)) << i); + j += 2; + } + } + break; + case SIMDE_SIDD_CMP_EQUAL_EACH: + for(int i = 0 ; i < upper_bound ; i++){ + SIMDE__VECTORIZE_REDUCTION(|:int_res_1) + for(int j = 0 ; j < upper_bound ; j++){ + int_res_1 |= (((bool_res_.i16[i] >> i) & 1) << i); + } + } + break; + case SIMDE_SIDD_CMP_EQUAL_ORDERED: + int_res_1 = 0xffff; + for(int i = 0 ; i < upper_bound ; i++){ + int k = i; + SIMDE__VECTORIZE_REDUCTION(&:int_res_1) + for(int j = 0 ; j < (upper_bound-i) ; j++){ + int_res_1 &= (((bool_res_.i16[k] >> j) & 1) << i) ; + k += 1; + } + } + break; + } + for(int i = 0; i < upper_bound ; i++){ + if(polarity & 1){ + if((polarity >> 1) & 1) { + if (i >= lb) { + int_res_2 |= (((int_res_1 >> i) & 1) << i); + } + else { + int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i); + } + } + else{ + int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i); + } + } + else{ + int_res_2 |= (((int_res_1 >> i) & 1) << i); + } + } + return !int_res_2 & (lb > upper_bound); +} + +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpestra(a, la, b, lb, imm8) _mm_cmpestra(a, la, b, lb, imm8) +#else + #define simde_mm_cmpestra(a, la, b, lb, imm8) \ + (((imm8) & SIMDE_SIDD_UWORD_OPS) \ + ? simde_mm_cmpestra_16_((a), (la), (b), (lb), (imm8)) \ + : simde_mm_cmpestra_8_((a), (la), (b), (lb), (imm8))) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #define _mm_cmpestra(a, la, b, lb, imm8) simde_mm_cmpestra(a, la, b, lb, imm8) +#endif + SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi64 (simde__m128i a, simde__m128i b) { diff --git a/test/x86/sse4.2.c b/test/x86/sse4.2.c index d00fe0fe2..5201d62c5 100644 --- a/test/x86/sse4.2.c +++ b/test/x86/sse4.2.c @@ -24,9 +24,116 @@ #define SIMDE_TESTS_CURRENT_ISAX sse4_2 #include #include +#include #if defined(SIMDE_X86_SSE4_2_NATIVE) || defined(SIMDE_NO_NATIVE) || defined(SIMDE_ALWAYS_BUILD_NATIVE_TESTS) +/* +static MunitResult +test_simde_mm_cmpestra_16(const MunitParameter params[], void* data) { + (void) params; + (void) data; + + const struct { + simde__m128i a; + int la; + simde__m128i b; + int lb; + const int imm8; + int r; + } test_vec[8] = { + + }; + + printf("\n"); + for (size_t i = 0 ; i < (sizeof(test_vec) / (sizeof(test_vec[0]))) ; i++) { + simde__m128i_private a, b; + int la, lb, r; + const int imm8 = (munit_rand_int_range(0, UINT8_MAX) | 1); + + munit_rand_memory(sizeof(a), (uint8_t*) &a); + munit_rand_memory(sizeof(b), (uint8_t*) &b); + la = munit_rand_int_range(0, 128/16); + lb = munit_rand_int_range(0, 128/16); + + r = simde_mm_cmpestra(simde__m128i_from_private(a), la, simde__m128i_from_private(b), lb, imm8); + + printf(" { simde_mm_set_epi16(INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "),\n" + " INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 ")),\n", + a.i16[7], a.i16[6], a.i16[5], a.i16[4], a.i16[3], a.i16[2], a.i16[1], a.i16[0]); + printf(" %d ,\n",la); + printf(" simde_mm_set_epi16(INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "),\n" + " INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 "), INT16_C(%6" PRId16 ")),\n", + b.i16[7], b.i16[6], b.i16[5], b.i16[4], b.i16[3], b.i16[2], b.i16[1], b.i16[0]); + printf(" %d ,\n",lb); + printf(" %d ,\n",imm8); + printf(" %d },\n",r); + } + return MUNIT_FAIL; + + for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) { + int r = simde_mm_cmpestra(test_vec[i].a, test_vec[i].la, test_vec[i].b, test_vec[i].lb, test_vec[i].imm8); + assert(r == test_vec[i].r); + } + + return MUNIT_OK; +} +static MunitResult +test_simde_mm_cmpestra_8(const MunitParameter params[], void* data) { + (void) params; + (void) data; + + const struct { + simde__m128i a; + int la; + simde__m128i b; + int lb; + const int imm8; + int r; + } test_vec[8] = { + + }; + + printf("\n"); + for (size_t i = 0 ; i < (sizeof(test_vec) / (sizeof(test_vec[0]))) ; i++) { + simde__m128i_private a, b; + int la, lb, r; + const int imm8 = (munit_rand_int_range(0, UINT8_MAX) & 0); + + munit_rand_memory(sizeof(a), (uint8_t*) &a); + munit_rand_memory(sizeof(b), (uint8_t*) &b); + la = munit_rand_int_range(0, 128/8); + lb = munit_rand_int_range(0, 128/8); + + r = simde_mm_cmpestra(simde__m128i_from_private(a), la, simde__m128i_from_private(b), lb, imm8); + + printf(" { simde_mm_set_epi8(INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "),\n" + " INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "),\n" + " INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "),\n" + " INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 ")),\n", + a.i8[15], a.i8[14], a.i8[13], a.i8[12], a.i8[11], a.i8[10], a.i8[ 9], a.i8[ 8], + a.i8[ 7], a.i8[ 6], a.i8[ 5], a.i8[ 4], a.i8[ 3], a.i8[ 2], a.i8[ 1], a.i8[ 0]); + printf(" %d ,\n",la); + printf(" simde_mm_set_epi8(INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "),\n" + " INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "),\n" + " INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "),\n" + " INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 "), INT8_C(%4" PRId8 ")),\n", + b.i8[15], b.i8[14], b.i8[13], b.i8[12], b.i8[11], b.i8[10], b.i8[ 9], b.i8[ 8], + b.i8[ 7], b.i8[ 6], b.i8[ 5], b.i8[ 4], b.i8[ 3], b.i8[ 2], b.i8[ 1], b.i8[ 0]); + printf(" %d ,\n",lb); + printf(" %d ,\n",imm8); + printf(" %d },\n",r); + } + return MUNIT_FAIL; + + for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) { + int r = simde_mm_cmpestra(test_vec[i].a, test_vec[i].la, test_vec[i].b, test_vec[i].lb, test_vec[i].imm8); + assert(r == test_vec[i].r); + } + + return MUNIT_OK; +} +*/ static MunitResult test_simde_mm_cmpgt_epi64(const MunitParameter params[], void* data) { (void) params;