|
23 | 23 | #include <emmintrin.h> |
24 | 24 | #define ZEND_HAVE_VECTOR_128 |
25 | 25 |
|
26 | | -typedef __m128i zend_vec_8x16_t; |
27 | | -typedef __m128i zend_vec_16x8_t; |
28 | | -typedef __m128i zend_vec_32x4_t; |
29 | | -typedef __m128i zend_vec_64x2_t; |
30 | | - |
31 | | -#define zend_vec_setzero_8x16() _mm_setzero_si128() |
32 | | -#define zend_vec_set_8x16(x) _mm_set1_epi8(x) |
33 | | -#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) |
34 | | -#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) _mm_set_epi32(x0, x1, x2, x3) |
35 | | -#define zend_vec_set_8x16_from_64x2(x0, x1) _mm_set_epi64(x0, x1) |
36 | | -#define zend_vec_load_8x16(x) _mm_load_si128((const __m128i *) (x)) |
37 | | -#define zend_vec_loadu_8x16(x) _mm_loadu_si128((const __m128i *) (x)) |
38 | | -#define zend_vec_store_8x16(to, x) _mm_store_si128((__m128i *) (to), x) |
39 | | -#define zend_vec_storeu_8x16(to, x) _mm_storeu_si128((__m128i *) (to), x) |
40 | | - |
41 | | -#define zend_vec_or_8x16(a, b) _mm_or_si128(a, b) |
42 | | -#define zend_vec_xor_8x16(a, b) _mm_xor_si128(a, b) |
43 | | -#define zend_vec_and_8x16(a, b) _mm_and_si128(a, b) |
44 | | -#define zend_vec_rshift_128_from_8x16(x, bytes) _mm_srli_si128(x, bytes) |
45 | | -#define zend_vec_lshift_128_from_8x16(x, bytes) _mm_slli_si128(x, bytes) |
46 | | - |
47 | | -#define zend_vec_add_8x16(a, b) _mm_add_epi8(a, b) |
48 | | - |
49 | | -#define zend_vec_cmpeq_8x16(a, b) _mm_cmpeq_epi8(a, b) |
50 | | -#define zend_vec_cmplt_8x16(a, b) _mm_cmplt_epi8(a, b) |
51 | | -#define zend_vec_cmpgt_8x16(a, b) _mm_cmpgt_epi8(a, b) |
52 | | - |
53 | | -#define zend_vec_movemask_8x16(x) _mm_movemask_epi8(x) |
54 | | - |
55 | 26 |
|
56 | 27 | #elif defined(__aarch64__) || defined(_M_ARM64) |
57 | 28 | #include <arm_neon.h> |
58 | 29 | #define ZEND_HAVE_VECTOR_128 |
59 | 30 |
|
60 | | -typedef int8x16_t zend_vec_8x16_t; |
61 | | -typedef int16x8_t zend_vec_16x8_t; |
62 | | -typedef int32x4_t zend_vec_32x4_t; |
63 | | -typedef int64x2_t zend_vec_64x2_t; |
| 31 | +typedef int8x16_t __m128i; |
64 | 32 |
|
65 | | -#define zend_vec_setzero_8x16() vdupq_n_s8(0) |
66 | | -#define zend_vec_set_8x16(x) vdupq_n_s8(x) |
67 | | -#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) \ |
| 33 | +#define _mm_setzero_si128() vdupq_n_s8(0) |
| 34 | +#define _mm_set1_epi8(x) vdupq_n_s8(x) |
| 35 | +#define _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7) \ |
68 | 36 | vreinterpretq_s8_s16((int16x8_t) { \ |
69 | 37 | (int16_t) (x7), (int16_t) (x6), (int16_t) (x5), (int16_t) (x4), \ |
70 | 38 | (int16_t) (x3), (int16_t) (x2), (int16_t) (x1), (int16_t) (x0) }) |
71 | | -#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) \ |
| 39 | +#define _mm_set_epi32(x0, x1, x2, x3) \ |
72 | 40 | vreinterpretq_s8_s32((int32x4_t) { (int32_t) (x3), (int32_t) (x2), (int32_t) (x1), (int32_t) (x0) }) |
73 | | -#define zend_vec_set_8x16_from_64x2(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) |
74 | | -#define zend_vec_load_8x16(x) vld1q_s8((const int8_t *) (x)) |
75 | | -#define zend_vec_loadu_8x16(x) zend_vec_load_8x16(x) |
76 | | -#define zend_vec_store_8x16(to, x) vst1q_s8((int8_t *) (to), x) |
77 | | -#define zend_vec_storeu_8x16(to, x) zend_vec_store_8x16(to, x) |
| 41 | +#define _mm_set_epi64(x0, x1) vreinterpretq_s8_s64((int64x2_t) { (int64_t) (x1), (int64_t) (x0) }) |
| 42 | +#define _mm_load_si128(x) vld1q_s8((const int8_t *) (x)) |
| 43 | +#define _mm_loadu_si128(x) _mm_load_si128(x) |
| 44 | +#define _mm_store_si128(to, x) vst1q_s8((int8_t *) (to), x) |
| 45 | +#define _mm_storeu_si128(to, x) _mm_store_si128(to, x) |
78 | 46 |
|
79 | | -#define zend_vec_or_8x16(a, b) vorrq_s8(a, b) |
80 | | -#define zend_vec_xor_8x16(a, b) veorq_s8(a, b) |
81 | | -#define zend_vec_and_8x16(a, b) vandq_s8(a, b) |
82 | | -#define zend_vec_rshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes)) |
83 | | -#define zend_vec_lshift_128_from_8x16(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes)) |
| 47 | +#define _mm_or_si128(a, b) vorrq_s8(a, b) |
| 48 | +#define _mm_xor_si128(a, b) veorq_s8(a, b) |
| 49 | +#define _mm_and_si128(a, b) vandq_s8(a, b) |
| 50 | +#define _mm_srli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes)) |
| 51 | +#define _mm_slli_si128(x, bytes) vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes)) |
84 | 52 |
|
85 | | -#define zend_vec_add_8x16(a, b) vaddq_s8(a, b) |
| 53 | +#define _mm_add_epi8(a, b) vaddq_s8(a, b) |
86 | 54 |
|
87 | | -#define zend_vec_cmpeq_8x16(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) |
88 | | -#define zend_vec_cmplt_8x16(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) |
89 | | -#define zend_vec_cmpgt_8x16(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b))) |
| 55 | +#define _mm_cmpeq_epi8(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b))) |
| 56 | +#define _mm_cmplt_epi8(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b))) |
| 57 | +#define _mm_cmpgt_epi8(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b))) |
90 | 58 |
|
91 | | -static zend_always_inline int zend_vec_movemask_8x16(int8x16_t x) |
| 59 | +static zend_always_inline int _mm_movemask_epi8(int8x16_t x) |
92 | 60 | { |
93 | 61 | /** |
94 | 62 | * based on code from |
|
0 commit comments