|
8 | 8 | #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H |
9 | 9 | #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H |
10 | 10 |
|
| 11 | +#include "src/__support/CPP/bit.h" // countr_zero |
11 | 12 | #include "src/string/memory_utils/op_x86.h" // K_AVX |
12 | 13 |
|
13 | 14 | #include <stddef.h> // size_t |
14 | | -#include <x86intrin.h> |
| 15 | + |
15 | 16 | namespace LIBC_NAMESPACE_DECL { |
16 | 17 |
|
17 | | -#if defined(__SSE2__) |
18 | 18 | [[maybe_unused]] LIBC_INLINE size_t string_length_sse2(const char *src) { |
19 | 19 | using Vector __attribute__((may_alias)) = __m128i; |
| 20 | + |
20 | 21 | Vector z = _mm_setzero_si128(); |
21 | 22 | uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector); |
22 | 23 | const Vector *block_ptr = |
23 | 24 | reinterpret_cast<const Vector *>(src - misalign_bytes); |
24 | | - if (misalign_bytes) { |
25 | | - Vector v = _mm_load_si128(block_ptr); |
26 | | - Vector vcmp = _mm_cmpeq_epi8(z, v); |
27 | | - // shift away results in irrelevant bytes. |
28 | | - int cmp = _mm_movemask_epi8(vcmp) >> misalign_bytes; |
29 | | - if (cmp) |
30 | | - return __builtin_ctz(cmp); |
31 | | - block_ptr++; |
32 | | - } |
| 25 | + Vector v = _mm_load_si128(block_ptr); |
| 26 | + Vector vcmp = _mm_cmpeq_epi8(z, v); |
| 27 | + // shift away results in irrelevant bytes. |
| 28 | + uint32_t cmp = _mm_movemask_epi8(vcmp) >> misalign_bytes; |
| 29 | + if (cmp) |
| 30 | + return cpp::countr_zero(cmp); |
| 31 | + |
33 | 32 | while (true) { |
34 | | - Vector v = _mm_load_si128(block_ptr); |
35 | | - Vector vcmp = _mm_cmpeq_epi8(z, v); |
36 | | - int cmp = _mm_movemask_epi8(vcmp); |
| 33 | + block_ptr++; |
| 34 | + v = _mm_load_si128(block_ptr); |
| 35 | + vcmp = _mm_cmpeq_epi8(z, v); |
| 36 | + cmp = _mm_movemask_epi8(vcmp); |
37 | 37 | if (cmp) |
38 | 38 | return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) - |
39 | 39 | reinterpret_cast<uintptr_t>(src) + |
40 | | - __builtin_ctz(cmp)); |
41 | | - block_ptr++; |
| 40 | + cpp::countr_zero(cmp)); |
42 | 41 | } |
43 | 42 | } |
44 | | -#endif |
45 | 43 |
|
46 | 44 | #if defined(__AVX2__) |
47 | 45 | [[maybe_unused]] LIBC_INLINE size_t string_length_avx2(const char *src) { |
48 | 46 | using Vector __attribute__((may_alias)) = __mm256i; |
| 47 | + |
49 | 48 | Vector z = _mm256_setzero_si256(); |
50 | 49 | uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector); |
51 | 50 | const Vector *block_ptr = |
52 | 51 | reinterpret_cast<const Vector *>(src - misalign_bytes); |
53 | | - if (misalign_bytes) { |
54 | | - Vector v = _mm256_load_si256(block_ptr); |
55 | | - Vector vcmp = _mm256_cmpeq_epi8(z, v); |
56 | | - // shift away results in irrelevant bytes. |
57 | | - int cmp = _mm256_movemask_epi8(vcmp) >> misalign_bytes; |
58 | | - if (cmp) |
59 | | - return __builtin_ctz(cmp); |
60 | | - block_ptr++; |
61 | | - } |
| 52 | + Vector v = _mm256_load_si256(block_ptr); |
| 53 | + Vector vcmp = _mm256_cmpeq_epi8(z, v); |
| 54 | + // shift away results in irrelevant bytes. |
| 55 | + int cmp = _mm256_movemask_epi8(vcmp) >> misalign_bytes; |
| 56 | + if (cmp) |
| 57 | + return cpp::countr_zero(cmp); |
| 58 | + |
62 | 59 | while (true) { |
63 | | - Vector v = _mm256_load_si256(block_ptr); |
64 | | - Vector vcmp = _mm256_cmpeq_epi8(z, v); |
65 | | - int cmp = _mm256_movemask_epi8(vcmp); |
| 60 | + block_ptr++; |
| 61 | + v = _mm256_load_si256(block_ptr); |
| 62 | + vcmp = _mm256_cmpeq_epi8(z, v); |
| 63 | + cmp = _mm256_movemask_epi8(vcmp); |
66 | 64 | if (cmp) |
67 | 65 | return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) - |
68 | 66 | reinterpret_cast<uintptr_t>(src) + |
69 | | - __builtin_ctz(cmp)); |
70 | | - block_ptr++; |
| 67 | + cpp::countr_zero(cmp)); |
71 | 68 | } |
72 | 69 | } |
73 | | -#endif // __AVX__ |
| 70 | +#endif // __AVX2__ |
74 | 71 |
|
75 | 72 | #if defined(__AVX512F__) |
76 | 73 | [[maybe_unused]] LIBC_INLINE size_t string_length_avx512(const char *src) { |
77 | 74 | using Vector __attribute__((may_alias)) = __mm512i; |
| 75 | + |
78 | 76 | Vector z = _mm512_setzero_si512(); |
79 | 77 | uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector); |
80 | 78 | const Vector *block_ptr = |
81 | 79 | reinterpret_cast<const Vector *>(src - misalign_bytes); |
82 | | - if (misalign_bytes) { |
83 | | - Vector v = _mm512_load_si512(block_ptr); |
84 | | - __mmask64 cmp = _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ) >> misalign_bytes; |
85 | | - if (cmp) |
86 | | - return __builtin_ctzl(cmp); |
87 | | - block_ptr++; |
88 | | - } |
| 80 | + Vector v = _mm512_load_si512(block_ptr); |
| 81 | + __mmask64 cmp = _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ) >> misalign_bytes; |
| 82 | + if (cmp) |
| 83 | + return cpp::countr_zero(cmp); |
| 84 | + |
89 | 85 | while (true) { |
| 86 | + block_ptr++; |
90 | 87 | Vector v = _mm512_load_si512(block_ptr); |
91 | 88 | __mmask64 cmp = _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ); |
92 | 89 | if (cmp) |
93 | 90 | return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) - |
94 | 91 | reinterpret_cast<uintptr_t>(src) + |
95 | | - __builtin_ctz(cmp)); |
96 | | - block_ptr++; |
| 92 | + cpp::countr_zero(cmp)); |
97 | 93 | } |
98 | 94 | } |
99 | 95 | #endif // __AVX512F__ |
100 | 96 |
|
| 97 | +namespace x86 { |
101 | 98 | template <typename T> LIBC_INLINE size_t string_length_x86_64(const char *src) { |
102 | 99 | #if defined(__AVX512F__) |
103 | 100 | return string_length_avx512(src); |
104 | | -#endif |
105 | | -#if defined(__AVX__) |
| 101 | +#elif defined(__AVX2__) |
106 | 102 | return string_length_avx2(src); |
107 | 103 | #endif |
108 | 104 | return string_length_sse2(src); |
109 | 105 | } |
| 106 | +} |
110 | 107 |
|
111 | 108 | } // namespace LIBC_NAMESPACE_DECL |
112 | 109 |
|
|
0 commit comments