Skip to content

Commit 90ed1dd

Browse files
lemireDaniel Lemire
andauthored
One simple SIMD optimization (#402)
* Adding some SIMD. * SSE2 version. * Reformat. --------- Co-authored-by: Daniel Lemire <[email protected]>
1 parent 349d926 commit 90ed1dd

File tree

3 files changed

+70
-3
lines changed

3 files changed

+70
-3
lines changed

include/ada/common_defs.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,4 +285,15 @@ namespace ada {
285285
if (!(COND)) __builtin_unreachable(); \
286286
} while (0)
287287
#endif
288+
289+
#if defined(__SSE2__) || defined(__x86_64__) || defined(__x86_64) || \
290+
(defined(_M_AMD64) || defined(_M_X64) || \
291+
(defined(_M_IX86_FP) && _M_IX86_FP == 2))
292+
#define ADA_SSE2 1
293+
#endif
294+
295+
#if defined(__aarch64__) || defined(_M_ARM64)
296+
#define ADA_NEON 1
297+
#endif
298+
288299
#endif // ADA_COMMON_DEFS_H

include/ada/unicode.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ std::string to_unicode(std::string_view input);
7171
* @attention The has_tabs_or_newline function is a bottleneck and it is simple
7272
* enough that compilers like GCC can 'autovectorize it'.
7373
*/
74-
ada_really_inline constexpr bool has_tabs_or_newline(
74+
ada_really_inline bool has_tabs_or_newline(
7575
std::string_view user_input) noexcept;
7676

7777
/**

src/unicode.cpp

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ ADA_PUSH_DISABLE_ALL_WARNINGS
88
ADA_POP_DISABLE_WARNINGS
99

1010
#include <algorithm>
11+
#if ADA_NEON
12+
#include <arm_neon.h>
13+
#elif ADA_SSE2
14+
#include <emmintrin.h>
15+
#endif
1116

1217
namespace ada::unicode {
1318

@@ -39,8 +44,58 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
3944
}
4045
return non_ascii == 0;
4146
}
42-
43-
ada_really_inline constexpr bool has_tabs_or_newline(
47+
#if ADA_NEON
48+
ada_really_inline bool has_tabs_or_newline(
49+
std::string_view user_input) noexcept {
50+
size_t i = 0;
51+
const uint8x16_t mask1 = vmovq_n_u8('\r');
52+
const uint8x16_t mask2 = vmovq_n_u8('\n');
53+
const uint8x16_t mask3 = vmovq_n_u8('\t');
54+
uint8x16_t running{0};
55+
for (; i + 15 < user_input.size(); i += 16) {
56+
uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
57+
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
58+
vceqq_u8(word, mask2))),
59+
vceqq_u8(word, mask3));
60+
}
61+
if (i < user_input.size()) {
62+
uint8_t buffer[16];
63+
memcpy(buffer, user_input.data() + i, user_input.size() - i);
64+
uint8x16_t word = vld1q_u8((const uint8_t*)user_input.data() + i);
65+
running = vorrq_u8(vorrq_u8(running, vorrq_u8(vceqq_u8(word, mask1),
66+
vceqq_u8(word, mask2))),
67+
vceqq_u8(word, mask3));
68+
}
69+
return vmaxvq_u8(running) != 0;
70+
}
71+
#elif ADA_SSE2
72+
ada_really_inline bool has_tabs_or_newline(
73+
std::string_view user_input) noexcept {
74+
size_t i = 0;
75+
const __m128i mask1 = _mm_set1_epi8('\r');
76+
const __m128i mask2 = _mm_set1_epi8('\n');
77+
const __m128i mask3 = _mm_set1_epi8('\t');
78+
__m128i running{0};
79+
for (; i + 15 < user_input.size(); i += 16) {
80+
__m128i word = _mm_loadu_si128((const __m128i*)(user_input.data() + i));
81+
running = _mm_or_si128(
82+
_mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
83+
_mm_cmpeq_epi8(word, mask2))),
84+
_mm_cmpeq_epi8(word, mask3));
85+
}
86+
if (i < user_input.size()) {
87+
uint8_t buffer[16];
88+
memcpy(buffer, user_input.data() + i, user_input.size() - i);
89+
__m128i word = _mm_loadu_si128((const __m128i*)buffer);
90+
running = _mm_or_si128(
91+
_mm_or_si128(running, _mm_or_si128(_mm_cmpeq_epi8(word, mask1),
92+
_mm_cmpeq_epi8(word, mask2))),
93+
_mm_cmpeq_epi8(word, mask3));
94+
}
95+
return _mm_movemask_epi8(running) != 0;
96+
}
97+
#else
98+
ada_really_inline bool has_tabs_or_newline(
4499
std::string_view user_input) noexcept {
45100
auto has_zero_byte = [](uint64_t v) {
46101
return ((v - 0x0101010101010101) & ~(v)&0x8080808080808080);
@@ -71,6 +126,7 @@ ada_really_inline constexpr bool has_tabs_or_newline(
71126
}
72127
return running;
73128
}
129+
#endif
74130

75131
// A forbidden host code point is U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR,
76132
// U+0020 SPACE, U+0023 (#), U+002F (/), U+003A (:), U+003C (<), U+003E (>),

0 commit comments

Comments
 (0)