|
9 | 9 | #include <string.h>
|
10 | 10 | #include <stdint.h>
|
11 | 11 | #include <stdio.h>
|
| 12 | +#if defined(__x86_64__) |
| 13 | +#include <emmintrin.h> |
| 14 | +#include <xmmintrin.h> |
| 15 | +#endif |
| 16 | + |
12 | 17 | #include "text_cbits.h"
|
13 | 18 |
|
| 19 | + |
14 | 20 | void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
|
15 | 21 | size_t n)
|
16 | 22 | {
|
@@ -157,24 +163,40 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
|
157 | 163 | */
|
158 | 164 |
|
159 | 165 | if (state == UTF8_ACCEPT) {
|
| 166 | +#if defined(__x86_64__) |
| 167 | + const __m128i zeros = _mm_set1_epi32(0); |
| 168 | + while (s < srcend - 8) { |
| 169 | + const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s); |
| 170 | + if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL) |
| 171 | + break; |
| 172 | + s += 8; |
| 173 | + |
| 174 | + /* Load 8 bytes of ASCII data */ |
| 175 | + const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars); |
| 176 | + /* Interleave with zeros */ |
| 177 | + const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros); |
| 178 | + /* Store the resulting 8 bytes into destination */ |
| 179 | + _mm_storeu_si128((__m128i *)d, eight_utf16_chars); |
| 180 | + d += 8; |
| 181 | + } |
| 182 | +#else |
160 | 183 | while (s < srcend - 4) {
|
161 |
| - codepoint = *((uint32_t *) s); |
162 |
| - if ((codepoint & 0x80808080) != 0) |
163 |
| - break; |
164 |
| - s += 4; |
165 |
| - |
166 |
| - /* |
167 |
| - * Tried 32-bit stores here, but the extra bit-twiddling |
168 |
| - * slowed the code down. |
169 |
| - */ |
170 |
| - |
171 |
| - *d++ = (uint16_t) (codepoint & 0xff); |
172 |
| - *d++ = (uint16_t) ((codepoint >> 8) & 0xff); |
173 |
| - *d++ = (uint16_t) ((codepoint >> 16) & 0xff); |
174 |
| - *d++ = (uint16_t) ((codepoint >> 24) & 0xff); |
| 184 | + codepoint = *((uint32_t *) s); |
| 185 | + if ((codepoint & 0x80808080) != 0) |
| 186 | + break; |
| 187 | + s += 4; |
| 188 | + /* |
| 189 | + * Tried 32-bit stores here, but the extra bit-twiddling |
| 190 | + * slowed the code down. |
| 191 | + */ |
| 192 | + *d++ = (uint16_t) (codepoint & 0xff); |
| 193 | + *d++ = (uint16_t) ((codepoint >> 8) & 0xff); |
| 194 | + *d++ = (uint16_t) ((codepoint >> 16) & 0xff); |
| 195 | + *d++ = (uint16_t) ((codepoint >> 24) & 0xff); |
175 | 196 | }
|
| 197 | +#endif |
176 | 198 | last = s;
|
177 |
| - } |
| 199 | + } /* end if (state == UTF8_ACCEPT) */ |
178 | 200 | #endif
|
179 | 201 |
|
180 | 202 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
|
|
0 commit comments