@@ -297,13 +297,43 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned
297297 char * s = (buf -> ptr + buf -> len );
298298
299299 // Pad the buffer with dummy characters that won't need escaping.
300- // This seem wateful at first sight, but memset of vector length is very fast.
301- memset (s , 'X' , vec_len );
300+ // This seem wasteful at first sight, but memset of vector length is very fast.
301+ // This is a space as it can be directly represented as an immediate on AArch64.
302+ memset (s , ' ' , vec_len );
302303
303304 // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters
304305 // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage.
305- MEMCPY (s , search -> ptr , char , len );
306+ #if defined(__has_builtin ) && __has_builtin (__builtin_memcpy )
307+
308+ #ifdef RBIMPL_ASSERT_OR_ASSUME
309+ RBIMPL_ASSERT_OR_ASSUME (len < 16 );
310+ #endif
306311
312+ if (vec_len == 16 && len >= 4 ) {
313+ // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes.
314+ // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
315+ // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
316+ // position in both copies.
317+
318+ // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
319+ // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
320+ // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
321+ // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
322+ // plus two loads and stores generated when using __builtin_memcpy.
323+ if (len >= 8 ) {
324+ __builtin_memcpy (s , search -> ptr , 8 );
325+ __builtin_memcpy (s + len - 8 , search -> ptr + len - 8 , 8 );
326+ } else {
327+ __builtin_memcpy (s , search -> ptr , 4 );
328+ __builtin_memcpy (s + len - 4 , search -> ptr + len - 4 , 4 );
329+ }
330+ } else {
331+ MEMCPY (s , search -> ptr , char , len );
332+ }
333+ #else
334+ MEMCPY (s , search -> ptr , char , len );
335+ #endif
336+
307337 return s ;
308338}
309339
0 commit comments