@@ -288,6 +288,8 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len)
288288
289289ALWAYS_INLINE (static ) char * copy_remaining_bytes (search_state * search , unsigned long vec_len , unsigned long len )
290290{
291+ RBIMPL_ASSERT_OR_ASSUME (len < vec_len );
292+
291293 // Flush the buffer so everything up until the last 'len' characters are unflushed.
292294 search_flush (search );
293295
@@ -303,37 +305,13 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned
303305
304306 // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters
305307 // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage.
306- #if defined(__has_builtin ) && __has_builtin (__builtin_memcpy )
307-
308- #ifdef RBIMPL_ASSERT_OR_ASSUME
309- RBIMPL_ASSERT_OR_ASSUME (len < 16 );
310- #endif
311-
312- if (vec_len == 16 && len >= 4 ) {
313- // If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes.
314- // These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
315- // the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
316- // position in both copies.
317-
318- // Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
319- // generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
320- // when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
321- // select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
322- // plus two loads and stores generated when using __builtin_memcpy.
323- if (len >= 8 ) {
324- __builtin_memcpy (s , search -> ptr , 8 );
325- __builtin_memcpy (s + len - 8 , search -> ptr + len - 8 , 8 );
326- } else {
327- __builtin_memcpy (s , search -> ptr , 4 );
328- __builtin_memcpy (s + len - 4 , search -> ptr + len - 4 , 4 );
329- }
308+ if (vec_len == 16 ) {
309+ RBIMPL_ASSERT_OR_ASSUME (len >= SIMD_MINIMUM_THRESHOLD );
310+ json_fast_memcpy16 (s , search -> ptr , len );
330311 } else {
331312 MEMCPY (s , search -> ptr , char , len );
332313 }
333- #else
334- MEMCPY (s , search -> ptr , char , len );
335- #endif
336-
314+
337315 return s ;
338316}
339317
0 commit comments