Skip to content

Commit 3164d4e

Browse files
byrootmatzbot
authored andcommitted
[ruby/json] Extract json_fast_memcpy16 for readability
ruby/json@1b276c8623
1 parent 456ef91 commit 3164d4e

File tree

3 files changed

+37
-28
lines changed

3 files changed

+37
-28
lines changed

ext/json/generator/generator.c

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,8 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len)
288288

289289
ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len)
290290
{
291+
RBIMPL_ASSERT_OR_ASSUME(len < vec_len);
292+
291293
// Flush the buffer so everything up until the last 'len' characters are unflushed.
292294
search_flush(search);
293295

@@ -303,37 +305,13 @@ ALWAYS_INLINE(static) char *copy_remaining_bytes(search_state *search, unsigned
303305

304306
// Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters
305307
// to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage.
306-
#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
307-
308-
#ifdef RBIMPL_ASSERT_OR_ASSUME
309-
RBIMPL_ASSERT_OR_ASSUME(len < 16);
310-
#endif
311-
312-
if (vec_len == 16 && len >= 4) {
313-
// If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD and vec_len-1 bytes.
314-
// These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
315-
// the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
316-
// position in both copies.
317-
318-
// Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
319-
// generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
320-
// when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
321-
// select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
322-
// plus two loads and stores generated when using __builtin_memcpy.
323-
if (len >= 8) {
324-
__builtin_memcpy(s, search->ptr, 8);
325-
__builtin_memcpy(s + len - 8, search->ptr + len - 8, 8);
326-
} else {
327-
__builtin_memcpy(s, search->ptr, 4);
328-
__builtin_memcpy(s + len - 4, search->ptr + len - 4, 4);
329-
}
308+
if (vec_len == 16) {
309+
RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD);
310+
json_fast_memcpy16(s, search->ptr, len);
330311
} else {
331312
MEMCPY(s, search->ptr, char, len);
332313
}
333-
#else
334-
MEMCPY(s, search->ptr, char, len);
335-
#endif
336-
314+
337315
return s;
338316
}
339317

ext/json/json.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@
55
#include "ruby/encoding.h"
66
#include <stdint.h>
77

8+
#ifndef RBIMPL_ASSERT_OR_ASSUME
9+
# define RBIMPL_ASSERT_OR_ASSUME(x)
10+
#endif
11+
812
#if defined(RUBY_DEBUG) && RUBY_DEBUG
913
# define JSON_ASSERT RUBY_ASSERT
1014
#else

ext/json/simd/simd.h

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,33 @@ static inline int trailing_zeros(int input)
6060

6161
#define SIMD_MINIMUM_THRESHOLD 4
6262

63+
ALWAYS_INLINE(static) void json_fast_memcpy16(char *dst, const char *src, size_t len)
64+
{
65+
RBIMPL_ASSERT_OR_ASSUME(len < 16);
66+
RBIMPL_ASSERT_OR_ASSUME(len >= SIMD_MINIMUM_THRESHOLD); // 4
67+
#if defined(__has_builtin) && __has_builtin(__builtin_memcpy)
68+
// If __builtin_memcpy is available, use it to copy between SIMD_MINIMUM_THRESHOLD (4) and vec_len-1 (15) bytes.
69+
// These copies overlap. The first copy will copy the first 8 (or 4) bytes. The second copy will copy
70+
// the last 8 (or 4) bytes but overlap with the first copy. The overlapping bytes will be in the correct
71+
// position in both copies.
72+
73+
// Please do not attempt to replace __builtin_memcpy with memcpy without profiling and/or looking at the
74+
// generated assembly. On clang-specifically (tested on Apple clang version 17.0.0 (clang-1700.0.13.3)),
75+
// when using memcpy, the compiler will notice the only difference is a 4 or 8 and generate a conditional
76+
// select instruction instead of direct loads and stores with a branch. This ends up slower than the branch
77+
// plus two loads and stores generated when using __builtin_memcpy.
78+
if (len >= 8) {
79+
__builtin_memcpy(dst, src, 8);
80+
__builtin_memcpy(dst + len - 8, src + len - 8, 8);
81+
} else {
82+
__builtin_memcpy(dst, src, 4);
83+
__builtin_memcpy(dst + len - 4, src + len - 4, 4);
84+
}
85+
#else
86+
MEMCPY(dst, src, char, len);
87+
#endif
88+
}
89+
6390
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
6491
#include <arm_neon.h>
6592

0 commit comments

Comments
 (0)