Skip to content

Commit d7b5e8b

Browse files
Refactor WIDE_READ mechanism to allow finer control over function selection
1 parent a065505 commit d7b5e8b

File tree

11 files changed

+126
-80
lines changed

11 files changed

+126
-80
lines changed

libc/cmake/modules/LLVMLibCCompileOptionRules.cmake

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,8 @@ function(_get_compile_options_from_config output_var)
8181
list(APPEND config_options "-DLIBC_QSORT_IMPL=${LIBC_CONF_QSORT_IMPL}")
8282
endif()
8383

84-
if(LIBC_CONF_STRING_UNSAFE_WIDE_READ)
85-
list(APPEND config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ")
86-
endif()
84+
list(APPEND config_options "-DLIBC_COPT_STRING_LENGTH_IMPL=${LIBC_CONF_STRING_LENGTH_IMPL}")
85+
list(APPEND config_options "-DLIBC_COPT_FIND_FIRST_CHARACTER_IMPL=${LIBC_CONF_FIND_FIRST_CHARACTER_IMPL}")
8786

8887
if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
8988
list(APPEND config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING")

libc/config/config.json

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
"value": false,
4141
"doc": "Use an alternative printf float implementation based on 320-bit floats"
4242
},
43+
4344
"LIBC_CONF_PRINTF_DISABLE_FIXED_POINT": {
4445
"value": false,
4546
"doc": "Disable printing fixed point values in printf and friends."
@@ -64,9 +65,13 @@
6465
}
6566
},
6667
"string": {
67-
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
68-
"value": false,
69-
"doc": "Read more than a byte at a time to perform byte-string operations like strlen."
68+
"LIBC_CONF_STRING_LENGTH_IMPL": {
69+
"value": "element",
70+
"doc": "Selects the implementation for string-length: 'element', 'wide', 'generic' (vector), or 'arch'."
71+
},
72+
"LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": {
73+
"value": "element",
74+
"doc": "Selects the implementation for find-first-character-related functions: 'element', 'wide', 'generic' (vector), or 'arch'."
7075
},
7176
"LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
7277
"value": false,

libc/config/linux/arm/config.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
{
22
"string": {
3-
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
4-
"value": false
3+
"LIBC_CONF_STRING_LENGTH_IMPL": {
4+
"value": "element"
5+
}
6+
"LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": {
7+
"value": "element"
58
}
69
}
710
}

libc/config/linux/config.json

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
{
22
"string": {
3-
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
4-
"value": true
3+
"LIBC_CONF_STRING_LENGTH_IMPL": {
4+
"value": "generic",
5+
},
6+
"LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": {
7+
"value": "wide",
58
}
69
}
710
}
Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
{
22
"string": {
3-
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
4-
"value": false
3+
"LIBC_CONF_STRING_LENGTH_IMPL": {
4+
"value": "element"
5+
}
6+
"LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": {
7+
"value": "element"
58
}
69
}
710
}

libc/docs/configure.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,9 @@ to learn about the defaults for your platform and target.
5858
* **"setjmp" options**
5959
- ``LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER``: Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved.
6060
* **"string" options**
61+
- ``LIBC_CONF_FIND_FIRST_CHARACTER_IMPL``: Selects the implementation for find-first-character-related functions: 'element', 'wide', 'generic' (vector), or 'arch'.
6162
- ``LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING``: Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled.
62-
- ``LIBC_CONF_STRING_UNSAFE_WIDE_READ``: Read more than a byte at a time to perform byte-string operations like strlen.
63+
- ``LIBC_CONF_STRING_LENGTH_IMPL``: Selects the implementation for string-length: 'element', 'wide', 'generic' (vector), or 'arch'.
6364
* **"threads" options**
6465
- ``LIBC_CONF_THREAD_MODE``: The implementation used for Mutex, acceptable values are LIBC_THREAD_MODE_PLATFORM, LIBC_THREAD_MODE_SINGLE, and LIBC_THREAD_MODE_EXTERNAL.
6566
* **"time" options**

libc/src/string/memory_utils/aarch64/inline_strlen.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
namespace LIBC_NAMESPACE_DECL {
1818

19-
namespace neon {
19+
namespace arch {
2020
[[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
2121
string_length(const char *src) {
2222
using Vector __attribute__((may_alias)) = uint8x8_t;
@@ -44,9 +44,7 @@ string_length(const char *src) {
4444
(cpp::countr_zero(cmp) >> 3));
4545
}
4646
}
47-
} // namespace neon
48-
49-
namespace string_length_impl = neon;
47+
} // namespace arch
5048

5149
} // namespace LIBC_NAMESPACE_DECL
5250
#endif // __ARM_NEON

libc/src/string/memory_utils/generic/inline_strlen.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
#include "src/__support/common.h"
1515

1616
namespace LIBC_NAMESPACE_DECL {
17-
namespace internal {
17+
namespace generic {
1818

1919
// Exploit the underlying integer representation to do a variable shift.
2020
LIBC_INLINE constexpr cpp::simd_mask<char> shift_mask(cpp::simd_mask<char> m,
@@ -46,9 +46,8 @@ LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) {
4646
cpp::find_first_set(mask);
4747
}
4848
}
49-
} // namespace internal
49+
} // namespace generic
5050

51-
namespace string_length_impl = internal;
5251
} // namespace LIBC_NAMESPACE_DECL
5352

5453
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_INLINE_STRLEN_H

libc/src/string/memory_utils/x86_64/inline_strlen.h

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515

1616
namespace LIBC_NAMESPACE_DECL {
1717

18-
namespace string_length_internal {
18+
namespace internal::arch {
19+
1920
// Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero.
2021
template <typename Vector, typename Mask>
2122
LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static Mask
@@ -92,15 +93,18 @@ namespace avx512 {
9293
}
9394
} // namespace avx512
9495
#endif
95-
} // namespace string_length_internal
9696

97+
[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
9798
#if defined(__AVX512F__)
98-
namespace string_length_impl = string_length_internal::avx512;
99+
return avx512::string_length(src);
99100
#elif defined(__AVX2__)
100-
namespace string_length_impl = string_length_internal::avx2;
101+
return avx2::string_length(src);
101102
#else
102-
namespace string_length_impl = string_length_internal::sse2;
103+
return sse2::string_length(src);
103104
#endif
105+
}
106+
107+
} // namespace internal::arch
104108

105109
} // namespace LIBC_NAMESPACE_DECL
106110

libc/src/string/string_utils.h

Lines changed: 84 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -24,21 +24,56 @@
2424
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
2525
#include "src/string/memory_utils/inline_memcpy.h"
2626

27-
#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
2827
#if LIBC_HAS_VECTOR_TYPE
2928
#include "src/string/memory_utils/generic/inline_strlen.h"
30-
#elif defined(LIBC_TARGET_ARCH_IS_X86)
29+
#endif
30+
#if defined(LIBC_TARGET_ARCH_IS_X86)
3131
#include "src/string/memory_utils/x86_64/inline_strlen.h"
32-
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_NEON)
32+
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
3333
#include "src/string/memory_utils/aarch64/inline_strlen.h"
34-
#else
35-
namespace string_length_impl = LIBC_NAMESPACE::wide_read;
3634
#endif
37-
#endif // defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
3835

3936
namespace LIBC_NAMESPACE_DECL {
4037
namespace internal {
4138

39+
#if !LIBC_HAS_VECTOR_TYPE
40+
// Foreword any generic vector impls to architecture specific ones
41+
namespace arch {}
42+
namespace generic = arch;
43+
#endif
44+
45+
namespace element {
46+
// Element-by-element (usually a byte, but wider for wchar) implementations of
47+
// functions that search for data. Slow, but easy to understand and analyze.
48+
49+
// Returns the length of a string, denoted by the first occurrence
50+
// of a null terminator.
51+
LIBC_INLINE size_t string_length(const char *src) {
52+
size_t length;
53+
for (length = 0; *src; ++src, ++length)
54+
;
55+
return length;
56+
}
57+
58+
template <typename T> LIBC_INLINE size_t string_length_element(const T *src) {
59+
size_t length;
60+
for (length = 0; *src; ++src, ++length)
61+
;
62+
return length;
63+
}
64+
65+
LIBC_INLINE void *find_first_character(const unsigned char *src,
66+
unsigned char ch, size_t n) {
67+
for (; n && *src != ch; --n, ++src)
68+
;
69+
return n ? const_cast<unsigned char *>(src) : nullptr;
70+
}
71+
} // namespace element
72+
73+
namespace wide {
74+
// Generic, non-vector, implementations of functions that search for data
75+
// by reading from memory block-by-block.
76+
4277
template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
4378
static_assert(CHAR_BIT == 8, "repeat_byte assumes a byte is 8 bits.");
4479
constexpr size_t BITS_IN_BYTE = CHAR_BIT;
@@ -74,8 +109,13 @@ template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
74109
return (subtracted & inverted & HIGH_BITS) != 0;
75110
}
76111

77-
template <typename Word>
78-
LIBC_INLINE size_t string_length_wide_read(const char *src) {
112+
// Unsigned int is the default size for most processors, and on x86-64 it
113+
// performs better than larger sizes when the src pointer can't be assumed to
114+
// be aligned to a word boundary, so it's the size we use for reading the
115+
// string a block at a time.
116+
117+
LIBC_INLINE size_t string_length(const char *src) {
118+
using Word = unsigned int;
79119
const char *char_ptr = src;
80120
// Step 1: read 1 byte at a time to align to block size
81121
for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
@@ -95,37 +135,23 @@ LIBC_INLINE size_t string_length_wide_read(const char *src) {
95135
return static_cast<size_t>(char_ptr - src);
96136
}
97137

98-
namespace wide_read {
99-
LIBC_INLINE size_t string_length(const char *src) {
100-
// Unsigned int is the default size for most processors, and on x86-64 it
101-
// performs better than larger sizes when the src pointer can't be assumed to
102-
// be aligned to a word boundary, so it's the size we use for reading the
103-
// string a block at a time.
104-
return string_length_wide_read<unsigned int>(src);
105-
}
106-
107-
} // namespace wide_read
108-
109-
// Returns the length of a string, denoted by the first occurrence
110-
// of a null terminator.
111-
template <typename T> LIBC_INLINE size_t string_length(const T *src) {
112-
#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
113-
if constexpr (cpp::is_same_v<T, char>)
114-
return string_length_impl::string_length(src);
115-
#endif
116-
size_t length;
117-
for (length = 0; *src; ++src, ++length)
118-
;
119-
return length;
120-
}
121-
122-
template <typename Word>
123138
LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE void *
124-
find_first_character_wide_read(const unsigned char *src, unsigned char ch,
125-
size_t n) {
139+
find_first_character(const unsigned char *src, unsigned char ch,
140+
size_t max_strlen = cpp::numeric_limits<size_t>::max()) {
141+
using Word = unsigned int;
126142
const unsigned char *char_ptr = src;
127143
size_t cur = 0;
128144

145+
// If the maximum size of the string is small, the overhead of aligning to a
146+
// word boundary and generating a bitmask of the appropriate size may be
147+
// greater than the gains from reading larger chunks. Based on some testing,
148+
// the crossover point between when it's faster to just read bytewise and read
149+
// blocks is somewhere between 16 and 32, so 4 times the size of the block
150+
// should be in that range.
151+
if (max_strlen < (sizeof(Word) * 4)) {
152+
return element::find_first_character(src, ch, max_strlen);
153+
}
154+
size_t n = max_strlen;
129155
// Step 1: read 1 byte at a time to align to block size
130156
for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
131157
++char_ptr, ++cur) {
@@ -153,31 +179,35 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch,
153179
return const_cast<unsigned char *>(char_ptr);
154180
}
155181

156-
LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
157-
unsigned char ch, size_t n) {
158-
for (; n && *src != ch; --n, ++src)
159-
;
160-
return n ? const_cast<unsigned char *>(src) : nullptr;
182+
} // namespace wide
183+
184+
// Dispatch mechanism for implementations of performance-sensitive
185+
// functions. Always measure, but generally from lower- to higher-performance
186+
// order:
187+
//
188+
// 1. element - read char-by-char or wchar-by-wchar
189+
// 3. wide - read word-by-word
190+
// 3. generic - read using clang's internal vector types
191+
// 4. arch - hand-coded per architecture. Possibly in asm, or with intrinsics.
192+
//
193+
//The called implemenation is chosen at build-time by setting
194+
// LIBC_CONF_{FUNC}_IMPL in config.json
195+
static constexpr auto &string_length_impl =
196+
LIBC_COPT_STRING_LENGTH_IMPL::string_length;
197+
static constexpr auto &find_first_character_impl =
198+
LIBC_COPT_FIND_FIRST_CHARACTER_IMPL::find_first_character;
199+
200+
template <typename T> LIBC_INLINE size_t string_length(const T *src) {
201+
if constexpr (cpp::is_same_v<T, char>)
202+
return string_length_impl(src);
203+
return element::string_length_element<T>(src);
161204
}
162205

163206
// Returns the first occurrence of 'ch' within the first 'n' characters of
164207
// 'src'. If 'ch' is not found, returns nullptr.
165208
LIBC_INLINE void *find_first_character(const unsigned char *src,
166209
unsigned char ch, size_t max_strlen) {
167-
#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
168-
// If the maximum size of the string is small, the overhead of aligning to a
169-
// word boundary and generating a bitmask of the appropriate size may be
170-
// greater than the gains from reading larger chunks. Based on some testing,
171-
// the crossover point between when it's faster to just read bytewise and read
172-
// blocks is somewhere between 16 and 32, so 4 times the size of the block
173-
// should be in that range.
174-
// Unsigned int is used for the same reason as in strlen.
175-
using BlockType = unsigned int;
176-
if (max_strlen > (sizeof(BlockType) * 4)) {
177-
return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
178-
}
179-
#endif
180-
return find_first_character_byte_read(src, ch, max_strlen);
210+
return find_first_character_impl(src, ch, max_strlen);
181211
}
182212

183213
// Returns the maximum length span that contains only characters not found in

0 commit comments

Comments
 (0)