Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions libc/cmake/modules/LLVMLibCCompileOptionRules.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,8 @@ function(_get_compile_options_from_config output_var)
list(APPEND config_options "-DLIBC_QSORT_IMPL=${LIBC_CONF_QSORT_IMPL}")
endif()

if(LIBC_CONF_STRING_UNSAFE_WIDE_READ)
list(APPEND config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ")
endif()
list(APPEND config_options "-DLIBC_COPT_STRING_LENGTH_IMPL=${LIBC_CONF_STRING_LENGTH_IMPL}")
list(APPEND config_options "-DLIBC_COPT_FIND_FIRST_CHARACTER_IMPL=${LIBC_CONF_FIND_FIRST_CHARACTER_IMPL}")

if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
list(APPEND config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING")
Expand Down
11 changes: 8 additions & 3 deletions libc/config/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"value": false,
"doc": "Use an alternative printf float implementation based on 320-bit floats"
},

"LIBC_CONF_PRINTF_DISABLE_FIXED_POINT": {
"value": false,
"doc": "Disable printing fixed point values in printf and friends."
Expand All @@ -64,9 +65,13 @@
}
},
"string": {
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
"value": false,
"doc": "Read more than a byte at a time to perform byte-string operations like strlen."
"LIBC_CONF_STRING_LENGTH_IMPL": {
"value": "element",
"doc": "Selects the implementation for string-length: 'element', 'wide', 'generic' (vector), or 'arch'."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like we could bikeshed on the names forever, so I'll try to only make one round of suggestions:
wide -> word: all of the non-element ones are "wide" in the sense that they read more than one element at a time. The difference for this implementation is that it reads one word at a time, more than element but less than the full vector implementations.
generic -> clang_vector: generic sounds like it'd be the trivial default, whereas this is an optimized but multiplatform implementation using clang specific intrinsics.
arch -> arch_vector: parallel to clang_vector, makes it clear that this is arch specific whereas clang_vector is clang specific.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

},
"LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": {
"value": "element",
"doc": "Selects the implementation for find-first-character-related functions: 'element', 'wide', 'generic' (vector), or 'arch'."
},
"LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
"value": false,
Expand Down
7 changes: 5 additions & 2 deletions libc/config/linux/arm/config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
{
"string": {
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
"value": false
"LIBC_CONF_STRING_LENGTH_IMPL": {
"value": "element"
}
"LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": {
"value": "element"
}
}
}
7 changes: 5 additions & 2 deletions libc/config/linux/config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
{
"string": {
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
"value": true
"LIBC_CONF_STRING_LENGTH_IMPL": {
"value": "generic",
},
"LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": {
"value": "wide",
}
}
}
7 changes: 5 additions & 2 deletions libc/config/linux/riscv/config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
{
"string": {
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
"value": false
"LIBC_CONF_STRING_LENGTH_IMPL": {
"value": "element"
}
"LIBC_CONF_FIND_FIRST_CHARACTER_IMPL": {
"value": "element"
}
}
}
3 changes: 2 additions & 1 deletion libc/docs/configure.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ to learn about the defaults for your platform and target.
* **"setjmp" options**
- ``LIBC_CONF_SETJMP_AARCH64_RESTORE_PLATFORM_REGISTER``: Make setjmp save the value of x18, and longjmp restore it. The AArch64 ABI delegates this register to platform ABIs, which can choose whether to make it caller-saved.
* **"string" options**
- ``LIBC_CONF_FIND_FIRST_CHARACTER_IMPL``: Selects the implementation for find-first-character-related functions: 'element', 'wide', 'generic' (vector), or 'arch'.
- ``LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING``: Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled.
- ``LIBC_CONF_STRING_UNSAFE_WIDE_READ``: Read more than a byte at a time to perform byte-string operations like strlen.
- ``LIBC_CONF_STRING_LENGTH_IMPL``: Selects the implementation for string-length: 'element', 'wide', 'generic' (vector), or 'arch'.
* **"threads" options**
- ``LIBC_CONF_THREAD_MODE``: The implementation used for Mutex, acceptable values are LIBC_THREAD_MODE_PLATFORM, LIBC_THREAD_MODE_SINGLE, and LIBC_THREAD_MODE_EXTERNAL.
* **"time" options**
Expand Down
6 changes: 2 additions & 4 deletions libc/src/string/memory_utils/aarch64/inline_strlen.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

namespace LIBC_NAMESPACE_DECL {

namespace neon {
namespace internal::arch {
[[maybe_unused]] LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static size_t
string_length(const char *src) {
using Vector __attribute__((may_alias)) = uint8x8_t;
Expand Down Expand Up @@ -44,9 +44,7 @@ string_length(const char *src) {
(cpp::countr_zero(cmp) >> 3));
}
}
} // namespace neon

namespace string_length_impl = neon;
} // namespace internal::arch

} // namespace LIBC_NAMESPACE_DECL
#endif // __ARM_NEON
Expand Down
5 changes: 2 additions & 3 deletions libc/src/string/memory_utils/generic/inline_strlen.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include "src/__support/common.h"

namespace LIBC_NAMESPACE_DECL {
namespace internal {
namespace generic {

// Exploit the underlying integer representation to do a variable shift.
LIBC_INLINE constexpr cpp::simd_mask<char> shift_mask(cpp::simd_mask<char> m,
Expand Down Expand Up @@ -46,9 +46,8 @@ LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE size_t string_length(const char *src) {
cpp::find_first_set(mask);
}
}
} // namespace internal
} // namespace generic

namespace string_length_impl = internal;
} // namespace LIBC_NAMESPACE_DECL

#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_GENERIC_INLINE_STRLEN_H
14 changes: 9 additions & 5 deletions libc/src/string/memory_utils/x86_64/inline_strlen.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@

namespace LIBC_NAMESPACE_DECL {

namespace string_length_internal {
namespace internal::arch {

// Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero.
template <typename Vector, typename Mask>
LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE static Mask
Expand Down Expand Up @@ -92,15 +93,18 @@ namespace avx512 {
}
} // namespace avx512
#endif
} // namespace string_length_internal

[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
#if defined(__AVX512F__)
namespace string_length_impl = string_length_internal::avx512;
return avx512::string_length(src);
#elif defined(__AVX2__)
namespace string_length_impl = string_length_internal::avx2;
return avx2::string_length(src);
#else
namespace string_length_impl = string_length_internal::sse2;
return sse2::string_length(src);
#endif
}

} // namespace internal::arch

} // namespace LIBC_NAMESPACE_DECL

Expand Down
138 changes: 84 additions & 54 deletions libc/src/string/string_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,56 @@
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/string/memory_utils/inline_memcpy.h"

#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
#if LIBC_HAS_VECTOR_TYPE
#include "src/string/memory_utils/generic/inline_strlen.h"
#elif defined(LIBC_TARGET_ARCH_IS_X86)
#endif
#if defined(LIBC_TARGET_ARCH_IS_X86)
#include "src/string/memory_utils/x86_64/inline_strlen.h"
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_NEON)
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
#include "src/string/memory_utils/aarch64/inline_strlen.h"
#else
namespace string_length_impl = LIBC_NAMESPACE::wide_read;
#endif
#endif // defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)

namespace LIBC_NAMESPACE_DECL {
namespace internal {

#if !LIBC_HAS_VECTOR_TYPE
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all of the strlen code should probably be moved out of string_utils and into its own file since it seems to be outgrowing being a generic utility.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Not sure "string_optimization.h" is the right file name; happy for better suggestions.

// Forward any generic vector impls to architecture specific ones
namespace arch {}
namespace generic = arch;
#endif

namespace element {
// Element-by-element (usually a byte, but wider for wchar) implementations of
// functions that search for data. Slow, but easy to understand and analyze.

// Returns the length of a string, denoted by the first occurrence
// of a null terminator.
LIBC_INLINE size_t string_length(const char *src) {
size_t length;
for (length = 0; *src; ++src, ++length)
;
return length;
}

template <typename T> LIBC_INLINE size_t string_length_element(const T *src) {
size_t length;
for (length = 0; *src; ++src, ++length)
;
return length;
}

LIBC_INLINE void *find_first_character(const unsigned char *src,
unsigned char ch, size_t n) {
for (; n && *src != ch; --n, ++src)
;
return n ? const_cast<unsigned char *>(src) : nullptr;
}
} // namespace element

namespace wide {
// Generic, non-vector, implementations of functions that search for data
// by reading from memory block-by-block.

template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
static_assert(CHAR_BIT == 8, "repeat_byte assumes a byte is 8 bits.");
constexpr size_t BITS_IN_BYTE = CHAR_BIT;
Expand Down Expand Up @@ -74,8 +109,13 @@ template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
return (subtracted & inverted & HIGH_BITS) != 0;
}

template <typename Word>
LIBC_INLINE size_t string_length_wide_read(const char *src) {
// Unsigned int is the default size for most processors, and on x86-64 it
// performs better than larger sizes when the src pointer can't be assumed to
// be aligned to a word boundary, so it's the size we use for reading the
// string a block at a time.

LIBC_INLINE size_t string_length(const char *src) {
using Word = unsigned int;
const char *char_ptr = src;
// Step 1: read 1 byte at a time to align to block size
for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
Expand All @@ -95,37 +135,23 @@ LIBC_INLINE size_t string_length_wide_read(const char *src) {
return static_cast<size_t>(char_ptr - src);
}

namespace wide_read {
LIBC_INLINE size_t string_length(const char *src) {
// Unsigned int is the default size for most processors, and on x86-64 it
// performs better than larger sizes when the src pointer can't be assumed to
// be aligned to a word boundary, so it's the size we use for reading the
// string a block at a time.
return string_length_wide_read<unsigned int>(src);
}

} // namespace wide_read

// Returns the length of a string, denoted by the first occurrence
// of a null terminator.
template <typename T> LIBC_INLINE size_t string_length(const T *src) {
#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
if constexpr (cpp::is_same_v<T, char>)
return string_length_impl::string_length(src);
#endif
size_t length;
for (length = 0; *src; ++src, ++length)
;
return length;
}

template <typename Word>
LIBC_NO_SANITIZE_OOB_ACCESS LIBC_INLINE void *
find_first_character_wide_read(const unsigned char *src, unsigned char ch,
size_t n) {
find_first_character(const unsigned char *src, unsigned char ch,
size_t max_strlen = cpp::numeric_limits<size_t>::max()) {
using Word = unsigned int;
const unsigned char *char_ptr = src;
size_t cur = 0;

// If the maximum size of the string is small, the overhead of aligning to a
// word boundary and generating a bitmask of the appropriate size may be
// greater than the gains from reading larger chunks. Based on some testing,
// the crossover point between when it's faster to just read bytewise and read
// blocks is somewhere between 16 and 32, so 4 times the size of the block
// should be in that range.
if (max_strlen < (sizeof(Word) * 4)) {
return element::find_first_character(src, ch, max_strlen);
}
size_t n = max_strlen;
// Step 1: read 1 byte at a time to align to block size
for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0 && cur < n;
++char_ptr, ++cur) {
Expand Down Expand Up @@ -153,31 +179,35 @@ find_first_character_wide_read(const unsigned char *src, unsigned char ch,
return const_cast<unsigned char *>(char_ptr);
}

LIBC_INLINE void *find_first_character_byte_read(const unsigned char *src,
unsigned char ch, size_t n) {
for (; n && *src != ch; --n, ++src)
;
return n ? const_cast<unsigned char *>(src) : nullptr;
} // namespace wide

// Dispatch mechanism for implementations of performance-sensitive
// functions. Always measure, but generally from lower- to higher-performance
// order:
//
// 1. element - read char-by-char or wchar-by-wchar
// 3. wide - read word-by-word
// 3. generic - read using clang's internal vector types
// 4. arch - hand-coded per architecture. Possibly in asm, or with intrinsics.
//
// The called implemenation is chosen at build-time by setting
// LIBC_CONF_{FUNC}_IMPL in config.json
static constexpr auto &string_length_impl =
LIBC_COPT_STRING_LENGTH_IMPL::string_length;
static constexpr auto &find_first_character_impl =
LIBC_COPT_FIND_FIRST_CHARACTER_IMPL::find_first_character;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems like a good mechanism, but I'd like to make sure it has a default. Basically

#ifndef LIBC_COPT_STRING_LENGTH_IMPL
#define LIBC_COPT_STRING_LENGTH_IMPL element
#endif

or something similar.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done


template <typename T> LIBC_INLINE size_t string_length(const T *src) {
if constexpr (cpp::is_same_v<T, char>)
return string_length_impl(src);
return element::string_length_element<T>(src);
}

// Returns the first occurrence of 'ch' within the first 'n' characters of
// 'src'. If 'ch' is not found, returns nullptr.
LIBC_INLINE void *find_first_character(const unsigned char *src,
unsigned char ch, size_t max_strlen) {
#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
// If the maximum size of the string is small, the overhead of aligning to a
// word boundary and generating a bitmask of the appropriate size may be
// greater than the gains from reading larger chunks. Based on some testing,
// the crossover point between when it's faster to just read bytewise and read
// blocks is somewhere between 16 and 32, so 4 times the size of the block
// should be in that range.
// Unsigned int is used for the same reason as in strlen.
using BlockType = unsigned int;
if (max_strlen > (sizeof(BlockType) * 4)) {
return find_first_character_wide_read<BlockType>(src, ch, max_strlen);
}
#endif
return find_first_character_byte_read(src, ch, max_strlen);
return find_first_character_impl(src, ch, max_strlen);
}

// Returns the maximum length span that contains only characters not found in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ LIBC_CONFIGURE_OPTIONS = [
# "LIBC_COPT_SCANF_DISABLE_FLOAT",
# "LIBC_COPT_SCANF_DISABLE_INDEX_MODE",
"LIBC_COPT_STDIO_USE_SYSTEM_FILE",
"LIBC_COPT_STRING_UNSAFE_WIDE_READ",
"LIBC_COPT_STRING_LENGTH_IMPL=generic",
"LIBC_COPT_FIND_FIRST_CHARACTER_IMPL=wide",
# "LIBC_COPT_STRTOFLOAT_DISABLE_CLINGER_FAST_PATH",
# "LIBC_COPT_STRTOFLOAT_DISABLE_EISEL_LEMIRE",
# "LIBC_COPT_STRTOFLOAT_DISABLE_SIMPLE_DECIMAL_CONVERSION",
Expand Down
Loading