Skip to content

Commit 3179200

Browse files
Add vector-based strlen implementation for x86_64 and aarch64 (#152389)
These replace the default LIBC_CONF_STRING_UNSAFE_WIDE_READ implementation on x86_64 and aarch64. These are substantially faster than both the character-by-character implementation and the original unsafe_wide_read implementation. Some below I have been unable to performance-test the aarch64 version, but I suspect speedups similar to avx2. ``` Function: strlen Variant: char wide ull sse2 avx2 avx512 ============================================================================================================================================================= length=1, alignment=1: 13.18 20.47 (-55.24%) 20.21 (-53.27%) 32.50 (-146.54%) 26.05 (-97.61%) 18.03 (-36.74%) length=1, alignment=0: 12.80 34.92 (-172.89%) 20.01 (-56.39%) 17.52 (-36.86%) 17.78 (-38.92%) 18.04 (-40.94%) length=2, alignment=2: 9.91 19.02 (-91.95%) 12.64 (-27.52%) 11.06 (-11.59%) 9.48 ( 4.38%) 9.48 ( 4.34%) length=2, alignment=0: 9.56 26.88 (-181.24%) 12.64 (-32.31%) 11.06 (-15.73%) 11.06 (-15.72%) 11.83 (-23.80%) length=3, alignment=3: 8.31 10.45 (-25.84%) 8.28 ( 0.32%) 8.28 ( 0.36%) 6.21 ( 25.28%) 6.21 ( 25.24%) length=3, alignment=0: 8.39 14.53 (-73.20%) 8.28 ( 1.33%) 7.24 ( 13.69%) 7.56 ( 9.94%) 7.25 ( 13.65%) length=4, alignment=4: 9.84 21.76 (-121.24%) 15.55 (-58.11%) 6.57 ( 33.18%) 5.02 ( 48.98%) 6.00 ( 39.00%) length=4, alignment=0: 8.64 13.70 (-58.51%) 7.28 ( 15.73%) 6.37 ( 26.31%) 6.36 ( 26.36%) 6.36 ( 26.36%) length=5, alignment=5: 11.85 23.81 (-100.97%) 12.17 ( -2.67%) 5.68 ( 52.09%) 4.87 ( 58.94%) 6.48 ( 45.33%) length=5, alignment=0: 11.82 13.64 (-15.42%) 7.27 ( 38.45%) 6.36 ( 46.15%) 6.37 ( 46.11%) 6.36 ( 46.14%) length=6, alignment=6: 10.50 19.37 (-84.56%) 13.64 (-29.93%) 6.54 ( 37.71%) 6.89 ( 34.35%) 9.45 ( 10.01%) length=6, alignment=0: 14.96 14.05 ( 6.04%) 6.49 ( 56.62%) 5.68 ( 62.04%) 5.68 ( 62.04%) 13.15 ( 12.05%) length=7, alignment=7: 10.97 18.02 (-64.35%) 14.59 (-33.06%) 6.36 ( 41.96%) 5.46 ( 50.25%) 5.46 ( 50.25%) length=7, alignment=0: 10.96 15.76 (-43.77%) 15.37 (-40.15%) 6.96 ( 36.51%) 5.68 ( 48.22%) 7.04 ( 35.83%) length=4, alignment=0: 8.66 13.69 (-58.02%) 7.28 ( 16.00%) 6.37 ( 26.44%) 6.37 ( 26.52%) 6.61 ( 23.74%) length=4, alignment=7: 8.87 17.35 (-95.73%) 12.18 (-37.39%) 5.68 ( 35.94%) 4.87 ( 45.11%) 6.00 ( 32.36%) length=4, alignment=2: 8.67 10.05 (-15.91%) 7.28 ( 16.01%) 7.37 ( 15.02%) 5.46 ( 37.02%) 5.47 ( 36.89%) length=2, alignment=2: 5.64 10.01 (-77.64%) 7.29 (-29.34%) 6.37 (-13.04%) 5.46 ( 3.19%) 5.46 ( 3.19%) length=8, alignment=0: 12.78 16.52 (-29.33%) 18.27 (-43.00%) 11.82 ( 7.47%) 9.83 ( 23.03%) 11.46 ( 10.27%) length=8, alignment=7: 14.24 17.30 (-21.49%) 12.16 ( 14.59%) 5.68 ( 60.14%) 4.87 ( 65.83%) 6.23 ( 56.28%) length=8, alignment=3: 12.34 26.15 (-111.98%) 12.20 ( 1.14%) 6.50 ( 47.34%) 4.87 ( 60.54%) 6.18 ( 49.94%) length=5, alignment=3: 10.95 19.74 (-80.30%) 12.17 (-11.11%) 5.68 ( 48.16%) 4.87 ( 55.56%) 5.96 ( 45.55%) length=16, alignment=0: 20.33 29.29 (-44.08%) 36.18 (-77.97%) 5.68 ( 72.06%) 5.68 ( 72.08%) 10.60 ( 47.86%) length=16, alignment=7: 19.29 17.52 ( 9.16%) 12.98 ( 32.73%) 7.05 ( 63.47%) 4.87 ( 74.75%) 6.23 ( 67.71%) length=16, alignment=4: 20.54 25.18 (-22.56%) 15.42 ( 24.92%) 7.31 ( 64.43%) 4.87 ( 76.29%) 5.98 ( 70.88%) length=10, alignment=4: 14.59 21.26 (-45.71%) 12.17 ( 16.58%) 5.68 ( 61.07%) 4.87 ( 66.65%) 6.00 ( 58.91%) length=32, alignment=0: 35.46 22.00 ( 37.95%) 16.22 ( 54.26%) 7.32 ( 79.35%) 5.68 ( 83.98%) 7.01 ( 80.22%) length=32, alignment=7: 35.23 24.14 ( 31.48%) 16.22 ( 53.96%) 7.30 ( 79.28%) 8.76 ( 75.12%) 6.14 ( 82.58%) length=32, alignment=5: 35.16 28.56 ( 18.76%) 16.22 ( 53.87%) 7.30 ( 79.23%) 6.77 ( 80.75%) 9.82 ( 72.07%) length=21, alignment=5: 26.47 27.66 ( -4.49%) 15.04 ( 43.17%) 6.90 ( 73.95%) 4.87 ( 81.60%) 6.04 ( 77.18%) length=64, alignment=0: 66.45 25.16 ( 62.14%) 22.70 ( 65.83%) 12.99 ( 80.44%) 7.47 ( 88.77%) 8.70 ( 86.90%) length=64, alignment=7: 64.75 27.78 ( 57.10%) 22.72 ( 64.91%) 10.85 ( 83.25%) 7.46 ( 88.48%) 8.68 ( 86.60%) length=64, alignment=6: 67.26 28.58 ( 57.51%) 22.70 ( 66.24%) 11.26 ( 83.25%) 9.46 ( 85.94%) 13.90 ( 79.33%) length=42, alignment=6: 73.42 27.97 ( 61.91%) 19.46 ( 73.49%) 8.92 ( 87.84%) 6.49 ( 91.16%) 6.00 ( 91.83%) length=128, alignment=0: 172.07 39.18 ( 77.23%) 35.68 ( 79.26%) 13.02 ( 92.43%) 12.98 ( 92.46%) 9.76 ( 94.33%) length=128, alignment=7: 163.98 43.79 ( 73.30%) 36.03 ( 78.03%) 15.68 ( 90.44%) 11.35 ( 93.08%) 10.51 ( 93.59%) length=128, alignment=7: 185.86 40.27 ( 78.33%) 36.04 ( 80.61%) 13.78 ( 92.58%) 11.35 ( 93.89%) 10.49 ( 94.36%) length=85, alignment=7: 121.61 55.66 ( 54.23%) 32.34 ( 73.40%) 13.88 ( 88.59%) 7.30 ( 94.00%) 8.72 ( 92.83%) length=256, alignment=0: 295.54 66.48 ( 77.50%) 61.63 ( 79.15%) 19.54 ( 93.39%) 12.97 ( 95.61%) 12.45 ( 95.79%) length=256, alignment=7: 308.06 78.92 ( 74.38%) 61.63 ( 80.00%) 22.90 ( 92.57%) 12.97 ( 95.79%) 13.23 ( 95.71%) length=256, alignment=8: 295.32 65.83 ( 77.71%) 61.62 ( 79.13%) 23.19 ( 92.15%) 12.97 ( 95.61%) 13.50 ( 95.43%) length=170, alignment=8: 234.39 48.79 ( 79.18%) 43.79 ( 81.32%) 16.22 ( 93.08%) 13.97 ( 94.04%) 10.48 ( 95.53%) length=512, alignment=0: 563.75 116.89 ( 79.27%) 114.99 ( 79.60%) 62.71 ( 88.88%) 19.58 ( 96.53%) 17.76 ( 96.85%) length=512, alignment=7: 580.53 120.91 ( 79.17%) 114.47 ( 80.28%) 37.75 ( 93.50%) 19.55 ( 96.63%) 18.68 ( 96.78%) length=512, alignment=9: 584.05 128.35 ( 78.02%) 114.74 ( 80.35%) 39.09 ( 93.31%) 19.76 ( 96.62%) 18.71 ( 96.80%) length=341, alignment=9: 405.84 90.87 ( 77.61%) 78.79 ( 80.59%) 28.77 ( 92.91%) 14.60 ( 96.40%) 14.15 ( 96.51%) length=1024, alignment=0: 1143.61 247.03 ( 78.40%) 243.70 ( 78.69%) 75.59 ( 93.39%) 67.02 ( 94.14%) 28.99 ( 97.46%) length=1024, alignment=7: 1124.55 267.87 ( 76.18%) 259.16 ( 76.95%) 64.96 ( 94.22%) 33.05 ( 97.06%) 30.91 ( 97.25%) length=1024, alignment=10: 1459.58 257.79 ( 82.34%) 239.91 ( 83.56%) 65.00 ( 95.55%) 33.10 ( 97.73%) 30.33 ( 97.92%) length=682, alignment=10: 732.89 163.67 ( 77.67%) 170.54 ( 76.73%) 46.48 ( 93.66%) 24.32 ( 96.68%) 21.44 ( 97.07%) length=2048, alignment=0: 2141.96 451.61 ( 78.92%) 448.00 ( 79.08%) 133.24 ( 93.78%) 61.22 ( 97.14%) 80.08 ( 96.26%) length=2048, alignment=7: 2145.05 458.26 ( 78.64%) 449.99 ( 79.02%) 140.19 ( 93.46%) 60.26 ( 97.19%) 51.71 ( 97.59%) length=2048, alignment=11: 2162.61 463.37 ( 78.57%) 448.07 ( 79.28%) 140.29 ( 93.51%) 59.51 ( 97.25%) 51.59 ( 97.61%) length=1365, alignment=11: 1439.74 322.86 ( 77.58%) 310.84 ( 78.41%) 116.08 ( 91.94%) 42.43 ( 97.05%) 36.15 ( 97.49%) length=4096, alignment=0: 4278.68 871.60 ( 79.63%) 865.25 ( 79.78%) 252.50 ( 94.10%) 161.17 ( 96.23%) 94.97 ( 97.78%) length=4096, alignment=7: 4253.01 871.62 ( 79.51%) 864.21 ( 79.68%) 243.90 ( 94.27%) 171.17 ( 95.98%) 95.14 ( 97.76%) length=4096, alignment=12: 4252.18 879.66 ( 79.31%) 863.68 ( 79.69%) 244.26 ( 94.26%) 185.36 ( 95.64%) 93.61 ( 97.80%) length=2730, alignment=12: 2868.22 597.65 ( 79.16%) 586.22 ( 79.56%) 175.09 ( 93.90%) 120.35 ( 95.80%) 101.35 ( 96.47%) length=0, alignment=0: 4.87 8.11 (-66.73%) 6.49 (-33.34%) 5.80 (-19.26%) 5.68 (-16.67%) 6.86 (-40.91%) length=32, alignment=0: 33.82 22.36 ( 33.89%) 17.03 ( 49.66%) 7.30 ( 78.42%) 5.68 ( 83.22%) 7.50 ( 77.83%) length=64, alignment=0: 66.20 26.76 ( 59.58%) 23.22 ( 64.93%) 12.99 ( 80.37%) 7.34 ( 88.92%) 8.44 ( 87.25%) length=96, alignment=0: 130.26 31.62 ( 75.72%) 30.00 ( 76.97%) 11.39 ( 91.26%) 10.54 ( 91.91%) 8.68 ( 93.34%) length=128, alignment=0: 164.66 39.05 ( 76.29%) 35.68 ( 78.33%) 13.07 ( 92.07%) 12.97 ( 92.12%) 9.59 ( 94.18%) length=160, alignment=0: 196.63 45.18 ( 77.02%) 42.16 ( 78.56%) 14.65 ( 92.55%) 10.87 ( 94.47%) 9.31 ( 95.27%) length=192, alignment=0: 225.50 52.71 ( 76.63%) 49.61 ( 78.00%) 16.22 ( 92.81%) 11.36 ( 94.96%) 11.08 ( 95.09%) length=224, alignment=0: 261.08 57.57 ( 77.95%) 55.82 ( 78.62%) 17.84 ( 93.17%) 12.16 ( 95.34%) 11.51 ( 95.59%) length=256, alignment=0: 295.13 65.56 ( 77.79%) 62.59 ( 78.79%) 19.46 ( 93.41%) 13.12 ( 95.56%) 12.33 ( 95.82%) length=288, alignment=0: 325.69 72.16 ( 77.84%) 69.20 ( 78.75%) 21.08 ( 93.53%) 13.94 ( 95.72%) 12.32 ( 96.22%) length=320, alignment=0: 364.18 78.78 ( 78.37%) 75.69 ( 79.21%) 22.71 ( 93.77%) 14.70 ( 95.96%) 14.46 ( 96.03%) length=352, alignment=0: 391.40 84.87 ( 78.32%) 82.15 ( 79.01%) 24.50 ( 93.74%) 15.62 ( 96.01%) 14.27 ( 96.35%) length=384, alignment=0: 428.50 91.43 ( 78.66%) 88.70 ( 79.30%) 26.16 ( 93.90%) 17.29 ( 95.97%) 15.04 ( 96.49%) length=416, alignment=0: 457.30 98.23 ( 78.52%) 95.02 ( 79.22%) 27.81 ( 93.92%) 17.22 ( 96.23%) 15.05 ( 96.71%) length=448, alignment=0: 488.38 104.52 ( 78.60%) 101.87 ( 79.14%) 31.22 ( 93.61%) 18.07 ( 96.30%) 16.89 ( 96.54%) length=480, alignment=0: 526.44 109.61 ( 79.18%) 108.11 ( 79.46%) 31.11 ( 94.09%) 18.88 ( 96.41%) 17.10 ( 96.75%) length=512, alignment=0: 556.50 117.29 ( 78.92%) 113.78 ( 79.56%) 62.57 ( 88.76%) 19.88 ( 96.43%) 17.80 ( 96.80%) length=576, alignment=0: 622.17 152.93 ( 75.42%) 127.58 ( 79.49%) 39.34 ( 93.68%) 21.31 ( 96.58%) 19.99 ( 96.79%) length=640, alignment=0: 691.01 142.56 ( 79.37%) 161.78 ( 76.59%) 39.20 ( 94.33%) 22.98 ( 96.67%) 20.13 ( 97.09%) length=704, alignment=0: 756.90 156.31 ( 79.35%) 176.19 ( 76.72%) 45.03 ( 94.05%) 24.82 ( 96.72%) 22.33 ( 97.05%) length=768, alignment=0: 826.23 193.17 ( 76.62%) 188.41 ( 77.20%) 50.81 ( 93.85%) 27.46 ( 96.68%) 23.25 ( 97.19%) length=832, alignment=0: 890.17 204.81 ( 76.99%) 201.61 ( 77.35%) 53.77 ( 93.96%) 27.73 ( 96.88%) 25.06 ( 97.18%) length=896, alignment=0: 959.52 217.89 ( 77.29%) 213.86 ( 77.71%) 57.99 ( 93.96%) 29.53 ( 96.92%) 26.29 ( 97.26%) length=960, alignment=0: 1024.52 231.06 ( 77.45%) 227.05 ( 77.84%) 60.36 ( 94.11%) 32.29 ( 96.85%) 27.94 ( 97.27%) length=1024, alignment=0: 1086.71 244.17 ( 77.53%) 239.87 ( 77.93%) 64.72 ( 94.04%) 72.38 ( 93.34%) 28.72 ( 97.36%) length=1152, alignment=0: 1231.48 270.22 ( 78.06%) 266.47 ( 78.36%) 73.38 ( 94.04%) 40.24 ( 96.73%) 32.42 ( 97.37%) length=1280, alignment=0: 1349.29 295.45 ( 78.10%) 292.69 ( 78.31%) 111.80 ( 91.71%) 42.44 ( 96.85%) 34.59 ( 97.44%) length=1408, alignment=0: 1487.13 322.57 ( 78.31%) 318.18 ( 78.60%) 84.47 ( 94.32%) 44.35 ( 97.02%) 37.31 ( 97.49%) length=1536, alignment=0: 1623.52 347.98 ( 78.57%) 344.24 ( 78.80%) 108.31 ( 93.33%) 49.82 ( 96.93%) 39.94 ( 97.54%) length=1664, alignment=0: 1748.88 373.80 ( 78.63%) 370.03 ( 78.84%) 118.76 ( 93.21%) 52.89 ( 96.98%) 42.93 ( 97.55%) length=1792, alignment=0: 1886.22 399.59 ( 78.82%) 397.39 ( 78.93%) 127.32 ( 93.25%) 53.64 ( 97.16%) 45.39 ( 97.59%) length=1920, alignment=0: 2018.37 425.98 ( 78.89%) 422.31 ( 79.08%) 126.70 ( 93.72%) 57.08 ( 97.17%) 48.12 ( 97.62%) length=2048, alignment=0: 2167.09 451.70 ( 79.16%) 447.70 ( 79.34%) 141.68 ( 93.46%) 61.63 ( 97.16%) 79.06 ( 96.35%) length=2304, alignment=0: 2422.03 503.63 ( 79.21%) 502.23 ( 79.26%) 149.62 ( 93.82%) 73.10 ( 96.98%) 56.97 ( 97.65%) length=2560, alignment=0: 2678.68 556.84 ( 79.21%) 553.24 ( 79.35%) 161.06 ( 93.99%) 127.74 ( 95.23%) 58.81 ( 97.80%) length=2816, alignment=0: 2941.95 608.70 ( 79.31%) 604.03 ( 79.47%) 171.85 ( 94.16%) 87.11 ( 97.04%) 67.08 ( 97.72%) length=3072, alignment=0: 3229.89 660.14 ( 79.56%) 659.19 ( 79.59%) 183.85 ( 94.31%) 140.25 ( 95.66%) 73.01 ( 97.74%) length=3328, alignment=0: 3496.08 713.05 ( 79.60%) 710.00 ( 79.69%) 209.72 ( 94.00%) 138.78 ( 96.03%) 77.81 ( 97.77%) length=3584, alignment=0: 3756.52 766.19 ( 79.60%) 763.94 ( 79.66%) 214.16 ( 94.30%) 146.36 ( 96.10%) 83.43 ( 97.78%) length=3840, alignment=0: 4017.15 817.43 ( 79.65%) 819.77 ( 79.59%) 242.07 ( 93.97%) 164.56 ( 95.90%) 89.72 ( 97.77%) length=4096, alignment=0: 4281.59 867.87 ( 79.73%) 864.71 ( 79.80%) 243.33 ( 94.32%) 173.11 ( 95.96%) 95.65 ( 97.77%) length=4608, alignment=0: 4810.30 977.80 ( 79.67%) 985.03 ( 79.52%) 271.13 ( 94.36%) 190.62 ( 96.04%) 107.82 ( 97.76%) length=5120, alignment=0: 5380.16 1075.77 ( 80.00%) 1071.80 ( 80.08%) 294.27 ( 94.53%) 206.04 ( 96.17%) 141.90 ( 97.36%) length=5632, alignment=0: 5925.70 1195.61 ( 79.82%) 1193.68 ( 79.86%) 323.42 ( 94.54%) 223.55 ( 96.23%) 125.28 ( 97.89%) length=6144, alignment=0: 6402.20 1285.52 ( 79.92%) 1281.04 ( 79.99%) 342.68 ( 94.65%) 234.84 ( 96.33%) 167.01 ( 97.39%) length=6656, alignment=0: 6997.01 1387.32 ( 80.17%) 1384.21 ( 80.22%) 365.93 ( 94.77%) 269.89 ( 96.14%) 176.40 ( 97.48%) length=7168, alignment=0: 7454.76 1492.10 ( 79.98%) 1488.45 ( 80.03%) 391.92 ( 94.74%) 280.81 ( 96.23%) 187.73 ( 97.48%) length=7680, alignment=0: 8163.34 1608.43 ( 80.30%) 1615.98 ( 80.20%) 460.03 ( 94.36%) 299.86 ( 96.33%) 201.40 ( 97.53%) ```
1 parent d4673fe commit 3179200

File tree

4 files changed

+179
-6
lines changed

4 files changed

+179
-6
lines changed
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
//===-- Strlen implementation for aarch64 ---------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
9+
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
10+
11+
#if defined(__ARM_NEON)
12+
#include "src/__support/CPP/bit.h" // countr_zero
13+
14+
#include <arm_neon.h>
15+
#include <stddef.h> // size_t
16+
17+
namespace LIBC_NAMESPACE_DECL {
18+
19+
namespace neon {
20+
[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
21+
using Vector __attribute__((may_alias)) = uint8x8_t;
22+
23+
uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
24+
Vector *block_ptr = reinterpret_cast<Vector *>(src - misalign_bytes);
25+
Vector v = *block_ptr;
26+
Vector vcmp = vceqz_u8(v);
27+
uint64x1_t cmp_mask = vreinterpret_u64_s8(vcmp);
28+
uint64_t cmp = vget_lane_u64(cmp_mask, 0);
29+
cmp = cmp >> (misalign_bytes << 3);
30+
if (cmp)
31+
return cpp::countr_zero(cmp) >> 3;
32+
33+
while (true) {
34+
++block_ptr;
35+
v = *block_ptr;
36+
vcmp = vceqz_u8(v);
37+
cmp_mask = vreinterpret_u64_s8(vcmp);
38+
cmp = vget_lane_u64(cmp_mask, 0);
39+
if (cmp)
40+
return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
41+
reinterpret_cast<uintptr_t>(src) +
42+
(cpp::countr_zero(cmp) >> 3));
43+
}
44+
}
45+
} // namespace neon
46+
47+
namespace string_length_impl = neon;
48+
49+
} // namespace LIBC_NAMESPACE_DECL
50+
#endif // __ARM_NEON
51+
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_STRLEN_H
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
//===-- Strlen implementation for x86_64 ----------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H
9+
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H
10+
11+
#include "src/__support/CPP/bit.h" // countr_zero
12+
13+
#include <immintrin.h>
14+
#include <stddef.h> // size_t
15+
16+
namespace LIBC_NAMESPACE_DECL {
17+
18+
namespace string_length_internal {
19+
// Return a bit-mask with the nth bit set if the nth-byte in block_ptr is zero.
20+
template <typename Vector, typename Mask>
21+
Mask CompareAndMask(const Vector *block_ptr);
22+
23+
template <typename Vector, typename Mask,
24+
decltype(CompareAndMask<Vector, Mask>)>
25+
size_t string_length_vector(const char *src) {
26+
uintptr_t misalign_bytes = reinterpret_cast<uintptr_t>(src) % sizeof(Vector);
27+
28+
const Vector *block_ptr =
29+
reinterpret_cast<const Vector *>(src - misalign_bytes);
30+
auto cmp = CompareAndMask<Vector, Mask>(block_ptr) >> misalign_bytes;
31+
if (cmp)
32+
return cpp::countr_zero(cmp);
33+
34+
while (true) {
35+
block_ptr++;
36+
cmp = CompareAndMask<Vector, Mask>(block_ptr);
37+
if (cmp)
38+
return static_cast<size_t>(reinterpret_cast<uintptr_t>(block_ptr) -
39+
reinterpret_cast<uintptr_t>(src) +
40+
cpp::countr_zero(cmp));
41+
}
42+
}
43+
44+
template <>
45+
uint32_t CompareAndMask<__m128i, uint32_t>(const __m128i *block_ptr) {
46+
__m128i v = _mm_load_si128(block_ptr);
47+
__m128i z = _mm_setzero_si128();
48+
__m128i c = _mm_cmpeq_epi8(z, v);
49+
return _mm_movemask_epi8(c);
50+
}
51+
52+
namespace sse2 {
53+
[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
54+
return string_length_vector<__m128i, uint32_t,
55+
CompareAndMask<__m128i, uint32_t>>(src);
56+
}
57+
} // namespace sse2
58+
59+
#if defined(__AVX2__)
60+
template <>
61+
uint32_t CompareAndMask<__m256i, uint32_t>(const __m256i *block_ptr) {
62+
__m256i v = _mm256_load_si256(block_ptr);
63+
__m256i z = _mm256_setzero_si256();
64+
__m256i c = _mm256_cmpeq_epi8(z, v);
65+
return _mm256_movemask_epi8(c);
66+
}
67+
68+
namespace avx2 {
69+
[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
70+
return string_length_vector<__m256i, uint32_t,
71+
CompareAndMask<__m256i, uint32_t>>(src);
72+
}
73+
} // namespace avx2
74+
#endif
75+
76+
#if defined(__AVX512F__)
77+
template <>
78+
__mmask64 CompareAndMask<__m512i, __mmask64>(const __m512i *block_ptr) {
79+
__m512i v = _mm512_load_si512(block_ptr);
80+
__m512i z = _mm512_setzero_si512();
81+
return _mm512_cmp_epu8_mask(z, v, _MM_CMPINT_EQ);
82+
}
83+
namespace avx512 {
84+
[[maybe_unused]] LIBC_INLINE size_t string_length(const char *src) {
85+
return string_length_vector<__m512i, __mmask64,
86+
CompareAndMask<__m512i, __mmask64>>(src);
87+
}
88+
} // namespace avx512
89+
#endif
90+
} // namespace string_length_internal
91+
92+
#if defined(__AVX512F__)
93+
namespace string_length_impl = string_length_internal::avx512;
94+
#elif defined(__AVX2__)
95+
namespace string_length_impl = string_length_internal::avx2;
96+
#else
97+
namespace string_length_impl = string_length_internal::sse2;
98+
#endif
99+
100+
} // namespace LIBC_NAMESPACE_DECL
101+
102+
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_STRLEN_H

libc/src/string/string_utils.h

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,16 @@
2222
#include "src/__support/macros/config.h"
2323
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
2424

25+
#if defined(LIBC_COPT_STRING_UNSAFE_WIDE_READ)
26+
#if defined(LIBC_TARGET_ARCH_IS_X86)
27+
#include "src/string/memory_utils/x86_64/inline_strlen.h"
28+
#elif defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_NEON)
29+
#include "src/string/memory_utils/aarch64/inline_strlen.h"
30+
#else
31+
namespace string_length_impl = LIBC_NAMESPACE::wide_read;
32+
#endif
33+
#endif
34+
2535
namespace LIBC_NAMESPACE_DECL {
2636
namespace internal {
2737

@@ -53,7 +63,7 @@ template <typename Word> LIBC_INLINE constexpr Word repeat_byte(Word byte) {
5363
// high bit set will no longer have it set, narrowing the list of bytes which
5464
// result in non-zero values to just the zero byte.
5565
template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
56-
constexpr Word LOW_BITS = repeat_byte<Word>(0x01);
66+
constexpr unsigned int LOW_BITS = repeat_byte<Word>(0x01);
5767
constexpr Word HIGH_BITS = repeat_byte<Word>(0x80);
5868
Word subtracted = block - LOW_BITS;
5969
Word inverted = ~block;
@@ -81,16 +91,23 @@ LIBC_INLINE size_t string_length_wide_read(const char *src) {
8191
return static_cast<size_t>(char_ptr - src);
8292
}
8393

84-
// Returns the length of a string, denoted by the first occurrence
85-
// of a null terminator.
86-
template <typename T> LIBC_INLINE size_t string_length(const T *src) {
87-
#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
94+
namespace wide_read {
95+
LIBC_INLINE size_t string_length(const char *src) {
8896
// Unsigned int is the default size for most processors, and on x86-64 it
8997
// performs better than larger sizes when the src pointer can't be assumed to
9098
// be aligned to a word boundary, so it's the size we use for reading the
9199
// string a block at a time.
100+
return string_length_wide_read<unsigned int>(src);
101+
}
102+
103+
} // namespace wide_read
104+
105+
// Returns the length of a string, denoted by the first occurrence
106+
// of a null terminator.
107+
template <typename T> LIBC_INLINE size_t string_length(const T *src) {
108+
#ifdef LIBC_COPT_STRING_UNSAFE_WIDE_READ
92109
if constexpr (cpp::is_same_v<T, char>)
93-
return string_length_wide_read<unsigned int>(src);
110+
return string_length_impl::string_length(src);
94111
#endif
95112
size_t length;
96113
for (length = 0; *src; ++src, ++length)

utils/bazel/llvm-project-overlay/libc/BUILD.bazel

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4913,6 +4913,7 @@ libc_support_library(
49134913
"src/string/memory_utils/aarch64/inline_memcpy.h",
49144914
"src/string/memory_utils/aarch64/inline_memmove.h",
49154915
"src/string/memory_utils/aarch64/inline_memset.h",
4916+
"src/string/memory_utils/aarch64/inline_strlen.h",
49164917
"src/string/memory_utils/arm/common.h",
49174918
"src/string/memory_utils/arm/inline_memcpy.h",
49184919
"src/string/memory_utils/arm/inline_memset.h",
@@ -4937,6 +4938,7 @@ libc_support_library(
49374938
"src/string/memory_utils/x86_64/inline_memcpy.h",
49384939
"src/string/memory_utils/x86_64/inline_memmove.h",
49394940
"src/string/memory_utils/x86_64/inline_memset.h",
4941+
"src/string/memory_utils/x86_64/inline_strlen.h",
49404942
],
49414943
deps = [
49424944
":__support_common",
@@ -4961,6 +4963,7 @@ libc_support_library(
49614963
":__support_macros_optimization",
49624964
":hdr_limits_macros",
49634965
":llvm_libc_types_size_t",
4966+
":string_memory_utils",
49644967
":types_size_t",
49654968
],
49664969
)

0 commit comments

Comments
 (0)