Skip to content

Commit d9a4851

Browse files
SchrodingerZhugchatelet
authored andcommitted
[libc][SVE] add sve handling for memcpy with count less than 32b (llvm#167446)
Add SVE optimization for AArch64 architectures. The idea is to use predicate registers to avoid branching. Microbench in repo shows considerable improvements on NV GB10 (locked on largest X925): ``` ====================================================================== BENCHMARK STATISTICS (time in nanoseconds) ====================================================================== memcpy_Google_A: Old - Mean: 3.1257 ns, Median: 3.1162 ns New - Mean: 2.8402 ns, Median: 2.8265 ns Improvement: +9.14% (mean), +9.30% (median) memcpy_Google_B: Old - Mean: 2.3171 ns, Median: 2.3159 ns New - Mean: 1.6589 ns, Median: 1.6593 ns Improvement: +28.40% (mean), +28.35% (median) memcpy_Google_D: Old - Mean: 8.7602 ns, Median: 8.7645 ns New - Mean: 8.4307 ns, Median: 8.4308 ns Improvement: +3.76% (mean), +3.81% (median) memcpy_Google_L: Old - Mean: 1.7137 ns, Median: 1.7091 ns New - Mean: 1.4530 ns, Median: 1.4553 ns Improvement: +15.22% (mean), +14.85% (median) memcpy_Google_M: Old - Mean: 1.9823 ns, Median: 1.9825 ns New - Mean: 1.4826 ns, Median: 1.4840 ns Improvement: +25.20% (mean), +25.15% (median) memcpy_Google_Q: Old - Mean: 1.6812 ns, Median: 1.6784 ns New - Mean: 1.1538 ns, Median: 1.1517 ns Improvement: +31.37% (mean), +31.38% (median) memcpy_Google_S: Old - Mean: 2.1816 ns, Median: 2.1786 ns New - Mean: 1.6297 ns, Median: 1.6287 ns Improvement: +25.29% (mean), +25.24% (median) memcpy_Google_U: Old - Mean: 2.2851 ns, Median: 2.2825 ns New - Mean: 1.7219 ns, Median: 1.7187 ns Improvement: +24.65% (mean), +24.70% (median) memcpy_Google_W: Old - Mean: 2.0408 ns, Median: 2.0361 ns New - Mean: 1.5260 ns, Median: 1.5252 ns Improvement: +25.23% (mean), +25.09% (median) uniform_384_to_4096: Old - Mean: 26.9067 ns, Median: 26.8845 ns New - Mean: 26.8083 ns, Median: 26.8149 ns Improvement: +0.37% (mean), +0.26% (median) ``` The beginning of the memcpy function looks like the following: ``` Dump of assembler code for function _ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm: 0x0000000000001340 <+0>: cbz x2, 0x143c <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+252> 0x0000000000001344 <+4>: cbz x0, 0x1440 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+256> 0x0000000000001348 <+8>: cbz x1, 0x1444 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+260> 0x000000000000134c <+12>: subs x8, x2, #0x20 0x0000000000001350 <+16>: b.hi 0x1374 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+52> // b.pmore 0x0000000000001354 <+20>: rdvl x8, llvm#1 0x0000000000001358 <+24>: whilelo p0.b, xzr, x2 0x000000000000135c <+28>: ld1b {z0.b}, p0/z, [x1] 0x0000000000001360 <+32>: whilelo p1.b, x8, x2 0x0000000000001364 <+36>: ld1b {z1.b}, p1/z, [x1, llvm#1, mul vl] 0x0000000000001368 <+40>: st1b {z0.b}, p0, [x0] 0x000000000000136c <+44>: st1b {z1.b}, p1, [x0, llvm#1, mul vl] 0x0000000000001370 <+48>: ret ``` --------- Co-authored-by: Guillaume Chatelet <chatelet.guillaume@gmail.com>
1 parent 989f736 commit d9a4851

File tree

1 file changed

+25
-1
lines changed

1 file changed

+25
-1
lines changed

libc/src/string/memory_utils/aarch64/inline_memcpy.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,40 @@
99
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_MEMCPY_H
1010

1111
#include "src/__support/macros/attributes.h" // LIBC_INLINE
12+
#include "src/__support/macros/properties/cpu_features.h"
1213
#include "src/string/memory_utils/op_builtin.h"
1314
#include "src/string/memory_utils/utils.h"
1415

1516
#include <stddef.h> // size_t
1617

18+
#if defined(LIBC_TARGET_CPU_HAS_SVE)
19+
#include <arm_sve.h>
20+
#endif
1721
namespace LIBC_NAMESPACE_DECL {
18-
1922
[[maybe_unused]] LIBC_INLINE void
2023
inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
24+
// Always avoid emit any memory operation if count == 0.
2125
if (count == 0)
2226
return;
27+
// Use predicated load/store on SVE available targets to avoid branching in
28+
// small cases.
29+
#ifdef LIBC_TARGET_CPU_HAS_SVE
30+
auto src_ptr = reinterpret_cast<const uint8_t *>(src);
31+
auto dst_ptr = reinterpret_cast<uint8_t *>(dst);
32+
if (count <= 16) {
33+
const svbool_t mask = svwhilelt_b8_u64(0, count);
34+
svst1_u8(mask, dst_ptr, svld1_u8(mask, src_ptr));
35+
return;
36+
}
37+
if (count <= 32) {
38+
const size_t vlen = svcntb();
39+
svbool_t m0 = svwhilelt_b8_u64(0, count);
40+
svbool_t m1 = svwhilelt_b8_u64(vlen, count);
41+
svst1_u8(m0, dst_ptr, svld1_u8(m0, src_ptr));
42+
svst1_u8(m1, dst_ptr + vlen, svld1_u8(m1, src_ptr + vlen));
43+
return;
44+
}
45+
#else
2346
if (count == 1)
2447
return builtin::Memcpy<1>::block(dst, src);
2548
if (count == 2)
@@ -34,6 +57,7 @@ inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
3457
return builtin::Memcpy<8>::head_tail(dst, src, count);
3558
if (count < 32)
3659
return builtin::Memcpy<16>::head_tail(dst, src, count);
60+
#endif
3761
if (count < 64)
3862
return builtin::Memcpy<32>::head_tail(dst, src, count);
3963
if (count < 128)

0 commit comments

Comments
 (0)