[libc][SVE] add sve handling for memcpy with count less than 32b (llvm#167446)

SchrodingerZhu · gchatelet · rishabhmadan19 · commit d9a4851cc822 · 2026-02-09T20:32:26.000+05:30
Add SVE optimization for AArch64 architectures. The idea is to use predicate registers to avoid branching. Microbench in repo shows considerable improvements on NV GB10 (locked on largest X925): ``` ====================================================================== BENCHMARK STATISTICS (time in nanoseconds) ====================================================================== memcpy_Google_A: Old - Mean: 3.1257 ns, Median: 3.1162 ns New - Mean: 2.8402 ns, Median: 2.8265 ns Improvement: +9.14% (mean), +9.30% (median) memcpy_Google_B: Old - Mean: 2.3171 ns, Median: 2.3159 ns New - Mean: 1.6589 ns, Median: 1.6593 ns Improvement: +28.40% (mean), +28.35% (median) memcpy_Google_D: Old - Mean: 8.7602 ns, Median: 8.7645 ns New - Mean: 8.4307 ns, Median: 8.4308 ns Improvement: +3.76% (mean), +3.81% (median) memcpy_Google_L: Old - Mean: 1.7137 ns, Median: 1.7091 ns New - Mean: 1.4530 ns, Median: 1.4553 ns Improvement: +15.22% (mean), +14.85% (median) memcpy_Google_M: Old - Mean: 1.9823 ns, Median: 1.9825 ns New - Mean: 1.4826 ns, Median: 1.4840 ns Improvement: +25.20% (mean), +25.15% (median) memcpy_Google_Q: Old - Mean: 1.6812 ns, Median: 1.6784 ns New - Mean: 1.1538 ns, Median: 1.1517 ns Improvement: +31.37% (mean), +31.38% (median) memcpy_Google_S: Old - Mean: 2.1816 ns, Median: 2.1786 ns New - Mean: 1.6297 ns, Median: 1.6287 ns Improvement: +25.29% (mean), +25.24% (median) memcpy_Google_U: Old - Mean: 2.2851 ns, Median: 2.2825 ns New - Mean: 1.7219 ns, Median: 1.7187 ns Improvement: +24.65% (mean), +24.70% (median) memcpy_Google_W: Old - Mean: 2.0408 ns, Median: 2.0361 ns New - Mean: 1.5260 ns, Median: 1.5252 ns Improvement: +25.23% (mean), +25.09% (median) uniform_384_to_4096: Old - Mean: 26.9067 ns, Median: 26.8845 ns New - Mean: 26.8083 ns, Median: 26.8149 ns Improvement: +0.37% (mean), +0.26% (median) ``` The beginning of the memcpy function looks like the following: ``` Dump of assembler code for function _ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm: 0x0000000000001340 <+0>: cbz x2, 0x143c <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+252> 0x0000000000001344 <+4>: cbz x0, 0x1440 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+256> 0x0000000000001348 <+8>: cbz x1, 0x1444 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+260> 0x000000000000134c <+12>: subs x8, x2, #0x20 0x0000000000001350 <+16>: b.hi 0x1374 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+52> // b.pmore 0x0000000000001354 <+20>: rdvl x8, llvm#1 0x0000000000001358 <+24>: whilelo p0.b, xzr, x2 0x000000000000135c <+28>: ld1b {z0.b}, p0/z, [x1] 0x0000000000001360 <+32>: whilelo p1.b, x8, x2 0x0000000000001364 <+36>: ld1b {z1.b}, p1/z, [x1, llvm#1, mul vl] 0x0000000000001368 <+40>: st1b {z0.b}, p0, [x0] 0x000000000000136c <+44>: st1b {z1.b}, p1, [x0, llvm#1, mul vl] 0x0000000000001370 <+48>: ret ``` --------- Co-authored-by: Guillaume Chatelet <chatelet.guillaume@gmail.com>
diff --git a/libc/src/string/memory_utils/aarch64/inline_memcpy.h b/libc/src/string/memory_utils/aarch64/inline_memcpy.h
@@ -9,17 +9,40 @@
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_MEMCPY_H
 
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/properties/cpu_features.h"
 #include "src/string/memory_utils/op_builtin.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
+#if defined(LIBC_TARGET_CPU_HAS_SVE)
+#include <arm_sve.h>
+#endif
 namespace LIBC_NAMESPACE_DECL {
-
 [[maybe_unused]] LIBC_INLINE void
 inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
+  // Always avoid emit any memory operation if count == 0.
   if (count == 0)
     return;
+  // Use predicated load/store on SVE available targets to avoid branching in
+  // small cases.
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+  auto src_ptr = reinterpret_cast<const uint8_t *>(src);
+  auto dst_ptr = reinterpret_cast<uint8_t *>(dst);
+  if (count <= 16) {
+    const svbool_t mask = svwhilelt_b8_u64(0, count);
+    svst1_u8(mask, dst_ptr, svld1_u8(mask, src_ptr));
+    return;
+  }
+  if (count <= 32) {
+    const size_t vlen = svcntb();
+    svbool_t m0 = svwhilelt_b8_u64(0, count);
+    svbool_t m1 = svwhilelt_b8_u64(vlen, count);
+    svst1_u8(m0, dst_ptr, svld1_u8(m0, src_ptr));
+    svst1_u8(m1, dst_ptr + vlen, svld1_u8(m1, src_ptr + vlen));
+    return;
+  }
+#else
   if (count == 1)
     return builtin::Memcpy<1>::block(dst, src);
   if (count == 2)
@@ -34,6 +57,7 @@ inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
     return builtin::Memcpy<8>::head_tail(dst, src, count);
   if (count < 32)
     return builtin::Memcpy<16>::head_tail(dst, src, count);
+#endif
   if (count < 64)
     return builtin::Memcpy<32>::head_tail(dst, src, count);
   if (count < 128)