55// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66//
77// ===----------------------------------------------------------------------===//
8+ // The functions defined in this file give approximate code size. These sizes
9+ // assume the following configuration options:
10+ // - LIBC_CONF_KEEP_FRAME_POINTER = false
11+ // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
12+ // - LIBC_ADD_NULL_CHECKS = false
813#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
914#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
1015
@@ -19,26 +24,34 @@ namespace LIBC_NAMESPACE_DECL {
1924
2025namespace {
2126
22- template <size_t bytes>
23- LIBC_INLINE void copy_assume_aligned (void *dst, const void *src) {
24- constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
25- memcpy_inline<bytes>(assume_aligned<alignment>(dst),
26- assume_aligned<alignment>(src));
27+ // Performs a copy of `bytes` byte from `src` to `dst`. This function has the
28+ // semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
29+ // free to use whatever instruction is best for the size and assumed access.
30+ template <size_t bytes, AssumeAccess access>
31+ LIBC_INLINE void copy (void *dst, const void *src) {
32+ if constexpr (access == AssumeAccess::kAligned ) {
33+ constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
34+ memcpy_inline<bytes>(assume_aligned<alignment>(dst),
35+ assume_aligned<alignment>(src));
36+ } else if constexpr (access == AssumeAccess::kUnknown ) {
37+ memcpy_inline<bytes>(dst, src);
38+ } else {
39+ static_assert (false );
40+ }
2741}
2842
29- template <size_t bytes, BlockOp block_op = BlockOp::kFull >
43+ template <size_t bytes, BlockOp block_op = BlockOp::kFull ,
44+ AssumeAccess access = AssumeAccess::kUnknown >
3045LIBC_INLINE void copy_block_and_bump_pointers (Ptr &dst, CPtr &src) {
3146 if constexpr (block_op == BlockOp::kFull ) {
32- copy_assume_aligned <bytes>(dst, src);
47+ copy <bytes, access >(dst, src);
3348 } else if constexpr (block_op == BlockOp::kByWord ) {
3449 // We restrict loads/stores to 4 byte to prevent the use of load/store
35- // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they
36- // may fault (see notes below) and second, they use more registers which
37- // in turn adds push/pop instructions in the hot path.
38- static_assert (bytes >= kWordSize );
50+ // multiple (LDM, STM) and load/store double (LDRD, STRD).
51+ static_assert ((bytes % kWordSize == 0 ) && (bytes >= kWordSize ));
3952 LIBC_LOOP_UNROLL
4053 for (size_t offset = 0 ; offset < bytes; offset += kWordSize ) {
41- copy_assume_aligned <kWordSize >(dst + offset, src + offset);
54+ copy <kWordSize , access >(dst + offset, src + offset);
4255 }
4356 } else {
4457 static_assert (false , " Invalid BlockOp" );
@@ -52,28 +65,27 @@ LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) {
5265 src += bytes;
5366}
5467
55- template <size_t bytes, BlockOp block_op, BumpSize bump_size = BumpSize:: kYes >
56- LIBC_INLINE void consume_by_aligned_block (Ptr &dst, CPtr &src, size_t &size) {
68+ template <size_t bytes, BlockOp block_op, AssumeAccess access >
69+ LIBC_INLINE void consume_by_block (Ptr &dst, CPtr &src, size_t &size) {
5770 LIBC_LOOP_NOUNROLL
5871 for (size_t i = 0 ; i < size / bytes; ++i)
59- copy_block_and_bump_pointers<bytes, block_op>(dst, src);
60- if constexpr (bump_size == BumpSize::kYes ) {
61- size %= bytes;
62- }
72+ copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
73+ size %= bytes;
6374}
6475
65- LIBC_INLINE void copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src,
66- size_t size) {
67- consume_by_aligned_block<1 , BlockOp::kFull , BumpSize::kNo >(dst, src, size);
76+ [[maybe_unused]] LIBC_INLINE void
77+ copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src, size_t size) {
78+ LIBC_LOOP_NOUNROLL
79+ for (size_t i = 0 ; i < size; ++i)
80+ *dst++ = *src++;
6881}
6982
7083} // namespace
7184
72- // Implementation for Cortex-M0, M0+, M1.
73- // Notes:
74- // - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
75- // that also needs to return the `dst` ptr.
76- // - These cores do not allow for unaligned loads/stores.
85+ // Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
86+ // loads/stores. It compiles down to 208 bytes when used through `memcpy` that
87+ // also needs to return the `dst` ptr.
88+ // Note:
7789// - When `src` and `dst` are coaligned, we start by aligning them and perform
7890// bulk copies. We let the compiler know the pointers are aligned so it can
7991// use load/store multiple (LDM, STM). This significantly increase throughput
@@ -94,21 +106,29 @@ LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
94106 if (src_alignment == 0 )
95107 LIBC_ATTR_LIKELY {
96108 // Both `src` and `dst` are now word-aligned.
97- consume_by_aligned_block<64 , BlockOp::kFull >(dst, src, size);
98- consume_by_aligned_block<16 , BlockOp::kFull >(dst, src, size);
99- consume_by_aligned_block<4 , BlockOp::kFull >(dst, src, size);
109+ // We first copy by blocks of 64 bytes, the compiler will use 4
110+ // load/store multiple (LDM, STM), each of 4 words. This requires more
111+ // registers so additional push/pop are needed but the speedup is worth
112+ // it.
113+ consume_by_block<64 , BlockOp::kFull , AssumeAccess::kAligned >(dst, src,
114+ size);
115+ // Then we use blocks of 4 word load/store.
116+ consume_by_block<16 , BlockOp::kByWord , AssumeAccess::kAligned >(dst, src,
117+ size);
118+ // Then we use word by word copy.
119+ consume_by_block<4 , BlockOp::kByWord , AssumeAccess::kAligned >(dst, src,
120+ size);
100121 }
101122 else {
102123 // `dst` is aligned but `src` is not.
103124 LIBC_LOOP_NOUNROLL
104125 while (size >= kWordSize ) {
105- // Recompose word from multiple loads depending on the
106- // alignment.
126+ // Recompose word from multiple loads depending on the alignment.
107127 const uint32_t value =
108128 src_alignment == 2
109129 ? load_aligned<uint32_t , uint16_t , uint16_t >(src)
110130 : load_aligned<uint32_t , uint8_t , uint16_t , uint8_t >(src);
111- copy_assume_aligned <kWordSize >(dst, &value);
131+ copy <kWordSize , AssumeAccess:: kAligned >(dst, &value);
112132 dst += kWordSize ;
113133 src += kWordSize ;
114134 size -= kWordSize ;
@@ -121,17 +141,8 @@ LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
121141}
122142
123143// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
124- // support for unaligned loads and stores.
125- // Notes:
126- // - It compiles down to 266 bytes.
127- // - `dst` and `src` are not `__restrict` to prevent the compiler from
128- // reordering loads/stores.
129- // - We keep state variables to a strict minimum to keep everything in the free
130- // registers and prevent costly push / pop.
131- // - If unaligned single loads/stores to normal memory are supported, unaligned
132- // accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
133- // STRD) instructions are generally not supported and will still fault so we
134- // make sure to restrict unrolling to word loads/stores.
144+ // support for unaligned loads and stores. It compiles down to 272 bytes when
145+ // used through `memcpy` that also needs to return the `dst` ptr.
135146[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end (Ptr dst, CPtr src,
136147 size_t size) {
137148 if (misaligned (bitwise_or (src, dst)))
@@ -157,22 +168,40 @@ LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
157168 }
158169 }
159170 // `dst` and `src` are not necessarily both aligned at that point but this
160- // implementation assumes hardware support for unaligned loads and stores.
161- consume_by_aligned_block<64 , BlockOp::kByWord >(dst, src, size);
162- consume_by_aligned_block<16 , BlockOp::kByWord >(dst, src, size);
163- consume_by_aligned_block<4 , BlockOp::kFull >(dst, src, size);
171+ // implementation assumes hardware support for unaligned loads and stores so
172+ // it is still fast to perform unrolled word by word copy. Note that wider
173+ // accesses through the use of load/store multiple (LDM, STM) and load/store
174+ // double (LDRD, STRD) instructions are generally not supported and can fault.
175+ // By forcing decomposition of 64 bytes copy into word by word copy, the
176+ // compiler can use the first load to prefetch memory:
177+ // ldr r3, [r1, #64]! <- prefetch next cache line
178+ // str r3, [r0]
179+ // ldr r3, [r1, #0x4]
180+ // str r3, [r0, #0x4]
181+ // ...
182+ // ldr r3, [r1, #0x3c]
183+ // str r3, [r0, #0x3c]
184+ // This is a bit detrimental for sizes between 64 and 256 (less than 10%
185+ // penalty) but the prefetch yields better throughput for larger copies.
186+ consume_by_block<64 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src,
187+ size);
188+ consume_by_block<16 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src,
189+ size);
190+ consume_by_block<4 , BlockOp::kByWord , AssumeAccess::kUnknown >(dst, src, size);
164191 if (size & 1 )
165192 copy_block_and_bump_pointers<1 >(dst, src);
166193 if (size & 2 )
167- LIBC_ATTR_UNLIKELY
168- copy_block_and_bump_pointers<2 >(dst, src);
194+ copy_block_and_bump_pointers<2 >(dst, src);
169195}
170196
171- [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (void *__restrict dst_,
172- const void *__restrict src_,
197+ [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (Ptr dst, CPtr src,
173198 size_t size) {
174- Ptr dst = cpp::bit_cast<Ptr>(dst_);
175- CPtr src = cpp::bit_cast<CPtr>(src_);
199+ // The compiler performs alias analysis and is able to prove that `dst` and
200+ // `src` do not alias by propagating the `__restrict` keyword from the
201+ // `memcpy` prototype. This allows the compiler to merge consecutive
202+ // load/store (LDR, STR) instructions into load/store double (LDRD, STRD)
203+ // instructions.
204+ asm volatile (" " : " +r" (dst), " +r" (src));
176205#ifdef __ARM_FEATURE_UNALIGNED
177206 return inline_memcpy_arm_mid_end (dst, src, size);
178207#else
0 commit comments