55// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66//
77// ===----------------------------------------------------------------------===//
8+ // The functions defined in this file give approximate code size. These sizes
9+ // assume the following configuration options:
10+ // - LIBC_CONF_KEEP_FRAME_POINTER = false
11+ // - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false
12+ // - LIBC_ADD_NULL_CHECKS = false
813#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
914#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
1015
16+ #include " src/__support/CPP/type_traits.h" // always_false
1117#include " src/__support/macros/attributes.h" // LIBC_INLINE
1218#include " src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL
19+ #include " src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY
1320#include " src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align
1421
1522#include < stddef.h> // size_t
1623
17- // https://libc.llvm.org/compiler_support.html
18- // Support for [[likely]] / [[unlikely]]
19- // [X] GCC 12.2
20- // [X] Clang 12
21- // [ ] Clang 11
22- #define LIBC_ATTR_LIKELY [[likely]]
23- #define LIBC_ATTR_UNLIKELY [[unlikely]]
24-
25- #if defined(LIBC_COMPILER_IS_CLANG)
26- #if LIBC_COMPILER_CLANG_VER < 1200
27- #undef LIBC_ATTR_LIKELY
28- #undef LIBC_ATTR_UNLIKELY
29- #define LIBC_ATTR_LIKELY
30- #define LIBC_ATTR_UNLIKELY
31- #endif
32- #endif
33-
3424namespace LIBC_NAMESPACE_DECL {
3525
3626namespace {
3727
38- LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof (uint32_t );
39-
40- enum Strategy {
41- ForceWordLdStChain,
42- AssumeWordAligned,
43- AssumeUnaligned,
44- };
28+ // Performs a copy of `bytes` byte from `src` to `dst`. This function has the
29+ // semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is
30+ // free to use whatever instruction is best for the size and assumed access.
31+ template <size_t bytes, AssumeAccess access>
32+ LIBC_INLINE void copy (void *dst, const void *src) {
33+ if constexpr (access == AssumeAccess::kAligned ) {
34+ constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes;
35+ memcpy_inline<bytes>(assume_aligned<alignment>(dst),
36+ assume_aligned<alignment>(src));
37+ } else if constexpr (access == AssumeAccess::kUnknown ) {
38+ memcpy_inline<bytes>(dst, src);
39+ } else {
40+ static_assert (cpp::always_false<decltype (access)>, " Invalid AssumeAccess" );
41+ }
42+ }
4543
46- template <size_t bytes, Strategy strategy = AssumeUnaligned>
47- LIBC_INLINE void copy_and_bump_pointers (Ptr &dst, CPtr &src) {
48- if constexpr (strategy == AssumeUnaligned) {
49- memcpy_inline<bytes>(assume_aligned<1 >(dst), assume_aligned<1 >(src));
50- } else if constexpr (strategy == AssumeWordAligned) {
51- static_assert (bytes >= kWordSize );
52- memcpy_inline<bytes>(assume_aligned<kWordSize >(dst),
53- assume_aligned<kWordSize >(src));
54- } else if constexpr (strategy == ForceWordLdStChain) {
44+ template <size_t bytes, BlockOp block_op = BlockOp::kFull ,
45+ AssumeAccess access = AssumeAccess::kUnknown >
46+ LIBC_INLINE void copy_block_and_bump_pointers (Ptr &dst, CPtr &src) {
47+ if constexpr (block_op == BlockOp::kFull ) {
48+ copy<bytes, access>(dst, src);
49+ } else if constexpr (block_op == BlockOp::kByWord ) {
5550 // We restrict loads/stores to 4 byte to prevent the use of load/store
56- // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
57- // fault (see notes below) and second, they use more registers which in turn
58- // adds push/pop instructions in the hot path.
51+ // multiple (LDM, STM) and load/store double (LDRD, STRD).
5952 static_assert ((bytes % kWordSize == 0 ) && (bytes >= kWordSize ));
6053 LIBC_LOOP_UNROLL
61- for (size_t i = 0 ; i < bytes / kWordSize ; ++i) {
62- const size_t offset = i * kWordSize ;
63- memcpy_inline<kWordSize >(dst + offset, src + offset);
54+ for (size_t offset = 0 ; offset < bytes; offset += kWordSize ) {
55+ copy<kWordSize , access>(dst + offset, src + offset);
6456 }
57+ } else {
58+ static_assert (cpp::always_false<decltype (block_op)>, " Invalid BlockOp" );
6559 }
6660 // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting
6761 // into the load/store instructions.
@@ -72,39 +66,27 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
7266 src += bytes;
7367}
7468
75- LIBC_INLINE void copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src,
76- const size_t size) {
69+ template < size_t bytes, BlockOp block_op, AssumeAccess access>
70+ LIBC_INLINE void consume_by_block (Ptr &dst, CPtr &src, size_t & size) {
7771 LIBC_LOOP_NOUNROLL
78- for (size_t i = 0 ; i < size; ++i)
79- *dst++ = *src++;
72+ for (size_t i = 0 ; i < size / bytes; ++i)
73+ copy_block_and_bump_pointers<bytes, block_op, access>(dst, src);
74+ size %= bytes;
8075}
8176
82- template <size_t block_size, Strategy strategy>
83- LIBC_INLINE void copy_blocks_and_update_args (Ptr &dst, CPtr &src,
84- size_t &size) {
77+ [[maybe_unused]] LIBC_INLINE void
78+ copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src, size_t size) {
8579 LIBC_LOOP_NOUNROLL
86- for (size_t i = 0 ; i < size / block_size; ++i)
87- copy_and_bump_pointers<block_size, strategy>(dst, src);
88- // Update `size` once at the end instead of once per iteration.
89- size %= block_size;
90- }
91-
92- LIBC_INLINE CPtr bitwise_or (CPtr a, CPtr b) {
93- return cpp::bit_cast<CPtr>(cpp::bit_cast<uintptr_t >(a) |
94- cpp::bit_cast<uintptr_t >(b));
95- }
96-
97- LIBC_INLINE auto misaligned (CPtr a) {
98- return distance_to_align_down<kWordSize >(a);
80+ for (size_t i = 0 ; i < size; ++i)
81+ *dst++ = *src++;
9982}
10083
10184} // namespace
10285
103- // Implementation for Cortex-M0, M0+, M1.
104- // Notes:
105- // - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
106- // that also needs to return the `dst` ptr.
107- // - These cores do not allow for unaligned loads/stores.
86+ // Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned
87+ // loads/stores. It compiles down to 208 bytes when used through `memcpy` that
88+ // also needs to return the `dst` ptr.
89+ // Note:
10890// - When `src` and `dst` are coaligned, we start by aligning them and perform
10991// bulk copies. We let the compiler know the pointers are aligned so it can
11092// use load/store multiple (LDM, STM). This significantly increase throughput
@@ -121,13 +103,20 @@ LIBC_INLINE auto misaligned(CPtr a) {
121103 copy_bytes_and_bump_pointers (dst, src, offset);
122104 size -= offset;
123105 }
106+ constexpr AssumeAccess kAligned = AssumeAccess::kAligned ;
124107 const auto src_alignment = distance_to_align_down<kWordSize >(src);
125108 if (src_alignment == 0 )
126109 LIBC_ATTR_LIKELY {
127110 // Both `src` and `dst` are now word-aligned.
128- copy_blocks_and_update_args<64 , AssumeWordAligned>(dst, src, size);
129- copy_blocks_and_update_args<16 , AssumeWordAligned>(dst, src, size);
130- copy_blocks_and_update_args<4 , AssumeWordAligned>(dst, src, size);
111+ // We first copy by blocks of 64 bytes, the compiler will use 4
112+ // load/store multiple (LDM, STM), each of 4 words. This requires more
113+ // registers so additional push/pop are needed but the speedup is worth
114+ // it.
115+ consume_by_block<64 , BlockOp::kFull , kAligned >(dst, src, size);
116+ // Then we use blocks of 4 word load/store.
117+ consume_by_block<16 , BlockOp::kByWord , kAligned >(dst, src, size);
118+ // Then we use word by word copy.
119+ consume_by_block<4 , BlockOp::kByWord , kAligned >(dst, src, size);
131120 }
132121 else {
133122 // `dst` is aligned but `src` is not.
@@ -138,7 +127,7 @@ LIBC_INLINE auto misaligned(CPtr a) {
138127 src_alignment == 2
139128 ? load_aligned<uint32_t , uint16_t , uint16_t >(src)
140129 : load_aligned<uint32_t , uint8_t , uint16_t , uint8_t >(src);
141- memcpy_inline <kWordSize >(assume_aligned< kWordSize >( dst) , &value);
130+ copy <kWordSize , kAligned >( dst, &value);
142131 dst += kWordSize ;
143132 src += kWordSize ;
144133 size -= kWordSize ;
@@ -151,56 +140,68 @@ LIBC_INLINE auto misaligned(CPtr a) {
151140}
152141
153142// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
154- // support for unaligned loads and stores.
155- // Notes:
156- // - It compiles down to 266 bytes.
157- // - `dst` and `src` are not `__restrict` to prevent the compiler from
158- // reordering loads/stores.
159- // - We keep state variables to a strict minimum to keep everything in the free
160- // registers and prevent costly push / pop.
161- // - If unaligned single loads/stores to normal memory are supported, unaligned
162- // accesses for load/store multiple (LDM, STM) and load/store double (LDRD,
163- // STRD) instructions are generally not supported and will still fault so we
164- // make sure to restrict unrolling to word loads/stores.
143+ // support for unaligned loads and stores. It compiles down to 272 bytes when
144+ // used through `memcpy` that also needs to return the `dst` ptr.
165145[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end (Ptr dst, CPtr src,
166146 size_t size) {
167147 if (misaligned (bitwise_or (src, dst)))
168148 LIBC_ATTR_UNLIKELY {
169149 if (size < 8 )
170150 LIBC_ATTR_UNLIKELY {
171151 if (size & 1 )
172- copy_and_bump_pointers <1 >(dst, src);
152+ copy_block_and_bump_pointers <1 >(dst, src);
173153 if (size & 2 )
174- copy_and_bump_pointers <2 >(dst, src);
154+ copy_block_and_bump_pointers <2 >(dst, src);
175155 if (size & 4 )
176- copy_and_bump_pointers <4 >(dst, src);
156+ copy_block_and_bump_pointers <4 >(dst, src);
177157 return ;
178158 }
179159 if (misaligned (src))
180160 LIBC_ATTR_UNLIKELY {
181161 const size_t offset = distance_to_align_up<kWordSize >(dst);
182162 if (offset & 1 )
183- copy_and_bump_pointers <1 >(dst, src);
163+ copy_block_and_bump_pointers <1 >(dst, src);
184164 if (offset & 2 )
185- copy_and_bump_pointers <2 >(dst, src);
165+ copy_block_and_bump_pointers <2 >(dst, src);
186166 size -= offset;
187167 }
188168 }
189- copy_blocks_and_update_args<64 , ForceWordLdStChain>(dst, src, size);
190- copy_blocks_and_update_args<16 , ForceWordLdStChain>(dst, src, size);
191- copy_blocks_and_update_args<4 , AssumeUnaligned>(dst, src, size);
169+ // `dst` and `src` are not necessarily both aligned at that point but this
170+ // implementation assumes hardware support for unaligned loads and stores so
171+ // it is still fast to perform unrolled word by word copy. Note that wider
172+ // accesses through the use of load/store multiple (LDM, STM) and load/store
173+ // double (LDRD, STRD) instructions are generally not supported and can fault.
174+ // By forcing decomposition of 64 bytes copy into word by word copy, the
175+ // compiler uses a load to prefetch the next cache line:
176+ // ldr r3, [r1, #64]! <- prefetch next cache line
177+ // str r3, [r0]
178+ // ldr r3, [r1, #0x4]
179+ // str r3, [r0, #0x4]
180+ // ...
181+ // ldr r3, [r1, #0x3c]
182+ // str r3, [r0, #0x3c]
183+ // This is a bit detrimental for sizes between 64 and 256 (less than 10%
184+ // penalty) but the prefetch yields better throughput for larger copies.
185+ constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown ;
186+ consume_by_block<64 , BlockOp::kByWord , kUnknown >(dst, src, size);
187+ consume_by_block<16 , BlockOp::kByWord , kUnknown >(dst, src, size);
188+ consume_by_block<4 , BlockOp::kByWord , kUnknown >(dst, src, size);
192189 if (size & 1 )
193- copy_and_bump_pointers <1 >(dst, src);
190+ copy_block_and_bump_pointers <1 >(dst, src);
194191 if (size & 2 )
195- LIBC_ATTR_UNLIKELY
196- copy_and_bump_pointers<2 >(dst, src);
192+ copy_block_and_bump_pointers<2 >(dst, src);
197193}
198194
199- [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (void *__restrict dst_,
200- const void *__restrict src_,
195+ [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm (Ptr dst, CPtr src,
201196 size_t size) {
202- Ptr dst = cpp::bit_cast<Ptr>(dst_);
203- CPtr src = cpp::bit_cast<CPtr>(src_);
197+ // The compiler performs alias analysis and is able to prove that `dst` and
198+ // `src` do not alias by propagating the `__restrict` keyword from the
199+ // `memcpy` prototype. This allows the compiler to merge consecutive
200+ // load/store (LDR, STR) instructions generated in
201+ // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store
202+ // double (LDRD, STRD) instructions, this is is undesirable so we prevent the
203+ // compiler from inferring `__restrict` with the following line.
204+ asm volatile (" " : " +r" (dst), " +r" (src));
204205#ifdef __ARM_FEATURE_UNALIGNED
205206 return inline_memcpy_arm_mid_end (dst, src, size);
206207#else
@@ -210,8 +211,4 @@ LIBC_INLINE auto misaligned(CPtr a) {
210211
211212} // namespace LIBC_NAMESPACE_DECL
212213
213- // Cleanup local macros
214- #undef LIBC_ATTR_LIKELY
215- #undef LIBC_ATTR_UNLIKELY
216-
217214#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H
0 commit comments