@@ -20,19 +20,29 @@ namespace {
2020
2121LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof (uint32_t );
2222
23- template <size_t bytes>
23+ enum Strategy {
24+ ForceWordLdStChain,
25+ AssumeWordAligned,
26+ AssumeUnaligned,
27+ };
28+
29+ template <size_t bytes, Strategy strategy = AssumeUnaligned>
2430LIBC_INLINE void copy_and_bump_pointers (Ptr &dst, CPtr &src) {
25- if constexpr (bytes == 1 || bytes == 2 || bytes == 4 ) {
26- memcpy_inline<bytes>(dst, src);
27- } else {
31+ if constexpr (strategy == AssumeUnaligned) {
32+ memcpy_inline<bytes>(assume_aligned<1 >(dst), assume_aligned<1 >(src));
33+ } else if constexpr (strategy == AssumeWordAligned) {
34+ static_assert (bytes >= kWordSize );
35+ memcpy_inline<bytes>(assume_aligned<kWordSize >(dst),
36+ assume_aligned<kWordSize >(src));
37+ } else if constexpr (strategy == ForceWordLdStChain) {
2838 // We restrict loads/stores to 4 byte to prevent the use of load/store
2939 // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
3040 // fault (see notes below) and second, they use more registers which in turn
3141 // adds push/pop instructions in the hot path.
32- static_assert (bytes % kWordSize == 0 );
42+ static_assert (( bytes % kWordSize == 0 ) && (bytes >= kWordSize ) );
3343 LIBC_LOOP_UNROLL
3444 for (size_t i = 0 ; i < bytes / kWordSize ; ++i) {
35- const uintptr_t offset = i * kWordSize ;
45+ const size_t offset = i * kWordSize ;
3646 memcpy_inline<kWordSize >(dst + offset, src + offset);
3747 }
3848 }
@@ -45,11 +55,19 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
4555 src += bytes;
4656}
4757
48- template <size_t block_size>
49- LIBC_INLINE void copy_blocks (Ptr &dst, CPtr &src, size_t &size) {
58+ LIBC_INLINE void copy_bytes_and_bump_pointers (Ptr &dst, CPtr &src,
59+ const size_t size) {
60+ LIBC_LOOP_NOUNROLL
61+ for (size_t i = 0 ; i < size; ++i)
62+ *dst++ = *src++;
63+ }
64+
65+ template <size_t block_size, Strategy strategy>
66+ LIBC_INLINE void copy_blocks_and_update_args (Ptr &dst, CPtr &src,
67+ size_t &size) {
5068 LIBC_LOOP_NOUNROLL
5169 for (size_t i = 0 ; i < size / block_size; ++i)
52- copy_and_bump_pointers<block_size>(dst, src);
70+ copy_and_bump_pointers<block_size, strategy >(dst, src);
5371 // Update `size` once at the end instead of once per iteration.
5472 size %= block_size;
5573}
@@ -66,19 +84,57 @@ LIBC_INLINE auto misaligned(CPtr a) {
6684} // namespace
6785
6886// Implementation for Cortex-M0, M0+, M1.
69- // The implementation makes sure that all accesses are aligned.
87+ // Notes:
88+ // - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
89+ // that also needs to return the `dst` ptr.
90+ // - These cores do not allow for unaligned loads/stores.
91+ // - When `src` and `dst` are coaligned, we start by aligning them and perform
92+ // bulk copies. We let the compiler know the pointers are aligned so it can
93+ // use load/store multiple (LDM, STM). This significantly increase throughput
94+ // but it also requires more registers and push/pop instructions. This impacts
95+ // latency for small size copies.
96+ // - When `src` and `dst` are misaligned, we align `dst` and recompose words
97+ // using multiple aligned loads. `load_aligned` takes care of endianness
98+ // issues.
7099[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_low_end (Ptr dst, CPtr src,
71100 size_t size) {
72- // For now, dummy implementation that performs byte per byte copy.
73- LIBC_LOOP_NOUNROLL
74- for (size_t i = 0 ; i < size; ++i)
75- dst[i] = src[i];
101+ if (size >= 8 ) {
102+ if (const size_t offset = distance_to_align_up<kWordSize >(dst))
103+ [[unlikely]] {
104+ copy_bytes_and_bump_pointers (dst, src, offset);
105+ size -= offset;
106+ }
107+ const auto src_alignment = distance_to_align_down<kWordSize >(src);
108+ if (src_alignment == 0 ) [[likely]] {
109+ // Both `src` and `dst` are now word-aligned.
110+ copy_blocks_and_update_args<64 , AssumeWordAligned>(dst, src, size);
111+ copy_blocks_and_update_args<16 , AssumeWordAligned>(dst, src, size);
112+ copy_blocks_and_update_args<4 , AssumeWordAligned>(dst, src, size);
113+ } else {
114+ // `dst` is aligned but `src` is not.
115+ LIBC_LOOP_NOUNROLL
116+ while (size >= kWordSize ) {
117+ // Recompose word from multiple loads depending on the alignment.
118+ const uint32_t value =
119+ src_alignment == 2
120+ ? load_aligned<uint32_t , uint16_t , uint16_t >(src)
121+ : load_aligned<uint32_t , uint8_t , uint16_t , uint8_t >(src);
122+ memcpy_inline<kWordSize >(assume_aligned<kWordSize >(dst), &value);
123+ dst += kWordSize ;
124+ src += kWordSize ;
125+ size -= kWordSize ;
126+ }
127+ }
128+ // Up to 3 bytes may still need to be copied.
129+ // Handling them with the slow loop below.
130+ }
131+ copy_bytes_and_bump_pointers (dst, src, size);
76132}
77133
78134// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
79135// support for unaligned loads and stores.
80136// Notes:
81- // - It compiles down to <300 bytes.
137+ // - It compiles down to 266 bytes.
82138// - `dst` and `src` are not `__restrict` to prevent the compiler from
83139// reordering loads/stores.
84140// - We keep state variables to a strict minimum to keep everything in the free
@@ -108,9 +164,9 @@ LIBC_INLINE auto misaligned(CPtr a) {
108164 size -= offset;
109165 }
110166 }
111- copy_blocks <64 >(dst, src, size);
112- copy_blocks <16 >(dst, src, size);
113- copy_blocks< 4 >(dst, src, size);
167+ copy_blocks_and_update_args <64 , ForceWordLdStChain >(dst, src, size);
168+ copy_blocks_and_update_args <16 , ForceWordLdStChain >(dst, src, size);
169+ copy_blocks_and_update_args< 4 , AssumeUnaligned >(dst, src, size);
114170 if (size & 1 )
115171 copy_and_bump_pointers<1 >(dst, src);
116172 if (size & 2 ) [[unlikely]]
0 commit comments