Skip to content

Commit 25eac99

Browse files
committed
Add an optimized memcpy version for Cortex M0 as well
1 parent 82a6075 commit 25eac99

File tree

1 file changed

+74
-18
lines changed

1 file changed

+74
-18
lines changed

libc/src/string/memory_utils/arm/inline_memcpy.h

Lines changed: 74 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,29 @@ namespace {
2020

2121
LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t);
2222

23-
template <size_t bytes>
23+
enum Strategy {
24+
ForceWordLdStChain,
25+
AssumeWordAligned,
26+
AssumeUnaligned,
27+
};
28+
29+
template <size_t bytes, Strategy strategy = AssumeUnaligned>
2430
LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
25-
if constexpr (bytes == 1 || bytes == 2 || bytes == 4) {
26-
memcpy_inline<bytes>(dst, src);
27-
} else {
31+
if constexpr (strategy == AssumeUnaligned) {
32+
memcpy_inline<bytes>(assume_aligned<1>(dst), assume_aligned<1>(src));
33+
} else if constexpr (strategy == AssumeWordAligned) {
34+
static_assert(bytes >= kWordSize);
35+
memcpy_inline<bytes>(assume_aligned<kWordSize>(dst),
36+
assume_aligned<kWordSize>(src));
37+
} else if constexpr (strategy == ForceWordLdStChain) {
2838
// We restrict loads/stores to 4 byte to prevent the use of load/store
2939
// multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may
3040
// fault (see notes below) and second, they use more registers which in turn
3141
// adds push/pop instructions in the hot path.
32-
static_assert(bytes % kWordSize == 0);
42+
static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize));
3343
LIBC_LOOP_UNROLL
3444
for (size_t i = 0; i < bytes / kWordSize; ++i) {
35-
const uintptr_t offset = i * kWordSize;
45+
const size_t offset = i * kWordSize;
3646
memcpy_inline<kWordSize>(dst + offset, src + offset);
3747
}
3848
}
@@ -45,11 +55,19 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) {
4555
src += bytes;
4656
}
4757

48-
template <size_t block_size>
49-
LIBC_INLINE void copy_blocks(Ptr &dst, CPtr &src, size_t &size) {
58+
LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src,
59+
const size_t size) {
60+
LIBC_LOOP_NOUNROLL
61+
for (size_t i = 0; i < size; ++i)
62+
*dst++ = *src++;
63+
}
64+
65+
template <size_t block_size, Strategy strategy>
66+
LIBC_INLINE void copy_blocks_and_update_args(Ptr &dst, CPtr &src,
67+
size_t &size) {
5068
LIBC_LOOP_NOUNROLL
5169
for (size_t i = 0; i < size / block_size; ++i)
52-
copy_and_bump_pointers<block_size>(dst, src);
70+
copy_and_bump_pointers<block_size, strategy>(dst, src);
5371
// Update `size` once at the end instead of once per iteration.
5472
size %= block_size;
5573
}
@@ -66,19 +84,57 @@ LIBC_INLINE auto misaligned(CPtr a) {
6684
} // namespace
6785

6886
// Implementation for Cortex-M0, M0+, M1.
69-
// The implementation makes sure that all accesses are aligned.
87+
// Notes:
88+
// - It compiles down to 196 bytes, but 220 bytes when used through `memcpy`
89+
// that also needs to return the `dst` ptr.
90+
// - These cores do not allow for unaligned loads/stores.
91+
// - When `src` and `dst` are coaligned, we start by aligning them and perform
92+
// bulk copies. We let the compiler know the pointers are aligned so it can
93+
// use load/store multiple (LDM, STM). This significantly increase throughput
94+
// but it also requires more registers and push/pop instructions. This impacts
95+
// latency for small size copies.
96+
// - When `src` and `dst` are misaligned, we align `dst` and recompose words
97+
// using multiple aligned loads. `load_aligned` takes care of endianness
98+
// issues.
7099
[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_low_end(Ptr dst, CPtr src,
71100
size_t size) {
72-
// For now, dummy implementation that performs byte per byte copy.
73-
LIBC_LOOP_NOUNROLL
74-
for (size_t i = 0; i < size; ++i)
75-
dst[i] = src[i];
101+
if (size >= 8) {
102+
if (const size_t offset = distance_to_align_up<kWordSize>(dst))
103+
[[unlikely]] {
104+
copy_bytes_and_bump_pointers(dst, src, offset);
105+
size -= offset;
106+
}
107+
const auto src_alignment = distance_to_align_down<kWordSize>(src);
108+
if (src_alignment == 0) [[likely]] {
109+
// Both `src` and `dst` are now word-aligned.
110+
copy_blocks_and_update_args<64, AssumeWordAligned>(dst, src, size);
111+
copy_blocks_and_update_args<16, AssumeWordAligned>(dst, src, size);
112+
copy_blocks_and_update_args<4, AssumeWordAligned>(dst, src, size);
113+
} else {
114+
// `dst` is aligned but `src` is not.
115+
LIBC_LOOP_NOUNROLL
116+
while (size >= kWordSize) {
117+
// Recompose word from multiple loads depending on the alignment.
118+
const uint32_t value =
119+
src_alignment == 2
120+
? load_aligned<uint32_t, uint16_t, uint16_t>(src)
121+
: load_aligned<uint32_t, uint8_t, uint16_t, uint8_t>(src);
122+
memcpy_inline<kWordSize>(assume_aligned<kWordSize>(dst), &value);
123+
dst += kWordSize;
124+
src += kWordSize;
125+
size -= kWordSize;
126+
}
127+
}
128+
// Up to 3 bytes may still need to be copied.
129+
// Handling them with the slow loop below.
130+
}
131+
copy_bytes_and_bump_pointers(dst, src, size);
76132
}
77133

78134
// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware
79135
// support for unaligned loads and stores.
80136
// Notes:
81-
// - It compiles down to <300 bytes.
137+
// - It compiles down to 266 bytes.
82138
// - `dst` and `src` are not `__restrict` to prevent the compiler from
83139
// reordering loads/stores.
84140
// - We keep state variables to a strict minimum to keep everything in the free
@@ -108,9 +164,9 @@ LIBC_INLINE auto misaligned(CPtr a) {
108164
size -= offset;
109165
}
110166
}
111-
copy_blocks<64>(dst, src, size);
112-
copy_blocks<16>(dst, src, size);
113-
copy_blocks<4>(dst, src, size);
167+
copy_blocks_and_update_args<64, ForceWordLdStChain>(dst, src, size);
168+
copy_blocks_and_update_args<16, ForceWordLdStChain>(dst, src, size);
169+
copy_blocks_and_update_args<4, AssumeUnaligned>(dst, src, size);
114170
if (size & 1)
115171
copy_and_bump_pointers<1>(dst, src);
116172
if (size & 2) [[unlikely]]

0 commit comments

Comments
 (0)