Skip to content

Commit de6d0df

Browse files
committed
string: Improve performance of unaligned memcpy
Use long access for the bulk of unaligned memcpy operations by fetching two aligned source words and combining those into an aligned destination word. Signed-off-by: Keith Packard <[email protected]>
1 parent 4b2b54d commit de6d0df

File tree

1 file changed

+84
-45
lines changed

1 file changed

+84
-45
lines changed

newlib/libc/string/memcpy.c

Lines changed: 84 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ QUICKREF
4747
#include "local.h"
4848
#include <stdint.h>
4949

50-
/* Nonzero if either X or Y is not aligned on a "long" boundary. */
51-
#define UNALIGNED(X, Y) \
52-
(((uintptr_t)X & (sizeof (long) - 1)) | ((uintptr_t)Y & (sizeof (long) - 1)))
50+
/* Distance from X to previous aligned boundary. Zero if aligned */
51+
#define UNALIGNED(X) \
52+
(((uintptr_t)X & (sizeof (long) - 1)))
5353

5454
/* How many bytes are copied each iteration of the 4X unrolled loop. */
5555
#define BIGBLOCKSIZE (sizeof (long) << 2)
@@ -68,56 +68,95 @@ memcpy (void *__restrict dst0,
6868
const void *__restrict src0,
6969
size_t len0)
7070
{
71-
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
72-
char *dst = (char *) dst0;
73-
char *src = (char *) src0;
74-
75-
void *save = dst0;
76-
77-
while (len0--)
78-
{
79-
*dst++ = *src++;
80-
}
81-
82-
return save;
83-
#else
8471
char *dst = dst0;
8572
const char *src = src0;
86-
long *aligned_dst;
87-
const long *aligned_src;
73+
#if !(defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__))
74+
unsigned long *aligned_dst;
75+
const unsigned long *aligned_src;
8876

8977
/* If the size is small, or either SRC or DST is unaligned,
9078
then punt into the byte copy loop. This should be rare. */
91-
if (!TOO_SMALL(len0) && !UNALIGNED (src, dst))
92-
{
93-
aligned_dst = (long*)dst;
94-
aligned_src = (long*)src;
95-
96-
/* Copy 4X long words at a time if possible. */
97-
while (len0 >= BIGBLOCKSIZE)
98-
{
99-
*aligned_dst++ = *aligned_src++;
100-
*aligned_dst++ = *aligned_src++;
101-
*aligned_dst++ = *aligned_src++;
102-
*aligned_dst++ = *aligned_src++;
103-
len0 -= BIGBLOCKSIZE;
104-
}
105-
106-
/* Copy one long word at a time if possible. */
107-
while (len0 >= LITTLEBLOCKSIZE)
108-
{
109-
*aligned_dst++ = *aligned_src++;
110-
len0 -= LITTLEBLOCKSIZE;
111-
}
112-
113-
/* Pick up any residual with a byte copier. */
114-
dst = (char*)aligned_dst;
115-
src = (char*)aligned_src;
116-
}
79+
if (!TOO_SMALL(len0))
80+
{
81+
/*
82+
* Align dst and make sure
83+
* we won't fetch anything before src
84+
*/
85+
unsigned start = LITTLEBLOCKSIZE*2 - UNALIGNED(dst);
86+
while(start--) {
87+
*dst++ = *src++;
88+
len0--;
89+
}
90+
91+
aligned_dst = (unsigned long*)dst;
92+
int byte_shift = UNALIGNED(src);
93+
94+
if (!byte_shift)
95+
{
96+
aligned_src = (unsigned long*)src;
97+
98+
/* Copy 4X long words at a time if possible. */
99+
while (len0 >= BIGBLOCKSIZE)
100+
{
101+
*aligned_dst++ = *aligned_src++;
102+
*aligned_dst++ = *aligned_src++;
103+
*aligned_dst++ = *aligned_src++;
104+
*aligned_dst++ = *aligned_src++;
105+
len0 -= BIGBLOCKSIZE;
106+
}
107+
108+
/* Copy one long word at a time if possible. */
109+
while (len0 >= LITTLEBLOCKSIZE)
110+
{
111+
*aligned_dst++ = *aligned_src++;
112+
len0 -= LITTLEBLOCKSIZE;
113+
}
114+
115+
src = (char*)aligned_src;
116+
}
117+
else
118+
{
119+
/*
120+
* Fetch source words and then merge two of them for each
121+
* dest word:
122+
*
123+
* byte_shift remain
124+
* | | |
125+
* xxxxxxxL RRRRRRRy
126+
* D DDDDDDD
127+
*
128+
* We don't want to fetch past the source at all, so stop
129+
* when we have fewer than 'remain' bytes left
130+
*/
131+
132+
/* bytes used from the left word */
133+
int remain = LITTLEBLOCKSIZE - byte_shift;
134+
/* bit shifts for the left and right words */
135+
int left_shift = byte_shift << 3;
136+
int right_shift = remain << 3;
137+
138+
aligned_src = (unsigned long*)(src - byte_shift);
139+
unsigned long left = *aligned_src++, right;
140+
141+
while (len0 >= LITTLEBLOCKSIZE + remain) {
142+
right = *aligned_src++;
143+
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
144+
*aligned_dst++ = (left >> left_shift) | (right << right_shift);
145+
#else
146+
*aligned_dst++ = (left << left_shift) | (right >> right_shift);
147+
#endif
148+
left = right;
149+
len0 -= LITTLEBLOCKSIZE;
150+
}
151+
src = (char *)aligned_src - remain;
152+
}
153+
/* Pick up any residual with a byte copier. */
154+
dst = (char*)aligned_dst;
155+
}
156+
#endif
117157

118158
while (len0--)
119159
*dst++ = *src++;
120160

121161
return dst0;
122-
#endif /* not PREFER_SIZE_OVER_SPEED */
123162
}

0 commit comments

Comments
 (0)