@@ -47,9 +47,9 @@ QUICKREF
4747#include "local.h"
4848#include <stdint.h>
4949
50- /* Nonzero if either X or Y is not aligned on a "long" boundary. */
51- #define UNALIGNED (X , Y ) \
52- (((uintptr_t)X & (sizeof (long) - 1)) | (( uintptr_t)Y & (sizeof (long) - 1)))
50+ /* Distance from X to previous aligned boundary. Zero if aligned */
51+ #define UNALIGNED (X ) \
52+ ((( uintptr_t)X & (sizeof (long) - 1)))
5353
5454/* How many bytes are copied each iteration of the 4X unrolled loop. */
5555#define BIGBLOCKSIZE (sizeof (long) << 2)
@@ -68,56 +68,95 @@ memcpy (void *__restrict dst0,
6868 const void * __restrict src0 ,
6969 size_t len0 )
7070{
71- #if defined(PREFER_SIZE_OVER_SPEED ) || defined(__OPTIMIZE_SIZE__ )
72- char * dst = (char * ) dst0 ;
73- char * src = (char * ) src0 ;
74-
75- void * save = dst0 ;
76-
77- while (len0 -- )
78- {
79- * dst ++ = * src ++ ;
80- }
81-
82- return save ;
83- #else
8471 char * dst = dst0 ;
8572 const char * src = src0 ;
86- long * aligned_dst ;
87- const long * aligned_src ;
73+ #if !(defined(PREFER_SIZE_OVER_SPEED ) || defined(__OPTIMIZE_SIZE__ ))
74+ unsigned long * aligned_dst ;
75+ const unsigned long * aligned_src ;
8876
8977 /* If the size is small, or either SRC or DST is unaligned,
9078 then punt into the byte copy loop. This should be rare. */
91- if (!TOO_SMALL (len0 ) && !UNALIGNED (src , dst ))
92- {
93- aligned_dst = (long * )dst ;
94- aligned_src = (long * )src ;
95-
96- /* Copy 4X long words at a time if possible. */
97- while (len0 >= BIGBLOCKSIZE )
98- {
99- * aligned_dst ++ = * aligned_src ++ ;
100- * aligned_dst ++ = * aligned_src ++ ;
101- * aligned_dst ++ = * aligned_src ++ ;
102- * aligned_dst ++ = * aligned_src ++ ;
103- len0 -= BIGBLOCKSIZE ;
104- }
105-
106- /* Copy one long word at a time if possible. */
107- while (len0 >= LITTLEBLOCKSIZE )
108- {
109- * aligned_dst ++ = * aligned_src ++ ;
110- len0 -= LITTLEBLOCKSIZE ;
111- }
112-
113- /* Pick up any residual with a byte copier. */
114- dst = (char * )aligned_dst ;
115- src = (char * )aligned_src ;
116- }
79+ if (!TOO_SMALL (len0 ))
80+ {
81+ /*
82+ * Align dst and make sure
83+ * we won't fetch anything before src
84+ */
85+ unsigned start = LITTLEBLOCKSIZE * 2 - UNALIGNED (dst );
86+ while (start -- ) {
87+ * dst ++ = * src ++ ;
88+ len0 -- ;
89+ }
90+
91+ aligned_dst = (unsigned long * )dst ;
92+ int byte_shift = UNALIGNED (src );
93+
94+ if (!byte_shift )
95+ {
96+ aligned_src = (unsigned long * )src ;
97+
98+ /* Copy 4X long words at a time if possible. */
99+ while (len0 >= BIGBLOCKSIZE )
100+ {
101+ * aligned_dst ++ = * aligned_src ++ ;
102+ * aligned_dst ++ = * aligned_src ++ ;
103+ * aligned_dst ++ = * aligned_src ++ ;
104+ * aligned_dst ++ = * aligned_src ++ ;
105+ len0 -= BIGBLOCKSIZE ;
106+ }
107+
108+ /* Copy one long word at a time if possible. */
109+ while (len0 >= LITTLEBLOCKSIZE )
110+ {
111+ * aligned_dst ++ = * aligned_src ++ ;
112+ len0 -= LITTLEBLOCKSIZE ;
113+ }
114+
115+ src = (char * )aligned_src ;
116+ }
117+ else
118+ {
119+ /*
120+ * Fetch source words and then merge two of them for each
121+ * dest word:
122+ *
123+ * byte_shift remain
124+ * | | |
125+ * xxxxxxxL RRRRRRRy
126+ * D DDDDDDD
127+ *
128+ * We don't want to fetch past the source at all, so stop
129+ * when we have fewer than 'remain' bytes left
130+ */
131+
132+ /* bytes used from the left word */
133+ int remain = LITTLEBLOCKSIZE - byte_shift ;
134+ /* bit shifts for the left and right words */
135+ int left_shift = byte_shift << 3 ;
136+ int right_shift = remain << 3 ;
137+
138+ aligned_src = (unsigned long * )(src - byte_shift );
139+ unsigned long left = * aligned_src ++ , right ;
140+
141+ while (len0 >= LITTLEBLOCKSIZE + remain ) {
142+ right = * aligned_src ++ ;
143+ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
144+ * aligned_dst ++ = (left >> left_shift ) | (right << right_shift );
145+ #else
146+ * aligned_dst ++ = (left << left_shift ) | (right >> right_shift );
147+ #endif
148+ left = right ;
149+ len0 -= LITTLEBLOCKSIZE ;
150+ }
151+ src = (char * )aligned_src - remain ;
152+ }
153+ /* Pick up any residual with a byte copier. */
154+ dst = (char * )aligned_dst ;
155+ }
156+ #endif
117157
118158 while (len0 -- )
119159 * dst ++ = * src ++ ;
120160
121161 return dst0 ;
122- #endif /* not PREFER_SIZE_OVER_SPEED */
123162}
0 commit comments