Skip to content

Commit 2d817de

Browse files
wyr-7xiaoxiang781216
authored andcommitted
armv8m/strcpy: add arch optimize version
1 parent abfb7da commit 2d817de

File tree

3 files changed

+319
-0
lines changed

3 files changed

+319
-0
lines changed

libs/libc/machine/arm/armv8-m/Kconfig

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ config ARMV8M_STRING_FUNCTION
1414
select ARMV8M_MEMSET
1515
select ARMV8M_MEMMOVE
1616
select ARMV8M_STRCMP
17+
select ARMV8M_STRCPY
1718
select ARMV8M_STRLEN
1819

1920
config ARMV8M_MEMCHR
@@ -56,6 +57,14 @@ config ARMV8M_STRCMP
5657
---help---
5758
Enable optimized ARMv8-M specific strcmp() library function
5859

60+
config ARMV8M_STRCPY
61+
bool "Enable optimized strcpy() for ARMv8-M"
62+
default n
63+
select LIBC_ARCH_STRCPY
64+
depends on ARCH_TOOLCHAIN_GNU
65+
---help---
66+
Enable optimized ARMv8-M specific strcpy() library function
67+
5968
config ARMV8M_STRLEN
6069
bool "Enable optimized strlen() for ARMv8-M"
6170
default n

libs/libc/machine/arm/armv8-m/Make.defs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ ifeq ($(CONFIG_ARMV8M_STRCMP),y)
4242
ASRCS += arch_strcmp.S
4343
endif
4444

45+
ifeq ($(CONFIG_ARMV8M_STRCPY),y)
46+
ASRCS += arch_strcpy.S
47+
endif
48+
4549
ifeq ($(CONFIG_ARMV8M_STRLEN),y)
4650
ASRCS += arch_strlen.S
4751
endif
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
/***************************************************************************
2+
* libs/libc/machine/arm/armv8-m/gnu/arch_strcpy.S
3+
*
4+
* Licensed to the Apache Software Foundation (ASF) under one or more
5+
* contributor license agreements. See the NOTICE file distributed with
6+
* this work for additional information regarding copyright ownership. The
7+
* ASF licenses this file to you under the Apache License, Version 2.0 (the
8+
* "License"); you may not use this file except in compliance with the
9+
* License. You may obtain a copy of the License at
10+
*
11+
* http://www.apache.org/licenses/LICENSE-2.0
12+
*
13+
* Unless required by applicable law or agreed to in writing, software
14+
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
15+
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
16+
* License for the specific language governing permissions and limitations
17+
* under the License.
18+
*
19+
***************************************************************************/
20+
21+
/* This strcpy borrowed some ideas from arch_strcmp.S(). */
22+
23+
/* Parameters and result. */
24+
#define dst r0
25+
#define src r1
26+
#define result r0
27+
28+
/* Internal variables, or callee saved registers */
29+
#define tmp1 r4
30+
#define tmp2 r5
31+
#define tmp3 r6
32+
#define src_offset r7
33+
34+
#ifdef __ARM_BIG_ENDIAN
35+
# define MASK_0 0xff000000
36+
# define MASK_1 0xff0000
37+
# define MASK_2 0xff00
38+
# define MASK_3 0xff
39+
# define BYTE_0_SHIFT 24
40+
# define BYTE_1_SHIFT 16
41+
# define BYTE_2_SHIFT 8
42+
# define BYTE_3_SHIFT 0
43+
#else
44+
# define MASK_0 0xff
45+
# define MASK_1 0xff00
46+
# define MASK_2 0xff0000
47+
# define MASK_3 0xff000000
48+
# define BYTE_0_SHIFT 0
49+
# define BYTE_1_SHIFT 8
50+
# define BYTE_2_SHIFT 16
51+
# define BYTE_3_SHIFT 24
52+
#endif
53+
54+
.syntax unified
55+
.text
56+
.align 2
57+
.global strcpy
58+
.thumb
59+
.type strcpy, %function
60+
61+
strcpy:
62+
push {result, tmp1, tmp2, tmp3, src_offset}
63+
eor tmp1, dst, src
64+
tst tmp1, #3
65+
/* If dst and src not at same byte offset from a word boundary */
66+
bne .Lstrs_diff_offset
67+
/* Process same byte offset then, get the offset */
68+
ands tmp1, src, #3
69+
beq .Ldst_src_aligned
70+
/* get number of bytes unaligned */
71+
rsb tmp1, #4
72+
73+
.Lbyte_copy_until_dsr_src_aligned:
74+
ldrb tmp2, [src], #1
75+
cmp tmp2, #0
76+
beq .Lcopy_done
77+
strb tmp2, [dst], #1
78+
subs tmp1, #1
79+
bne .Lbyte_copy_until_dsr_src_aligned
80+
81+
.Ldst_src_aligned:
82+
/* Now dst and src are aligned */
83+
ldr tmp1, [src], #4
84+
sub tmp2, tmp1, #0x01010101
85+
bic tmp2, tmp1
86+
tst tmp2, #0x80808080
87+
/* All zero means no zero byte is detected */
88+
it eq
89+
streq tmp1, [dst], #4
90+
beq .Ldst_src_aligned
91+
92+
/* There is a zero in the word, copy until zero */
93+
sub src, #4
94+
.Lbyte_copy_until_zero:
95+
ldrb tmp2, [src], #1
96+
cmp tmp2, #0
97+
beq .Lcopy_done
98+
strb tmp2, [dst], #1
99+
b .Lbyte_copy_until_zero
100+
101+
/* Make dst aligned, so we won't write anything before dst.
102+
* If we attempt to write before dst, atomic read-write must
103+
* be ensured. Atomic operation complicates things.
104+
* So the solution here is byte by byte copy until dst aligned.
105+
*/
106+
.Lstrs_diff_offset:
107+
ands tmp1, dst, #3
108+
beq .Ldiff_offset_loop_begin
109+
/* get number of dst bytes unaligned */
110+
rsb tmp1, #4
111+
112+
.Lbyte_copy_until_dst_aligned:
113+
ldrb tmp2, [src], #1
114+
cmp tmp2, #0
115+
beq .Lcopy_done
116+
strb tmp2, [dst], #1
117+
subs tmp1, #1
118+
bne .Lbyte_copy_until_dst_aligned
119+
120+
.Ldiff_offset_loop_begin:
121+
/* src_offset mustn't be 0 here */
122+
and src_offset, src, 3
123+
lsls src_offset, #3
124+
bic src, #3
125+
/* first word logic
126+
* prepend 0xff to make the algorithm simpler
127+
* only the first word needs to be prepended
128+
*/
129+
ldr tmp1, [src], #4
130+
mov tmp2, #0xffffffff
131+
rsb tmp3, src_offset, #32
132+
133+
#ifdef __ARM_BIG_ENDIAN
134+
lsls tmp2, tmp3
135+
#else
136+
lsrs tmp2, tmp3
137+
#endif
138+
orr tmp1, tmp1, tmp2
139+
/* Test if the first word contains zero */
140+
sub tmp3, tmp1, #0x01010101
141+
bic tmp3, tmp1
142+
tst tmp3, #0x80808080
143+
/* non-zero means zero byte is detected */
144+
bne .Ltail_copy
145+
146+
/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
147+
mov tmp2, tmp1
148+
.Ldiff_offset_loop:
149+
mov tmp1, tmp2
150+
ldr tmp2, [src], #4
151+
/* Test if contains zero */
152+
sub tmp3, tmp2, #0x01010101
153+
bic tmp3, tmp2
154+
tst tmp3, #0x80808080
155+
/* non-zero means zero byte is detected */
156+
bne .Ltail_copy
157+
/* Now let's fill dst */
158+
#ifdef __ARM_BIG_ENDIAN
159+
lsls tmp1, src_offset
160+
rsb tmp3, src_offset, #32
161+
lsrs tmp3, tmp2, tmp3
162+
orr tmp1, tmp1, tmp3
163+
#else
164+
lsrs tmp1, src_offset
165+
rsb tmp3, src_offset, #32
166+
lsls tmp3, tmp2, tmp3
167+
orr tmp1, tmp1, tmp3
168+
#endif
169+
str tmp1, [dst], #4
170+
b .Ldiff_offset_loop
171+
172+
.Ltail_copy:
173+
cmp src_offset, #24
174+
beq .Loffset_3
175+
cmp src_offset, #16
176+
beq .Loffset_2
177+
/* src_offset == 8 here */
178+
ands tmp3, tmp1, MASK_1
179+
beq .Lcopy_done
180+
lsrs tmp3, BYTE_1_SHIFT
181+
strb tmp3, [dst], #1
182+
.Loffset_2:
183+
ands tmp3, tmp1, MASK_2
184+
beq .Lcopy_done
185+
lsrs tmp3, BYTE_2_SHIFT
186+
strb tmp3, [dst], #1
187+
.Loffset_3:
188+
ands tmp3, tmp1, MASK_3
189+
beq .Lcopy_done
190+
lsrs tmp3, BYTE_3_SHIFT
191+
strb tmp3, [dst], #1
192+
ands tmp3, tmp2, MASK_0
193+
beq .Lcopy_done
194+
lsrs tmp3, BYTE_0_SHIFT
195+
strb tmp3, [dst], #1
196+
ands tmp3, tmp2, MASK_1
197+
beq .Lcopy_done
198+
lsrs tmp3, BYTE_1_SHIFT
199+
strb tmp3, [dst], #1
200+
ands tmp3, tmp2, MASK_2
201+
beq .Lcopy_done
202+
lsrs tmp3, BYTE_2_SHIFT
203+
strb tmp3, [dst], #1
204+
.Lcopy_done:
205+
mov tmp3, #0
206+
strb tmp3, [dst]
207+
pop {result, tmp1, tmp2, tmp3, src_offset}
208+
bx lr
209+
210+
#if 0
211+
/* Pseudo Code of strcpy when dst/src not at same byte offset */
212+
213+
/* Make dst aligned, so we won't write anything before dst.
214+
* If we attempt to write before dst, atomic read-write must
215+
* be ensured. Atomic operation complicates things.
216+
* So the solution here is byte by byte copy until dst aligned.
217+
*/
218+
if (dst & 3 == 0)
219+
goto diff_offset_loop_begin;
220+
ByteCopyUntilDstAligned();
221+
222+
.diff_offset_loop_begin:
223+
/* src_offset mustn't be 0 here */
224+
src_offset = src & 3;
225+
src_offset = src_offset * 8;
226+
src = src & 0xfffffffc;
227+
tmp1 = *src;
228+
src +=4;
229+
/* first word logic
230+
* prepend 0xff to make the algorithm simpler
231+
* only the first word needs to be prepended
232+
*/
233+
if (src_offset != 0)
234+
{
235+
tmp2 = 0xffffffff
236+
#if big endian
237+
tmp2 = tmp2 << (32 - src_offset)
238+
#else
239+
tmp2 = tmp2 >> (32 - src_offset)
240+
#endif
241+
tmp1 |= tmp2
242+
}
243+
if (HasZeroByte(tmp1))
244+
{
245+
goto .tail_copy;
246+
}
247+
248+
/* before loop, set tmp2=tmp1 to simplify the logic in the loop */
249+
tmp2 = tmp1
250+
.diff_offset_loop:
251+
tmp1 = tmp2;
252+
tmp2 = *src;
253+
src += 4;
254+
255+
/* double word tail means we have to copy from tmp1 and tmp2 to dst */
256+
if (HasZeroByte(tmp2))
257+
{
258+
goto .tail_copy;
259+
}
260+
/* Now let's fill dst */
261+
#if big endian
262+
tmp1 = tmp1 << (src_offset);
263+
tmp1 |= tmp2 >> (32 - src_offset);
264+
*dst = tmp1;
265+
#else
266+
tmp1 = tmp1 >> (src_offset);
267+
tmp1 |= tmp2 << (32 - src_offset);
268+
*dst = tmp1;
269+
#endif
270+
dst +=4;
271+
goto .diff_offset_loop;
272+
273+
/* byte by byte copy at the tail */
274+
.tail_copy:
275+
if (src_offset == 3)
276+
goto offset_3;
277+
if (src_offset == 2)
278+
goto offset_2;
279+
280+
/* src_offset mustn't be 0 here */
281+
/* default src_offset == 1 */
282+
if (tmp1 & MASK_1 == 0)
283+
goto cpy_done;
284+
*dst++ = tmp1 & MASK_1;
285+
offset_2:
286+
if (tmp1 & MASK_2 == 0)
287+
goto cpy_done;
288+
*dst++ = tmp1 & MASK_2;
289+
offset_3:
290+
if (tmp1 & MASK_3 == 0)
291+
goto cpy_done;
292+
*dst++ = tmp1 & MASK_3;
293+
if (tmp2 & MASK_0 == 0)
294+
goto cpy_done;
295+
*dst++ = tmp2 & MASK_0;
296+
if (tmp2 & MASK_1 == 0)
297+
goto cpy_done;
298+
*dst++ = tmp2 & MASK_1;
299+
if (tmp2 & MASK_2 == 0)
300+
goto cpy_done;
301+
*dst++ = tmp2 & MASK_2;
302+
/* tmp2 BYTE3 must be zero here */
303+
304+
.cpy_done:
305+
*dst++ = 0;
306+
#endif /* Pseudo code end */

0 commit comments

Comments
 (0)