66
77#include "../assembly.h"
88
9- #define L(l) .L ## l
10-
119//
1210// __arm_sc_memcpy / __arm_sc_memmove
1311//
5250 The loop tail is handled by always copying 64 bytes from the end.
5351* /
5452
55- DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED (__arm_sc_memcpy)
53+ DEFINE_COMPILERRT_FUNCTION (__arm_sc_memcpy)
5654 add srcend1 , src , count
5755 add dstend1 , dstin , count
5856 cmp count , 128
59- b.hi L( copy_long)
57+ b.hi 7f // copy_long
6058 cmp count , 32
61- b.hi L( copy32_128)
59+ b.hi 4f // copy32_128
6260
6361 / * Small copies: 0 .. 32 bytes. * /
6462 cmp count , 16
65- b.lo L( copy16)
63+ b.lo 0f // copy16
6664 ldp A_l , A_h , [ src ]
6765 ldp D_l , D_h , [ srcend1 , - 16 ]
6866 stp A_l , A_h , [ dstin ]
6967 stp D_l , D_h , [ dstend1 , - 16 ]
7068 ret
7169
7270 / * Copy 8 - 15 bytes. * /
73- L(copy16):
74- tbz count , 3 , L( copy8)
71+ 0 : // copy16
72+ tbz count , 3 , 1f // copy8
7573 ldr A_l , [ src ]
7674 ldr A_h , [ srcend1 , - 8 ]
7775 str A_l , [ dstin ]
@@ -80,36 +78,36 @@ L(copy16):
8078
8179 .p2align 3
8280 / * Copy 4 - 7 bytes. * /
83- L(copy8):
84- tbz count , 2 , L( copy4)
81+ 1 : // copy8
82+ tbz count , 2 , 2f // copy4
8583 ldr A_lw , [ src ]
8684 ldr B_lw , [ srcend1 , - 4 ]
8785 str A_lw , [ dstin ]
8886 str B_lw , [ dstend1 , - 4 ]
8987 ret
9088
9189 / * Copy 0 .. 3 bytes using a branchless sequence. * /
92- L(copy4):
93- cbz count , L( copy0)
90+ 2 : // copy4
91+ cbz count , 3f // copy0
9492 lsr tmp1 , count , 1
9593 ldrb A_lw , [ src ]
9694 ldrb C_lw , [ srcend1 , - 1 ]
9795 ldrb B_lw , [ src , tmp1 ]
9896 strb A_lw , [ dstin ]
9997 strb B_lw , [ dstin , tmp1 ]
10098 strb C_lw , [ dstend1 , - 1 ]
101- L(copy0):
99+ 3 : // copy0
102100 ret
103101
104102 .p2align 4
105103 / * Medium copies: 33 .. 128 bytes. * /
106- L(copy32_128):
104+ 4 : // copy32_128
107105 ldp A_l , A_h , [ src ]
108106 ldp B_l , B_h , [ src , 16 ]
109107 ldp C_l , C_h , [ srcend1 , - 32 ]
110108 ldp D_l , D_h , [ srcend1 , - 16 ]
111109 cmp count , 64
112- b.hi L( copy128)
110+ b.hi 5f // copy128
113111 stp A_l , A_h , [ dstin ]
114112 stp B_l , B_h , [ dstin , 16 ]
115113 stp C_l , C_h , [ dstend1 , - 32 ]
@@ -118,16 +116,16 @@ L(copy32_128):
118116
119117 .p2align 4
120118 / * Copy 65 .. 128 bytes. * /
121- L(copy128):
119+ 5 : // copy128
122120 ldp E_l , E_h , [ src , 32 ]
123121 ldp F_l , F_h , [ src , 48 ]
124122 cmp count , 96
125- b.ls L( copy96)
123+ b.ls 6f // copy96
126124 ldp G_l , G_h , [ srcend1 , - 64 ]
127125 ldp H_l , H_h , [ srcend1 , - 48 ]
128126 stp G_l , G_h , [ dstend1 , - 64 ]
129127 stp H_l , H_h , [ dstend1 , - 48 ]
130- L(copy96):
128+ 6 : // copy96
131129 stp A_l , A_h , [ dstin ]
132130 stp B_l , B_h , [ dstin , 16 ]
133131 stp E_l , E_h , [ dstin , 32 ]
@@ -138,12 +136,12 @@ L(copy96):
138136
139137 .p2align 4
140138 / * Copy more than 128 bytes. * /
141- L(copy_long):
139+ 7 : // copy_long
142140 / * Use backwards copy if there is an overlap. * /
143141 sub tmp1 , dstin , src
144- cbz tmp1 , L( copy0)
142+ cbz tmp1 , 3b // copy0
145143 cmp tmp1 , count
146- b.lo L( copy_long_backwards)
144+ b.lo 10f // copy_long_backwards
147145
148146 / * Copy 16 bytes and then align dst to 16 - byte alignment. * /
149147
@@ -158,8 +156,8 @@ L(copy_long):
158156 ldp C_l , C_h , [ src , 48 ]
159157 ldp D_l , D_h , [ src , 64 ] !
160158 subs count , count , 128 + 16 / * Test and readjust count. * /
161- b.ls L( copy64_from_end)
162- L(loop64):
159+ b.ls 9f // copy64_from_end
160+ 8 : // loop64
163161 stp A_l , A_h , [ dst , 16 ]
164162 ldp A_l , A_h , [ src , 16 ]
165163 stp B_l , B_h , [ dst , 32 ]
@@ -169,10 +167,10 @@ L(loop64):
169167 stp D_l , D_h , [ dst , 64 ] !
170168 ldp D_l , D_h , [ src , 64 ] !
171169 subs count , count , 64
172- b.hi L( loop64)
170+ b.hi 8b // loop64
173171
174172 / * Write the last iteration and copy 64 bytes from the end. * /
175- L(copy64_from_end):
173+ 9 : // copy64_from_end
176174 ldp E_l , E_h , [ srcend1 , - 64 ]
177175 stp A_l , A_h , [ dst , 16 ]
178176 ldp A_l , A_h , [ srcend1 , - 48 ]
@@ -191,7 +189,7 @@ L(copy64_from_end):
191189
192190 / * Large backwards copy for overlapping copies.
193191 Copy 16 bytes and then align dst to 16 - byte alignment. * /
194- L(copy_long_backwards):
192+ 10 : // copy_long_backwards
195193 ldp D_l , D_h , [ srcend1 , - 16 ]
196194 and tmp1 , dstend1 , 15
197195 sub srcend1 , srcend1 , tmp1
@@ -203,9 +201,9 @@ L(copy_long_backwards):
203201 ldp D_l , D_h , [ srcend1 , - 64 ] !
204202 sub dstend1 , dstend1 , tmp1
205203 subs count , count , 128
206- b.ls L( copy64_from_start)
204+ b.ls 12f // copy64_from_start
207205
208- L(loop64_backwards):
206+ 11 : // loop64_backwards
209207 stp A_l , A_h , [ dstend1 , - 16 ]
210208 ldp A_l , A_h , [ srcend1 , - 16 ]
211209 stp B_l , B_h , [ dstend1 , - 32 ]
@@ -215,10 +213,10 @@ L(loop64_backwards):
215213 stp D_l , D_h , [ dstend1 , - 64 ] !
216214 ldp D_l , D_h , [ srcend1 , - 64 ] !
217215 subs count , count , 64
218- b.hi L( loop64_backwards)
216+ b.hi 11b // loop64_backwards
219217
220218 / * Write the last iteration and copy 64 bytes from the start. * /
221- L(copy64_from_start):
219+ 12 : // copy64_from_start
222220 ldp G_l , G_h , [ src , 48 ]
223221 stp A_l , A_h , [ dstend1 , - 16 ]
224222 ldp A_l , A_h , [ src , 32 ]
@@ -232,7 +230,7 @@ L(copy64_from_start):
232230 stp B_l , B_h , [ dstin , 16 ]
233231 stp C_l , C_h , [ dstin ]
234232 ret
235- END_COMPILERRT_OUTLINE_FUNCTION (__arm_sc_memcpy)
233+ END_COMPILERRT_FUNCTION (__arm_sc_memcpy)
236234
237235DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove , __arm_sc_memcpy)
238236
@@ -250,7 +248,7 @@ DEFINE_COMPILERRT_FUNCTION_ALIAS(__arm_sc_memmove, __arm_sc_memcpy)
250248#define dstend2 x4
251249#define zva_val x5
252250
253- DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED (__arm_sc_memset)
251+ DEFINE_COMPILERRT_FUNCTION (__arm_sc_memset)
254252#ifdef __ARM_FEATURE_SVE
255253 mov z0.b , valw
256254#else
@@ -263,9 +261,9 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
263261 add dstend2 , dstin , count
264262
265263 cmp count , 96
266- b.hi L( set_long)
264+ b.hi 7f // set_long
267265 cmp count , 16
268- b.hs L( set_medium)
266+ b.hs 4f // set_medium
269267 mov val , v0.D [ 0 ]
270268
271269 / * Set 0 .. 15 bytes. * /
@@ -285,38 +283,38 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_sc_memset)
2852833 : ret
286284
287285 / * Set 17 .. 96 bytes. * /
288- L(set_medium):
286+ 4 : // set_medium
289287 str q0 , [ dstin ]
290- tbnz count , 6 , L( set96)
288+ tbnz count , 6 , 6f // set96
291289 str q0 , [ dstend2 , - 16 ]
292- tbz count , 5 , 1f
290+ tbz count , 5 , 5f
293291 str q0 , [ dstin , 16 ]
294292 str q0 , [ dstend2 , - 32 ]
295- 1 : ret
293+ 5 : ret
296294
297295 .p2align 4
298296 / * Set 64 .. 96 bytes. Write 64 bytes from the start and
299297 32 bytes from the end. * /
300- L(set96):
298+ 6 : // set96
301299 str q0 , [ dstin , 16 ]
302300 stp q0 , q0 , [ dstin , 32 ]
303301 stp q0 , q0 , [ dstend2 , - 32 ]
304302 ret
305303
306304 .p2align 4
307- L(set_long):
305+ 7 : // set_long
308306 and valw , valw , 255
309307 bic dst , dstin , 15
310308 str q0 , [ dstin ]
311309 cmp count , 160
312310 ccmp valw , 0 , 0 , hs
313- b.ne L( no_zva)
311+ b.ne 9f // no_zva
314312
315313#ifndef SKIP_ZVA_CHECK
316314 mrs zva_val , dczid_el0
317315 and zva_val , zva_val , 31
318316 cmp zva_val , 4 / * ZVA size is 64 bytes. * /
319- b.ne L( no_zva)
317+ b.ne 9f // no_zva
320318#endif
321319 str q0 , [ dst , 16 ]
322320 stp q0 , q0 , [ dst , 32 ]
@@ -325,27 +323,27 @@ L(set_long):
325323 sub count , count , 128 / * Adjust count and bias for loop . * /
326324
327325 .p2align 4
328- L(zva_loop):
326+ 8 : // zva_loop
329327 add dst , dst , 64
330328 dc zva , dst
331329 subs count , count , 64
332- b.hi L( zva_loop)
330+ b.hi 8b // zva_loop
333331 stp q0 , q0 , [ dstend2 , - 64 ]
334332 stp q0 , q0 , [ dstend2 , - 32 ]
335333 ret
336334
337- L(no_zva):
335+ 9 : // no_zva
338336 sub count , dstend2 , dst / * Count is 16 too large. * /
339337 sub dst , dst , 16 / * Dst is biased by - 32 . * /
340338 sub count , count , 64 + 16 / * Adjust count and bias for loop . * /
341- L(no_zva_loop):
339+ 10 : // no_zva_loop
342340 stp q0 , q0 , [ dst , 32 ]
343341 stp q0 , q0 , [ dst , 64 ] !
344342 subs count , count , 64
345- b.hi L( no_zva_loop)
343+ b.hi 10b // no_zva_loop
346344 stp q0 , q0 , [ dstend2 , - 64 ]
347345 stp q0 , q0 , [ dstend2 , - 32 ]
348346 ret
349- END_COMPILERRT_OUTLINE_FUNCTION (__arm_sc_memset)
347+ END_COMPILERRT_FUNCTION (__arm_sc_memset)
350348
351349#endif // __aarch64__
0 commit comments