207
207
.file "musl_memcpy.c"
208
208
209
209
# Define memcpy_loop macro for custom instruction (U-type)
210
- .macro memcpy_loop shift
211
- .word 0x00000072 | (\shift << 12 ) # opcode 0x72 + shift in immediate field (bits 12-31 )
212
- .endm
210
+ .macro memcpy_loop shift
211
+ 1 :
212
+ li t0, 16
213
+ bltu a2 , t0, 3f # if (len < 16 ) break
214
+
215
+ # Fast path only if both src and dst 4 -byte aligned
216
+ or t2, a3 , a4
217
+ andi t2, t2, 3
218
+ bnez t2, 2f # if either misaligned, use bytewise
219
+
220
+ # 16B via four lw/sw (RV32IM requires aligned lw/sw)
221
+ lw t3, 0 (a4 )
222
+ lw t4, 4 (a4 )
223
+ lw t5, 8 (a4 )
224
+ lw t6, 12 (a4 )
225
+ sw t3, 0 (a3 )
226
+ sw t4, 4 (a3 )
227
+ sw t5, 8 (a3 )
228
+ sw t6, 12 (a3 )
229
+ addi a4 , a4 , 16
230
+ addi a3 , a3 , 16
231
+ addi a2 , a2 , -16
232
+ j 1b
233
+
234
+ 2 : # Fallback: alignment-safe bytewise 16B copy
235
+ lb t1, 0 (a4 ); sb t1, 0(a3)
236
+ lb t1, 1 (a4 ); sb t1, 1(a3)
237
+ lb t1, 2 (a4 ); sb t1, 2(a3)
238
+ lb t1, 3 (a4 ); sb t1, 3(a3)
239
+ lb t1, 4 (a4 ); sb t1, 4(a3)
240
+ lb t1, 5 (a4 ); sb t1, 5(a3)
241
+ lb t1, 6 (a4 ); sb t1, 6(a3)
242
+ lb t1, 7 (a4 ); sb t1, 7(a3)
243
+ lb t1, 8 (a4 ); sb t1, 8(a3)
244
+ lb t1, 9 (a4 ); sb t1, 9(a3)
245
+ lb t1,10 (a4 ); sb t1,10(a3)
246
+ lb t1,11 (a4 ); sb t1,11(a3)
247
+ lb t1,12 (a4 ); sb t1,12(a3)
248
+ lb t1,13 (a4 ); sb t1,13(a3)
249
+ lb t1,14 (a4 ); sb t1,14(a3)
250
+ lb t1,15 (a4 ); sb t1,15(a3)
251
+ addi a4 , a4 , 16
252
+ addi a3 , a3 , 16
253
+ addi a2 , a2 , -16
254
+ j 1b
255
+
256
+ 3 :
257
+ .endm
258
+
213
259
.globl memcpy
214
260
.p2align 2
215
261
.type memcpy,@function
@@ -416,4 +462,4 @@ memcpy:
416
462
417
463
.ident "Ubuntu clang version 14.0.6-++20220622053131+f28c006a5895-1~exp1~20220622173215.157"
418
464
.section ".note.GNU-stack" ,"" ,@progbits
419
- .addrsig
465
+ .addrsig
0 commit comments