Skip to content

Commit a275a82

Browse files
committed
LoongArch: Use alternative to optimize libraries
Use the alternative to optimize common libraries according whether CPU has UAL (hardware unaligned access support) feature, including memset(), memcopy(), memmove(), copy_user() and clear_user(). We have tested UnixBench on a Loongson-3A5000 quad-core machine (1.6GHz): 1, One copy, before patch: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 9566582.0 819.8 Double-Precision Whetstone 55.0 2805.3 510.1 Execl Throughput 43.0 2120.0 493.0 File Copy 1024 bufsize 2000 maxblocks 3960.0 209833.0 529.9 File Copy 256 bufsize 500 maxblocks 1655.0 89400.0 540.2 File Copy 4096 bufsize 8000 maxblocks 5800.0 320036.0 551.8 Pipe Throughput 12440.0 340624.0 273.8 Pipe-based Context Switching 4000.0 109939.1 274.8 Process Creation 126.0 4728.7 375.3 Shell Scripts (1 concurrent) 42.4 2223.1 524.3 Shell Scripts (8 concurrent) 6.0 883.1 1471.9 System Call Overhead 15000.0 518639.1 345.8 ======== System Benchmarks Index Score 500.2 2, One copy, after patch: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 9567674.7 819.9 Double-Precision Whetstone 55.0 2805.5 510.1 Execl Throughput 43.0 2392.7 556.4 File Copy 1024 bufsize 2000 maxblocks 3960.0 417804.0 1055.1 File Copy 256 bufsize 500 maxblocks 1655.0 112909.5 682.2 File Copy 4096 bufsize 8000 maxblocks 5800.0 1255207.4 2164.2 Pipe Throughput 12440.0 555712.0 446.7 Pipe-based Context Switching 4000.0 99964.5 249.9 Process Creation 126.0 5192.5 412.1 Shell Scripts (1 concurrent) 42.4 2302.4 543.0 Shell Scripts (8 concurrent) 6.0 919.6 1532.6 System Call Overhead 15000.0 511159.3 340.8 ======== System Benchmarks Index Score 640.1 3, Four copies, before patch: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 38268610.5 3279.2 Double-Precision Whetstone 55.0 11222.2 2040.4 Execl Throughput 43.0 7892.0 1835.3 File Copy 1024 bufsize 2000 maxblocks 3960.0 235149.6 593.8 File Copy 256 bufsize 500 maxblocks 1655.0 74959.6 452.9 File Copy 4096 bufsize 8000 maxblocks 5800.0 545048.5 939.7 Pipe Throughput 12440.0 1337359.0 1075.0 Pipe-based Context Switching 4000.0 473663.9 1184.2 Process Creation 126.0 17491.2 1388.2 Shell Scripts (1 concurrent) 42.4 6865.7 1619.3 Shell Scripts (8 concurrent) 6.0 1015.9 1693.1 System Call Overhead 15000.0 1899535.2 1266.4 ======== System Benchmarks Index Score 1278.3 4, Four copies, after patch: System Benchmarks Index Values BASELINE RESULT INDEX Dhrystone 2 using register variables 116700.0 38272815.5 3279.6 Double-Precision Whetstone 55.0 11222.8 2040.5 Execl Throughput 43.0 8839.2 2055.6 File Copy 1024 bufsize 2000 maxblocks 3960.0 313912.9 792.7 File Copy 256 bufsize 500 maxblocks 1655.0 80976.1 489.3 File Copy 4096 bufsize 8000 maxblocks 5800.0 1176594.3 2028.6 Pipe Throughput 12440.0 2100941.9 1688.9 Pipe-based Context Switching 4000.0 476696.4 1191.7 Process Creation 126.0 18394.7 1459.9 Shell Scripts (1 concurrent) 42.4 7172.2 1691.6 Shell Scripts (8 concurrent) 6.0 1058.3 1763.9 System Call Overhead 15000.0 1874714.7 1249.8 ======== System Benchmarks Index Score 1488.8 Signed-off-by: Jun Yi <[email protected]> Signed-off-by: Huacai Chen <[email protected]>
1 parent 19e5eb1 commit a275a82

File tree

7 files changed

+465
-11
lines changed

7 files changed

+465
-11
lines changed

arch/loongarch/include/asm/string.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,13 @@
55
#ifndef _ASM_STRING_H
66
#define _ASM_STRING_H
77

8+
#define __HAVE_ARCH_MEMSET
89
extern void *memset(void *__s, int __c, size_t __count);
10+
11+
#define __HAVE_ARCH_MEMCPY
912
extern void *memcpy(void *__to, __const__ void *__from, size_t __n);
13+
14+
#define __HAVE_ARCH_MEMMOVE
1015
extern void *memmove(void *__dest, __const__ void *__src, size_t __n);
1116

1217
#endif /* _ASM_STRING_H */

arch/loongarch/lib/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@
33
# Makefile for LoongArch-specific library files.
44
#
55

6-
lib-y += delay.o clear_user.o copy_user.o dump_tlb.o unaligned.o
6+
lib-y += delay.o memset.o memcpy.o memmove.o \
7+
clear_user.o copy_user.o dump_tlb.o unaligned.o

arch/loongarch/lib/clear_user.S

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,25 +3,37 @@
33
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
44
*/
55

6+
#include <asm/alternative-asm.h>
67
#include <asm/asm.h>
78
#include <asm/asmmacro.h>
89
#include <asm/asm-extable.h>
10+
#include <asm/cpu.h>
911
#include <asm/export.h>
1012
#include <asm/regdef.h>
1113

12-
.irp to, 0
14+
.irp to, 0, 1, 2, 3, 4, 5, 6, 7
1315
.L_fixup_handle_\to\():
1416
addi.d a0, a1, (\to) * (-8)
1517
jr ra
1618
.endr
1719

20+
SYM_FUNC_START(__clear_user)
21+
/*
22+
* Some CPUs support hardware unaligned access
23+
*/
24+
ALTERNATIVE "b __clear_user_generic", \
25+
"b __clear_user_fast", CPU_FEATURE_UAL
26+
SYM_FUNC_END(__clear_user)
27+
28+
EXPORT_SYMBOL(__clear_user)
29+
1830
/*
19-
* unsigned long __clear_user(void *addr, size_t size)
31+
* unsigned long __clear_user_generic(void *addr, size_t size)
2032
*
2133
* a0: addr
2234
* a1: size
2335
*/
24-
SYM_FUNC_START(__clear_user)
36+
SYM_FUNC_START(__clear_user_generic)
2537
beqz a1, 2f
2638

2739
1: st.b zero, a0, 0
@@ -33,6 +45,54 @@ SYM_FUNC_START(__clear_user)
3345
jr ra
3446

3547
_asm_extable 1b, .L_fixup_handle_0
36-
SYM_FUNC_END(__clear_user)
48+
SYM_FUNC_END(__clear_user_generic)
3749

38-
EXPORT_SYMBOL(__clear_user)
50+
/*
51+
* unsigned long __clear_user_fast(void *addr, unsigned long size)
52+
*
53+
* a0: addr
54+
* a1: size
55+
*/
56+
SYM_FUNC_START(__clear_user_fast)
57+
beqz a1, 10f
58+
59+
ori a2, zero, 64
60+
blt a1, a2, 9f
61+
62+
/* set 64 bytes at a time */
63+
1: st.d zero, a0, 0
64+
2: st.d zero, a0, 8
65+
3: st.d zero, a0, 16
66+
4: st.d zero, a0, 24
67+
5: st.d zero, a0, 32
68+
6: st.d zero, a0, 40
69+
7: st.d zero, a0, 48
70+
8: st.d zero, a0, 56
71+
72+
addi.d a0, a0, 64
73+
addi.d a1, a1, -64
74+
bge a1, a2, 1b
75+
76+
beqz a1, 10f
77+
78+
/* set the remaining bytes */
79+
9: st.b zero, a0, 0
80+
addi.d a0, a0, 1
81+
addi.d a1, a1, -1
82+
bgt a1, zero, 9b
83+
84+
/* return */
85+
10: move a0, a1
86+
jr ra
87+
88+
/* fixup and ex_table */
89+
_asm_extable 1b, .L_fixup_handle_0
90+
_asm_extable 2b, .L_fixup_handle_1
91+
_asm_extable 3b, .L_fixup_handle_2
92+
_asm_extable 4b, .L_fixup_handle_3
93+
_asm_extable 5b, .L_fixup_handle_4
94+
_asm_extable 6b, .L_fixup_handle_5
95+
_asm_extable 7b, .L_fixup_handle_6
96+
_asm_extable 8b, .L_fixup_handle_7
97+
_asm_extable 9b, .L_fixup_handle_0
98+
SYM_FUNC_END(__clear_user_fast)

arch/loongarch/lib/copy_user.S

Lines changed: 86 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,38 @@
33
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
44
*/
55

6+
#include <asm/alternative-asm.h>
67
#include <asm/asm.h>
78
#include <asm/asmmacro.h>
89
#include <asm/asm-extable.h>
10+
#include <asm/cpu.h>
911
#include <asm/export.h>
1012
#include <asm/regdef.h>
1113

12-
.irp to, 0
14+
.irp to, 0, 1, 2, 3, 4, 5, 6, 7
1315
.L_fixup_handle_\to\():
1416
addi.d a0, a2, (\to) * (-8)
1517
jr ra
1618
.endr
1719

20+
SYM_FUNC_START(__copy_user)
21+
/*
22+
* Some CPUs support hardware unaligned access
23+
*/
24+
ALTERNATIVE "b __copy_user_generic", \
25+
"b __copy_user_fast", CPU_FEATURE_UAL
26+
SYM_FUNC_END(__copy_user)
27+
28+
EXPORT_SYMBOL(__copy_user)
29+
1830
/*
19-
* unsigned long __copy_user(void *to, const void *from, size_t n)
31+
* unsigned long __copy_user_generic(void *to, const void *from, size_t n)
2032
*
2133
* a0: to
2234
* a1: from
2335
* a2: n
2436
*/
25-
SYM_FUNC_START(__copy_user)
37+
SYM_FUNC_START(__copy_user_generic)
2638
beqz a2, 3f
2739

2840
1: ld.b t0, a1, 0
@@ -37,6 +49,75 @@ SYM_FUNC_START(__copy_user)
3749

3850
_asm_extable 1b, .L_fixup_handle_0
3951
_asm_extable 2b, .L_fixup_handle_0
40-
SYM_FUNC_END(__copy_user)
52+
SYM_FUNC_END(__copy_user_generic)
4153

42-
EXPORT_SYMBOL(__copy_user)
54+
/*
55+
* unsigned long __copy_user_fast(void *to, const void *from, unsigned long n)
56+
*
57+
* a0: to
58+
* a1: from
59+
* a2: n
60+
*/
61+
SYM_FUNC_START(__copy_user_fast)
62+
beqz a2, 19f
63+
64+
ori a3, zero, 64
65+
blt a2, a3, 17f
66+
67+
/* copy 64 bytes at a time */
68+
1: ld.d t0, a1, 0
69+
2: ld.d t1, a1, 8
70+
3: ld.d t2, a1, 16
71+
4: ld.d t3, a1, 24
72+
5: ld.d t4, a1, 32
73+
6: ld.d t5, a1, 40
74+
7: ld.d t6, a1, 48
75+
8: ld.d t7, a1, 56
76+
9: st.d t0, a0, 0
77+
10: st.d t1, a0, 8
78+
11: st.d t2, a0, 16
79+
12: st.d t3, a0, 24
80+
13: st.d t4, a0, 32
81+
14: st.d t5, a0, 40
82+
15: st.d t6, a0, 48
83+
16: st.d t7, a0, 56
84+
85+
addi.d a0, a0, 64
86+
addi.d a1, a1, 64
87+
addi.d a2, a2, -64
88+
bge a2, a3, 1b
89+
90+
beqz a2, 19f
91+
92+
/* copy the remaining bytes */
93+
17: ld.b t0, a1, 0
94+
18: st.b t0, a0, 0
95+
addi.d a0, a0, 1
96+
addi.d a1, a1, 1
97+
addi.d a2, a2, -1
98+
bgt a2, zero, 17b
99+
100+
/* return */
101+
19: move a0, a2
102+
jr ra
103+
104+
/* fixup and ex_table */
105+
_asm_extable 1b, .L_fixup_handle_0
106+
_asm_extable 2b, .L_fixup_handle_1
107+
_asm_extable 3b, .L_fixup_handle_2
108+
_asm_extable 4b, .L_fixup_handle_3
109+
_asm_extable 5b, .L_fixup_handle_4
110+
_asm_extable 6b, .L_fixup_handle_5
111+
_asm_extable 7b, .L_fixup_handle_6
112+
_asm_extable 8b, .L_fixup_handle_7
113+
_asm_extable 9b, .L_fixup_handle_0
114+
_asm_extable 10b, .L_fixup_handle_1
115+
_asm_extable 11b, .L_fixup_handle_2
116+
_asm_extable 12b, .L_fixup_handle_3
117+
_asm_extable 13b, .L_fixup_handle_4
118+
_asm_extable 14b, .L_fixup_handle_5
119+
_asm_extable 15b, .L_fixup_handle_6
120+
_asm_extable 16b, .L_fixup_handle_7
121+
_asm_extable 17b, .L_fixup_handle_0
122+
_asm_extable 18b, .L_fixup_handle_0
123+
SYM_FUNC_END(__copy_user_fast)

arch/loongarch/lib/memcpy.S

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* Copyright (C) 2020-2022 Loongson Technology Corporation Limited
4+
*/
5+
6+
#include <asm/alternative-asm.h>
7+
#include <asm/asm.h>
8+
#include <asm/asmmacro.h>
9+
#include <asm/cpu.h>
10+
#include <asm/export.h>
11+
#include <asm/regdef.h>
12+
13+
SYM_FUNC_START(memcpy)
14+
/*
15+
* Some CPUs support hardware unaligned access
16+
*/
17+
ALTERNATIVE "b __memcpy_generic", \
18+
"b __memcpy_fast", CPU_FEATURE_UAL
19+
SYM_FUNC_END(memcpy)
20+
21+
EXPORT_SYMBOL(memcpy)
22+
23+
/*
24+
* void *__memcpy_generic(void *dst, const void *src, size_t n)
25+
*
26+
* a0: dst
27+
* a1: src
28+
* a2: n
29+
*/
30+
SYM_FUNC_START(__memcpy_generic)
31+
move a3, a0
32+
beqz a2, 2f
33+
34+
1: ld.b t0, a1, 0
35+
st.b t0, a0, 0
36+
addi.d a0, a0, 1
37+
addi.d a1, a1, 1
38+
addi.d a2, a2, -1
39+
bgt a2, zero, 1b
40+
41+
2: move a0, a3
42+
jr ra
43+
SYM_FUNC_END(__memcpy_generic)
44+
45+
/*
46+
* void *__memcpy_fast(void *dst, const void *src, size_t n)
47+
*
48+
* a0: dst
49+
* a1: src
50+
* a2: n
51+
*/
52+
SYM_FUNC_START(__memcpy_fast)
53+
move a3, a0
54+
beqz a2, 3f
55+
56+
ori a4, zero, 64
57+
blt a2, a4, 2f
58+
59+
/* copy 64 bytes at a time */
60+
1: ld.d t0, a1, 0
61+
ld.d t1, a1, 8
62+
ld.d t2, a1, 16
63+
ld.d t3, a1, 24
64+
ld.d t4, a1, 32
65+
ld.d t5, a1, 40
66+
ld.d t6, a1, 48
67+
ld.d t7, a1, 56
68+
st.d t0, a0, 0
69+
st.d t1, a0, 8
70+
st.d t2, a0, 16
71+
st.d t3, a0, 24
72+
st.d t4, a0, 32
73+
st.d t5, a0, 40
74+
st.d t6, a0, 48
75+
st.d t7, a0, 56
76+
77+
addi.d a0, a0, 64
78+
addi.d a1, a1, 64
79+
addi.d a2, a2, -64
80+
bge a2, a4, 1b
81+
82+
beqz a2, 3f
83+
84+
/* copy the remaining bytes */
85+
2: ld.b t0, a1, 0
86+
st.b t0, a0, 0
87+
addi.d a0, a0, 1
88+
addi.d a1, a1, 1
89+
addi.d a2, a2, -1
90+
bgt a2, zero, 2b
91+
92+
/* return */
93+
3: move a0, a3
94+
jr ra
95+
SYM_FUNC_END(__memcpy_fast)

0 commit comments

Comments
 (0)