Skip to content

Commit 9805f39

Browse files
xry111zx2c4
authored andcommitted
LoongArch: vDSO: Tune chacha implementation
As Christophe pointed out, tuning the chacha implementation by scheduling the instructions like what GCC does can improve the performance. The tuning does not introduce too much complexity (basically it's just reordering some instructions). And the tuning does not hurt readibility too much: actually the tuned code looks even more similar to a textbook-style implementation based on 128-bit vectors. So overall it's a good deal to me. Tested with vdso_test_getchacha and benched with vdso_test_getrandom. On a LA664 the speedup is 5%, and I expect a larger speedup on LA[2-4]64 with a lower issue rate. Suggested-by: Christophe Leroy <[email protected]> Link: https://lore.kernel.org/all/[email protected]/ Signed-off-by: Xi Ruoyao <[email protected]> Reviewed-by: Huacai Chen <[email protected]> Signed-off-by: Jason A. Donenfeld <[email protected]>
1 parent 6ff2c29 commit 9805f39

File tree

1 file changed

+55
-37
lines changed

1 file changed

+55
-37
lines changed

arch/loongarch/vdso/vgetrandom-chacha.S

Lines changed: 55 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,11 @@
99

1010
.text
1111

12-
/* Salsa20 quarter-round */
13-
.macro QR a b c d
14-
add.w \a, \a, \b
15-
xor \d, \d, \a
16-
rotri.w \d, \d, 16
17-
18-
add.w \c, \c, \d
19-
xor \b, \b, \c
20-
rotri.w \b, \b, 20
21-
22-
add.w \a, \a, \b
23-
xor \d, \d, \a
24-
rotri.w \d, \d, 24
25-
26-
add.w \c, \c, \d
27-
xor \b, \b, \c
28-
rotri.w \b, \b, 25
12+
.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
13+
\op \d0, \d0, \s0
14+
\op \d1, \d1, \s1
15+
\op \d2, \d2, \s2
16+
\op \d3, \d3, \s3
2917
.endm
3018

3119
/*
@@ -74,6 +62,23 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
7462
/* Reuse i as copy3 */
7563
#define copy3 i
7664

65+
/* Packs to be used with OP_4REG */
66+
#define line0 state0, state1, state2, state3
67+
#define line1 state4, state5, state6, state7
68+
#define line2 state8, state9, state10, state11
69+
#define line3 state12, state13, state14, state15
70+
71+
#define line1_perm state5, state6, state7, state4
72+
#define line2_perm state10, state11, state8, state9
73+
#define line3_perm state15, state12, state13, state14
74+
75+
#define copy copy0, copy1, copy2, copy3
76+
77+
#define _16 16, 16, 16, 16
78+
#define _20 20, 20, 20, 20
79+
#define _24 24, 24, 24, 24
80+
#define _25 25, 25, 25, 25
81+
7782
/*
7883
* The ABI requires s0-s9 saved, and sp aligned to 16-byte.
7984
* This does not violate the stack-less requirement: no sensitive data
@@ -126,16 +131,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
126131
li.w i, 10
127132
.Lpermute:
128133
/* odd round */
129-
QR state0, state4, state8, state12
130-
QR state1, state5, state9, state13
131-
QR state2, state6, state10, state14
132-
QR state3, state7, state11, state15
134+
OP_4REG add.w line0, line1
135+
OP_4REG xor line3, line0
136+
OP_4REG rotri.w line3, _16
137+
138+
OP_4REG add.w line2, line3
139+
OP_4REG xor line1, line2
140+
OP_4REG rotri.w line1, _20
141+
142+
OP_4REG add.w line0, line1
143+
OP_4REG xor line3, line0
144+
OP_4REG rotri.w line3, _24
145+
146+
OP_4REG add.w line2, line3
147+
OP_4REG xor line1, line2
148+
OP_4REG rotri.w line1, _25
133149

134150
/* even round */
135-
QR state0, state5, state10, state15
136-
QR state1, state6, state11, state12
137-
QR state2, state7, state8, state13
138-
QR state3, state4, state9, state14
151+
OP_4REG add.w line0, line1_perm
152+
OP_4REG xor line3_perm, line0
153+
OP_4REG rotri.w line3_perm, _16
154+
155+
OP_4REG add.w line2_perm, line3_perm
156+
OP_4REG xor line1_perm, line2_perm
157+
OP_4REG rotri.w line1_perm, _20
158+
159+
OP_4REG add.w line0, line1_perm
160+
OP_4REG xor line3_perm, line0
161+
OP_4REG rotri.w line3_perm, _24
162+
163+
OP_4REG add.w line2_perm, line3_perm
164+
OP_4REG xor line1_perm, line2_perm
165+
OP_4REG rotri.w line1_perm, _25
139166

140167
addi.w i, i, -1
141168
bnez i, .Lpermute
@@ -147,10 +174,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
147174
li.w copy3, 0x6b206574
148175

149176
/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
150-
add.w state0, state0, copy0
151-
add.w state1, state1, copy1
152-
add.w state2, state2, copy2
153-
add.w state3, state3, copy3
177+
OP_4REG add.w line0, copy
154178
st.w state0, output, 0
155179
st.w state1, output, 4
156180
st.w state2, output, 8
@@ -165,10 +189,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
165189
ld.w state3, key, 12
166190

167191
/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
168-
add.w state4, state4, state0
169-
add.w state5, state5, state1
170-
add.w state6, state6, state2
171-
add.w state7, state7, state3
192+
OP_4REG add.w line1, line0
172193
st.w state4, output, 16
173194
st.w state5, output, 20
174195
st.w state6, output, 24
@@ -181,10 +202,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
181202
ld.w state3, key, 28
182203

183204
/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
184-
add.w state8, state8, state0
185-
add.w state9, state9, state1
186-
add.w state10, state10, state2
187-
add.w state11, state11, state3
205+
OP_4REG add.w line2, line0
188206
st.w state8, output, 32
189207
st.w state9, output, 36
190208
st.w state10, output, 40

0 commit comments

Comments
 (0)