99
1010.text
1111
12- /* Salsa20 quarter-round */
13- .macro QR a b c d
14- add.w \a, \a, \b
15- xor \d, \d, \a
16- rotri.w \d, \d, 16
17-
18- add.w \c, \c, \d
19- xor \b, \b, \c
20- rotri.w \b, \b, 20
21-
22- add.w \a, \a, \b
23- xor \d, \d, \a
24- rotri.w \d, \d, 24
25-
26- add.w \c, \c, \d
27- xor \b, \b, \c
28- rotri.w \b, \b, 25
12+ .macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
13+ \op \d0 , \d0 , \s0
14+ \op \d1 , \d1 , \s1
15+ \op \d2 , \d2 , \s2
16+ \op \d3 , \d3 , \s3
2917.endm
3018
3119/*
@@ -74,6 +62,23 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
7462/* Reuse i as copy3 */
7563#define copy3 i
7664
65+ /* Packs to be used with OP_4REG */
66+ #define line0 state0, state1, state2, state3
67+ #define line1 state4, state5, state6, state7
68+ #define line2 state8, state9, state10, state11
69+ #define line3 state12, state13, state14, state15
70+
71+ #define line1_perm state5, state6, state7, state4
72+ #define line2_perm state10, state11, state8, state9
73+ #define line3_perm state15, state12, state13, state14
74+
75+ #define copy copy0, copy1, copy2, copy3
76+
77+ #define _16 16 , 16 , 16 , 16
78+ #define _20 20 , 20 , 20 , 20
79+ #define _24 24 , 24 , 24 , 24
80+ #define _25 25 , 25 , 25 , 25
81+
7782 /*
7883 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
7984 * This does not violate the stack-less requirement: no sensitive data
@@ -126,16 +131,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
126131 li.w i, 10
127132.Lpermute:
128133 /* odd round */
129- QR state0, state4, state8, state12
130- QR state1, state5, state9, state13
131- QR state2, state6, state10, state14
132- QR state3, state7, state11, state15
134+ OP_4REG add.w line0, line1
135+ OP_4REG xor line3, line0
136+ OP_4REG rotri.w line3, _16
137+
138+ OP_4REG add.w line2, line3
139+ OP_4REG xor line1, line2
140+ OP_4REG rotri.w line1, _20
141+
142+ OP_4REG add.w line0, line1
143+ OP_4REG xor line3, line0
144+ OP_4REG rotri.w line3, _24
145+
146+ OP_4REG add.w line2, line3
147+ OP_4REG xor line1, line2
148+ OP_4REG rotri.w line1, _25
133149
134150 /* even round */
135- QR state0, state5, state10, state15
136- QR state1, state6, state11, state12
137- QR state2, state7, state8, state13
138- QR state3, state4, state9, state14
151+ OP_4REG add.w line0, line1_perm
152+ OP_4REG xor line3_perm, line0
153+ OP_4REG rotri.w line3_perm, _16
154+
155+ OP_4REG add.w line2_perm, line3_perm
156+ OP_4REG xor line1_perm, line2_perm
157+ OP_4REG rotri.w line1_perm, _20
158+
159+ OP_4REG add.w line0, line1_perm
160+ OP_4REG xor line3_perm, line0
161+ OP_4REG rotri.w line3_perm, _24
162+
163+ OP_4REG add.w line2_perm, line3_perm
164+ OP_4REG xor line1_perm, line2_perm
165+ OP_4REG rotri.w line1_perm, _25
139166
140167 addi.w i, i, -1
141168 bnez i, .Lpermute
@@ -147,10 +174,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
147174 li.w copy3, 0x6b206574
148175
149176 /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
150- add.w state0, state0, copy0
151- add.w state1, state1, copy1
152- add.w state2, state2, copy2
153- add.w state3, state3, copy3
177+ OP_4REG add.w line0, copy
154178 st .w state0, output, 0
155179 st .w state1, output, 4
156180 st .w state2, output, 8
@@ -165,10 +189,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
165189 ld.w state3, key, 12
166190
167191 /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
168- add.w state4, state4, state0
169- add.w state5, state5, state1
170- add.w state6, state6, state2
171- add.w state7, state7, state3
192+ OP_4REG add.w line1, line0
172193 st .w state4, output, 16
173194 st .w state5, output, 20
174195 st .w state6, output, 24
@@ -181,10 +202,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
181202 ld.w state3, key, 28
182203
183204 /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
184- add.w state8, state8, state0
185- add.w state9, state9, state1
186- add.w state10, state10, state2
187- add.w state11, state11, state3
205+ OP_4REG add.w line2, line0
188206 st .w state8, output, 32
189207 st .w state9, output, 36
190208 st .w state10, output, 40
0 commit comments