9
9
10
10
.text
11
11
12
- /* Salsa20 quarter-round */
13
- .macro QR a b c d
14
- add.w \a, \a, \b
15
- xor \d, \d, \a
16
- rotri.w \d, \d, 16
17
-
18
- add.w \c, \c, \d
19
- xor \b, \b, \c
20
- rotri.w \b, \b, 20
21
-
22
- add.w \a, \a, \b
23
- xor \d, \d, \a
24
- rotri.w \d, \d, 24
25
-
26
- add.w \c, \c, \d
27
- xor \b, \b, \c
28
- rotri.w \b, \b, 25
12
+ .macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
13
+ \op \d0 , \d0 , \s0
14
+ \op \d1 , \d1 , \s1
15
+ \op \d2 , \d2 , \s2
16
+ \op \d3 , \d3 , \s3
29
17
.endm
30
18
31
19
/*
@@ -74,6 +62,23 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
74
62
/* Reuse i as copy3 */
75
63
#define copy3 i
76
64
65
+ /* Packs to be used with OP_4REG */
66
+ #define line0 state0, state1, state2, state3
67
+ #define line1 state4, state5, state6, state7
68
+ #define line2 state8, state9, state10, state11
69
+ #define line3 state12, state13, state14, state15
70
+
71
+ #define line1_perm state5, state6, state7, state4
72
+ #define line2_perm state10, state11, state8, state9
73
+ #define line3_perm state15, state12, state13, state14
74
+
75
+ #define copy copy0, copy1, copy2, copy3
76
+
77
+ #define _16 16 , 16 , 16 , 16
78
+ #define _20 20 , 20 , 20 , 20
79
+ #define _24 24 , 24 , 24 , 24
80
+ #define _25 25 , 25 , 25 , 25
81
+
77
82
/*
78
83
* The ABI requires s0-s9 saved, and sp aligned to 16-byte.
79
84
* This does not violate the stack-less requirement: no sensitive data
@@ -126,16 +131,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
126
131
li.w i, 10
127
132
.Lpermute:
128
133
/* odd round */
129
- QR state0, state4, state8, state12
130
- QR state1, state5, state9, state13
131
- QR state2, state6, state10, state14
132
- QR state3, state7, state11, state15
134
+ OP_4REG add.w line0, line1
135
+ OP_4REG xor line3, line0
136
+ OP_4REG rotri.w line3, _16
137
+
138
+ OP_4REG add.w line2, line3
139
+ OP_4REG xor line1, line2
140
+ OP_4REG rotri.w line1, _20
141
+
142
+ OP_4REG add.w line0, line1
143
+ OP_4REG xor line3, line0
144
+ OP_4REG rotri.w line3, _24
145
+
146
+ OP_4REG add.w line2, line3
147
+ OP_4REG xor line1, line2
148
+ OP_4REG rotri.w line1, _25
133
149
134
150
/* even round */
135
- QR state0, state5, state10, state15
136
- QR state1, state6, state11, state12
137
- QR state2, state7, state8, state13
138
- QR state3, state4, state9, state14
151
+ OP_4REG add.w line0, line1_perm
152
+ OP_4REG xor line3_perm, line0
153
+ OP_4REG rotri.w line3_perm, _16
154
+
155
+ OP_4REG add.w line2_perm, line3_perm
156
+ OP_4REG xor line1_perm, line2_perm
157
+ OP_4REG rotri.w line1_perm, _20
158
+
159
+ OP_4REG add.w line0, line1_perm
160
+ OP_4REG xor line3_perm, line0
161
+ OP_4REG rotri.w line3_perm, _24
162
+
163
+ OP_4REG add.w line2_perm, line3_perm
164
+ OP_4REG xor line1_perm, line2_perm
165
+ OP_4REG rotri.w line1_perm, _25
139
166
140
167
addi.w i, i, -1
141
168
bnez i, .Lpermute
@@ -147,10 +174,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
147
174
li.w copy3, 0x6b206574
148
175
149
176
/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
150
- add.w state0, state0, copy0
151
- add.w state1, state1, copy1
152
- add.w state2, state2, copy2
153
- add.w state3, state3, copy3
177
+ OP_4REG add.w line0, copy
154
178
st .w state0, output, 0
155
179
st .w state1, output, 4
156
180
st .w state2, output, 8
@@ -165,10 +189,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
165
189
ld.w state3, key, 12
166
190
167
191
/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
168
- add.w state4, state4, state0
169
- add.w state5, state5, state1
170
- add.w state6, state6, state2
171
- add.w state7, state7, state3
192
+ OP_4REG add.w line1, line0
172
193
st .w state4, output, 16
173
194
st .w state5, output, 20
174
195
st .w state6, output, 24
@@ -181,10 +202,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
181
202
ld.w state3, key, 28
182
203
183
204
/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
184
- add.w state8, state8, state0
185
- add.w state9, state9, state1
186
- add.w state10, state10, state2
187
- add.w state11, state11, state3
205
+ OP_4REG add.w line2, line0
188
206
st .w state8, output, 32
189
207
st .w state9, output, 36
190
208
st .w state10, output, 40
0 commit comments