@@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
167
167
} while (-- lines > 0 );
168
168
}
169
169
170
- struct xor_block_template const xor_block_inner_neon = {
170
+ struct xor_block_template xor_block_inner_neon __ro_after_init = {
171
171
.name = "__inner_neon__" ,
172
172
.do_2 = xor_arm64_neon_2 ,
173
173
.do_3 = xor_arm64_neon_3 ,
@@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = {
176
176
};
177
177
EXPORT_SYMBOL (xor_block_inner_neon );
178
178
179
+ static inline uint64x2_t eor3 (uint64x2_t p , uint64x2_t q , uint64x2_t r )
180
+ {
181
+ uint64x2_t res ;
182
+
183
+ asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
184
+ "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
185
+ : "=w" (res ) : "w" (p ), "w" (q ), "w" (r ));
186
+ return res ;
187
+ }
188
+
189
+ static void xor_arm64_eor3_3 (unsigned long bytes , unsigned long * p1 ,
190
+ unsigned long * p2 , unsigned long * p3 )
191
+ {
192
+ uint64_t * dp1 = (uint64_t * )p1 ;
193
+ uint64_t * dp2 = (uint64_t * )p2 ;
194
+ uint64_t * dp3 = (uint64_t * )p3 ;
195
+
196
+ register uint64x2_t v0 , v1 , v2 , v3 ;
197
+ long lines = bytes / (sizeof (uint64x2_t ) * 4 );
198
+
199
+ do {
200
+ /* p1 ^= p2 ^ p3 */
201
+ v0 = eor3 (vld1q_u64 (dp1 + 0 ), vld1q_u64 (dp2 + 0 ),
202
+ vld1q_u64 (dp3 + 0 ));
203
+ v1 = eor3 (vld1q_u64 (dp1 + 2 ), vld1q_u64 (dp2 + 2 ),
204
+ vld1q_u64 (dp3 + 2 ));
205
+ v2 = eor3 (vld1q_u64 (dp1 + 4 ), vld1q_u64 (dp2 + 4 ),
206
+ vld1q_u64 (dp3 + 4 ));
207
+ v3 = eor3 (vld1q_u64 (dp1 + 6 ), vld1q_u64 (dp2 + 6 ),
208
+ vld1q_u64 (dp3 + 6 ));
209
+
210
+ /* store */
211
+ vst1q_u64 (dp1 + 0 , v0 );
212
+ vst1q_u64 (dp1 + 2 , v1 );
213
+ vst1q_u64 (dp1 + 4 , v2 );
214
+ vst1q_u64 (dp1 + 6 , v3 );
215
+
216
+ dp1 += 8 ;
217
+ dp2 += 8 ;
218
+ dp3 += 8 ;
219
+ } while (-- lines > 0 );
220
+ }
221
+
222
+ static void xor_arm64_eor3_4 (unsigned long bytes , unsigned long * p1 ,
223
+ unsigned long * p2 , unsigned long * p3 ,
224
+ unsigned long * p4 )
225
+ {
226
+ uint64_t * dp1 = (uint64_t * )p1 ;
227
+ uint64_t * dp2 = (uint64_t * )p2 ;
228
+ uint64_t * dp3 = (uint64_t * )p3 ;
229
+ uint64_t * dp4 = (uint64_t * )p4 ;
230
+
231
+ register uint64x2_t v0 , v1 , v2 , v3 ;
232
+ long lines = bytes / (sizeof (uint64x2_t ) * 4 );
233
+
234
+ do {
235
+ /* p1 ^= p2 ^ p3 */
236
+ v0 = eor3 (vld1q_u64 (dp1 + 0 ), vld1q_u64 (dp2 + 0 ),
237
+ vld1q_u64 (dp3 + 0 ));
238
+ v1 = eor3 (vld1q_u64 (dp1 + 2 ), vld1q_u64 (dp2 + 2 ),
239
+ vld1q_u64 (dp3 + 2 ));
240
+ v2 = eor3 (vld1q_u64 (dp1 + 4 ), vld1q_u64 (dp2 + 4 ),
241
+ vld1q_u64 (dp3 + 4 ));
242
+ v3 = eor3 (vld1q_u64 (dp1 + 6 ), vld1q_u64 (dp2 + 6 ),
243
+ vld1q_u64 (dp3 + 6 ));
244
+
245
+ /* p1 ^= p4 */
246
+ v0 = veorq_u64 (v0 , vld1q_u64 (dp4 + 0 ));
247
+ v1 = veorq_u64 (v1 , vld1q_u64 (dp4 + 2 ));
248
+ v2 = veorq_u64 (v2 , vld1q_u64 (dp4 + 4 ));
249
+ v3 = veorq_u64 (v3 , vld1q_u64 (dp4 + 6 ));
250
+
251
+ /* store */
252
+ vst1q_u64 (dp1 + 0 , v0 );
253
+ vst1q_u64 (dp1 + 2 , v1 );
254
+ vst1q_u64 (dp1 + 4 , v2 );
255
+ vst1q_u64 (dp1 + 6 , v3 );
256
+
257
+ dp1 += 8 ;
258
+ dp2 += 8 ;
259
+ dp3 += 8 ;
260
+ dp4 += 8 ;
261
+ } while (-- lines > 0 );
262
+ }
263
+
264
+ static void xor_arm64_eor3_5 (unsigned long bytes , unsigned long * p1 ,
265
+ unsigned long * p2 , unsigned long * p3 ,
266
+ unsigned long * p4 , unsigned long * p5 )
267
+ {
268
+ uint64_t * dp1 = (uint64_t * )p1 ;
269
+ uint64_t * dp2 = (uint64_t * )p2 ;
270
+ uint64_t * dp3 = (uint64_t * )p3 ;
271
+ uint64_t * dp4 = (uint64_t * )p4 ;
272
+ uint64_t * dp5 = (uint64_t * )p5 ;
273
+
274
+ register uint64x2_t v0 , v1 , v2 , v3 ;
275
+ long lines = bytes / (sizeof (uint64x2_t ) * 4 );
276
+
277
+ do {
278
+ /* p1 ^= p2 ^ p3 */
279
+ v0 = eor3 (vld1q_u64 (dp1 + 0 ), vld1q_u64 (dp2 + 0 ),
280
+ vld1q_u64 (dp3 + 0 ));
281
+ v1 = eor3 (vld1q_u64 (dp1 + 2 ), vld1q_u64 (dp2 + 2 ),
282
+ vld1q_u64 (dp3 + 2 ));
283
+ v2 = eor3 (vld1q_u64 (dp1 + 4 ), vld1q_u64 (dp2 + 4 ),
284
+ vld1q_u64 (dp3 + 4 ));
285
+ v3 = eor3 (vld1q_u64 (dp1 + 6 ), vld1q_u64 (dp2 + 6 ),
286
+ vld1q_u64 (dp3 + 6 ));
287
+
288
+ /* p1 ^= p4 ^ p5 */
289
+ v0 = eor3 (v0 , vld1q_u64 (dp4 + 0 ), vld1q_u64 (dp5 + 0 ));
290
+ v1 = eor3 (v1 , vld1q_u64 (dp4 + 2 ), vld1q_u64 (dp5 + 2 ));
291
+ v2 = eor3 (v2 , vld1q_u64 (dp4 + 4 ), vld1q_u64 (dp5 + 4 ));
292
+ v3 = eor3 (v3 , vld1q_u64 (dp4 + 6 ), vld1q_u64 (dp5 + 6 ));
293
+
294
+ /* store */
295
+ vst1q_u64 (dp1 + 0 , v0 );
296
+ vst1q_u64 (dp1 + 2 , v1 );
297
+ vst1q_u64 (dp1 + 4 , v2 );
298
+ vst1q_u64 (dp1 + 6 , v3 );
299
+
300
+ dp1 += 8 ;
301
+ dp2 += 8 ;
302
+ dp3 += 8 ;
303
+ dp4 += 8 ;
304
+ dp5 += 8 ;
305
+ } while (-- lines > 0 );
306
+ }
307
+
308
+ static int __init xor_neon_init (void )
309
+ {
310
+ if (IS_ENABLED (CONFIG_AS_HAS_SHA3 ) && cpu_have_named_feature (SHA3 )) {
311
+ xor_block_inner_neon .do_3 = xor_arm64_eor3_3 ;
312
+ xor_block_inner_neon .do_4 = xor_arm64_eor3_4 ;
313
+ xor_block_inner_neon .do_5 = xor_arm64_eor3_5 ;
314
+ }
315
+ return 0 ;
316
+ }
317
+ module_init (xor_neon_init );
318
+
319
+ static void __exit xor_neon_exit (void )
320
+ {
321
+ }
322
+ module_exit (xor_neon_exit );
323
+
179
324
MODULE_AUTHOR (
"Jackie Liu <[email protected] >" );
180
325
MODULE_DESCRIPTION ("ARMv8 XOR Extensions" );
181
326
MODULE_LICENSE ("GPL" );
0 commit comments