Skip to content

Commit 2c54b42

Browse files
ardbiesheuvelctmarinas
authored andcommitted
arm64/xor: use EOR3 instructions when available
Use the EOR3 instruction to implement xor_blocks() if the instruction is available, which is the case if the CPU implements the SHA-3 extension. This is about 20% faster on Apple M1 when using the 5-way version. Signed-off-by: Ard Biesheuvel <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Catalin Marinas <[email protected]>
1 parent d58071a commit 2c54b42

File tree

3 files changed

+157
-1
lines changed

3 files changed

+157
-1
lines changed

arch/arm64/Kconfig

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1545,6 +1545,12 @@ endmenu
15451545

15461546
menu "ARMv8.2 architectural features"
15471547

1548+
config AS_HAS_ARMV8_2
1549+
def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
1550+
1551+
config AS_HAS_SHA3
1552+
def_bool $(as-instr,.arch armv8.2-a+sha3)
1553+
15481554
config ARM64_PMEM
15491555
bool "Enable support for persistent memory"
15501556
select ARCH_HAS_PMEM_API

arch/arm64/Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
5858
include/generated/asm-offsets.h))
5959
endif
6060

61+
ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
62+
# make sure to pass the newest target architecture to -march.
63+
asm-arch := armv8.2-a
64+
endif
65+
6166
# Ensure that if the compiler supports branch protection we default it
6267
# off, this will be overridden if we are using branch protection.
6368
branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)

arch/arm64/lib/xor-neon.c

Lines changed: 146 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
167167
} while (--lines > 0);
168168
}
169169

170-
struct xor_block_template const xor_block_inner_neon = {
170+
struct xor_block_template xor_block_inner_neon __ro_after_init = {
171171
.name = "__inner_neon__",
172172
.do_2 = xor_arm64_neon_2,
173173
.do_3 = xor_arm64_neon_3,
@@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = {
176176
};
177177
EXPORT_SYMBOL(xor_block_inner_neon);
178178

179+
static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
180+
{
181+
uint64x2_t res;
182+
183+
asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
184+
"eor3 %0.16b, %1.16b, %2.16b, %3.16b"
185+
: "=w"(res) : "w"(p), "w"(q), "w"(r));
186+
return res;
187+
}
188+
189+
static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
190+
unsigned long *p2, unsigned long *p3)
191+
{
192+
uint64_t *dp1 = (uint64_t *)p1;
193+
uint64_t *dp2 = (uint64_t *)p2;
194+
uint64_t *dp3 = (uint64_t *)p3;
195+
196+
register uint64x2_t v0, v1, v2, v3;
197+
long lines = bytes / (sizeof(uint64x2_t) * 4);
198+
199+
do {
200+
/* p1 ^= p2 ^ p3 */
201+
v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
202+
vld1q_u64(dp3 + 0));
203+
v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
204+
vld1q_u64(dp3 + 2));
205+
v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
206+
vld1q_u64(dp3 + 4));
207+
v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
208+
vld1q_u64(dp3 + 6));
209+
210+
/* store */
211+
vst1q_u64(dp1 + 0, v0);
212+
vst1q_u64(dp1 + 2, v1);
213+
vst1q_u64(dp1 + 4, v2);
214+
vst1q_u64(dp1 + 6, v3);
215+
216+
dp1 += 8;
217+
dp2 += 8;
218+
dp3 += 8;
219+
} while (--lines > 0);
220+
}
221+
222+
static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
223+
unsigned long *p2, unsigned long *p3,
224+
unsigned long *p4)
225+
{
226+
uint64_t *dp1 = (uint64_t *)p1;
227+
uint64_t *dp2 = (uint64_t *)p2;
228+
uint64_t *dp3 = (uint64_t *)p3;
229+
uint64_t *dp4 = (uint64_t *)p4;
230+
231+
register uint64x2_t v0, v1, v2, v3;
232+
long lines = bytes / (sizeof(uint64x2_t) * 4);
233+
234+
do {
235+
/* p1 ^= p2 ^ p3 */
236+
v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
237+
vld1q_u64(dp3 + 0));
238+
v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
239+
vld1q_u64(dp3 + 2));
240+
v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
241+
vld1q_u64(dp3 + 4));
242+
v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
243+
vld1q_u64(dp3 + 6));
244+
245+
/* p1 ^= p4 */
246+
v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
247+
v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
248+
v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
249+
v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
250+
251+
/* store */
252+
vst1q_u64(dp1 + 0, v0);
253+
vst1q_u64(dp1 + 2, v1);
254+
vst1q_u64(dp1 + 4, v2);
255+
vst1q_u64(dp1 + 6, v3);
256+
257+
dp1 += 8;
258+
dp2 += 8;
259+
dp3 += 8;
260+
dp4 += 8;
261+
} while (--lines > 0);
262+
}
263+
264+
static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
265+
unsigned long *p2, unsigned long *p3,
266+
unsigned long *p4, unsigned long *p5)
267+
{
268+
uint64_t *dp1 = (uint64_t *)p1;
269+
uint64_t *dp2 = (uint64_t *)p2;
270+
uint64_t *dp3 = (uint64_t *)p3;
271+
uint64_t *dp4 = (uint64_t *)p4;
272+
uint64_t *dp5 = (uint64_t *)p5;
273+
274+
register uint64x2_t v0, v1, v2, v3;
275+
long lines = bytes / (sizeof(uint64x2_t) * 4);
276+
277+
do {
278+
/* p1 ^= p2 ^ p3 */
279+
v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
280+
vld1q_u64(dp3 + 0));
281+
v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
282+
vld1q_u64(dp3 + 2));
283+
v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
284+
vld1q_u64(dp3 + 4));
285+
v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
286+
vld1q_u64(dp3 + 6));
287+
288+
/* p1 ^= p4 ^ p5 */
289+
v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
290+
v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
291+
v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
292+
v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
293+
294+
/* store */
295+
vst1q_u64(dp1 + 0, v0);
296+
vst1q_u64(dp1 + 2, v1);
297+
vst1q_u64(dp1 + 4, v2);
298+
vst1q_u64(dp1 + 6, v3);
299+
300+
dp1 += 8;
301+
dp2 += 8;
302+
dp3 += 8;
303+
dp4 += 8;
304+
dp5 += 8;
305+
} while (--lines > 0);
306+
}
307+
308+
static int __init xor_neon_init(void)
309+
{
310+
if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
311+
xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
312+
xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
313+
xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
314+
}
315+
return 0;
316+
}
317+
module_init(xor_neon_init);
318+
319+
static void __exit xor_neon_exit(void)
320+
{
321+
}
322+
module_exit(xor_neon_exit);
323+
179324
MODULE_AUTHOR("Jackie Liu <[email protected]>");
180325
MODULE_DESCRIPTION("ARMv8 XOR Extensions");
181326
MODULE_LICENSE("GPL");

0 commit comments

Comments
 (0)