Skip to content

Commit 2717e01

Browse files
ebiggersherbertx
authored andcommitted
crypto: x86/aes-xts - handle AES-128 and AES-192 more efficiently
Decrease the amount of code specific to the different AES variants by "right-aligning" the sequence of round keys, and for AES-128 and AES-192 just skipping irrelevant rounds at the beginning. This shrinks the size of aes-xts-avx-x86_64.o by 13.3%, and it improves the efficiency of AES-128 and AES-192. The tradeoff is that for AES-256 some additional not-taken conditional jumps are now executed. But these are predicted well and are cheap on x86. Note that the ARMv8 CE based AES-XTS implementation uses a similar strategy to handle the different AES variants. Signed-off-by: Eric Biggers <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent ea9459e commit 2717e01

File tree

1 file changed

+92
-86
lines changed

1 file changed

+92
-86
lines changed

arch/x86/crypto/aes-xts-avx-x86_64.S

Lines changed: 92 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -82,14 +82,15 @@
8282

8383
// Function parameters
8484
.set KEY, %rdi // Initially points to crypto_aes_ctx, then is
85-
// advanced to point directly to 7th round key
85+
// advanced to point to 7th-from-last round key
8686
.set SRC, %rsi // Pointer to next source data
8787
.set DST, %rdx // Pointer to next destination data
8888
.set LEN, %rcx // Remaining length in bytes
8989
.set TWEAK, %r8 // Pointer to next tweak
9090

91-
// %r9d holds the AES key length in bytes.
91+
// %r9 holds the AES key length in bytes.
9292
.set KEYLEN, %r9d
93+
.set KEYLEN64, %r9
9394

9495
// %rax and %r10-r11 are available as temporaries.
9596

@@ -165,12 +166,18 @@
165166
.set GF_POLY_XMM, %xmm14
166167
.set GF_POLY, V14
167168

168-
// V15 holds the first AES round key, copied to all 128-bit lanes.
169+
// V15 holds the key for AES "round 0", copied to all 128-bit lanes.
169170
.set KEY0_XMM, %xmm15
170171
.set KEY0, V15
171172

172173
// If 32 SIMD registers are available, then V16-V29 hold the remaining
173174
// AES round keys, copied to all 128-bit lanes.
175+
//
176+
// AES-128, AES-192, and AES-256 use different numbers of round keys.
177+
// To allow handling all three variants efficiently, we align the round
178+
// keys to the *end* of this register range. I.e., AES-128 uses
179+
// KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14.
180+
// (All also use KEY0 for the XOR-only "round" at the beginning.)
174181
.if USE_AVX10
175182
.set KEY1_XMM, %xmm16
176183
.set KEY1, V16
@@ -340,15 +347,15 @@
340347
.set PREV_TWEAK, NEXT_TWEAK2
341348
.set NEXT_TWEAK, NEXT_TWEAK3
342349
.endif
343-
.if \i < 20 && \i % 5 == 0
350+
.if \i >= 0 && \i < 20 && \i % 5 == 0
344351
vpshufd $0x13, PREV_TWEAK, V5
345-
.elseif \i < 20 && \i % 5 == 1
352+
.elseif \i >= 0 && \i < 20 && \i % 5 == 1
346353
vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
347-
.elseif \i < 20 && \i % 5 == 2
354+
.elseif \i >= 0 && \i < 20 && \i % 5 == 2
348355
vpsrad $31, V5, V5
349-
.elseif \i < 20 && \i % 5 == 3
356+
.elseif \i >= 0 && \i < 20 && \i % 5 == 3
350357
vpand GF_POLY, V5, V5
351-
.elseif \i < 20 && \i % 5 == 4
358+
.elseif \i >= 0 && \i < 20 && \i % 5 == 4
352359
vpxor V5, NEXT_TWEAK, NEXT_TWEAK
353360
.elseif \i == 1000
354361
vmovdqa NEXT_TWEAK0, TWEAK0
@@ -364,21 +371,21 @@
364371
// when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
365372
// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
366373
.macro _tweak_step_pclmul i
367-
.if \i == 2
374+
.if \i == 0
368375
vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
369-
.elseif \i == 4
376+
.elseif \i == 2
370377
vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
371-
.elseif \i == 6
378+
.elseif \i == 4
372379
vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
373-
.elseif \i == 8
380+
.elseif \i == 6
374381
vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
375-
.elseif \i == 10
382+
.elseif \i == 8
376383
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
377-
.elseif \i == 12
384+
.elseif \i == 10
378385
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
379-
.elseif \i == 14
386+
.elseif \i == 12
380387
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
381-
.elseif \i == 16
388+
.elseif \i == 14
382389
vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
383390
.elseif \i == 1000
384391
vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
@@ -393,8 +400,8 @@
393400
.endm
394401

395402
// _tweak_step does one step of the computation of the next set of tweaks from
396-
// TWEAK[0-3]. To complete all steps, this must be invoked with \i values 0
397-
// through at least 19, then 1000 which signals the last step.
403+
// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of
404+
// \i that include at least 0 through 19, then 1000 which signals the last step.
398405
//
399406
// This is used to interleave the computation of the next set of tweaks with the
400407
// AES en/decryptions, which increases performance in some cases.
@@ -406,22 +413,56 @@
406413
.endif
407414
.endm
408415

409-
// Load the round keys: just the first one if !USE_AVX10, otherwise all of them.
410-
.macro _load_round_keys
411-
_vbroadcast128 -7*16(KEY), KEY0
416+
.macro _setup_round_keys enc
417+
418+
// Select either the encryption round keys or the decryption round keys.
419+
.if \enc
420+
.set OFFS, 0
421+
.else
422+
.set OFFS, 240
423+
.endif
424+
425+
// Load the round key for "round 0".
426+
_vbroadcast128 OFFS(KEY), KEY0
427+
428+
// Increment KEY to make it so that 7*16(KEY) is the last round key.
429+
// For AES-128, increment by 3*16, resulting in the 10 round keys (not
430+
// counting the zero-th round key which was just loaded into KEY0) being
431+
// -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use
432+
// 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment
433+
// by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY).
434+
//
435+
// This rebasing provides two benefits. First, it makes the offset to
436+
// any round key be in the range [-96, 112], fitting in a signed byte.
437+
// This shortens VEX-encoded instructions that access the later round
438+
// keys which otherwise would need 4-byte offsets. Second, it makes it
439+
// easy to do AES-128 and AES-192 by skipping irrelevant rounds at the
440+
// beginning. Skipping rounds at the end doesn't work as well because
441+
// the last round needs different instructions.
442+
//
443+
// An alternative approach would be to roll up all the round loops. We
444+
// don't do that because it isn't compatible with caching the round keys
445+
// in registers which we do when possible (see below), and also because
446+
// it seems unwise to rely *too* heavily on the CPU's branch predictor.
447+
lea OFFS-16(KEY, KEYLEN64, 4), KEY
448+
449+
// If all 32 SIMD registers are available, cache all the round keys.
412450
.if USE_AVX10
451+
cmp $24, KEYLEN
452+
jl .Laes128\@
453+
je .Laes192\@
413454
_vbroadcast128 -6*16(KEY), KEY1
414455
_vbroadcast128 -5*16(KEY), KEY2
456+
.Laes192\@:
415457
_vbroadcast128 -4*16(KEY), KEY3
416458
_vbroadcast128 -3*16(KEY), KEY4
459+
.Laes128\@:
417460
_vbroadcast128 -2*16(KEY), KEY5
418461
_vbroadcast128 -1*16(KEY), KEY6
419462
_vbroadcast128 0*16(KEY), KEY7
420463
_vbroadcast128 1*16(KEY), KEY8
421464
_vbroadcast128 2*16(KEY), KEY9
422465
_vbroadcast128 3*16(KEY), KEY10
423-
// Note: if it's AES-128 or AES-192, the last several round keys won't
424-
// be used. We do the loads anyway to save a conditional jump.
425466
_vbroadcast128 4*16(KEY), KEY11
426467
_vbroadcast128 5*16(KEY), KEY12
427468
_vbroadcast128 6*16(KEY), KEY13
@@ -466,22 +507,22 @@
466507

467508
// Do a single round of AES en/decryption on the blocks in registers V0-V3,
468509
// using the same key for all blocks. The round key is loaded from the
469-
// appropriate register or memory location for round \i. In addition, does step
470-
// \i of the computation of the next set of tweaks. May clobber V4.
510+
// appropriate register or memory location for round \i. In addition, does two
511+
// steps of the computation of the next set of tweaks. May clobber V4.
471512
.macro _vaes_4x enc, last, i
472513
.if USE_AVX10
473-
_tweak_step (2*(\i-1))
514+
_tweak_step (2*(\i-5))
474515
_vaes \enc, \last, KEY\i, V0
475516
_vaes \enc, \last, KEY\i, V1
476-
_tweak_step (2*(\i-1) + 1)
517+
_tweak_step (2*(\i-5) + 1)
477518
_vaes \enc, \last, KEY\i, V2
478519
_vaes \enc, \last, KEY\i, V3
479520
.else
480521
_vbroadcast128 (\i-7)*16(KEY), V4
481-
_tweak_step (2*(\i-1))
522+
_tweak_step (2*(\i-5))
482523
_vaes \enc, \last, V4, V0
483524
_vaes \enc, \last, V4, V1
484-
_tweak_step (2*(\i-1) + 1)
525+
_tweak_step (2*(\i-5) + 1)
485526
_vaes \enc, \last, V4, V2
486527
_vaes \enc, \last, V4, V3
487528
.endif
@@ -493,32 +534,25 @@
493534
// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
494535
.macro _aes_crypt enc, xmm_suffix, tweak, data
495536
_xor3 KEY0\xmm_suffix, \tweak, \data
537+
cmp $24, KEYLEN
538+
jl .Laes128\@
539+
je .Laes192\@
496540
_vaes_1x \enc, 0, 1, \xmm_suffix, \data
497541
_vaes_1x \enc, 0, 2, \xmm_suffix, \data
542+
.Laes192\@:
498543
_vaes_1x \enc, 0, 3, \xmm_suffix, \data
499544
_vaes_1x \enc, 0, 4, \xmm_suffix, \data
545+
.Laes128\@:
500546
_vaes_1x \enc, 0, 5, \xmm_suffix, \data
501547
_vaes_1x \enc, 0, 6, \xmm_suffix, \data
502548
_vaes_1x \enc, 0, 7, \xmm_suffix, \data
503549
_vaes_1x \enc, 0, 8, \xmm_suffix, \data
504550
_vaes_1x \enc, 0, 9, \xmm_suffix, \data
505-
cmp $24, KEYLEN
506-
jle .Laes_128_or_192\@
507551
_vaes_1x \enc, 0, 10, \xmm_suffix, \data
508552
_vaes_1x \enc, 0, 11, \xmm_suffix, \data
509553
_vaes_1x \enc, 0, 12, \xmm_suffix, \data
510554
_vaes_1x \enc, 0, 13, \xmm_suffix, \data
511555
_vaes_1x \enc, 1, 14, \xmm_suffix, \data
512-
jmp .Laes_done\@
513-
.Laes_128_or_192\@:
514-
je .Laes_192\@
515-
_vaes_1x \enc, 1, 10, \xmm_suffix, \data
516-
jmp .Laes_done\@
517-
.Laes_192\@:
518-
_vaes_1x \enc, 0, 10, \xmm_suffix, \data
519-
_vaes_1x \enc, 0, 11, \xmm_suffix, \data
520-
_vaes_1x \enc, 1, 12, \xmm_suffix, \data
521-
.Laes_done\@:
522556
_vpxor \tweak, \data, \data
523557
.endm
524558

@@ -528,16 +562,7 @@
528562
// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
529563
movl 480(KEY), KEYLEN
530564

531-
// Advance KEY to point to the 7th encryption round key (if encrypting)
532-
// or the 7th decryption round key (if decrypting). This makes the
533-
// offset to any round key be in the range [-112, 112], fitting in a
534-
// signed byte. This shortens VEX-encoded instructions that access the
535-
// 8th and later round keys which otherwise would need 4-byte offsets.
536-
.if \enc
537-
add $7*16, KEY
538-
.else
539-
add $(15+7)*16, KEY
540-
565+
.if !\enc
541566
// When decrypting a message whose length isn't a multiple of the AES
542567
// block length, exclude the last full block from the main loop by
543568
// subtracting 16 from LEN. This is needed because ciphertext stealing
@@ -548,8 +573,8 @@
548573
.Lxts_init\@:
549574
.endif
550575

551-
// Cache as many round keys as possible.
552-
_load_round_keys
576+
// Setup the pointer to the round keys and cache as many as possible.
577+
_setup_round_keys \enc
553578

554579
// Compute the first set of tweaks TWEAK[0-3].
555580
_compute_first_set_of_tweaks
@@ -560,7 +585,7 @@
560585
.Lmain_loop\@:
561586
// This is the main loop, en/decrypting 4*VL bytes per iteration.
562587

563-
// XOR each source block with its tweak and the first round key.
588+
// XOR each source block with its tweak and the zero-th round key.
564589
.if USE_AVX10
565590
vmovdqu8 0*VL(SRC), V0
566591
vmovdqu8 1*VL(SRC), V1
@@ -580,27 +605,27 @@
580605
vpxor TWEAK2, V2, V2
581606
vpxor TWEAK3, V3, V3
582607
.endif
608+
cmp $24, KEYLEN
609+
jl .Laes128\@
610+
je .Laes192\@
583611
// Do all the AES rounds on the data blocks, interleaved with
584612
// the computation of the next set of tweaks.
585613
_vaes_4x \enc, 0, 1
586614
_vaes_4x \enc, 0, 2
615+
.Laes192\@:
587616
_vaes_4x \enc, 0, 3
588617
_vaes_4x \enc, 0, 4
618+
.Laes128\@:
589619
_vaes_4x \enc, 0, 5
590620
_vaes_4x \enc, 0, 6
591621
_vaes_4x \enc, 0, 7
592622
_vaes_4x \enc, 0, 8
593623
_vaes_4x \enc, 0, 9
594-
// Try to optimize for AES-256 by keeping the code for AES-128 and
595-
// AES-192 out-of-line.
596-
cmp $24, KEYLEN
597-
jle .Lencrypt_4x_aes_128_or_192\@
598624
_vaes_4x \enc, 0, 10
599625
_vaes_4x \enc, 0, 11
600626
_vaes_4x \enc, 0, 12
601627
_vaes_4x \enc, 0, 13
602628
_vaes_4x \enc, 1, 14
603-
.Lencrypt_4x_done\@:
604629

605630
// XOR in the tweaks again.
606631
_vpxor TWEAK0, V0, V0
@@ -678,17 +703,6 @@
678703
jnz .Lcts\@
679704
jmp .Ldone\@
680705

681-
// Out-of-line handling of AES-128 and AES-192
682-
.Lencrypt_4x_aes_128_or_192\@:
683-
jz .Lencrypt_4x_aes_192\@
684-
_vaes_4x \enc, 1, 10
685-
jmp .Lencrypt_4x_done\@
686-
.Lencrypt_4x_aes_192\@:
687-
_vaes_4x \enc, 0, 10
688-
_vaes_4x \enc, 0, 11
689-
_vaes_4x \enc, 1, 12
690-
jmp .Lencrypt_4x_done\@
691-
692706
.if !\enc
693707
.Lneed_cts_dec\@:
694708
sub $16, LEN
@@ -764,38 +778,30 @@
764778
// u8 iv[AES_BLOCK_SIZE]);
765779
SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
766780
vmovdqu (%rsi), %xmm0
767-
add $7*16, %rdi
768-
vpxor -7*16(%rdi), %xmm0, %xmm0
781+
vpxor (%rdi), %xmm0, %xmm0
782+
movl 480(%rdi), %eax // AES key length
783+
lea -16(%rdi, %rax, 4), %rdi
784+
cmp $24, %eax
785+
jl .Lencrypt_iv_aes128
786+
je .Lencrypt_iv_aes192
769787
vaesenc -6*16(%rdi), %xmm0, %xmm0
770788
vaesenc -5*16(%rdi), %xmm0, %xmm0
789+
.Lencrypt_iv_aes192:
771790
vaesenc -4*16(%rdi), %xmm0, %xmm0
772791
vaesenc -3*16(%rdi), %xmm0, %xmm0
792+
.Lencrypt_iv_aes128:
773793
vaesenc -2*16(%rdi), %xmm0, %xmm0
774794
vaesenc -1*16(%rdi), %xmm0, %xmm0
775795
vaesenc 0*16(%rdi), %xmm0, %xmm0
776796
vaesenc 1*16(%rdi), %xmm0, %xmm0
777797
vaesenc 2*16(%rdi), %xmm0, %xmm0
778-
cmpl $24, 480-(7*16)(%rdi)
779-
jle .Lencrypt_iv_aes_128_or_192
780798
vaesenc 3*16(%rdi), %xmm0, %xmm0
781799
vaesenc 4*16(%rdi), %xmm0, %xmm0
782800
vaesenc 5*16(%rdi), %xmm0, %xmm0
783801
vaesenc 6*16(%rdi), %xmm0, %xmm0
784802
vaesenclast 7*16(%rdi), %xmm0, %xmm0
785-
.Lencrypt_iv_done:
786803
vmovdqu %xmm0, (%rsi)
787804
RET
788-
789-
// Out-of-line handling of AES-128 and AES-192
790-
.Lencrypt_iv_aes_128_or_192:
791-
jz .Lencrypt_iv_aes_192
792-
vaesenclast 3*16(%rdi), %xmm0, %xmm0
793-
jmp .Lencrypt_iv_done
794-
.Lencrypt_iv_aes_192:
795-
vaesenc 3*16(%rdi), %xmm0, %xmm0
796-
vaesenc 4*16(%rdi), %xmm0, %xmm0
797-
vaesenclast 5*16(%rdi), %xmm0, %xmm0
798-
jmp .Lencrypt_iv_done
799805
SYM_FUNC_END(aes_xts_encrypt_iv)
800806

801807
// Below are the actual AES-XTS encryption and decryption functions,

0 commit comments

Comments
 (0)