Skip to content

Commit 1d27e1f

Browse files
ebiggersherbertx
authored andcommitted
crypto: x86/aes-xts - handle CTS encryption more efficiently
When encrypting a message whose length isn't a multiple of 16 bytes, encrypt the last full block in the main loop. This works because only decryption uses the last two tweaks in reverse order, not encryption. This improves the performance of decrypting messages whose length isn't a multiple of the AES block length, shrinks the size of aes-xts-avx-x86_64.o by 5.0%, and eliminates two instructions (a test and a not-taken conditional jump) when encrypting a message whose length *is* a multiple of the AES block length. While it's not super useful to optimize for ciphertext stealing given that it's rarely needed in practice, the other two benefits mentioned above make this optimization worthwhile. Signed-off-by: Eric Biggers <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 3525fe4 commit 1d27e1f

File tree

1 file changed

+29
-24
lines changed

1 file changed

+29
-24
lines changed

arch/x86/crypto/aes-xts-avx-x86_64.S

Lines changed: 29 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -537,12 +537,16 @@
537537
add $7*16, KEY
538538
.else
539539
add $(15+7)*16, KEY
540-
.endif
541540

542-
// Check whether the data length is a multiple of the AES block length.
541+
// When decrypting a message whose length isn't a multiple of the AES
542+
// block length, exclude the last full block from the main loop by
543+
// subtracting 16 from LEN. This is needed because ciphertext stealing
544+
// decryption uses the last two tweaks in reverse order. We'll handle
545+
// the last full block and the partial block specially at the end.
543546
test $15, LEN
544-
jnz .Lneed_cts\@
547+
jnz .Lneed_cts_dec\@
545548
.Lxts_init\@:
549+
.endif
546550

547551
// Cache as many round keys as possible.
548552
_load_round_keys
@@ -685,41 +689,42 @@
685689
_vaes_4x \enc, 1, 12
686690
jmp .Lencrypt_4x_done\@
687691

688-
.Lneed_cts\@:
689-
// The data length isn't a multiple of the AES block length, so
690-
// ciphertext stealing (CTS) will be needed. Subtract one block from
691-
// LEN so that the main loop doesn't process the last full block. The
692-
// CTS step will process it specially along with the partial block.
692+
.if !\enc
693+
.Lneed_cts_dec\@:
693694
sub $16, LEN
694695
jmp .Lxts_init\@
696+
.endif
695697

696698
.Lcts\@:
697699
// Do ciphertext stealing (CTS) to en/decrypt the last full block and
698-
// the partial block. CTS needs two tweaks. TWEAK0_XMM contains the
699-
// next tweak; compute the one after that. Decryption uses these two
700-
// tweaks in reverse order, so also define aliases to handle that.
701-
_next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
700+
// the partial block. TWEAK0_XMM contains the next tweak.
701+
702702
.if \enc
703-
.set CTS_TWEAK0, TWEAK0_XMM
704-
.set CTS_TWEAK1, TWEAK1_XMM
703+
// If encrypting, the main loop already encrypted the last full block to
704+
// create the CTS intermediate ciphertext. Prepare for the rest of CTS
705+
// by rewinding the pointers and loading the intermediate ciphertext.
706+
sub $16, SRC
707+
sub $16, DST
708+
vmovdqu (DST), %xmm0
705709
.else
706-
.set CTS_TWEAK0, TWEAK1_XMM
707-
.set CTS_TWEAK1, TWEAK0_XMM
708-
.endif
709-
710-
// En/decrypt the last full block.
710+
// If decrypting, the main loop didn't decrypt the last full block
711+
// because CTS decryption uses the last two tweaks in reverse order.
712+
// Do it now by advancing the tweak and decrypting the last full block.
713+
_next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM
711714
vmovdqu (SRC), %xmm0
712-
_aes_crypt \enc, _XMM, CTS_TWEAK0, %xmm0
715+
_aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0
716+
.endif
713717

714718
.if USE_AVX10
715719
// Create a mask that has the first LEN bits set.
716720
mov $-1, %rax
717721
bzhi LEN, %rax, %rax
718722
kmovq %rax, %k1
719723

720-
// Swap the first LEN bytes of the above result with the partial block.
721-
// Note that to support in-place en/decryption, the load from the src
722-
// partial block must happen before the store to the dst partial block.
724+
// Swap the first LEN bytes of the en/decryption of the last full block
725+
// with the partial block. Note that to support in-place en/decryption,
726+
// the load from the src partial block must happen before the store to
727+
// the dst partial block.
723728
vmovdqa %xmm0, %xmm1
724729
vmovdqu8 16(SRC), %xmm0{%k1}
725730
vmovdqu8 %xmm1, 16(DST){%k1}
@@ -750,7 +755,7 @@
750755
vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
751756
.endif
752757
// En/decrypt again and store the last full block.
753-
_aes_crypt \enc, _XMM, CTS_TWEAK1, %xmm0
758+
_aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0
754759
vmovdqu %xmm0, (DST)
755760
jmp .Ldone\@
756761
.endm

0 commit comments

Comments
 (0)