Skip to content

Commit 543ea17

Browse files
ebiggersherbertx
authored andcommitted
crypto: x86/aes-xts - optimize size of instructions operating on lengths
x86_64 has the "interesting" property that the instruction size is generally a bit shorter for instructions that operate on the 32-bit (or less) part of registers, or registers that are in the original set of 8. This patch adjusts the AES-XTS code to take advantage of that property by changing the LEN parameter from size_t to unsigned int (which is all that's needed and is what the non-AVX implementation uses) and using the %eax register for KEYLEN. This decreases the size of aes-xts-avx-x86_64.o by 1.2%. Note that changing the kmovq to kmovd was going to be needed anyway to make the AVX10/256 code really work on CPUs that don't support 512-bit vectors (since the AVX10 spec says that 64-bit opmask instructions will only be supported on processors that support 512-bit vectors). Signed-off-by: Eric Biggers <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent e619723 commit 543ea17

File tree

2 files changed

+30
-28
lines changed

2 files changed

+30
-28
lines changed

arch/x86/crypto/aes-xts-avx-x86_64.S

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -85,14 +85,16 @@
8585
// advanced to point to 7th-from-last round key
8686
.set SRC, %rsi // Pointer to next source data
8787
.set DST, %rdx // Pointer to next destination data
88-
.set LEN, %rcx // Remaining length in bytes
88+
.set LEN, %ecx // Remaining length in bytes
89+
.set LEN8, %cl
90+
.set LEN64, %rcx
8991
.set TWEAK, %r8 // Pointer to next tweak
9092

91-
// %r9 holds the AES key length in bytes.
92-
.set KEYLEN, %r9d
93-
.set KEYLEN64, %r9
93+
// %rax holds the AES key length in bytes.
94+
.set KEYLEN, %eax
95+
.set KEYLEN64, %rax
9496

95-
// %rax and %r10-r11 are available as temporaries.
97+
// %r9-r11 are available as temporaries.
9698

9799
.macro _define_Vi i
98100
.if VL == 16
@@ -565,9 +567,9 @@
565567
// subtracting 16 from LEN. This is needed because ciphertext stealing
566568
// decryption uses the last two tweaks in reverse order. We'll handle
567569
// the last full block and the partial block specially at the end.
568-
lea -16(LEN), %rax
569-
test $15, LEN
570-
cmovnz %rax, LEN
570+
lea -16(LEN), %eax
571+
test $15, LEN8
572+
cmovnz %eax, LEN
571573
.endif
572574

573575
// Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
@@ -650,7 +652,7 @@
650652
// Check for the uncommon case where the data length isn't a multiple of
651653
// 4*VL. Handle it out-of-line in order to optimize for the common
652654
// case. In the common case, just fall through to the ret.
653-
test $4*VL-1, LEN
655+
test $4*VL-1, LEN8
654656
jnz .Lhandle_remainder\@
655657
.Ldone\@:
656658
// Store the next tweak back to *TWEAK to support continuation calls.
@@ -718,9 +720,9 @@
718720

719721
.if USE_AVX10
720722
// Create a mask that has the first LEN bits set.
721-
mov $-1, %rax
722-
bzhi LEN, %rax, %rax
723-
kmovq %rax, %k1
723+
mov $-1, %r9d
724+
bzhi LEN, %r9d, %r9d
725+
kmovd %r9d, %k1
724726

725727
// Swap the first LEN bytes of the en/decryption of the last full block
726728
// with the partial block. Note that to support in-place en/decryption,
@@ -730,23 +732,23 @@
730732
vmovdqu8 16(SRC), %xmm0{%k1}
731733
vmovdqu8 %xmm1, 16(DST){%k1}
732734
.else
733-
lea .Lcts_permute_table(%rip), %rax
735+
lea .Lcts_permute_table(%rip), %r9
734736

735737
// Load the src partial block, left-aligned. Note that to support
736738
// in-place en/decryption, this must happen before the store to the dst
737739
// partial block.
738-
vmovdqu (SRC, LEN, 1), %xmm1
740+
vmovdqu (SRC, LEN64, 1), %xmm1
739741

740742
// Shift the first LEN bytes of the en/decryption of the last full block
741743
// to the end of a register, then store it to DST+LEN. This stores the
742744
// dst partial block. It also writes to the second part of the dst last
743745
// full block, but that part is overwritten later.
744-
vpshufb (%rax, LEN, 1), %xmm0, %xmm2
745-
vmovdqu %xmm2, (DST, LEN, 1)
746+
vpshufb (%r9, LEN64, 1), %xmm0, %xmm2
747+
vmovdqu %xmm2, (DST, LEN64, 1)
746748

747749
// Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...].
748-
sub LEN, %rax
749-
vmovdqu 32(%rax), %xmm3
750+
sub LEN64, %r9
751+
vmovdqu 32(%r9), %xmm3
750752

751753
// Shift the src partial block to the beginning of its register.
752754
vpshufb %xmm3, %xmm1, %xmm1
@@ -795,7 +797,7 @@ SYM_FUNC_END(aes_xts_encrypt_iv)
795797
// instantiated from the above macro. They all have the following prototype:
796798
//
797799
// void (*xts_asm_func)(const struct crypto_aes_ctx *key,
798-
// const u8 *src, u8 *dst, size_t len,
800+
// const u8 *src, u8 *dst, unsigned int len,
799801
// u8 tweak[AES_BLOCK_SIZE]);
800802
//
801803
// |key| is the data key. |tweak| contains the next tweak; the encryption of

arch/x86/crypto/aesni-intel_glue.c

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,7 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key,
899899
typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key,
900900
u8 iv[AES_BLOCK_SIZE]);
901901
typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key,
902-
const u8 *src, u8 *dst, size_t len,
902+
const u8 *src, u8 *dst, unsigned int len,
903903
u8 tweak[AES_BLOCK_SIZE]);
904904

905905
/* This handles cases where the source and/or destination span pages. */
@@ -1021,14 +1021,14 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
10211021
}
10221022

10231023
static void aesni_xts_encrypt(const struct crypto_aes_ctx *key,
1024-
const u8 *src, u8 *dst, size_t len,
1024+
const u8 *src, u8 *dst, unsigned int len,
10251025
u8 tweak[AES_BLOCK_SIZE])
10261026
{
10271027
aesni_xts_enc(key, dst, src, len, tweak);
10281028
}
10291029

10301030
static void aesni_xts_decrypt(const struct crypto_aes_ctx *key,
1031-
const u8 *src, u8 *dst, size_t len,
1031+
const u8 *src, u8 *dst, unsigned int len,
10321032
u8 tweak[AES_BLOCK_SIZE])
10331033
{
10341034
aesni_xts_dec(key, dst, src, len, tweak);
@@ -1185,12 +1185,12 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key,
11851185

11861186
#define DEFINE_XTS_ALG(suffix, driver_name, priority) \
11871187
\
1188-
asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, \
1189-
const u8 *src, u8 *dst, size_t len, \
1190-
u8 tweak[AES_BLOCK_SIZE]); \
1191-
asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, \
1192-
const u8 *src, u8 *dst, size_t len, \
1193-
u8 tweak[AES_BLOCK_SIZE]); \
1188+
asmlinkage void \
1189+
aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
1190+
u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
1191+
asmlinkage void \
1192+
aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \
1193+
u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \
11941194
\
11951195
static int xts_encrypt_##suffix(struct skcipher_request *req) \
11961196
{ \

0 commit comments

Comments
 (0)