|
82 | 82 |
|
83 | 83 | // Function parameters
|
84 | 84 | .set KEY, %rdi // Initially points to crypto_aes_ctx, then is
|
85 |
| - // advanced to point directly to 7th round key |
| 85 | + // advanced to point to 7th-from-last round key |
86 | 86 | .set SRC, %rsi // Pointer to next source data
|
87 | 87 | .set DST, %rdx // Pointer to next destination data
|
88 | 88 | .set LEN, %rcx // Remaining length in bytes
|
89 | 89 | .set TWEAK, %r8 // Pointer to next tweak
|
90 | 90 |
|
91 |
| -// %r9d holds the AES key length in bytes. |
| 91 | +// %r9 holds the AES key length in bytes. |
92 | 92 | .set KEYLEN, %r9d
|
| 93 | +.set KEYLEN64, %r9 |
93 | 94 |
|
94 | 95 | // %rax and %r10-r11 are available as temporaries.
|
95 | 96 |
|
|
165 | 166 | .set GF_POLY_XMM, %xmm14
|
166 | 167 | .set GF_POLY, V14
|
167 | 168 |
|
168 |
| - // V15 holds the first AES round key, copied to all 128-bit lanes. |
| 169 | + // V15 holds the key for AES "round 0", copied to all 128-bit lanes. |
169 | 170 | .set KEY0_XMM, %xmm15
|
170 | 171 | .set KEY0, V15
|
171 | 172 |
|
172 | 173 | // If 32 SIMD registers are available, then V16-V29 hold the remaining
|
173 | 174 | // AES round keys, copied to all 128-bit lanes.
|
| 175 | + // |
| 176 | + // AES-128, AES-192, and AES-256 use different numbers of round keys. |
| 177 | + // To allow handling all three variants efficiently, we align the round |
| 178 | + // keys to the *end* of this register range. I.e., AES-128 uses |
| 179 | + // KEY5-KEY14, AES-192 uses KEY3-KEY14, and AES-256 uses KEY1-KEY14. |
| 180 | + // (All also use KEY0 for the XOR-only "round" at the beginning.) |
174 | 181 | .if USE_AVX10
|
175 | 182 | .set KEY1_XMM, %xmm16
|
176 | 183 | .set KEY1, V16
|
|
340 | 347 | .set PREV_TWEAK, NEXT_TWEAK2
|
341 | 348 | .set NEXT_TWEAK, NEXT_TWEAK3
|
342 | 349 | .endif
|
343 |
| -.if \i < 20 && \i % 5 == 0 |
| 350 | +.if \i >= 0 && \i < 20 && \i % 5 == 0 |
344 | 351 | vpshufd $0x13, PREV_TWEAK, V5
|
345 |
| -.elseif \i < 20 && \i % 5 == 1 |
| 352 | +.elseif \i >= 0 && \i < 20 && \i % 5 == 1 |
346 | 353 | vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK
|
347 |
| -.elseif \i < 20 && \i % 5 == 2 |
| 354 | +.elseif \i >= 0 && \i < 20 && \i % 5 == 2 |
348 | 355 | vpsrad $31, V5, V5
|
349 |
| -.elseif \i < 20 && \i % 5 == 3 |
| 356 | +.elseif \i >= 0 && \i < 20 && \i % 5 == 3 |
350 | 357 | vpand GF_POLY, V5, V5
|
351 |
| -.elseif \i < 20 && \i % 5 == 4 |
| 358 | +.elseif \i >= 0 && \i < 20 && \i % 5 == 4 |
352 | 359 | vpxor V5, NEXT_TWEAK, NEXT_TWEAK
|
353 | 360 | .elseif \i == 1000
|
354 | 361 | vmovdqa NEXT_TWEAK0, TWEAK0
|
|
364 | 371 | // when VL > 16 (which it is here), the needed shift amounts are byte-aligned,
|
365 | 372 | // which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts.
|
366 | 373 | .macro _tweak_step_pclmul i
|
367 |
| -.if \i == 2 |
| 374 | +.if \i == 0 |
368 | 375 | vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0
|
369 |
| -.elseif \i == 4 |
| 376 | +.elseif \i == 2 |
370 | 377 | vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1
|
371 |
| -.elseif \i == 6 |
| 378 | +.elseif \i == 4 |
372 | 379 | vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2
|
373 |
| -.elseif \i == 8 |
| 380 | +.elseif \i == 6 |
374 | 381 | vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3
|
375 |
| -.elseif \i == 10 |
| 382 | +.elseif \i == 8 |
376 | 383 | vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0
|
377 |
| -.elseif \i == 12 |
| 384 | +.elseif \i == 10 |
378 | 385 | vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1
|
379 |
| -.elseif \i == 14 |
| 386 | +.elseif \i == 12 |
380 | 387 | vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2
|
381 |
| -.elseif \i == 16 |
| 388 | +.elseif \i == 14 |
382 | 389 | vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3
|
383 | 390 | .elseif \i == 1000
|
384 | 391 | vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0
|
|
393 | 400 | .endm
|
394 | 401 |
|
395 | 402 | // _tweak_step does one step of the computation of the next set of tweaks from
|
396 |
| -// TWEAK[0-3]. To complete all steps, this must be invoked with \i values 0 |
397 |
| -// through at least 19, then 1000 which signals the last step. |
| 403 | +// TWEAK[0-3]. To complete all steps, this is invoked with increasing values of |
| 404 | +// \i that include at least 0 through 19, then 1000 which signals the last step. |
398 | 405 | //
|
399 | 406 | // This is used to interleave the computation of the next set of tweaks with the
|
400 | 407 | // AES en/decryptions, which increases performance in some cases.
|
|
406 | 413 | .endif
|
407 | 414 | .endm
|
408 | 415 |
|
409 |
| -// Load the round keys: just the first one if !USE_AVX10, otherwise all of them. |
410 |
| -.macro _load_round_keys |
411 |
| - _vbroadcast128 -7*16(KEY), KEY0 |
| 416 | +.macro _setup_round_keys enc |
| 417 | + |
| 418 | + // Select either the encryption round keys or the decryption round keys. |
| 419 | +.if \enc |
| 420 | + .set OFFS, 0 |
| 421 | +.else |
| 422 | + .set OFFS, 240 |
| 423 | +.endif |
| 424 | + |
| 425 | + // Load the round key for "round 0". |
| 426 | + _vbroadcast128 OFFS(KEY), KEY0 |
| 427 | + |
| 428 | + // Increment KEY to make it so that 7*16(KEY) is the last round key. |
| 429 | + // For AES-128, increment by 3*16, resulting in the 10 round keys (not |
| 430 | + // counting the zero-th round key which was just loaded into KEY0) being |
| 431 | + // -2*16(KEY) through 7*16(KEY). For AES-192, increment by 5*16 and use |
| 432 | + // 12 round keys -4*16(KEY) through 7*16(KEY). For AES-256, increment |
| 433 | + // by 7*16 and use 14 round keys -6*16(KEY) through 7*16(KEY). |
| 434 | + // |
| 435 | + // This rebasing provides two benefits. First, it makes the offset to |
| 436 | + // any round key be in the range [-96, 112], fitting in a signed byte. |
| 437 | + // This shortens VEX-encoded instructions that access the later round |
| 438 | + // keys which otherwise would need 4-byte offsets. Second, it makes it |
| 439 | + // easy to do AES-128 and AES-192 by skipping irrelevant rounds at the |
| 440 | + // beginning. Skipping rounds at the end doesn't work as well because |
| 441 | + // the last round needs different instructions. |
| 442 | + // |
| 443 | + // An alternative approach would be to roll up all the round loops. We |
| 444 | + // don't do that because it isn't compatible with caching the round keys |
| 445 | + // in registers which we do when possible (see below), and also because |
| 446 | + // it seems unwise to rely *too* heavily on the CPU's branch predictor. |
| 447 | + lea OFFS-16(KEY, KEYLEN64, 4), KEY |
| 448 | + |
| 449 | + // If all 32 SIMD registers are available, cache all the round keys. |
412 | 450 | .if USE_AVX10
|
| 451 | + cmp $24, KEYLEN |
| 452 | + jl .Laes128\@ |
| 453 | + je .Laes192\@ |
413 | 454 | _vbroadcast128 -6*16(KEY), KEY1
|
414 | 455 | _vbroadcast128 -5*16(KEY), KEY2
|
| 456 | +.Laes192\@: |
415 | 457 | _vbroadcast128 -4*16(KEY), KEY3
|
416 | 458 | _vbroadcast128 -3*16(KEY), KEY4
|
| 459 | +.Laes128\@: |
417 | 460 | _vbroadcast128 -2*16(KEY), KEY5
|
418 | 461 | _vbroadcast128 -1*16(KEY), KEY6
|
419 | 462 | _vbroadcast128 0*16(KEY), KEY7
|
420 | 463 | _vbroadcast128 1*16(KEY), KEY8
|
421 | 464 | _vbroadcast128 2*16(KEY), KEY9
|
422 | 465 | _vbroadcast128 3*16(KEY), KEY10
|
423 |
| - // Note: if it's AES-128 or AES-192, the last several round keys won't |
424 |
| - // be used. We do the loads anyway to save a conditional jump. |
425 | 466 | _vbroadcast128 4*16(KEY), KEY11
|
426 | 467 | _vbroadcast128 5*16(KEY), KEY12
|
427 | 468 | _vbroadcast128 6*16(KEY), KEY13
|
|
466 | 507 |
|
467 | 508 | // Do a single round of AES en/decryption on the blocks in registers V0-V3,
|
468 | 509 | // using the same key for all blocks. The round key is loaded from the
|
469 |
| -// appropriate register or memory location for round \i. In addition, does step |
470 |
| -// \i of the computation of the next set of tweaks. May clobber V4. |
| 510 | +// appropriate register or memory location for round \i. In addition, does two |
| 511 | +// steps of the computation of the next set of tweaks. May clobber V4. |
471 | 512 | .macro _vaes_4x enc, last, i
|
472 | 513 | .if USE_AVX10
|
473 |
| - _tweak_step (2*(\i-1)) |
| 514 | + _tweak_step (2*(\i-5)) |
474 | 515 | _vaes \enc, \last, KEY\i, V0
|
475 | 516 | _vaes \enc, \last, KEY\i, V1
|
476 |
| - _tweak_step (2*(\i-1) + 1) |
| 517 | + _tweak_step (2*(\i-5) + 1) |
477 | 518 | _vaes \enc, \last, KEY\i, V2
|
478 | 519 | _vaes \enc, \last, KEY\i, V3
|
479 | 520 | .else
|
480 | 521 | _vbroadcast128 (\i-7)*16(KEY), V4
|
481 |
| - _tweak_step (2*(\i-1)) |
| 522 | + _tweak_step (2*(\i-5)) |
482 | 523 | _vaes \enc, \last, V4, V0
|
483 | 524 | _vaes \enc, \last, V4, V1
|
484 |
| - _tweak_step (2*(\i-1) + 1) |
| 525 | + _tweak_step (2*(\i-5) + 1) |
485 | 526 | _vaes \enc, \last, V4, V2
|
486 | 527 | _vaes \enc, \last, V4, V3
|
487 | 528 | .endif
|
|
493 | 534 | // length VL, use V* registers and leave \xmm_suffix empty. May clobber V4.
|
494 | 535 | .macro _aes_crypt enc, xmm_suffix, tweak, data
|
495 | 536 | _xor3 KEY0\xmm_suffix, \tweak, \data
|
| 537 | + cmp $24, KEYLEN |
| 538 | + jl .Laes128\@ |
| 539 | + je .Laes192\@ |
496 | 540 | _vaes_1x \enc, 0, 1, \xmm_suffix, \data
|
497 | 541 | _vaes_1x \enc, 0, 2, \xmm_suffix, \data
|
| 542 | +.Laes192\@: |
498 | 543 | _vaes_1x \enc, 0, 3, \xmm_suffix, \data
|
499 | 544 | _vaes_1x \enc, 0, 4, \xmm_suffix, \data
|
| 545 | +.Laes128\@: |
500 | 546 | _vaes_1x \enc, 0, 5, \xmm_suffix, \data
|
501 | 547 | _vaes_1x \enc, 0, 6, \xmm_suffix, \data
|
502 | 548 | _vaes_1x \enc, 0, 7, \xmm_suffix, \data
|
503 | 549 | _vaes_1x \enc, 0, 8, \xmm_suffix, \data
|
504 | 550 | _vaes_1x \enc, 0, 9, \xmm_suffix, \data
|
505 |
| - cmp $24, KEYLEN |
506 |
| - jle .Laes_128_or_192\@ |
507 | 551 | _vaes_1x \enc, 0, 10, \xmm_suffix, \data
|
508 | 552 | _vaes_1x \enc, 0, 11, \xmm_suffix, \data
|
509 | 553 | _vaes_1x \enc, 0, 12, \xmm_suffix, \data
|
510 | 554 | _vaes_1x \enc, 0, 13, \xmm_suffix, \data
|
511 | 555 | _vaes_1x \enc, 1, 14, \xmm_suffix, \data
|
512 |
| - jmp .Laes_done\@ |
513 |
| -.Laes_128_or_192\@: |
514 |
| - je .Laes_192\@ |
515 |
| - _vaes_1x \enc, 1, 10, \xmm_suffix, \data |
516 |
| - jmp .Laes_done\@ |
517 |
| -.Laes_192\@: |
518 |
| - _vaes_1x \enc, 0, 10, \xmm_suffix, \data |
519 |
| - _vaes_1x \enc, 0, 11, \xmm_suffix, \data |
520 |
| - _vaes_1x \enc, 1, 12, \xmm_suffix, \data |
521 |
| -.Laes_done\@: |
522 | 556 | _vpxor \tweak, \data, \data
|
523 | 557 | .endm
|
524 | 558 |
|
|
528 | 562 | // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256).
|
529 | 563 | movl 480(KEY), KEYLEN
|
530 | 564 |
|
531 |
| - // Advance KEY to point to the 7th encryption round key (if encrypting) |
532 |
| - // or the 7th decryption round key (if decrypting). This makes the |
533 |
| - // offset to any round key be in the range [-112, 112], fitting in a |
534 |
| - // signed byte. This shortens VEX-encoded instructions that access the |
535 |
| - // 8th and later round keys which otherwise would need 4-byte offsets. |
536 |
| -.if \enc |
537 |
| - add $7*16, KEY |
538 |
| -.else |
539 |
| - add $(15+7)*16, KEY |
540 |
| - |
| 565 | +.if !\enc |
541 | 566 | // When decrypting a message whose length isn't a multiple of the AES
|
542 | 567 | // block length, exclude the last full block from the main loop by
|
543 | 568 | // subtracting 16 from LEN. This is needed because ciphertext stealing
|
|
548 | 573 | .Lxts_init\@:
|
549 | 574 | .endif
|
550 | 575 |
|
551 |
| - // Cache as many round keys as possible. |
552 |
| - _load_round_keys |
| 576 | + // Setup the pointer to the round keys and cache as many as possible. |
| 577 | + _setup_round_keys \enc |
553 | 578 |
|
554 | 579 | // Compute the first set of tweaks TWEAK[0-3].
|
555 | 580 | _compute_first_set_of_tweaks
|
|
560 | 585 | .Lmain_loop\@:
|
561 | 586 | // This is the main loop, en/decrypting 4*VL bytes per iteration.
|
562 | 587 |
|
563 |
| - // XOR each source block with its tweak and the first round key. |
| 588 | + // XOR each source block with its tweak and the zero-th round key. |
564 | 589 | .if USE_AVX10
|
565 | 590 | vmovdqu8 0*VL(SRC), V0
|
566 | 591 | vmovdqu8 1*VL(SRC), V1
|
|
580 | 605 | vpxor TWEAK2, V2, V2
|
581 | 606 | vpxor TWEAK3, V3, V3
|
582 | 607 | .endif
|
| 608 | + cmp $24, KEYLEN |
| 609 | + jl .Laes128\@ |
| 610 | + je .Laes192\@ |
583 | 611 | // Do all the AES rounds on the data blocks, interleaved with
|
584 | 612 | // the computation of the next set of tweaks.
|
585 | 613 | _vaes_4x \enc, 0, 1
|
586 | 614 | _vaes_4x \enc, 0, 2
|
| 615 | +.Laes192\@: |
587 | 616 | _vaes_4x \enc, 0, 3
|
588 | 617 | _vaes_4x \enc, 0, 4
|
| 618 | +.Laes128\@: |
589 | 619 | _vaes_4x \enc, 0, 5
|
590 | 620 | _vaes_4x \enc, 0, 6
|
591 | 621 | _vaes_4x \enc, 0, 7
|
592 | 622 | _vaes_4x \enc, 0, 8
|
593 | 623 | _vaes_4x \enc, 0, 9
|
594 |
| - // Try to optimize for AES-256 by keeping the code for AES-128 and |
595 |
| - // AES-192 out-of-line. |
596 |
| - cmp $24, KEYLEN |
597 |
| - jle .Lencrypt_4x_aes_128_or_192\@ |
598 | 624 | _vaes_4x \enc, 0, 10
|
599 | 625 | _vaes_4x \enc, 0, 11
|
600 | 626 | _vaes_4x \enc, 0, 12
|
601 | 627 | _vaes_4x \enc, 0, 13
|
602 | 628 | _vaes_4x \enc, 1, 14
|
603 |
| -.Lencrypt_4x_done\@: |
604 | 629 |
|
605 | 630 | // XOR in the tweaks again.
|
606 | 631 | _vpxor TWEAK0, V0, V0
|
|
678 | 703 | jnz .Lcts\@
|
679 | 704 | jmp .Ldone\@
|
680 | 705 |
|
681 |
| - // Out-of-line handling of AES-128 and AES-192 |
682 |
| -.Lencrypt_4x_aes_128_or_192\@: |
683 |
| - jz .Lencrypt_4x_aes_192\@ |
684 |
| - _vaes_4x \enc, 1, 10 |
685 |
| - jmp .Lencrypt_4x_done\@ |
686 |
| -.Lencrypt_4x_aes_192\@: |
687 |
| - _vaes_4x \enc, 0, 10 |
688 |
| - _vaes_4x \enc, 0, 11 |
689 |
| - _vaes_4x \enc, 1, 12 |
690 |
| - jmp .Lencrypt_4x_done\@ |
691 |
| - |
692 | 706 | .if !\enc
|
693 | 707 | .Lneed_cts_dec\@:
|
694 | 708 | sub $16, LEN
|
|
764 | 778 | // u8 iv[AES_BLOCK_SIZE]);
|
765 | 779 | SYM_TYPED_FUNC_START(aes_xts_encrypt_iv)
|
766 | 780 | vmovdqu (%rsi), %xmm0
|
767 |
| - add $7*16, %rdi |
768 |
| - vpxor -7*16(%rdi), %xmm0, %xmm0 |
| 781 | + vpxor (%rdi), %xmm0, %xmm0 |
| 782 | + movl 480(%rdi), %eax // AES key length |
| 783 | + lea -16(%rdi, %rax, 4), %rdi |
| 784 | + cmp $24, %eax |
| 785 | + jl .Lencrypt_iv_aes128 |
| 786 | + je .Lencrypt_iv_aes192 |
769 | 787 | vaesenc -6*16(%rdi), %xmm0, %xmm0
|
770 | 788 | vaesenc -5*16(%rdi), %xmm0, %xmm0
|
| 789 | +.Lencrypt_iv_aes192: |
771 | 790 | vaesenc -4*16(%rdi), %xmm0, %xmm0
|
772 | 791 | vaesenc -3*16(%rdi), %xmm0, %xmm0
|
| 792 | +.Lencrypt_iv_aes128: |
773 | 793 | vaesenc -2*16(%rdi), %xmm0, %xmm0
|
774 | 794 | vaesenc -1*16(%rdi), %xmm0, %xmm0
|
775 | 795 | vaesenc 0*16(%rdi), %xmm0, %xmm0
|
776 | 796 | vaesenc 1*16(%rdi), %xmm0, %xmm0
|
777 | 797 | vaesenc 2*16(%rdi), %xmm0, %xmm0
|
778 |
| - cmpl $24, 480-(7*16)(%rdi) |
779 |
| - jle .Lencrypt_iv_aes_128_or_192 |
780 | 798 | vaesenc 3*16(%rdi), %xmm0, %xmm0
|
781 | 799 | vaesenc 4*16(%rdi), %xmm0, %xmm0
|
782 | 800 | vaesenc 5*16(%rdi), %xmm0, %xmm0
|
783 | 801 | vaesenc 6*16(%rdi), %xmm0, %xmm0
|
784 | 802 | vaesenclast 7*16(%rdi), %xmm0, %xmm0
|
785 |
| -.Lencrypt_iv_done: |
786 | 803 | vmovdqu %xmm0, (%rsi)
|
787 | 804 | RET
|
788 |
| - |
789 |
| - // Out-of-line handling of AES-128 and AES-192 |
790 |
| -.Lencrypt_iv_aes_128_or_192: |
791 |
| - jz .Lencrypt_iv_aes_192 |
792 |
| - vaesenclast 3*16(%rdi), %xmm0, %xmm0 |
793 |
| - jmp .Lencrypt_iv_done |
794 |
| -.Lencrypt_iv_aes_192: |
795 |
| - vaesenc 3*16(%rdi), %xmm0, %xmm0 |
796 |
| - vaesenc 4*16(%rdi), %xmm0, %xmm0 |
797 |
| - vaesenclast 5*16(%rdi), %xmm0, %xmm0 |
798 |
| - jmp .Lencrypt_iv_done |
799 | 805 | SYM_FUNC_END(aes_xts_encrypt_iv)
|
800 | 806 |
|
801 | 807 | // Below are the actual AES-XTS encryption and decryption functions,
|
|
0 commit comments