diff --git a/crypto/fipsmodule/CMakeLists.txt b/crypto/fipsmodule/CMakeLists.txt index a61919e701..d367a1bd99 100644 --- a/crypto/fipsmodule/CMakeLists.txt +++ b/crypto/fipsmodule/CMakeLists.txt @@ -47,6 +47,7 @@ if(ARCH STREQUAL "x86_64") vpaes-x86_64.${ASM_EXT} x86_64-mont5.${ASM_EXT} x86_64-mont.${ASM_EXT} + keccak1600-x86_64-avx512vl.${ASM_EXT} ) endif() @@ -174,6 +175,7 @@ if(PERL_EXECUTABLE) perlasm(x86_64-mont5.${ASM_EXT} bn/asm/x86_64-mont5.pl) perlasm(x86_64-mont.${ASM_EXT} bn/asm/x86_64-mont.pl) perlasm(x86-mont.${ASM_EXT} bn/asm/x86-mont.pl) + perlasm(keccak1600-x86_64-avx512vl.${ASM_EXT} sha/asm/keccak1600-x86_64-avx512vl.pl) endif() # clang-6 (and older) knows how to compile AVX512 assembly instructions, diff --git a/crypto/fipsmodule/sha/asm/keccak1600-x86_64-avx512vl.pl b/crypto/fipsmodule/sha/asm/keccak1600-x86_64-avx512vl.pl new file mode 100755 index 0000000000..a283821d45 --- /dev/null +++ b/crypto/fipsmodule/sha/asm/keccak1600-x86_64-avx512vl.pl @@ -0,0 +1,531 @@ +#! /usr/bin/env perl +# Copyright (C) 2025 Intel Corporation +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# This implementation is identical to the liboqs version +# (https://github.com/open-quantum-safe/liboqs/blob/main/src/common/sha3/avx512vl_low/KeccakP-1600-AVX512VL.S) +# and adapted (simplified) to fit to the existing absorb and squeeze API's in C. +# +###################################################################### +# The main building block of this code is keccak_1600_permute function. +# It is implemented using AVX512VL instruction set, AVX512F and AVX512DQ extensions in particular. +# +# This function, as is, can work on 1 to 4 independent states at the same time. +# +# YMM registers 0 to 24 are used as Keccak state registers. +# + +# The first two arguments should always be the flavour and output file path. +if ($#ARGV < 1) { + die "Not enough arguments provided. " . + "Two arguments are necessary: the flavour and the output file path."; +} + +$flavour = shift; +$output = shift; + +$win64=0; +$win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; +$dir=$1; + +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or + die "can't locate x86_64-xlate.pl"; + +$avx512vl = 1; +for (@ARGV) { + $avx512vl = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +#====================================================================== + + +# Loads one 64-bit register from four state structures into single ymm +# ymmN = state[0][N] | state[1][N] | state[2][N] | state[3][N] +sub load_reg_x4 +{ + my ($base,$reg_index) = @_; + my $offset = $reg_index * 8; + my $lane = 25 * 8; + + $code.=<<___ ; + vmovq $offset+$lane*0($base), %xmm$reg_index + vpinsrq \$1, $offset+$lane*1($base), %xmm$reg_index, %xmm$reg_index + vmovq $offset+$lane*2($base), %xmm25 + vpinsrq \$1, $offset+$lane*3($base), %xmm25, %xmm25 + vinserti32x4 \$1, %xmm25, %ymm$reg_index, %ymm$reg_index +___ +} + +# Loads four 64-bit registers from four state structures into four ymm registers +# Same as load_reg_x4 but more efficient transposition. +sub load_4regs_x4 +{ + my ($base,$reg_index) = @_; + my $offset = $reg_index * 8; + my $r0 = $reg_index; + my $r1 = $reg_index + 1; + my $r2 = $reg_index + 2; + my $r3 = $reg_index + 3; + my $lane = 25 * 8; + + $code.=<<___ ; + vmovdqu64 $offset+$lane*0($base), %ymm25 + vmovdqu64 $offset+$lane*1($base), %ymm26 + vmovdqu64 $offset+$lane*2($base), %ymm27 + vmovdqu64 $offset+$lane*3($base), %ymm28 + vpunpcklqdq %ymm26, %ymm25, %ymm29 # A0 B0 A2 B2 + vpunpckhqdq %ymm26, %ymm25, %ymm30 # A1 B1 A3 B3 + vpunpcklqdq %ymm28, %ymm27, %ymm25 # C0 D0 C2 D2 + vpunpckhqdq %ymm28, %ymm27, %ymm26 # C1 D1 C3 D3 + vshufi64x2 \$0, %ymm25, %ymm29, %ymm$r0 # A0 B0 C0 D0 + vshufi64x2 \$0, %ymm26, %ymm30, %ymm$r1 # A1 B1 C1 D1 + vshufi64x2 \$3, %ymm25, %ymm29, %ymm$r2 # A2 B2 C2 D2 + vshufi64x2 \$3, %ymm26, %ymm30, %ymm$r3 # A3 B3 C3 D3 +___ +} + +# Stores one 64-bit register into four state structures from a single ymm +# state[0][N] = ymmN & (2^64-1) +# state[1][N] = (ymmN >> 64) & (2^64-1) +# state[2][N] = (ymmN >> 128) & (2^64-1) +# state[3][N] = (ymmN >> 192) & (2^64-1) +sub save_reg_x4 +{ + my ($base,$reg_index) = @_; + my $offset = $reg_index * 8; + my $lane = 25 * 8; + + $code.=<<___ ; + vextracti32x4 \$1, %ymm$reg_index, %xmm25 + vmovq %xmm$reg_index, $offset+$lane*0($base) + vpextrq \$1, %xmm$reg_index, $offset+$lane*1($base) + vmovq %xmm25, $offset+$lane*2($base) + vpextrq \$1, %xmm25, $offset+$lane*3($base) +___ +} + +# Stores four 64-bit registers into four state structures from four ymm registers +# Same as store_reg_x4 but more efficient transposition. +sub save_4regs_x4 +{ + my ($base,$reg_index) = @_; + my $offset = $reg_index * 8; + my $r0 = $reg_index; + my $r1 = $reg_index + 1; + my $r2 = $reg_index + 2; + my $r3 = $reg_index + 3; + my $lane = 25 * 8; + + $code.=<<___ ; + vpunpcklqdq %ymm$r1, %ymm$r0, %ymm25 # A0 A1 C0 C1 + vpunpckhqdq %ymm$r1, %ymm$r0, %ymm26 # B0 B1 D0 D1 + vpunpcklqdq %ymm$r3, %ymm$r2, %ymm27 # A2 A3 C2 C3 + vpunpckhqdq %ymm$r3, %ymm$r2, %ymm28 # B2 B3 D2 D3 + vshufi64x2 \$0, %ymm27, %ymm25, %ymm$r0 # A0 A1 A2 A3 + vshufi64x2 \$0, %ymm28, %ymm26, %ymm$r1 # B0 B1 B2 B3 + vshufi64x2 \$3, %ymm27, %ymm25, %ymm$r2 # C0 C1 C2 C3 + vshufi64x2 \$3, %ymm28, %ymm26, %ymm$r3 # D0 D1 D2 D3 + vmovdqu64 %ymm$r0, $offset+$lane*0($base) + vmovdqu64 %ymm$r1, $offset+$lane*1($base) + vmovdqu64 %ymm$r2, $offset+$lane*2($base) + vmovdqu64 %ymm$r3, $offset+$lane*3($base) +___ +} + +# Stores XMM6-XMM15 on the stack frame on Windows +sub save_xmm6_xmm15 +{ + if ($win64) { + $code .= "sub \$10*16, %rsp\n"; + for (my $j = 0; $j < 10; $j++) { + my $r = 6 + $j; + $code .= "vmovdqu %xmm$r, $j*16(%rsp)\n"; + } + } +} + +# Restores XMM6-XMM15 from the stack on Windows +sub restore_xmm6_xmm15 +{ + if ($win64) { + for (my $j = 0; $j < 10; $j++) { + my $r = 6 + $j; + $code .= "vmovdqu $j*16(%rsp), %xmm$r\n"; + } + $code .= "add \$10*16, %rsp\n"; + } +} + +if ($avx512vl) { + + $arg1="%rdi"; # 1st arg + $roundn="%r10d"; + $tblptr="%r11"; + +$code.=<<___ ; +#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX + +.text + +# Perform Keccak permutation +# +# YMM registers 0 to 24 are used as Keccak state registers. +# This function, as is, can work on 1 to 4 independent states at the same time. +# +# There is no clear boundary between Theta, Rho, Pi, Chi and Iota steps. +# Instructions corresponding to these steps overlap for better efficiency. +# +# ymm0-ymm24 [in/out] Keccak state registers (one SIMD per one state register) +# ymm25-ymm31 [clobbered] temporary SIMD registers +# $roundn [clobbered] used for round tracking +# $tblptr [clobbered] used for access to SHA3 constant table + +.align 32 +keccak_1600_permute: +.cfi_startproc + endbranch + movl \$24, $roundn # 24 rounds + leaq iotas_avx512(%rip), $tblptr # Load the address of the SHA3 round constants + +.align 32 +.Lkeccak_rnd_loop: + # Theta step + + # Compute column parities + # C[5] = [0, 0, 0, 0, 0] + # for x in 0 to 4: + # C[x] = state[x][0] XOR state[x][1] XOR state[x][2] XOR state[x][3] XOR state[x][4] + + vmovdqa64 %ymm0, %ymm25 + vpternlogq \$0x96, %ymm5, %ymm10, %ymm25 + vmovdqa64 %ymm1, %ymm26 + vpternlogq \$0x96, %ymm11, %ymm6, %ymm26 + vmovdqa64 %ymm2, %ymm27 + vpternlogq \$0x96, %ymm12, %ymm7, %ymm27 + + vmovdqa64 %ymm3, %ymm28 + vpternlogq \$0x96, %ymm13, %ymm8, %ymm28 + vmovdqa64 %ymm4, %ymm29 + vpternlogq \$0x96, %ymm14, %ymm9, %ymm29 + vpternlogq \$0x96, %ymm20, %ymm15, %ymm25 + + vpternlogq \$0x96, %ymm21, %ymm16, %ymm26 + vpternlogq \$0x96, %ymm22, %ymm17, %ymm27 + vpternlogq \$0x96, %ymm23, %ymm18, %ymm28 + + # Start computing D values and keep computing column parity + # D[5] = [0, 0, 0, 0, 0] + # for x in 0 to 4: + # D[x] = C[(x+4) mod 5] XOR ROTATE_LEFT(C[(x+1) mod 5], 1) + + vprolq \$1, %ymm26, %ymm30 + vprolq \$1, %ymm27, %ymm31 + vpternlogq \$0x96, %ymm24, %ymm19, %ymm29 + + # Continue computing D values and apply Theta + # for x in 0 to 4: + # for y in 0 to 4: + # state[x][y] = state[x][y] XOR D[x] + + vpternlogq \$0x96, %ymm30, %ymm29, %ymm0 + vpternlogq \$0x96, %ymm30, %ymm29, %ymm10 + vpternlogq \$0x96, %ymm30, %ymm29, %ymm20 + + vpternlogq \$0x96, %ymm30, %ymm29, %ymm5 + vpternlogq \$0x96, %ymm30, %ymm29, %ymm15 + vprolq \$1, %ymm28, %ymm30 + + vpternlogq \$0x96, %ymm31, %ymm25, %ymm6 + vpternlogq \$0x96, %ymm31, %ymm25, %ymm16 + vpternlogq \$0x96, %ymm31, %ymm25, %ymm1 + + vpternlogq \$0x96, %ymm31, %ymm25, %ymm11 + vpternlogq \$0x96, %ymm31, %ymm25, %ymm21 + vprolq \$1, %ymm29, %ymm31 + + vpbroadcastq ($tblptr), %ymm29 # Load the round constant into ymm29 (Iota) + addq \$8, $tblptr # Increment the pointer to the next round constant + + vpternlogq \$0x96, %ymm30, %ymm26, %ymm12 + vpternlogq \$0x96, %ymm30, %ymm26, %ymm7 + vpternlogq \$0x96, %ymm30, %ymm26, %ymm22 + + vpternlogq \$0x96, %ymm30, %ymm26, %ymm17 + vpternlogq \$0x96, %ymm30, %ymm26, %ymm2 + vprolq \$1, %ymm25, %ymm30 + + # Rho step + # Keep applying Theta and start Rho step + # + # ROTATION_OFFSETS[5][5] = [ + # [0, 1, 62, 28, 27], + # [36, 44, 6, 55, 20], + # [3, 10, 43, 25, 39], + # [41, 45, 15, 21, 8], + # [18, 2, 61, 56, 14] ] + # + # for x in 0 to 4: + # for y in 0 to 4: + # state[x][y] = ROTATE_LEFT(state[x][y], ROTATION_OFFSETS[x][y]) + + vpternlogq \$0x96, %ymm31, %ymm27, %ymm3 + vpternlogq \$0x96, %ymm31, %ymm27, %ymm13 + vpternlogq \$0x96, %ymm31, %ymm27, %ymm23 + + vprolq \$44, %ymm6, %ymm6 + vpternlogq \$0x96, %ymm31, %ymm27, %ymm18 + vpternlogq \$0x96, %ymm31, %ymm27, %ymm8 + + vprolq \$43, %ymm12, %ymm12 + vprolq \$21, %ymm18, %ymm18 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm24 + + vprolq \$14, %ymm24, %ymm24 + vprolq \$28, %ymm3, %ymm3 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm9 + + vprolq \$20, %ymm9, %ymm9 + vprolq \$3, %ymm10, %ymm10 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm19 + + vprolq \$45, %ymm16, %ymm16 + vprolq \$61, %ymm22, %ymm22 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm4 + + vprolq \$1, %ymm1, %ymm1 + vprolq \$6, %ymm7, %ymm7 + vpternlogq \$0x96, %ymm30, %ymm28, %ymm14 + + # Continue with Rho and start Pi and Chi steps at the same time + # Ternary logic 0xD2 is used for Chi step + # + # for x in 0 to 4: + # for y in 0 to 4: + # state[x][y] = state[x][y] XOR ((NOT state[(x+1) mod 5][y]) AND state[(x+2) mod 5][y]) + + vprolq \$25, %ymm13, %ymm13 + vprolq \$8, %ymm19, %ymm19 + vmovdqa64 %ymm0, %ymm30 + vpternlogq \$0xD2, %ymm12, %ymm6, %ymm30 + + vprolq \$18, %ymm20, %ymm20 + vprolq \$27, %ymm4, %ymm4 + vpxorq %ymm29, %ymm30, %ymm30 # Iota step + + vprolq \$36, %ymm5, %ymm5 + vprolq \$10, %ymm11, %ymm11 + vmovdqa64 %ymm6, %ymm31 + vpternlogq \$0xD2, %ymm18, %ymm12, %ymm31 + + vprolq \$15, %ymm17, %ymm17 + vprolq \$56, %ymm23, %ymm23 + vpternlogq \$0xD2, %ymm24, %ymm18, %ymm12 + + vprolq \$62, %ymm2, %ymm2 + vprolq \$55, %ymm8, %ymm8 + vpternlogq \$0xD2, %ymm0, %ymm24, %ymm18 + + vprolq \$39, %ymm14, %ymm14 + vprolq \$41, %ymm15, %ymm15 + vpternlogq \$0xD2, %ymm6, %ymm0, %ymm24 + vmovdqa64 %ymm30, %ymm0 + vmovdqa64 %ymm31, %ymm6 + + vprolq \$2, %ymm21, %ymm21 + vmovdqa64 %ymm3, %ymm30 + vpternlogq \$0xD2, %ymm10, %ymm9, %ymm30 + vmovdqa64 %ymm9, %ymm31 + vpternlogq \$0xD2, %ymm16, %ymm10, %ymm31 + + vpternlogq \$0xD2, %ymm22, %ymm16, %ymm10 + vpternlogq \$0xD2, %ymm3, %ymm22, %ymm16 + vpternlogq \$0xD2, %ymm9, %ymm3, %ymm22 + vmovdqa64 %ymm30, %ymm3 + vmovdqa64 %ymm31, %ymm9 + + vmovdqa64 %ymm1, %ymm30 + vpternlogq \$0xD2, %ymm13, %ymm7, %ymm30 + vmovdqa64 %ymm7, %ymm31 + vpternlogq \$0xD2, %ymm19, %ymm13, %ymm31 + vpternlogq \$0xD2, %ymm20, %ymm19, %ymm13 + + vpternlogq \$0xD2, %ymm1, %ymm20, %ymm19 + vpternlogq \$0xD2, %ymm7, %ymm1, %ymm20 + vmovdqa64 %ymm30, %ymm1 + vmovdqa64 %ymm31, %ymm7 + vmovdqa64 %ymm4, %ymm30 + vpternlogq \$0xD2, %ymm11, %ymm5, %ymm30 + + vmovdqa64 %ymm5, %ymm31 + vpternlogq \$0xD2, %ymm17, %ymm11, %ymm31 + vpternlogq \$0xD2, %ymm23, %ymm17, %ymm11 + vpternlogq \$0xD2, %ymm4, %ymm23, %ymm17 + + vpternlogq \$0xD2, %ymm5, %ymm4, %ymm23 + vmovdqa64 %ymm30, %ymm4 + vmovdqa64 %ymm31, %ymm5 + vmovdqa64 %ymm2, %ymm30 + vpternlogq \$0xD2, %ymm14, %ymm8, %ymm30 + vmovdqa64 %ymm8, %ymm31 + vpternlogq \$0xD2, %ymm15, %ymm14, %ymm31 + + vpternlogq \$0xD2, %ymm21, %ymm15, %ymm14 + vpternlogq \$0xD2, %ymm2, %ymm21, %ymm15 + vpternlogq \$0xD2, %ymm8, %ymm2, %ymm21 + vmovdqa64 %ymm30, %ymm2 + vmovdqa64 %ymm31, %ymm8 + + # Complete the steps and update state registers in ymm0 to ymm24 + vmovdqa64 %ymm3, %ymm30 + vmovdqa64 %ymm18, %ymm3 + vmovdqa64 %ymm17, %ymm18 + vmovdqa64 %ymm11, %ymm17 + vmovdqa64 %ymm7, %ymm11 + vmovdqa64 %ymm10, %ymm7 + vmovdqa64 %ymm1, %ymm10 + vmovdqa64 %ymm6, %ymm1 + vmovdqa64 %ymm9, %ymm6 + vmovdqa64 %ymm22, %ymm9 + vmovdqa64 %ymm14, %ymm22 + vmovdqa64 %ymm20, %ymm14 + vmovdqa64 %ymm2, %ymm20 + vmovdqa64 %ymm12, %ymm2 + vmovdqa64 %ymm13, %ymm12 + vmovdqa64 %ymm19, %ymm13 + vmovdqa64 %ymm23, %ymm19 + vmovdqa64 %ymm15, %ymm23 + vmovdqa64 %ymm4, %ymm15 + vmovdqa64 %ymm24, %ymm4 + vmovdqa64 %ymm21, %ymm24 + vmovdqa64 %ymm8, %ymm21 + vmovdqa64 %ymm16, %ymm8 + vmovdqa64 %ymm5, %ymm16 + vmovdqa64 %ymm30, %ymm5 + + decl $roundn # Decrement the round counter + jnz .Lkeccak_rnd_loop # Jump to the start of the loop if r13d is not zero + ret +.cfi_endproc + +.globl KeccakF1600_avx512vl +.type KeccakF1600_avx512vl,\@function,1 +.align 32 +KeccakF1600_avx512vl: +.cfi_startproc + endbranch +___ + # save xmm6-xmm15 if required + &save_xmm6_xmm15(); + + # load the state + for (my $i = 0; $i < 25; $i++) { + $code .= "vmovq 8*$i($arg1), %xmm$i\n"; + } + + # perform permute on the state + $code.=<<___ ; + + call keccak_1600_permute + +___ + + # save updated state + for (my $i = 0; $i < 25; $i++) { + $code .= "vmovq %xmm$i, 8*$i($arg1)\n"; + } + + # restore xmm6-xmm15 if required + &restore_xmm6_xmm15(); + + $code.=<<___ ; + vzeroupper + ret +.cfi_endproc + +.globl KeccakF1600_x4_avx512vl +.type KeccakF1600_x4_avx512vl,\@function,1 +.align 32 +KeccakF1600_x4_avx512vl: +.cfi_startproc + endbranch +___ + # save xmm6-xmm15 if required + &save_xmm6_xmm15(); + + # load and transpose four states + for (my $i = 0; ($i + 4) < 25; $i += 4) { + &load_4regs_x4($arg1, $i); + } + &load_reg_x4($arg1, 24); + + # perform permute on the four states + + $code.=<<___ ; + + call keccak_1600_permute + +___ + + # transpose and save four updated states + for (my $i = 0; ($i + 4) < 25; $i += 4) { + &save_4regs_x4($arg1, $i); + } + &save_reg_x4($arg1, 24); + + # restore xmm6-xmm15 if required + &restore_xmm6_xmm15(); + + $code.=<<___ ; + vzeroupper + ret +.cfi_endproc + +.section .rodata + +.align 64 +iotas_avx512: +.quad 0x0000000000000001, 0x0000000000008082 +.quad 0x800000000000808a, 0x8000000080008000 +.quad 0x000000000000808b, 0x0000000080000001 +.quad 0x8000000080008081, 0x8000000000008009 +.quad 0x000000000000008a, 0x0000000000000088 +.quad 0x0000000080008009, 0x000000008000000a +.quad 0x000000008000808b, 0x800000000000008b +.quad 0x8000000000008089, 0x8000000000008003 +.quad 0x8000000000008002, 0x8000000000000080 +.quad 0x000000000000800a, 0x800000008000000a +.quad 0x8000000080008081, 0x8000000000008080 +.quad 0x0000000080000001, 0x8000000080008008 + +#endif + +.text +___ +} else { +$code.=<<___ ; +.text +.globl KeccakF1600_avx512vl +.type KeccakF1600_avx512vl,\@function,1 +.globl KeccakF1600_x4_avx512vl +.type KeccakF1600_x4_avx512vl,\@function,1 +KeccakF1600_avx512vl: +KeccakF1600_x4_avx512vl: +.cfi_startproc + ret +.cfi_endproc +___ +} + +print $code; + +close STDOUT or die "error closing STDOUT: $!"; diff --git a/crypto/fipsmodule/sha/internal.h b/crypto/fipsmodule/sha/internal.h index db5ecb2e06..af858f04b0 100644 --- a/crypto/fipsmodule/sha/internal.h +++ b/crypto/fipsmodule/sha/internal.h @@ -356,6 +356,8 @@ void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *data, #define KECCAK1600_S2N_BIGNUM_ASM #include "../../../third_party/s2n-bignum/s2n-bignum_aws-lc.h" #endif +void KeccakF1600_x4_avx512vl(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]); +void KeccakF1600_avx512vl(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]); #endif #endif diff --git a/crypto/fipsmodule/sha/keccak1600.c b/crypto/fipsmodule/sha/keccak1600.c index f97d03a6a9..febad68394 100644 --- a/crypto/fipsmodule/sha/keccak1600.c +++ b/crypto/fipsmodule/sha/keccak1600.c @@ -236,18 +236,22 @@ void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]); // KeccakF1600_XORBytes XORs |len| bytes from |inp| into the Keccak state |A|. // |len| must be a multiple of 8. -static void KeccakF1600_XORBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len) { +static void KeccakF1600_XORBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, const size_t len) { assert(len <= SHA3_MAX_BLOCKSIZE); assert((len % 8) == 0); uint64_t *A_flat = (uint64_t *)A; - size_t w = len / 8; + const size_t w = len / 8; for (size_t i = 0; i < w; i++) { +#if defined(OPENSSL_X86_64) + const uint64_t Ai = *((const uint64_t *) inp); +#else uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 | (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 | (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 | (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56; +#endif inp += 8; A_flat[i] ^= Ai; } @@ -275,17 +279,12 @@ static void KeccakF1600_ExtractBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS assert(len <= SHA3_MAX_BLOCKSIZE); size_t i = 0; - while (len != 0) { + while (len >= 8) { +#if defined(OPENSSL_X86_64) + *((uint64_t *) out) = A_flat[i]; +#else uint64_t Ai = A_flat[i]; - if (len < 8) { - for (size_t j = 0; j < len; j++) { - *out++ = (uint8_t)Ai; - Ai >>= 8; - } - return; - } - out[0] = (uint8_t)(Ai); out[1] = (uint8_t)(Ai >> 8); out[2] = (uint8_t)(Ai >> 16); @@ -294,10 +293,20 @@ static void KeccakF1600_ExtractBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS out[5] = (uint8_t)(Ai >> 40); out[6] = (uint8_t)(Ai >> 48); out[7] = (uint8_t)(Ai >> 56); +#endif out += 8; len -= 8; i++; } + + if (len > 0) { + uint64_t Ai = A_flat[i]; + + for (size_t j = 0; j < len; j++) { + *out++ = (uint8_t)Ai; + Ai >>= 8; + } + } } void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len, size_t r, int padded) { @@ -318,7 +327,8 @@ void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *o #if defined(KECCAK1600_ASM) -// Scalar implementation from OpenSSL provided by keccak1600-armv8.pl +// Scalar implementation from OpenSSL provided by keccak1600-armv8.pl or +// SIMD implementation provided by keccak-x86_64-avx512vl.pl extern void KeccakF1600_hw(uint64_t state[25]); #if defined(OPENSSL_AARCH64) @@ -372,7 +382,17 @@ void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) { KeccakF1600_hw((uint64_t *) A); #elif defined(OPENSSL_X86_64) - sha3_keccak_f1600((uint64_t *)A, iotas); + // Dispatch logic for Keccak-x4 on x86_64: + // + // 1. If ASM is disabled, use the C implementation. + // 2. If ASM is enabled: + // - For systems with AVX512VL support use KeccakF1600_avx512vl(). + // Otherwise fall back to the C implementation. + if (CRYPTO_is_AVX512_capable()) { + KeccakF1600_avx512vl(A); + } else { + sha3_keccak_f1600((uint64_t *)A, iotas); + } #endif } @@ -425,6 +445,7 @@ static void Keccak1600_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]) { // which is a straightforward implementation using the SHA3 extension. // - Otherwise, fall back to four times the 1-fold Keccak implementation // (which has its own dispatch logic). + #if defined(KECCAK1600_S2N_BIGNUM_ASM) && defined(OPENSSL_AARCH64) if (CRYPTO_is_Neoverse_N1()) { keccak_log_dispatch(13); // kFlag_sha3_keccak4_f1600_alt @@ -449,6 +470,19 @@ static void Keccak1600_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]) { #endif #endif + // Dispatch logic for Keccak-x4 on x86_64: + // + // 1. If ASM is disabled, use 4x the C implementation. + // 2. If ASM is enabled: + // - For systems with AVX512VL support use KeccakF1600_x4_avx512vl(). + // Otherwise fall back to 4x the C implementation. +#if defined(OPENSSL_X86_64) + if (CRYPTO_is_AVX512_capable()) { + KeccakF1600_x4_avx512vl(A); + return; + } +#endif + // Fallback: 4x individual KeccakF1600 calls (each with their own dispatch) KeccakF1600(A[0]); KeccakF1600(A[1]); diff --git a/generated-src/linux-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.S b/generated-src/linux-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.S new file mode 100644 index 0000000000..68d9730c00 --- /dev/null +++ b/generated-src/linux-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.S @@ -0,0 +1,503 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX + +.text + + + + + + + + + + + + + + +.align 32 +keccak_1600_permute: +.cfi_startproc +.byte 243,15,30,250 + movl $24,%r10d + leaq iotas_avx512(%rip),%r11 + +.align 32 +.Lkeccak_rnd_loop: + + + + + + + + vmovdqa64 %ymm0,%ymm25 + vpternlogq $0x96,%ymm5,%ymm10,%ymm25 + vmovdqa64 %ymm1,%ymm26 + vpternlogq $0x96,%ymm11,%ymm6,%ymm26 + vmovdqa64 %ymm2,%ymm27 + vpternlogq $0x96,%ymm12,%ymm7,%ymm27 + + vmovdqa64 %ymm3,%ymm28 + vpternlogq $0x96,%ymm13,%ymm8,%ymm28 + vmovdqa64 %ymm4,%ymm29 + vpternlogq $0x96,%ymm14,%ymm9,%ymm29 + vpternlogq $0x96,%ymm20,%ymm15,%ymm25 + + vpternlogq $0x96,%ymm21,%ymm16,%ymm26 + vpternlogq $0x96,%ymm22,%ymm17,%ymm27 + vpternlogq $0x96,%ymm23,%ymm18,%ymm28 + + + + + + + vprolq $1,%ymm26,%ymm30 + vprolq $1,%ymm27,%ymm31 + vpternlogq $0x96,%ymm24,%ymm19,%ymm29 + + + + + + + vpternlogq $0x96,%ymm30,%ymm29,%ymm0 + vpternlogq $0x96,%ymm30,%ymm29,%ymm10 + vpternlogq $0x96,%ymm30,%ymm29,%ymm20 + + vpternlogq $0x96,%ymm30,%ymm29,%ymm5 + vpternlogq $0x96,%ymm30,%ymm29,%ymm15 + vprolq $1,%ymm28,%ymm30 + + vpternlogq $0x96,%ymm31,%ymm25,%ymm6 + vpternlogq $0x96,%ymm31,%ymm25,%ymm16 + vpternlogq $0x96,%ymm31,%ymm25,%ymm1 + + vpternlogq $0x96,%ymm31,%ymm25,%ymm11 + vpternlogq $0x96,%ymm31,%ymm25,%ymm21 + vprolq $1,%ymm29,%ymm31 + + vpbroadcastq (%r11),%ymm29 + addq $8,%r11 + + vpternlogq $0x96,%ymm30,%ymm26,%ymm12 + vpternlogq $0x96,%ymm30,%ymm26,%ymm7 + vpternlogq $0x96,%ymm30,%ymm26,%ymm22 + + vpternlogq $0x96,%ymm30,%ymm26,%ymm17 + vpternlogq $0x96,%ymm30,%ymm26,%ymm2 + vprolq $1,%ymm25,%ymm30 + + + + + + + + + + + + + + + + vpternlogq $0x96,%ymm31,%ymm27,%ymm3 + vpternlogq $0x96,%ymm31,%ymm27,%ymm13 + vpternlogq $0x96,%ymm31,%ymm27,%ymm23 + + vprolq $44,%ymm6,%ymm6 + vpternlogq $0x96,%ymm31,%ymm27,%ymm18 + vpternlogq $0x96,%ymm31,%ymm27,%ymm8 + + vprolq $43,%ymm12,%ymm12 + vprolq $21,%ymm18,%ymm18 + vpternlogq $0x96,%ymm30,%ymm28,%ymm24 + + vprolq $14,%ymm24,%ymm24 + vprolq $28,%ymm3,%ymm3 + vpternlogq $0x96,%ymm30,%ymm28,%ymm9 + + vprolq $20,%ymm9,%ymm9 + vprolq $3,%ymm10,%ymm10 + vpternlogq $0x96,%ymm30,%ymm28,%ymm19 + + vprolq $45,%ymm16,%ymm16 + vprolq $61,%ymm22,%ymm22 + vpternlogq $0x96,%ymm30,%ymm28,%ymm4 + + vprolq $1,%ymm1,%ymm1 + vprolq $6,%ymm7,%ymm7 + vpternlogq $0x96,%ymm30,%ymm28,%ymm14 + + + + + + + + + vprolq $25,%ymm13,%ymm13 + vprolq $8,%ymm19,%ymm19 + vmovdqa64 %ymm0,%ymm30 + vpternlogq $0xD2,%ymm12,%ymm6,%ymm30 + + vprolq $18,%ymm20,%ymm20 + vprolq $27,%ymm4,%ymm4 + vpxorq %ymm29,%ymm30,%ymm30 + + vprolq $36,%ymm5,%ymm5 + vprolq $10,%ymm11,%ymm11 + vmovdqa64 %ymm6,%ymm31 + vpternlogq $0xD2,%ymm18,%ymm12,%ymm31 + + vprolq $15,%ymm17,%ymm17 + vprolq $56,%ymm23,%ymm23 + vpternlogq $0xD2,%ymm24,%ymm18,%ymm12 + + vprolq $62,%ymm2,%ymm2 + vprolq $55,%ymm8,%ymm8 + vpternlogq $0xD2,%ymm0,%ymm24,%ymm18 + + vprolq $39,%ymm14,%ymm14 + vprolq $41,%ymm15,%ymm15 + vpternlogq $0xD2,%ymm6,%ymm0,%ymm24 + vmovdqa64 %ymm30,%ymm0 + vmovdqa64 %ymm31,%ymm6 + + vprolq $2,%ymm21,%ymm21 + vmovdqa64 %ymm3,%ymm30 + vpternlogq $0xD2,%ymm10,%ymm9,%ymm30 + vmovdqa64 %ymm9,%ymm31 + vpternlogq $0xD2,%ymm16,%ymm10,%ymm31 + + vpternlogq $0xD2,%ymm22,%ymm16,%ymm10 + vpternlogq $0xD2,%ymm3,%ymm22,%ymm16 + vpternlogq $0xD2,%ymm9,%ymm3,%ymm22 + vmovdqa64 %ymm30,%ymm3 + vmovdqa64 %ymm31,%ymm9 + + vmovdqa64 %ymm1,%ymm30 + vpternlogq $0xD2,%ymm13,%ymm7,%ymm30 + vmovdqa64 %ymm7,%ymm31 + vpternlogq $0xD2,%ymm19,%ymm13,%ymm31 + vpternlogq $0xD2,%ymm20,%ymm19,%ymm13 + + vpternlogq $0xD2,%ymm1,%ymm20,%ymm19 + vpternlogq $0xD2,%ymm7,%ymm1,%ymm20 + vmovdqa64 %ymm30,%ymm1 + vmovdqa64 %ymm31,%ymm7 + vmovdqa64 %ymm4,%ymm30 + vpternlogq $0xD2,%ymm11,%ymm5,%ymm30 + + vmovdqa64 %ymm5,%ymm31 + vpternlogq $0xD2,%ymm17,%ymm11,%ymm31 + vpternlogq $0xD2,%ymm23,%ymm17,%ymm11 + vpternlogq $0xD2,%ymm4,%ymm23,%ymm17 + + vpternlogq $0xD2,%ymm5,%ymm4,%ymm23 + vmovdqa64 %ymm30,%ymm4 + vmovdqa64 %ymm31,%ymm5 + vmovdqa64 %ymm2,%ymm30 + vpternlogq $0xD2,%ymm14,%ymm8,%ymm30 + vmovdqa64 %ymm8,%ymm31 + vpternlogq $0xD2,%ymm15,%ymm14,%ymm31 + + vpternlogq $0xD2,%ymm21,%ymm15,%ymm14 + vpternlogq $0xD2,%ymm2,%ymm21,%ymm15 + vpternlogq $0xD2,%ymm8,%ymm2,%ymm21 + vmovdqa64 %ymm30,%ymm2 + vmovdqa64 %ymm31,%ymm8 + + + vmovdqa64 %ymm3,%ymm30 + vmovdqa64 %ymm18,%ymm3 + vmovdqa64 %ymm17,%ymm18 + vmovdqa64 %ymm11,%ymm17 + vmovdqa64 %ymm7,%ymm11 + vmovdqa64 %ymm10,%ymm7 + vmovdqa64 %ymm1,%ymm10 + vmovdqa64 %ymm6,%ymm1 + vmovdqa64 %ymm9,%ymm6 + vmovdqa64 %ymm22,%ymm9 + vmovdqa64 %ymm14,%ymm22 + vmovdqa64 %ymm20,%ymm14 + vmovdqa64 %ymm2,%ymm20 + vmovdqa64 %ymm12,%ymm2 + vmovdqa64 %ymm13,%ymm12 + vmovdqa64 %ymm19,%ymm13 + vmovdqa64 %ymm23,%ymm19 + vmovdqa64 %ymm15,%ymm23 + vmovdqa64 %ymm4,%ymm15 + vmovdqa64 %ymm24,%ymm4 + vmovdqa64 %ymm21,%ymm24 + vmovdqa64 %ymm8,%ymm21 + vmovdqa64 %ymm16,%ymm8 + vmovdqa64 %ymm5,%ymm16 + vmovdqa64 %ymm30,%ymm5 + + decl %r10d + jnz .Lkeccak_rnd_loop + .byte 0xf3,0xc3 +.cfi_endproc + +.globl KeccakF1600_avx512vl +.hidden KeccakF1600_avx512vl +.type KeccakF1600_avx512vl,@function +.align 32 +KeccakF1600_avx512vl: +.cfi_startproc +.byte 243,15,30,250 + vmovq 0(%rdi),%xmm0 + vmovq 8(%rdi),%xmm1 + vmovq 16(%rdi),%xmm2 + vmovq 24(%rdi),%xmm3 + vmovq 32(%rdi),%xmm4 + vmovq 40(%rdi),%xmm5 + vmovq 48(%rdi),%xmm6 + vmovq 56(%rdi),%xmm7 + vmovq 64(%rdi),%xmm8 + vmovq 72(%rdi),%xmm9 + vmovq 80(%rdi),%xmm10 + vmovq 88(%rdi),%xmm11 + vmovq 96(%rdi),%xmm12 + vmovq 104(%rdi),%xmm13 + vmovq 112(%rdi),%xmm14 + vmovq 120(%rdi),%xmm15 + vmovq 128(%rdi),%xmm16 + vmovq 136(%rdi),%xmm17 + vmovq 144(%rdi),%xmm18 + vmovq 152(%rdi),%xmm19 + vmovq 160(%rdi),%xmm20 + vmovq 168(%rdi),%xmm21 + vmovq 176(%rdi),%xmm22 + vmovq 184(%rdi),%xmm23 + vmovq 192(%rdi),%xmm24 + + call keccak_1600_permute + + vmovq %xmm0,0(%rdi) + vmovq %xmm1,8(%rdi) + vmovq %xmm2,16(%rdi) + vmovq %xmm3,24(%rdi) + vmovq %xmm4,32(%rdi) + vmovq %xmm5,40(%rdi) + vmovq %xmm6,48(%rdi) + vmovq %xmm7,56(%rdi) + vmovq %xmm8,64(%rdi) + vmovq %xmm9,72(%rdi) + vmovq %xmm10,80(%rdi) + vmovq %xmm11,88(%rdi) + vmovq %xmm12,96(%rdi) + vmovq %xmm13,104(%rdi) + vmovq %xmm14,112(%rdi) + vmovq %xmm15,120(%rdi) + vmovq %xmm16,128(%rdi) + vmovq %xmm17,136(%rdi) + vmovq %xmm18,144(%rdi) + vmovq %xmm19,152(%rdi) + vmovq %xmm20,160(%rdi) + vmovq %xmm21,168(%rdi) + vmovq %xmm22,176(%rdi) + vmovq %xmm23,184(%rdi) + vmovq %xmm24,192(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc + +.globl KeccakF1600_x4_avx512vl +.hidden KeccakF1600_x4_avx512vl +.type KeccakF1600_x4_avx512vl,@function +.align 32 +KeccakF1600_x4_avx512vl: +.cfi_startproc +.byte 243,15,30,250 + vmovdqu64 0+0(%rdi),%ymm25 + vmovdqu64 0+200(%rdi),%ymm26 + vmovdqu64 0+400(%rdi),%ymm27 + vmovdqu64 0+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm0 + vshufi64x2 $0,%ymm26,%ymm30,%ymm1 + vshufi64x2 $3,%ymm25,%ymm29,%ymm2 + vshufi64x2 $3,%ymm26,%ymm30,%ymm3 + vmovdqu64 32+0(%rdi),%ymm25 + vmovdqu64 32+200(%rdi),%ymm26 + vmovdqu64 32+400(%rdi),%ymm27 + vmovdqu64 32+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm4 + vshufi64x2 $0,%ymm26,%ymm30,%ymm5 + vshufi64x2 $3,%ymm25,%ymm29,%ymm6 + vshufi64x2 $3,%ymm26,%ymm30,%ymm7 + vmovdqu64 64+0(%rdi),%ymm25 + vmovdqu64 64+200(%rdi),%ymm26 + vmovdqu64 64+400(%rdi),%ymm27 + vmovdqu64 64+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm8 + vshufi64x2 $0,%ymm26,%ymm30,%ymm9 + vshufi64x2 $3,%ymm25,%ymm29,%ymm10 + vshufi64x2 $3,%ymm26,%ymm30,%ymm11 + vmovdqu64 96+0(%rdi),%ymm25 + vmovdqu64 96+200(%rdi),%ymm26 + vmovdqu64 96+400(%rdi),%ymm27 + vmovdqu64 96+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm12 + vshufi64x2 $0,%ymm26,%ymm30,%ymm13 + vshufi64x2 $3,%ymm25,%ymm29,%ymm14 + vshufi64x2 $3,%ymm26,%ymm30,%ymm15 + vmovdqu64 128+0(%rdi),%ymm25 + vmovdqu64 128+200(%rdi),%ymm26 + vmovdqu64 128+400(%rdi),%ymm27 + vmovdqu64 128+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm16 + vshufi64x2 $0,%ymm26,%ymm30,%ymm17 + vshufi64x2 $3,%ymm25,%ymm29,%ymm18 + vshufi64x2 $3,%ymm26,%ymm30,%ymm19 + vmovdqu64 160+0(%rdi),%ymm25 + vmovdqu64 160+200(%rdi),%ymm26 + vmovdqu64 160+400(%rdi),%ymm27 + vmovdqu64 160+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm20 + vshufi64x2 $0,%ymm26,%ymm30,%ymm21 + vshufi64x2 $3,%ymm25,%ymm29,%ymm22 + vshufi64x2 $3,%ymm26,%ymm30,%ymm23 + vmovq 192+0(%rdi),%xmm24 + vpinsrq $1,192+200(%rdi),%xmm24,%xmm24 + vmovq 192+400(%rdi),%xmm25 + vpinsrq $1,192+600(%rdi),%xmm25,%xmm25 + vinserti32x4 $1,%xmm25,%ymm24,%ymm24 + + call keccak_1600_permute + + vpunpcklqdq %ymm1,%ymm0,%ymm25 + vpunpckhqdq %ymm1,%ymm0,%ymm26 + vpunpcklqdq %ymm3,%ymm2,%ymm27 + vpunpckhqdq %ymm3,%ymm2,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm0 + vshufi64x2 $0,%ymm28,%ymm26,%ymm1 + vshufi64x2 $3,%ymm27,%ymm25,%ymm2 + vshufi64x2 $3,%ymm28,%ymm26,%ymm3 + vmovdqu64 %ymm0,0+0(%rdi) + vmovdqu64 %ymm1,0+200(%rdi) + vmovdqu64 %ymm2,0+400(%rdi) + vmovdqu64 %ymm3,0+600(%rdi) + vpunpcklqdq %ymm5,%ymm4,%ymm25 + vpunpckhqdq %ymm5,%ymm4,%ymm26 + vpunpcklqdq %ymm7,%ymm6,%ymm27 + vpunpckhqdq %ymm7,%ymm6,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm4 + vshufi64x2 $0,%ymm28,%ymm26,%ymm5 + vshufi64x2 $3,%ymm27,%ymm25,%ymm6 + vshufi64x2 $3,%ymm28,%ymm26,%ymm7 + vmovdqu64 %ymm4,32+0(%rdi) + vmovdqu64 %ymm5,32+200(%rdi) + vmovdqu64 %ymm6,32+400(%rdi) + vmovdqu64 %ymm7,32+600(%rdi) + vpunpcklqdq %ymm9,%ymm8,%ymm25 + vpunpckhqdq %ymm9,%ymm8,%ymm26 + vpunpcklqdq %ymm11,%ymm10,%ymm27 + vpunpckhqdq %ymm11,%ymm10,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm8 + vshufi64x2 $0,%ymm28,%ymm26,%ymm9 + vshufi64x2 $3,%ymm27,%ymm25,%ymm10 + vshufi64x2 $3,%ymm28,%ymm26,%ymm11 + vmovdqu64 %ymm8,64+0(%rdi) + vmovdqu64 %ymm9,64+200(%rdi) + vmovdqu64 %ymm10,64+400(%rdi) + vmovdqu64 %ymm11,64+600(%rdi) + vpunpcklqdq %ymm13,%ymm12,%ymm25 + vpunpckhqdq %ymm13,%ymm12,%ymm26 + vpunpcklqdq %ymm15,%ymm14,%ymm27 + vpunpckhqdq %ymm15,%ymm14,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm12 + vshufi64x2 $0,%ymm28,%ymm26,%ymm13 + vshufi64x2 $3,%ymm27,%ymm25,%ymm14 + vshufi64x2 $3,%ymm28,%ymm26,%ymm15 + vmovdqu64 %ymm12,96+0(%rdi) + vmovdqu64 %ymm13,96+200(%rdi) + vmovdqu64 %ymm14,96+400(%rdi) + vmovdqu64 %ymm15,96+600(%rdi) + vpunpcklqdq %ymm17,%ymm16,%ymm25 + vpunpckhqdq %ymm17,%ymm16,%ymm26 + vpunpcklqdq %ymm19,%ymm18,%ymm27 + vpunpckhqdq %ymm19,%ymm18,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm16 + vshufi64x2 $0,%ymm28,%ymm26,%ymm17 + vshufi64x2 $3,%ymm27,%ymm25,%ymm18 + vshufi64x2 $3,%ymm28,%ymm26,%ymm19 + vmovdqu64 %ymm16,128+0(%rdi) + vmovdqu64 %ymm17,128+200(%rdi) + vmovdqu64 %ymm18,128+400(%rdi) + vmovdqu64 %ymm19,128+600(%rdi) + vpunpcklqdq %ymm21,%ymm20,%ymm25 + vpunpckhqdq %ymm21,%ymm20,%ymm26 + vpunpcklqdq %ymm23,%ymm22,%ymm27 + vpunpckhqdq %ymm23,%ymm22,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm20 + vshufi64x2 $0,%ymm28,%ymm26,%ymm21 + vshufi64x2 $3,%ymm27,%ymm25,%ymm22 + vshufi64x2 $3,%ymm28,%ymm26,%ymm23 + vmovdqu64 %ymm20,160+0(%rdi) + vmovdqu64 %ymm21,160+200(%rdi) + vmovdqu64 %ymm22,160+400(%rdi) + vmovdqu64 %ymm23,160+600(%rdi) + vextracti32x4 $1,%ymm24,%xmm25 + vmovq %xmm24,192+0(%rdi) + vpextrq $1,%xmm24,192+200(%rdi) + vmovq %xmm25,192+400(%rdi) + vpextrq $1,%xmm25,192+600(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc + +.section .rodata + +.align 64 +iotas_avx512: +.quad 0x0000000000000001, 0x0000000000008082 +.quad 0x800000000000808a, 0x8000000080008000 +.quad 0x000000000000808b, 0x0000000080000001 +.quad 0x8000000080008081, 0x8000000000008009 +.quad 0x000000000000008a, 0x0000000000000088 +.quad 0x0000000080008009, 0x000000008000000a +.quad 0x000000008000808b, 0x800000000000008b +.quad 0x8000000000008089, 0x8000000000008003 +.quad 0x8000000000008002, 0x8000000000000080 +.quad 0x000000000000800a, 0x800000008000000a +.quad 0x8000000080008081, 0x8000000000008080 +.quad 0x0000000080000001, 0x8000000080008008 + +#endif + +.text +#endif diff --git a/generated-src/mac-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.S b/generated-src/mac-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.S new file mode 100644 index 0000000000..342ae509f5 --- /dev/null +++ b/generated-src/mac-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.S @@ -0,0 +1,503 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX + +.text + + + + + + + + + + + + + + +.p2align 5 +keccak_1600_permute: + +.byte 243,15,30,250 + movl $24,%r10d + leaq iotas_avx512(%rip),%r11 + +.p2align 5 +L$keccak_rnd_loop: + + + + + + + + vmovdqa64 %ymm0,%ymm25 + vpternlogq $0x96,%ymm5,%ymm10,%ymm25 + vmovdqa64 %ymm1,%ymm26 + vpternlogq $0x96,%ymm11,%ymm6,%ymm26 + vmovdqa64 %ymm2,%ymm27 + vpternlogq $0x96,%ymm12,%ymm7,%ymm27 + + vmovdqa64 %ymm3,%ymm28 + vpternlogq $0x96,%ymm13,%ymm8,%ymm28 + vmovdqa64 %ymm4,%ymm29 + vpternlogq $0x96,%ymm14,%ymm9,%ymm29 + vpternlogq $0x96,%ymm20,%ymm15,%ymm25 + + vpternlogq $0x96,%ymm21,%ymm16,%ymm26 + vpternlogq $0x96,%ymm22,%ymm17,%ymm27 + vpternlogq $0x96,%ymm23,%ymm18,%ymm28 + + + + + + + vprolq $1,%ymm26,%ymm30 + vprolq $1,%ymm27,%ymm31 + vpternlogq $0x96,%ymm24,%ymm19,%ymm29 + + + + + + + vpternlogq $0x96,%ymm30,%ymm29,%ymm0 + vpternlogq $0x96,%ymm30,%ymm29,%ymm10 + vpternlogq $0x96,%ymm30,%ymm29,%ymm20 + + vpternlogq $0x96,%ymm30,%ymm29,%ymm5 + vpternlogq $0x96,%ymm30,%ymm29,%ymm15 + vprolq $1,%ymm28,%ymm30 + + vpternlogq $0x96,%ymm31,%ymm25,%ymm6 + vpternlogq $0x96,%ymm31,%ymm25,%ymm16 + vpternlogq $0x96,%ymm31,%ymm25,%ymm1 + + vpternlogq $0x96,%ymm31,%ymm25,%ymm11 + vpternlogq $0x96,%ymm31,%ymm25,%ymm21 + vprolq $1,%ymm29,%ymm31 + + vpbroadcastq (%r11),%ymm29 + addq $8,%r11 + + vpternlogq $0x96,%ymm30,%ymm26,%ymm12 + vpternlogq $0x96,%ymm30,%ymm26,%ymm7 + vpternlogq $0x96,%ymm30,%ymm26,%ymm22 + + vpternlogq $0x96,%ymm30,%ymm26,%ymm17 + vpternlogq $0x96,%ymm30,%ymm26,%ymm2 + vprolq $1,%ymm25,%ymm30 + + + + + + + + + + + + + + + + vpternlogq $0x96,%ymm31,%ymm27,%ymm3 + vpternlogq $0x96,%ymm31,%ymm27,%ymm13 + vpternlogq $0x96,%ymm31,%ymm27,%ymm23 + + vprolq $44,%ymm6,%ymm6 + vpternlogq $0x96,%ymm31,%ymm27,%ymm18 + vpternlogq $0x96,%ymm31,%ymm27,%ymm8 + + vprolq $43,%ymm12,%ymm12 + vprolq $21,%ymm18,%ymm18 + vpternlogq $0x96,%ymm30,%ymm28,%ymm24 + + vprolq $14,%ymm24,%ymm24 + vprolq $28,%ymm3,%ymm3 + vpternlogq $0x96,%ymm30,%ymm28,%ymm9 + + vprolq $20,%ymm9,%ymm9 + vprolq $3,%ymm10,%ymm10 + vpternlogq $0x96,%ymm30,%ymm28,%ymm19 + + vprolq $45,%ymm16,%ymm16 + vprolq $61,%ymm22,%ymm22 + vpternlogq $0x96,%ymm30,%ymm28,%ymm4 + + vprolq $1,%ymm1,%ymm1 + vprolq $6,%ymm7,%ymm7 + vpternlogq $0x96,%ymm30,%ymm28,%ymm14 + + + + + + + + + vprolq $25,%ymm13,%ymm13 + vprolq $8,%ymm19,%ymm19 + vmovdqa64 %ymm0,%ymm30 + vpternlogq $0xD2,%ymm12,%ymm6,%ymm30 + + vprolq $18,%ymm20,%ymm20 + vprolq $27,%ymm4,%ymm4 + vpxorq %ymm29,%ymm30,%ymm30 + + vprolq $36,%ymm5,%ymm5 + vprolq $10,%ymm11,%ymm11 + vmovdqa64 %ymm6,%ymm31 + vpternlogq $0xD2,%ymm18,%ymm12,%ymm31 + + vprolq $15,%ymm17,%ymm17 + vprolq $56,%ymm23,%ymm23 + vpternlogq $0xD2,%ymm24,%ymm18,%ymm12 + + vprolq $62,%ymm2,%ymm2 + vprolq $55,%ymm8,%ymm8 + vpternlogq $0xD2,%ymm0,%ymm24,%ymm18 + + vprolq $39,%ymm14,%ymm14 + vprolq $41,%ymm15,%ymm15 + vpternlogq $0xD2,%ymm6,%ymm0,%ymm24 + vmovdqa64 %ymm30,%ymm0 + vmovdqa64 %ymm31,%ymm6 + + vprolq $2,%ymm21,%ymm21 + vmovdqa64 %ymm3,%ymm30 + vpternlogq $0xD2,%ymm10,%ymm9,%ymm30 + vmovdqa64 %ymm9,%ymm31 + vpternlogq $0xD2,%ymm16,%ymm10,%ymm31 + + vpternlogq $0xD2,%ymm22,%ymm16,%ymm10 + vpternlogq $0xD2,%ymm3,%ymm22,%ymm16 + vpternlogq $0xD2,%ymm9,%ymm3,%ymm22 + vmovdqa64 %ymm30,%ymm3 + vmovdqa64 %ymm31,%ymm9 + + vmovdqa64 %ymm1,%ymm30 + vpternlogq $0xD2,%ymm13,%ymm7,%ymm30 + vmovdqa64 %ymm7,%ymm31 + vpternlogq $0xD2,%ymm19,%ymm13,%ymm31 + vpternlogq $0xD2,%ymm20,%ymm19,%ymm13 + + vpternlogq $0xD2,%ymm1,%ymm20,%ymm19 + vpternlogq $0xD2,%ymm7,%ymm1,%ymm20 + vmovdqa64 %ymm30,%ymm1 + vmovdqa64 %ymm31,%ymm7 + vmovdqa64 %ymm4,%ymm30 + vpternlogq $0xD2,%ymm11,%ymm5,%ymm30 + + vmovdqa64 %ymm5,%ymm31 + vpternlogq $0xD2,%ymm17,%ymm11,%ymm31 + vpternlogq $0xD2,%ymm23,%ymm17,%ymm11 + vpternlogq $0xD2,%ymm4,%ymm23,%ymm17 + + vpternlogq $0xD2,%ymm5,%ymm4,%ymm23 + vmovdqa64 %ymm30,%ymm4 + vmovdqa64 %ymm31,%ymm5 + vmovdqa64 %ymm2,%ymm30 + vpternlogq $0xD2,%ymm14,%ymm8,%ymm30 + vmovdqa64 %ymm8,%ymm31 + vpternlogq $0xD2,%ymm15,%ymm14,%ymm31 + + vpternlogq $0xD2,%ymm21,%ymm15,%ymm14 + vpternlogq $0xD2,%ymm2,%ymm21,%ymm15 + vpternlogq $0xD2,%ymm8,%ymm2,%ymm21 + vmovdqa64 %ymm30,%ymm2 + vmovdqa64 %ymm31,%ymm8 + + + vmovdqa64 %ymm3,%ymm30 + vmovdqa64 %ymm18,%ymm3 + vmovdqa64 %ymm17,%ymm18 + vmovdqa64 %ymm11,%ymm17 + vmovdqa64 %ymm7,%ymm11 + vmovdqa64 %ymm10,%ymm7 + vmovdqa64 %ymm1,%ymm10 + vmovdqa64 %ymm6,%ymm1 + vmovdqa64 %ymm9,%ymm6 + vmovdqa64 %ymm22,%ymm9 + vmovdqa64 %ymm14,%ymm22 + vmovdqa64 %ymm20,%ymm14 + vmovdqa64 %ymm2,%ymm20 + vmovdqa64 %ymm12,%ymm2 + vmovdqa64 %ymm13,%ymm12 + vmovdqa64 %ymm19,%ymm13 + vmovdqa64 %ymm23,%ymm19 + vmovdqa64 %ymm15,%ymm23 + vmovdqa64 %ymm4,%ymm15 + vmovdqa64 %ymm24,%ymm4 + vmovdqa64 %ymm21,%ymm24 + vmovdqa64 %ymm8,%ymm21 + vmovdqa64 %ymm16,%ymm8 + vmovdqa64 %ymm5,%ymm16 + vmovdqa64 %ymm30,%ymm5 + + decl %r10d + jnz L$keccak_rnd_loop + .byte 0xf3,0xc3 + + +.globl _KeccakF1600_avx512vl +.private_extern _KeccakF1600_avx512vl + +.p2align 5 +_KeccakF1600_avx512vl: + +.byte 243,15,30,250 + vmovq 0(%rdi),%xmm0 + vmovq 8(%rdi),%xmm1 + vmovq 16(%rdi),%xmm2 + vmovq 24(%rdi),%xmm3 + vmovq 32(%rdi),%xmm4 + vmovq 40(%rdi),%xmm5 + vmovq 48(%rdi),%xmm6 + vmovq 56(%rdi),%xmm7 + vmovq 64(%rdi),%xmm8 + vmovq 72(%rdi),%xmm9 + vmovq 80(%rdi),%xmm10 + vmovq 88(%rdi),%xmm11 + vmovq 96(%rdi),%xmm12 + vmovq 104(%rdi),%xmm13 + vmovq 112(%rdi),%xmm14 + vmovq 120(%rdi),%xmm15 + vmovq 128(%rdi),%xmm16 + vmovq 136(%rdi),%xmm17 + vmovq 144(%rdi),%xmm18 + vmovq 152(%rdi),%xmm19 + vmovq 160(%rdi),%xmm20 + vmovq 168(%rdi),%xmm21 + vmovq 176(%rdi),%xmm22 + vmovq 184(%rdi),%xmm23 + vmovq 192(%rdi),%xmm24 + + call keccak_1600_permute + + vmovq %xmm0,0(%rdi) + vmovq %xmm1,8(%rdi) + vmovq %xmm2,16(%rdi) + vmovq %xmm3,24(%rdi) + vmovq %xmm4,32(%rdi) + vmovq %xmm5,40(%rdi) + vmovq %xmm6,48(%rdi) + vmovq %xmm7,56(%rdi) + vmovq %xmm8,64(%rdi) + vmovq %xmm9,72(%rdi) + vmovq %xmm10,80(%rdi) + vmovq %xmm11,88(%rdi) + vmovq %xmm12,96(%rdi) + vmovq %xmm13,104(%rdi) + vmovq %xmm14,112(%rdi) + vmovq %xmm15,120(%rdi) + vmovq %xmm16,128(%rdi) + vmovq %xmm17,136(%rdi) + vmovq %xmm18,144(%rdi) + vmovq %xmm19,152(%rdi) + vmovq %xmm20,160(%rdi) + vmovq %xmm21,168(%rdi) + vmovq %xmm22,176(%rdi) + vmovq %xmm23,184(%rdi) + vmovq %xmm24,192(%rdi) + vzeroupper + .byte 0xf3,0xc3 + + +.globl _KeccakF1600_x4_avx512vl +.private_extern _KeccakF1600_x4_avx512vl + +.p2align 5 +_KeccakF1600_x4_avx512vl: + +.byte 243,15,30,250 + vmovdqu64 0+0(%rdi),%ymm25 + vmovdqu64 0+200(%rdi),%ymm26 + vmovdqu64 0+400(%rdi),%ymm27 + vmovdqu64 0+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm0 + vshufi64x2 $0,%ymm26,%ymm30,%ymm1 + vshufi64x2 $3,%ymm25,%ymm29,%ymm2 + vshufi64x2 $3,%ymm26,%ymm30,%ymm3 + vmovdqu64 32+0(%rdi),%ymm25 + vmovdqu64 32+200(%rdi),%ymm26 + vmovdqu64 32+400(%rdi),%ymm27 + vmovdqu64 32+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm4 + vshufi64x2 $0,%ymm26,%ymm30,%ymm5 + vshufi64x2 $3,%ymm25,%ymm29,%ymm6 + vshufi64x2 $3,%ymm26,%ymm30,%ymm7 + vmovdqu64 64+0(%rdi),%ymm25 + vmovdqu64 64+200(%rdi),%ymm26 + vmovdqu64 64+400(%rdi),%ymm27 + vmovdqu64 64+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm8 + vshufi64x2 $0,%ymm26,%ymm30,%ymm9 + vshufi64x2 $3,%ymm25,%ymm29,%ymm10 + vshufi64x2 $3,%ymm26,%ymm30,%ymm11 + vmovdqu64 96+0(%rdi),%ymm25 + vmovdqu64 96+200(%rdi),%ymm26 + vmovdqu64 96+400(%rdi),%ymm27 + vmovdqu64 96+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm12 + vshufi64x2 $0,%ymm26,%ymm30,%ymm13 + vshufi64x2 $3,%ymm25,%ymm29,%ymm14 + vshufi64x2 $3,%ymm26,%ymm30,%ymm15 + vmovdqu64 128+0(%rdi),%ymm25 + vmovdqu64 128+200(%rdi),%ymm26 + vmovdqu64 128+400(%rdi),%ymm27 + vmovdqu64 128+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm16 + vshufi64x2 $0,%ymm26,%ymm30,%ymm17 + vshufi64x2 $3,%ymm25,%ymm29,%ymm18 + vshufi64x2 $3,%ymm26,%ymm30,%ymm19 + vmovdqu64 160+0(%rdi),%ymm25 + vmovdqu64 160+200(%rdi),%ymm26 + vmovdqu64 160+400(%rdi),%ymm27 + vmovdqu64 160+600(%rdi),%ymm28 + vpunpcklqdq %ymm26,%ymm25,%ymm29 + vpunpckhqdq %ymm26,%ymm25,%ymm30 + vpunpcklqdq %ymm28,%ymm27,%ymm25 + vpunpckhqdq %ymm28,%ymm27,%ymm26 + vshufi64x2 $0,%ymm25,%ymm29,%ymm20 + vshufi64x2 $0,%ymm26,%ymm30,%ymm21 + vshufi64x2 $3,%ymm25,%ymm29,%ymm22 + vshufi64x2 $3,%ymm26,%ymm30,%ymm23 + vmovq 192+0(%rdi),%xmm24 + vpinsrq $1,192+200(%rdi),%xmm24,%xmm24 + vmovq 192+400(%rdi),%xmm25 + vpinsrq $1,192+600(%rdi),%xmm25,%xmm25 + vinserti32x4 $1,%xmm25,%ymm24,%ymm24 + + call keccak_1600_permute + + vpunpcklqdq %ymm1,%ymm0,%ymm25 + vpunpckhqdq %ymm1,%ymm0,%ymm26 + vpunpcklqdq %ymm3,%ymm2,%ymm27 + vpunpckhqdq %ymm3,%ymm2,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm0 + vshufi64x2 $0,%ymm28,%ymm26,%ymm1 + vshufi64x2 $3,%ymm27,%ymm25,%ymm2 + vshufi64x2 $3,%ymm28,%ymm26,%ymm3 + vmovdqu64 %ymm0,0+0(%rdi) + vmovdqu64 %ymm1,0+200(%rdi) + vmovdqu64 %ymm2,0+400(%rdi) + vmovdqu64 %ymm3,0+600(%rdi) + vpunpcklqdq %ymm5,%ymm4,%ymm25 + vpunpckhqdq %ymm5,%ymm4,%ymm26 + vpunpcklqdq %ymm7,%ymm6,%ymm27 + vpunpckhqdq %ymm7,%ymm6,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm4 + vshufi64x2 $0,%ymm28,%ymm26,%ymm5 + vshufi64x2 $3,%ymm27,%ymm25,%ymm6 + vshufi64x2 $3,%ymm28,%ymm26,%ymm7 + vmovdqu64 %ymm4,32+0(%rdi) + vmovdqu64 %ymm5,32+200(%rdi) + vmovdqu64 %ymm6,32+400(%rdi) + vmovdqu64 %ymm7,32+600(%rdi) + vpunpcklqdq %ymm9,%ymm8,%ymm25 + vpunpckhqdq %ymm9,%ymm8,%ymm26 + vpunpcklqdq %ymm11,%ymm10,%ymm27 + vpunpckhqdq %ymm11,%ymm10,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm8 + vshufi64x2 $0,%ymm28,%ymm26,%ymm9 + vshufi64x2 $3,%ymm27,%ymm25,%ymm10 + vshufi64x2 $3,%ymm28,%ymm26,%ymm11 + vmovdqu64 %ymm8,64+0(%rdi) + vmovdqu64 %ymm9,64+200(%rdi) + vmovdqu64 %ymm10,64+400(%rdi) + vmovdqu64 %ymm11,64+600(%rdi) + vpunpcklqdq %ymm13,%ymm12,%ymm25 + vpunpckhqdq %ymm13,%ymm12,%ymm26 + vpunpcklqdq %ymm15,%ymm14,%ymm27 + vpunpckhqdq %ymm15,%ymm14,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm12 + vshufi64x2 $0,%ymm28,%ymm26,%ymm13 + vshufi64x2 $3,%ymm27,%ymm25,%ymm14 + vshufi64x2 $3,%ymm28,%ymm26,%ymm15 + vmovdqu64 %ymm12,96+0(%rdi) + vmovdqu64 %ymm13,96+200(%rdi) + vmovdqu64 %ymm14,96+400(%rdi) + vmovdqu64 %ymm15,96+600(%rdi) + vpunpcklqdq %ymm17,%ymm16,%ymm25 + vpunpckhqdq %ymm17,%ymm16,%ymm26 + vpunpcklqdq %ymm19,%ymm18,%ymm27 + vpunpckhqdq %ymm19,%ymm18,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm16 + vshufi64x2 $0,%ymm28,%ymm26,%ymm17 + vshufi64x2 $3,%ymm27,%ymm25,%ymm18 + vshufi64x2 $3,%ymm28,%ymm26,%ymm19 + vmovdqu64 %ymm16,128+0(%rdi) + vmovdqu64 %ymm17,128+200(%rdi) + vmovdqu64 %ymm18,128+400(%rdi) + vmovdqu64 %ymm19,128+600(%rdi) + vpunpcklqdq %ymm21,%ymm20,%ymm25 + vpunpckhqdq %ymm21,%ymm20,%ymm26 + vpunpcklqdq %ymm23,%ymm22,%ymm27 + vpunpckhqdq %ymm23,%ymm22,%ymm28 + vshufi64x2 $0,%ymm27,%ymm25,%ymm20 + vshufi64x2 $0,%ymm28,%ymm26,%ymm21 + vshufi64x2 $3,%ymm27,%ymm25,%ymm22 + vshufi64x2 $3,%ymm28,%ymm26,%ymm23 + vmovdqu64 %ymm20,160+0(%rdi) + vmovdqu64 %ymm21,160+200(%rdi) + vmovdqu64 %ymm22,160+400(%rdi) + vmovdqu64 %ymm23,160+600(%rdi) + vextracti32x4 $1,%ymm24,%xmm25 + vmovq %xmm24,192+0(%rdi) + vpextrq $1,%xmm24,192+200(%rdi) + vmovq %xmm25,192+400(%rdi) + vpextrq $1,%xmm25,192+600(%rdi) + vzeroupper + .byte 0xf3,0xc3 + + +.section __DATA,__const + +.p2align 6 +iotas_avx512: +.quad 0x0000000000000001, 0x0000000000008082 +.quad 0x800000000000808a, 0x8000000080008000 +.quad 0x000000000000808b, 0x0000000080000001 +.quad 0x8000000080008081, 0x8000000000008009 +.quad 0x000000000000008a, 0x0000000000000088 +.quad 0x0000000080008009, 0x000000008000000a +.quad 0x000000008000808b, 0x800000000000008b +.quad 0x8000000000008089, 0x8000000000008003 +.quad 0x8000000000008002, 0x8000000000000080 +.quad 0x000000000000800a, 0x800000008000000a +.quad 0x8000000080008081, 0x8000000000008080 +.quad 0x0000000080000001, 0x8000000080008008 + +#endif + +.text +#endif diff --git a/generated-src/win-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.asm b/generated-src/win-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.asm new file mode 100644 index 0000000000..8a671caf1a --- /dev/null +++ b/generated-src/win-x86_64/crypto/fipsmodule/keccak1600-x86_64-avx512vl.asm @@ -0,0 +1,573 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%include "openssl/boringssl_prefix_symbols_nasm.inc" +%ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX + +section .text code align=64 + + + + + + + + + + + + + + + +ALIGN 32 +keccak_1600_permute: + +DB 243,15,30,250 + mov r10d,24 + lea r11,[iotas_avx512] + +ALIGN 32 +$L$keccak_rnd_loop: + + + + + + + + vmovdqa64 ymm25,ymm0 + vpternlogq ymm25,ymm10,ymm5,0x96 + vmovdqa64 ymm26,ymm1 + vpternlogq ymm26,ymm6,ymm11,0x96 + vmovdqa64 ymm27,ymm2 + vpternlogq ymm27,ymm7,ymm12,0x96 + + vmovdqa64 ymm28,ymm3 + vpternlogq ymm28,ymm8,ymm13,0x96 + vmovdqa64 ymm29,ymm4 + vpternlogq ymm29,ymm9,ymm14,0x96 + vpternlogq ymm25,ymm15,ymm20,0x96 + + vpternlogq ymm26,ymm16,ymm21,0x96 + vpternlogq ymm27,ymm17,ymm22,0x96 + vpternlogq ymm28,ymm18,ymm23,0x96 + + + + + + + vprolq ymm30,ymm26,1 + vprolq ymm31,ymm27,1 + vpternlogq ymm29,ymm19,ymm24,0x96 + + + + + + + vpternlogq ymm0,ymm29,ymm30,0x96 + vpternlogq ymm10,ymm29,ymm30,0x96 + vpternlogq ymm20,ymm29,ymm30,0x96 + + vpternlogq ymm5,ymm29,ymm30,0x96 + vpternlogq ymm15,ymm29,ymm30,0x96 + vprolq ymm30,ymm28,1 + + vpternlogq ymm6,ymm25,ymm31,0x96 + vpternlogq ymm16,ymm25,ymm31,0x96 + vpternlogq ymm1,ymm25,ymm31,0x96 + + vpternlogq ymm11,ymm25,ymm31,0x96 + vpternlogq ymm21,ymm25,ymm31,0x96 + vprolq ymm31,ymm29,1 + + vpbroadcastq ymm29,QWORD[r11] + add r11,8 + + vpternlogq ymm12,ymm26,ymm30,0x96 + vpternlogq ymm7,ymm26,ymm30,0x96 + vpternlogq ymm22,ymm26,ymm30,0x96 + + vpternlogq ymm17,ymm26,ymm30,0x96 + vpternlogq ymm2,ymm26,ymm30,0x96 + vprolq ymm30,ymm25,1 + + + + + + + + + + + + + + + + vpternlogq ymm3,ymm27,ymm31,0x96 + vpternlogq ymm13,ymm27,ymm31,0x96 + vpternlogq ymm23,ymm27,ymm31,0x96 + + vprolq ymm6,ymm6,44 + vpternlogq ymm18,ymm27,ymm31,0x96 + vpternlogq ymm8,ymm27,ymm31,0x96 + + vprolq ymm12,ymm12,43 + vprolq ymm18,ymm18,21 + vpternlogq ymm24,ymm28,ymm30,0x96 + + vprolq ymm24,ymm24,14 + vprolq ymm3,ymm3,28 + vpternlogq ymm9,ymm28,ymm30,0x96 + + vprolq ymm9,ymm9,20 + vprolq ymm10,ymm10,3 + vpternlogq ymm19,ymm28,ymm30,0x96 + + vprolq ymm16,ymm16,45 + vprolq ymm22,ymm22,61 + vpternlogq ymm4,ymm28,ymm30,0x96 + + vprolq ymm1,ymm1,1 + vprolq ymm7,ymm7,6 + vpternlogq ymm14,ymm28,ymm30,0x96 + + + + + + + + + vprolq ymm13,ymm13,25 + vprolq ymm19,ymm19,8 + vmovdqa64 ymm30,ymm0 + vpternlogq ymm30,ymm6,ymm12,0xD2 + + vprolq ymm20,ymm20,18 + vprolq ymm4,ymm4,27 + vpxorq ymm30,ymm30,ymm29 + + vprolq ymm5,ymm5,36 + vprolq ymm11,ymm11,10 + vmovdqa64 ymm31,ymm6 + vpternlogq ymm31,ymm12,ymm18,0xD2 + + vprolq ymm17,ymm17,15 + vprolq ymm23,ymm23,56 + vpternlogq ymm12,ymm18,ymm24,0xD2 + + vprolq ymm2,ymm2,62 + vprolq ymm8,ymm8,55 + vpternlogq ymm18,ymm24,ymm0,0xD2 + + vprolq ymm14,ymm14,39 + vprolq ymm15,ymm15,41 + vpternlogq ymm24,ymm0,ymm6,0xD2 + vmovdqa64 ymm0,ymm30 + vmovdqa64 ymm6,ymm31 + + vprolq ymm21,ymm21,2 + vmovdqa64 ymm30,ymm3 + vpternlogq ymm30,ymm9,ymm10,0xD2 + vmovdqa64 ymm31,ymm9 + vpternlogq ymm31,ymm10,ymm16,0xD2 + + vpternlogq ymm10,ymm16,ymm22,0xD2 + vpternlogq ymm16,ymm22,ymm3,0xD2 + vpternlogq ymm22,ymm3,ymm9,0xD2 + vmovdqa64 ymm3,ymm30 + vmovdqa64 ymm9,ymm31 + + vmovdqa64 ymm30,ymm1 + vpternlogq ymm30,ymm7,ymm13,0xD2 + vmovdqa64 ymm31,ymm7 + vpternlogq ymm31,ymm13,ymm19,0xD2 + vpternlogq ymm13,ymm19,ymm20,0xD2 + + vpternlogq ymm19,ymm20,ymm1,0xD2 + vpternlogq ymm20,ymm1,ymm7,0xD2 + vmovdqa64 ymm1,ymm30 + vmovdqa64 ymm7,ymm31 + vmovdqa64 ymm30,ymm4 + vpternlogq ymm30,ymm5,ymm11,0xD2 + + vmovdqa64 ymm31,ymm5 + vpternlogq ymm31,ymm11,ymm17,0xD2 + vpternlogq ymm11,ymm17,ymm23,0xD2 + vpternlogq ymm17,ymm23,ymm4,0xD2 + + vpternlogq ymm23,ymm4,ymm5,0xD2 + vmovdqa64 ymm4,ymm30 + vmovdqa64 ymm5,ymm31 + vmovdqa64 ymm30,ymm2 + vpternlogq ymm30,ymm8,ymm14,0xD2 + vmovdqa64 ymm31,ymm8 + vpternlogq ymm31,ymm14,ymm15,0xD2 + + vpternlogq ymm14,ymm15,ymm21,0xD2 + vpternlogq ymm15,ymm21,ymm2,0xD2 + vpternlogq ymm21,ymm2,ymm8,0xD2 + vmovdqa64 ymm2,ymm30 + vmovdqa64 ymm8,ymm31 + + + vmovdqa64 ymm30,ymm3 + vmovdqa64 ymm3,ymm18 + vmovdqa64 ymm18,ymm17 + vmovdqa64 ymm17,ymm11 + vmovdqa64 ymm11,ymm7 + vmovdqa64 ymm7,ymm10 + vmovdqa64 ymm10,ymm1 + vmovdqa64 ymm1,ymm6 + vmovdqa64 ymm6,ymm9 + vmovdqa64 ymm9,ymm22 + vmovdqa64 ymm22,ymm14 + vmovdqa64 ymm14,ymm20 + vmovdqa64 ymm20,ymm2 + vmovdqa64 ymm2,ymm12 + vmovdqa64 ymm12,ymm13 + vmovdqa64 ymm13,ymm19 + vmovdqa64 ymm19,ymm23 + vmovdqa64 ymm23,ymm15 + vmovdqa64 ymm15,ymm4 + vmovdqa64 ymm4,ymm24 + vmovdqa64 ymm24,ymm21 + vmovdqa64 ymm21,ymm8 + vmovdqa64 ymm8,ymm16 + vmovdqa64 ymm16,ymm5 + vmovdqa64 ymm5,ymm30 + + dec r10d + jnz NEAR $L$keccak_rnd_loop + DB 0F3h,0C3h ;repret + + +global KeccakF1600_avx512vl + +ALIGN 32 +KeccakF1600_avx512vl: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_KeccakF1600_avx512vl: + mov rdi,rcx + + + +DB 243,15,30,250 + sub rsp,10*16 + vmovdqu XMMWORD[rsp],xmm6 + vmovdqu XMMWORD[16+rsp],xmm7 + vmovdqu XMMWORD[32+rsp],xmm8 + vmovdqu XMMWORD[48+rsp],xmm9 + vmovdqu XMMWORD[64+rsp],xmm10 + vmovdqu XMMWORD[80+rsp],xmm11 + vmovdqu XMMWORD[96+rsp],xmm12 + vmovdqu XMMWORD[112+rsp],xmm13 + vmovdqu XMMWORD[128+rsp],xmm14 + vmovdqu XMMWORD[144+rsp],xmm15 + vmovq xmm0,QWORD[rdi] + vmovq xmm1,QWORD[8+rdi] + vmovq xmm2,QWORD[16+rdi] + vmovq xmm3,QWORD[24+rdi] + vmovq xmm4,QWORD[32+rdi] + vmovq xmm5,QWORD[40+rdi] + vmovq xmm6,QWORD[48+rdi] + vmovq xmm7,QWORD[56+rdi] + vmovq xmm8,QWORD[64+rdi] + vmovq xmm9,QWORD[72+rdi] + vmovq xmm10,QWORD[80+rdi] + vmovq xmm11,QWORD[88+rdi] + vmovq xmm12,QWORD[96+rdi] + vmovq xmm13,QWORD[104+rdi] + vmovq xmm14,QWORD[112+rdi] + vmovq xmm15,QWORD[120+rdi] + vmovq xmm16,QWORD[128+rdi] + vmovq xmm17,QWORD[136+rdi] + vmovq xmm18,QWORD[144+rdi] + vmovq xmm19,QWORD[152+rdi] + vmovq xmm20,QWORD[160+rdi] + vmovq xmm21,QWORD[168+rdi] + vmovq xmm22,QWORD[176+rdi] + vmovq xmm23,QWORD[184+rdi] + vmovq xmm24,QWORD[192+rdi] + + call keccak_1600_permute + + vmovq QWORD[rdi],xmm0 + vmovq QWORD[8+rdi],xmm1 + vmovq QWORD[16+rdi],xmm2 + vmovq QWORD[24+rdi],xmm3 + vmovq QWORD[32+rdi],xmm4 + vmovq QWORD[40+rdi],xmm5 + vmovq QWORD[48+rdi],xmm6 + vmovq QWORD[56+rdi],xmm7 + vmovq QWORD[64+rdi],xmm8 + vmovq QWORD[72+rdi],xmm9 + vmovq QWORD[80+rdi],xmm10 + vmovq QWORD[88+rdi],xmm11 + vmovq QWORD[96+rdi],xmm12 + vmovq QWORD[104+rdi],xmm13 + vmovq QWORD[112+rdi],xmm14 + vmovq QWORD[120+rdi],xmm15 + vmovq QWORD[128+rdi],xmm16 + vmovq QWORD[136+rdi],xmm17 + vmovq QWORD[144+rdi],xmm18 + vmovq QWORD[152+rdi],xmm19 + vmovq QWORD[160+rdi],xmm20 + vmovq QWORD[168+rdi],xmm21 + vmovq QWORD[176+rdi],xmm22 + vmovq QWORD[184+rdi],xmm23 + vmovq QWORD[192+rdi],xmm24 + vmovdqu xmm6,XMMWORD[rsp] + vmovdqu xmm7,XMMWORD[16+rsp] + vmovdqu xmm8,XMMWORD[32+rsp] + vmovdqu xmm9,XMMWORD[48+rsp] + vmovdqu xmm10,XMMWORD[64+rsp] + vmovdqu xmm11,XMMWORD[80+rsp] + vmovdqu xmm12,XMMWORD[96+rsp] + vmovdqu xmm13,XMMWORD[112+rsp] + vmovdqu xmm14,XMMWORD[128+rsp] + vmovdqu xmm15,XMMWORD[144+rsp] + add rsp,10*16 + vzeroupper + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +global KeccakF1600_x4_avx512vl + +ALIGN 32 +KeccakF1600_x4_avx512vl: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_KeccakF1600_x4_avx512vl: + mov rdi,rcx + + + +DB 243,15,30,250 + sub rsp,10*16 + vmovdqu XMMWORD[rsp],xmm6 + vmovdqu XMMWORD[16+rsp],xmm7 + vmovdqu XMMWORD[32+rsp],xmm8 + vmovdqu XMMWORD[48+rsp],xmm9 + vmovdqu XMMWORD[64+rsp],xmm10 + vmovdqu XMMWORD[80+rsp],xmm11 + vmovdqu XMMWORD[96+rsp],xmm12 + vmovdqu XMMWORD[112+rsp],xmm13 + vmovdqu XMMWORD[128+rsp],xmm14 + vmovdqu XMMWORD[144+rsp],xmm15 + vmovdqu64 ymm25,YMMWORD[((0+0))+rdi] + vmovdqu64 ymm26,YMMWORD[((0+200))+rdi] + vmovdqu64 ymm27,YMMWORD[((0+400))+rdi] + vmovdqu64 ymm28,YMMWORD[((0+600))+rdi] + vpunpcklqdq ymm29,ymm25,ymm26 + vpunpckhqdq ymm30,ymm25,ymm26 + vpunpcklqdq ymm25,ymm27,ymm28 + vpunpckhqdq ymm26,ymm27,ymm28 + vshufi64x2 ymm0,ymm29,ymm25,0 + vshufi64x2 ymm1,ymm30,ymm26,0 + vshufi64x2 ymm2,ymm29,ymm25,3 + vshufi64x2 ymm3,ymm30,ymm26,3 + vmovdqu64 ymm25,YMMWORD[((32+0))+rdi] + vmovdqu64 ymm26,YMMWORD[((32+200))+rdi] + vmovdqu64 ymm27,YMMWORD[((32+400))+rdi] + vmovdqu64 ymm28,YMMWORD[((32+600))+rdi] + vpunpcklqdq ymm29,ymm25,ymm26 + vpunpckhqdq ymm30,ymm25,ymm26 + vpunpcklqdq ymm25,ymm27,ymm28 + vpunpckhqdq ymm26,ymm27,ymm28 + vshufi64x2 ymm4,ymm29,ymm25,0 + vshufi64x2 ymm5,ymm30,ymm26,0 + vshufi64x2 ymm6,ymm29,ymm25,3 + vshufi64x2 ymm7,ymm30,ymm26,3 + vmovdqu64 ymm25,YMMWORD[((64+0))+rdi] + vmovdqu64 ymm26,YMMWORD[((64+200))+rdi] + vmovdqu64 ymm27,YMMWORD[((64+400))+rdi] + vmovdqu64 ymm28,YMMWORD[((64+600))+rdi] + vpunpcklqdq ymm29,ymm25,ymm26 + vpunpckhqdq ymm30,ymm25,ymm26 + vpunpcklqdq ymm25,ymm27,ymm28 + vpunpckhqdq ymm26,ymm27,ymm28 + vshufi64x2 ymm8,ymm29,ymm25,0 + vshufi64x2 ymm9,ymm30,ymm26,0 + vshufi64x2 ymm10,ymm29,ymm25,3 + vshufi64x2 ymm11,ymm30,ymm26,3 + vmovdqu64 ymm25,YMMWORD[((96+0))+rdi] + vmovdqu64 ymm26,YMMWORD[((96+200))+rdi] + vmovdqu64 ymm27,YMMWORD[((96+400))+rdi] + vmovdqu64 ymm28,YMMWORD[((96+600))+rdi] + vpunpcklqdq ymm29,ymm25,ymm26 + vpunpckhqdq ymm30,ymm25,ymm26 + vpunpcklqdq ymm25,ymm27,ymm28 + vpunpckhqdq ymm26,ymm27,ymm28 + vshufi64x2 ymm12,ymm29,ymm25,0 + vshufi64x2 ymm13,ymm30,ymm26,0 + vshufi64x2 ymm14,ymm29,ymm25,3 + vshufi64x2 ymm15,ymm30,ymm26,3 + vmovdqu64 ymm25,YMMWORD[((128+0))+rdi] + vmovdqu64 ymm26,YMMWORD[((128+200))+rdi] + vmovdqu64 ymm27,YMMWORD[((128+400))+rdi] + vmovdqu64 ymm28,YMMWORD[((128+600))+rdi] + vpunpcklqdq ymm29,ymm25,ymm26 + vpunpckhqdq ymm30,ymm25,ymm26 + vpunpcklqdq ymm25,ymm27,ymm28 + vpunpckhqdq ymm26,ymm27,ymm28 + vshufi64x2 ymm16,ymm29,ymm25,0 + vshufi64x2 ymm17,ymm30,ymm26,0 + vshufi64x2 ymm18,ymm29,ymm25,3 + vshufi64x2 ymm19,ymm30,ymm26,3 + vmovdqu64 ymm25,YMMWORD[((160+0))+rdi] + vmovdqu64 ymm26,YMMWORD[((160+200))+rdi] + vmovdqu64 ymm27,YMMWORD[((160+400))+rdi] + vmovdqu64 ymm28,YMMWORD[((160+600))+rdi] + vpunpcklqdq ymm29,ymm25,ymm26 + vpunpckhqdq ymm30,ymm25,ymm26 + vpunpcklqdq ymm25,ymm27,ymm28 + vpunpckhqdq ymm26,ymm27,ymm28 + vshufi64x2 ymm20,ymm29,ymm25,0 + vshufi64x2 ymm21,ymm30,ymm26,0 + vshufi64x2 ymm22,ymm29,ymm25,3 + vshufi64x2 ymm23,ymm30,ymm26,3 + vmovq xmm24,QWORD[((192+0))+rdi] + vpinsrq xmm24,xmm24,QWORD[((192+200))+rdi],1 + vmovq xmm25,QWORD[((192+400))+rdi] + vpinsrq xmm25,xmm25,QWORD[((192+600))+rdi],1 + vinserti32x4 ymm24,ymm24,xmm25,1 + + call keccak_1600_permute + + vpunpcklqdq ymm25,ymm0,ymm1 + vpunpckhqdq ymm26,ymm0,ymm1 + vpunpcklqdq ymm27,ymm2,ymm3 + vpunpckhqdq ymm28,ymm2,ymm3 + vshufi64x2 ymm0,ymm25,ymm27,0 + vshufi64x2 ymm1,ymm26,ymm28,0 + vshufi64x2 ymm2,ymm25,ymm27,3 + vshufi64x2 ymm3,ymm26,ymm28,3 + vmovdqu64 YMMWORD[(0+0)+rdi],ymm0 + vmovdqu64 YMMWORD[(0+200)+rdi],ymm1 + vmovdqu64 YMMWORD[(0+400)+rdi],ymm2 + vmovdqu64 YMMWORD[(0+600)+rdi],ymm3 + vpunpcklqdq ymm25,ymm4,ymm5 + vpunpckhqdq ymm26,ymm4,ymm5 + vpunpcklqdq ymm27,ymm6,ymm7 + vpunpckhqdq ymm28,ymm6,ymm7 + vshufi64x2 ymm4,ymm25,ymm27,0 + vshufi64x2 ymm5,ymm26,ymm28,0 + vshufi64x2 ymm6,ymm25,ymm27,3 + vshufi64x2 ymm7,ymm26,ymm28,3 + vmovdqu64 YMMWORD[(32+0)+rdi],ymm4 + vmovdqu64 YMMWORD[(32+200)+rdi],ymm5 + vmovdqu64 YMMWORD[(32+400)+rdi],ymm6 + vmovdqu64 YMMWORD[(32+600)+rdi],ymm7 + vpunpcklqdq ymm25,ymm8,ymm9 + vpunpckhqdq ymm26,ymm8,ymm9 + vpunpcklqdq ymm27,ymm10,ymm11 + vpunpckhqdq ymm28,ymm10,ymm11 + vshufi64x2 ymm8,ymm25,ymm27,0 + vshufi64x2 ymm9,ymm26,ymm28,0 + vshufi64x2 ymm10,ymm25,ymm27,3 + vshufi64x2 ymm11,ymm26,ymm28,3 + vmovdqu64 YMMWORD[(64+0)+rdi],ymm8 + vmovdqu64 YMMWORD[(64+200)+rdi],ymm9 + vmovdqu64 YMMWORD[(64+400)+rdi],ymm10 + vmovdqu64 YMMWORD[(64+600)+rdi],ymm11 + vpunpcklqdq ymm25,ymm12,ymm13 + vpunpckhqdq ymm26,ymm12,ymm13 + vpunpcklqdq ymm27,ymm14,ymm15 + vpunpckhqdq ymm28,ymm14,ymm15 + vshufi64x2 ymm12,ymm25,ymm27,0 + vshufi64x2 ymm13,ymm26,ymm28,0 + vshufi64x2 ymm14,ymm25,ymm27,3 + vshufi64x2 ymm15,ymm26,ymm28,3 + vmovdqu64 YMMWORD[(96+0)+rdi],ymm12 + vmovdqu64 YMMWORD[(96+200)+rdi],ymm13 + vmovdqu64 YMMWORD[(96+400)+rdi],ymm14 + vmovdqu64 YMMWORD[(96+600)+rdi],ymm15 + vpunpcklqdq ymm25,ymm16,ymm17 + vpunpckhqdq ymm26,ymm16,ymm17 + vpunpcklqdq ymm27,ymm18,ymm19 + vpunpckhqdq ymm28,ymm18,ymm19 + vshufi64x2 ymm16,ymm25,ymm27,0 + vshufi64x2 ymm17,ymm26,ymm28,0 + vshufi64x2 ymm18,ymm25,ymm27,3 + vshufi64x2 ymm19,ymm26,ymm28,3 + vmovdqu64 YMMWORD[(128+0)+rdi],ymm16 + vmovdqu64 YMMWORD[(128+200)+rdi],ymm17 + vmovdqu64 YMMWORD[(128+400)+rdi],ymm18 + vmovdqu64 YMMWORD[(128+600)+rdi],ymm19 + vpunpcklqdq ymm25,ymm20,ymm21 + vpunpckhqdq ymm26,ymm20,ymm21 + vpunpcklqdq ymm27,ymm22,ymm23 + vpunpckhqdq ymm28,ymm22,ymm23 + vshufi64x2 ymm20,ymm25,ymm27,0 + vshufi64x2 ymm21,ymm26,ymm28,0 + vshufi64x2 ymm22,ymm25,ymm27,3 + vshufi64x2 ymm23,ymm26,ymm28,3 + vmovdqu64 YMMWORD[(160+0)+rdi],ymm20 + vmovdqu64 YMMWORD[(160+200)+rdi],ymm21 + vmovdqu64 YMMWORD[(160+400)+rdi],ymm22 + vmovdqu64 YMMWORD[(160+600)+rdi],ymm23 + vextracti32x4 xmm25,ymm24,1 + vmovq QWORD[(192+0)+rdi],xmm24 + vpextrq XMMWORD[(192+200)+rdi],xmm24,1 + vmovq QWORD[(192+400)+rdi],xmm25 + vpextrq XMMWORD[(192+600)+rdi],xmm25,1 + vmovdqu xmm6,XMMWORD[rsp] + vmovdqu xmm7,XMMWORD[16+rsp] + vmovdqu xmm8,XMMWORD[32+rsp] + vmovdqu xmm9,XMMWORD[48+rsp] + vmovdqu xmm10,XMMWORD[64+rsp] + vmovdqu xmm11,XMMWORD[80+rsp] + vmovdqu xmm12,XMMWORD[96+rsp] + vmovdqu xmm13,XMMWORD[112+rsp] + vmovdqu xmm14,XMMWORD[128+rsp] + vmovdqu xmm15,XMMWORD[144+rsp] + add rsp,10*16 + vzeroupper + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +section .rdata rdata align=8 + +ALIGN 64 +iotas_avx512: + DQ 0x0000000000000001,0x0000000000008082 + DQ 0x800000000000808a,0x8000000080008000 + DQ 0x000000000000808b,0x0000000080000001 + DQ 0x8000000080008081,0x8000000000008009 + DQ 0x000000000000008a,0x0000000000000088 + DQ 0x0000000080008009,0x000000008000000a + DQ 0x000000008000808b,0x800000000000008b + DQ 0x8000000000008089,0x8000000000008003 + DQ 0x8000000000008002,0x8000000000000080 + DQ 0x000000000000800a,0x800000008000000a + DQ 0x8000000080008081,0x8000000000008080 + DQ 0x0000000080000001,0x8000000080008008 + +%endif + +section .text + +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif