|
| 1 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 2 | +; Copyright(c) 2024 Intel Corporation All rights reserved. |
| 3 | +; |
| 4 | +; Redistribution and use in source and binary forms, with or without |
| 5 | +; modification, are permitted provided that the following conditions |
| 6 | +; are met: |
| 7 | +; * Redistributions of source code must retain the above copyright |
| 8 | +; notice, this list of conditions and the following disclaimer. |
| 9 | +; * Redistributions in binary form must reproduce the above copyright |
| 10 | +; notice, this list of conditions and the following disclaimer in |
| 11 | +; the documentation and/or other materials provided with the |
| 12 | +; distribution. |
| 13 | +; * Neither the name of Intel Corporation nor the names of its |
| 14 | +; contributors may be used to endorse or promote products derived |
| 15 | +; from this software without specific prior written permission. |
| 16 | +; |
| 17 | +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 18 | +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 19 | +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 20 | +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 21 | +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 22 | +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 23 | +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 24 | +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 25 | +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 26 | +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 27 | +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 28 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 29 | + |
| 30 | +;;; Optimized pq of N source vectors using AVX512+GFNI |
| 31 | +;;; int pq_gen_avx512_gfni(int vects, int len, void **array) |
| 32 | + |
| 33 | +;;; Generates P+Q parity vector from N (vects-2) sources in array of pointers |
| 34 | +;;; (**array). Last two pointers are the P and Q destinations respectively. |
| 35 | +;;; Vectors must be aligned to 64 bytes if NO_NT_LDST is not defined. |
| 36 | +;;; Length must be 32 byte multiple. |
| 37 | + |
| 38 | +%include "reg_sizes.asm" |
| 39 | + |
| 40 | +%ifdef HAVE_AS_KNOWS_AVX512 |
| 41 | + |
| 42 | +%ifidn __OUTPUT_FORMAT__, elf64 |
| 43 | + %define arg0 rdi |
| 44 | + %define arg1 rsi |
| 45 | + %define arg2 rdx |
| 46 | + %define arg3 rcx |
| 47 | + %define arg4 r8 |
| 48 | + %define arg5 r9 |
| 49 | + %define tmp r11 |
| 50 | + %define tmp3 arg4 |
| 51 | + %define return rax |
| 52 | + %define func(x) x: endbranch |
| 53 | + %define FUNC_SAVE |
| 54 | + %define FUNC_RESTORE |
| 55 | +%endif |
| 56 | + |
| 57 | +%ifidn __OUTPUT_FORMAT__, win64 |
| 58 | + %define arg0 rcx |
| 59 | + %define arg1 rdx |
| 60 | + %define arg2 r8 |
| 61 | + %define arg3 r9 |
| 62 | + %define tmp r11 |
| 63 | + %define tmp3 r10 |
| 64 | + %define return rax |
| 65 | + %define stack_size 1*16 + 8 ; must be an odd multiple of 8 |
| 66 | + %define func(x) proc_frame x |
| 67 | + %macro FUNC_SAVE 0 |
| 68 | + alloc_stack stack_size |
| 69 | + vmovdqa [rsp + 0*16], xmm6 |
| 70 | + end_prolog |
| 71 | + %endmacro |
| 72 | + |
| 73 | + %macro FUNC_RESTORE 0 |
| 74 | + vmovdqa xmm6, [rsp + 0*16] |
| 75 | + add rsp, stack_size |
| 76 | + %endmacro |
| 77 | +%endif |
| 78 | + |
| 79 | +%define vec arg0 |
| 80 | +%define len arg1 |
| 81 | +%define ptr arg3 |
| 82 | +%define pos rax |
| 83 | + |
| 84 | +%define xp1 zmm0 |
| 85 | +%define xq1 zmm1 |
| 86 | +%define xs1 zmm2 |
| 87 | + |
| 88 | +%define xp2 zmm3 |
| 89 | +%define xq2 zmm4 |
| 90 | +%define xs2 zmm5 |
| 91 | + |
| 92 | +%define gfmatrix zmm6 |
| 93 | + |
| 94 | +%define xp1y ymm0 |
| 95 | +%define xq1y ymm1 |
| 96 | +%define xs1y ymm2 |
| 97 | + |
| 98 | +%define gfmatrixy ymm6 |
| 99 | + |
| 100 | +%define NO_NT_LDST |
| 101 | +;;; Use Non-temporal load/stor |
| 102 | +%ifdef NO_NT_LDST |
| 103 | + %define XLDR vmovdqu8 ;u8 |
| 104 | + %define XSTR vmovdqu8 |
| 105 | +%else |
| 106 | + %define XLDR vmovntdqa |
| 107 | + %define XSTR vmovntdq |
| 108 | +%endif |
| 109 | + |
| 110 | +; Matrix with 0x11d as first column |
| 111 | +; and identity matrix shited by 1 (as we are multiplying data by 2, mod 0x11d) |
| 112 | +; 0 1 0 0 0 0 0 0 |
| 113 | +; 0 0 1 0 0 0 0 0 |
| 114 | +; 0 0 0 1 0 0 0 0 |
| 115 | +; 0 0 0 0 1 0 0 0 |
| 116 | +; 1 0 0 0 0 1 0 0 |
| 117 | +; 1 0 0 0 0 0 1 0 |
| 118 | +; 0 0 0 0 0 0 0 1 |
| 119 | +; 1 0 0 0 0 0 0 0 |
| 120 | +default rel |
| 121 | +align 64 |
| 122 | +gf_matrix: |
| 123 | +db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80 |
| 124 | +db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80 |
| 125 | +db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80 |
| 126 | +db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80 |
| 127 | +db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80 |
| 128 | +db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80 |
| 129 | +db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80 |
| 130 | +db 0x40, 0x20, 0x10, 0x88, 0x84, 0x82, 0x01, 0x80 |
| 131 | + |
| 132 | + |
| 133 | +[bits 64] |
| 134 | +section .text |
| 135 | + |
| 136 | +align 16 |
| 137 | +mk_global pq_gen_avx512_gfni, function |
| 138 | +func(pq_gen_avx512_gfni) |
| 139 | + FUNC_SAVE |
| 140 | + sub vec, 3 ;Keep as offset to last source |
| 141 | + jng return_fail ;Must have at least 2 sources |
| 142 | + cmp len, 0 |
| 143 | + je return_pass |
| 144 | + test len, (32-1) ;Check alignment of length |
| 145 | + jnz return_fail |
| 146 | + |
| 147 | + vmovdqa64 gfmatrix, [rel gf_matrix] |
| 148 | + |
| 149 | + xor pos, pos |
| 150 | + cmp len, 128 |
| 151 | + jl loop32 |
| 152 | + |
| 153 | +len_aligned_32bytes: |
| 154 | + sub len, 2*64 ;Len points to last block |
| 155 | + |
| 156 | +loop128: |
| 157 | + mov ptr, [arg2+vec*8] ;Fetch last source pointer |
| 158 | + mov tmp, vec ;Set tmp to point back to last vector |
| 159 | + XLDR xs1, [ptr+pos] ;Preload last vector (source) |
| 160 | + XLDR xs2, [ptr+pos+64] ;Preload last vector (source) |
| 161 | + vpxorq xp1, xp1, xp1 ;p1 = 0 |
| 162 | + vpxorq xp2, xp2, xp2 ;p2 = 0 |
| 163 | + vpxorq xq1, xq1, xq1 ;q1 = 0 |
| 164 | + vpxorq xq2, xq2, xq2 ;q2 = 0 |
| 165 | + |
| 166 | +next_vect: |
| 167 | + sub tmp, 1 ;Inner loop for each source vector |
| 168 | + mov ptr, [arg2+tmp*8] ; get pointer to next vect |
| 169 | + vpxorq xq1, xq1, xs1 ; q1 ^= s1 |
| 170 | + vpxorq xq2, xq2, xs2 ; q2 ^= s2 |
| 171 | + vpxorq xp1, xp1, xs1 ; p1 ^= s1 |
| 172 | + vpxorq xp2, xp2, xs2 ; p2 ^= s2 |
| 173 | + XLDR xs1, [ptr+pos] ; Get next vector (source data1) |
| 174 | + XLDR xs2, [ptr+pos+64] ; Get next vector (source data2) |
| 175 | + vgf2p8affineqb xq1, xq1, gfmatrix, 0x00 |
| 176 | + vgf2p8affineqb xq2, xq2, gfmatrix, 0x00 |
| 177 | + jg next_vect ; Loop for each vect except 0 |
| 178 | + |
| 179 | + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector |
| 180 | + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector |
| 181 | + vpxorq xp1, xp1, xs1 ;p1 ^= s1[0] - last source is already loaded |
| 182 | + vpxorq xq1, xq1, xs1 ;q1 ^= 1 * s1[0] |
| 183 | + vpxorq xp2, xp2, xs2 ;p2 ^= s2[0] |
| 184 | + vpxorq xq2, xq2, xs2 ;q2 ^= 1 * s2[0] |
| 185 | + XSTR [ptr+pos], xp1 ;Write parity P1 vector |
| 186 | + XSTR [ptr+pos+64], xp2 ;Write parity P2 vector |
| 187 | + XSTR [tmp+pos], xq1 ;Write parity Q1 vector |
| 188 | + XSTR [tmp+pos+64], xq2 ;Write parity Q2 vector |
| 189 | + add pos, 2*64 |
| 190 | + cmp pos, len |
| 191 | + jle loop128 |
| 192 | + |
| 193 | + ;; ------------------------------ |
| 194 | + ;; Do last 32 or 64 Bytes remaining |
| 195 | + add len, 2*64 |
| 196 | + cmp pos, len |
| 197 | + je return_pass |
| 198 | + |
| 199 | +loop32: |
| 200 | + mov ptr, [arg2+vec*8] ;Fetch last source pointer |
| 201 | + mov tmp, vec ;Set tmp to point back to last vector |
| 202 | + XLDR xs1y, [ptr+pos] ;Preload last vector (source) |
| 203 | + vpxorq xp1y, xp1y, xp1y ;p = 0 |
| 204 | + vpxorq xq1y, xq1y, xq1y ;q = 0 |
| 205 | + |
| 206 | +next_vect32: |
| 207 | + sub tmp, 1 ;Inner loop for each source vector |
| 208 | + mov ptr, [arg2+tmp*8] ; get pointer to next vect |
| 209 | + vpxorq xq1y, xq1y, xs1y ; q1 ^= s1 |
| 210 | + vgf2p8affineqb xq1y, xq1y, gfmatrixy, 0x00 |
| 211 | + vpxorq xp1y, xp1y, xs1y ; p ^= s |
| 212 | + XLDR xs1y, [ptr+pos] ; Get next vector (source data) |
| 213 | + jg next_vect32 ; Loop for each vect except 0 |
| 214 | + |
| 215 | + mov ptr, [arg2+8+vec*8] ;Get address of P parity vector |
| 216 | + mov tmp, [arg2+(2*8)+vec*8] ;Get address of Q parity vector |
| 217 | + vpxorq xp1y, xp1y, xs1y ;p ^= s[0] - last source is already loaded |
| 218 | + vpxorq xq1y, xq1y, xs1y ;q ^= 1 * s[0] |
| 219 | + XSTR [ptr+pos], xp1y ;Write parity P vector |
| 220 | + XSTR [tmp+pos], xq1y ;Write parity Q vector |
| 221 | + add pos, 32 |
| 222 | + cmp pos, len |
| 223 | + jl loop32 |
| 224 | + |
| 225 | + |
| 226 | +return_pass: |
| 227 | + mov return, 0 |
| 228 | + FUNC_RESTORE |
| 229 | + ret |
| 230 | + |
| 231 | +return_fail: |
| 232 | + mov return, 1 |
| 233 | + FUNC_RESTORE |
| 234 | + ret |
| 235 | + |
| 236 | +endproc_frame |
| 237 | + |
| 238 | +%endif ; ifdef HAVE_AS_KNOWS_AVX512 |
0 commit comments