|
| 1 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 2 | +; Copyright(c) 2011-2018 Intel Corporation All rights reserved. |
| 3 | +; |
| 4 | +; Redistribution and use in source and binary forms, with or without |
| 5 | +; modification, are permitted provided that the following conditions |
| 6 | +; are met: |
| 7 | +; * Redistributions of source code must retain the above copyright |
| 8 | +; notice, this list of conditions and the following disclaimer. |
| 9 | +; * Redistributions in binary form must reproduce the above copyright |
| 10 | +; notice, this list of conditions and the following disclaimer in |
| 11 | +; the documentation and/or other materials provided with the |
| 12 | +; distribution. |
| 13 | +; * Neither the name of Intel Corporation nor the names of its |
| 14 | +; contributors may be used to endorse or promote products derived |
| 15 | +; from this software without specific prior written permission. |
| 16 | +; |
| 17 | +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 18 | +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 19 | +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 20 | +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 21 | +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 22 | +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 23 | +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 24 | +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 25 | +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 26 | +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 27 | +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 28 | +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| 29 | + |
| 30 | +%include "reg_sizes.asm" |
| 31 | + |
| 32 | +%ifidn __OUTPUT_FORMAT__, elf64 |
| 33 | + %define arg0 rdi |
| 34 | + %define arg1 rsi |
| 35 | + %define arg2 rdx |
| 36 | + %define arg3 rcx |
| 37 | + %define arg4 r8 |
| 38 | + %define arg5 r9 |
| 39 | + %define tmp r11 |
| 40 | + %define tmpb r11b |
| 41 | + %define tmp3 arg4 |
| 42 | + %define return rax |
| 43 | + %define func(x) x: endbranch |
| 44 | + %define FUNC_SAVE |
| 45 | + %define FUNC_RESTORE |
| 46 | +%endif |
| 47 | + |
| 48 | +%ifidn __OUTPUT_FORMAT__, win64 |
| 49 | + %define arg0 rcx |
| 50 | + %define arg1 rdx |
| 51 | + %define arg2 r8 |
| 52 | + %define arg3 r9 |
| 53 | + %define tmp r11 |
| 54 | + %define tmpb r11b |
| 55 | + %define tmp3 r10 |
| 56 | + %define return rax |
| 57 | + %define func(x) proc_frame x |
| 58 | + %macro FUNC_SAVE 0 |
| 59 | + end_prolog |
| 60 | + %endmacro |
| 61 | + %macro FUNC_RESTORE 0 |
| 62 | + %endmacro |
| 63 | +%endif |
| 64 | + |
| 65 | +%define src arg0 |
| 66 | +%define len arg1 |
| 67 | +%define tmp0 arg2 |
| 68 | +%define tmp1 arg3 |
| 69 | + |
| 70 | +%use smartalign |
| 71 | +ALIGNMODE P6 |
| 72 | +default rel |
| 73 | + |
| 74 | +[bits 64] |
| 75 | +section .text |
| 76 | +align 32 ; maximize mu-ops cache coverage |
| 77 | +mk_global mem_zero_detect_avx512, function |
| 78 | +func(mem_zero_detect_avx512) |
| 79 | + FUNC_SAVE |
| 80 | + or tmp1, -1 ; all ones mask |
| 81 | + mov eax, DWORD(src) |
| 82 | + and eax, 63 |
| 83 | + neg rax |
| 84 | + add rax, 64 ; 64 - eax |
| 85 | + cmp rax, len |
| 86 | + cmovae eax, DWORD(len) |
| 87 | + bzhi tmp1, tmp1, rax ; alignment mask |
| 88 | + kmovq k1, tmp1 |
| 89 | + vmovdqu8 zmm0{k1}{z}, [src] |
| 90 | + add src, rax ; align to cacheline |
| 91 | + sub len, rax |
| 92 | + vptestmb k1, zmm0, zmm0 |
| 93 | + xor DWORD(tmp0), DWORD(tmp0) |
| 94 | + ktestq k1, k1 |
| 95 | + setnz BYTE(tmp0) |
| 96 | + mov DWORD(tmp3), DWORD(len) |
| 97 | + xor eax, eax |
| 98 | + shr len, 7 ; len/128 |
| 99 | + setz al |
| 100 | + add eax, DWORD(tmp0) |
| 101 | + jnz .mem_z_small_block |
| 102 | + |
| 103 | + |
| 104 | +align 16 |
| 105 | +.mem_z_loop: |
| 106 | + vmovdqa64 zmm0, [src] |
| 107 | + vporq zmm0, zmm0,[src+64] |
| 108 | + xor tmp1,tmp1 |
| 109 | + sub len, 1 |
| 110 | + setz BYTE(tmp1) |
| 111 | + add src, 128 |
| 112 | + vptestmb k1, zmm0, zmm0 |
| 113 | + kmovq tmp0, k1 |
| 114 | + add tmp1, tmp0 ; for macrofusion. |
| 115 | + jz .mem_z_loop |
| 116 | + |
| 117 | +align 16 |
| 118 | +.mem_z_small_block: |
| 119 | + ;len < 128 |
| 120 | + xor eax, eax |
| 121 | + lea tmp1, [rax-1] ; 0xFFFFFF... |
| 122 | + mov DWORD(len), DWORD(tmp3) |
| 123 | + and DWORD(len), 127 ; len % 128 |
| 124 | + and DWORD(tmp3),63 ; len % 64 |
| 125 | + bzhi tmp, tmp1, tmp3; mask |
| 126 | + cmp DWORD(len), 64 |
| 127 | + cmovb tmp1, tmp |
| 128 | + cmovb tmp, rax |
| 129 | + kmovq k1, tmp1 |
| 130 | + kmovq k2, tmp |
| 131 | + vmovdqu8 zmm0{k1}{z}, [src] |
| 132 | + vmovdqu8 zmm1{k2}{z}, [src+64] |
| 133 | + vporq zmm0, zmm0, zmm1 |
| 134 | + vptestmb k1, zmm0, zmm0 |
| 135 | + kmovq tmp1, k1 |
| 136 | + or tmp0, tmp1 |
| 137 | + setnz al ; eax is still zero |
| 138 | + FUNC_RESTORE |
| 139 | + ret |
| 140 | + |
| 141 | + |
| 142 | +endproc_frame |
0 commit comments