Skip to content

Commit e3783f2

Browse files
Nicola Torraccagbtucker
authored andcommitted
Add AVX512 implementation of mem_zero_detect().
Change-Id: I60fe0846d783787198b6a44a090fd9fe17c1807f Signed-off-by: Nicola Torracca <[email protected]>
1 parent d3cfb2f commit e3783f2

File tree

4 files changed

+148
-3
lines changed

4 files changed

+148
-3
lines changed

Makefile.nmake

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,9 @@ objs = \
158158
bin\igzip_gen_icf_map_lh1_04.obj \
159159
bin\igzip_set_long_icf_fg_04.obj \
160160
bin\igzip_set_long_icf_fg_06.obj \
161-
bin\mem_zero_detect_avx.obj \
161+
bin\mem_zero_detect_avx512.obj \
162162
bin\mem_zero_detect_avx2.obj \
163+
bin\mem_zero_detect_avx.obj \
163164
bin\mem_zero_detect_sse.obj \
164165
bin\mem_multibinary.obj
165166

mem/Makefile.am

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ lsrc += mem/mem_zero_detect_base.c
3434
lsrc_base_aliases += mem/mem_zero_detect_base_aliases.c
3535
lsrc_ppc64le += mem/mem_zero_detect_base_aliases.c
3636

37-
lsrc_x86_64 += mem/mem_zero_detect_avx.asm \
37+
lsrc_x86_64 += mem/mem_zero_detect_avx512.asm \
3838
mem/mem_zero_detect_avx2.asm \
39+
mem/mem_zero_detect_avx.asm \
3940
mem/mem_zero_detect_sse.asm \
4041
mem/mem_multibinary.asm
4142

mem/mem_multibinary.asm

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,12 @@
3333
default rel
3434
[bits 64]
3535

36+
extern mem_zero_detect_avx512
3637
extern mem_zero_detect_avx2
3738
extern mem_zero_detect_avx
3839
extern mem_zero_detect_sse
3940
extern mem_zero_detect_base
4041

4142
mbin_interface isal_zero_detect
4243

43-
mbin_dispatch_init5 isal_zero_detect, mem_zero_detect_base, mem_zero_detect_sse, mem_zero_detect_avx, mem_zero_detect_avx2
44+
mbin_dispatch_init6 isal_zero_detect, mem_zero_detect_base, mem_zero_detect_sse, mem_zero_detect_avx, mem_zero_detect_avx2, mem_zero_detect_avx512

mem/mem_zero_detect_avx512.asm

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2+
; Copyright(c) 2011-2018 Intel Corporation All rights reserved.
3+
;
4+
; Redistribution and use in source and binary forms, with or without
5+
; modification, are permitted provided that the following conditions
6+
; are met:
7+
; * Redistributions of source code must retain the above copyright
8+
; notice, this list of conditions and the following disclaimer.
9+
; * Redistributions in binary form must reproduce the above copyright
10+
; notice, this list of conditions and the following disclaimer in
11+
; the documentation and/or other materials provided with the
12+
; distribution.
13+
; * Neither the name of Intel Corporation nor the names of its
14+
; contributors may be used to endorse or promote products derived
15+
; from this software without specific prior written permission.
16+
;
17+
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18+
; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19+
; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20+
; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21+
; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22+
; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23+
; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24+
; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25+
; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26+
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27+
; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
29+
30+
%include "reg_sizes.asm"
31+
32+
%ifidn __OUTPUT_FORMAT__, elf64
33+
%define arg0 rdi
34+
%define arg1 rsi
35+
%define arg2 rdx
36+
%define arg3 rcx
37+
%define arg4 r8
38+
%define arg5 r9
39+
%define tmp r11
40+
%define tmpb r11b
41+
%define tmp3 arg4
42+
%define return rax
43+
%define func(x) x: endbranch
44+
%define FUNC_SAVE
45+
%define FUNC_RESTORE
46+
%endif
47+
48+
%ifidn __OUTPUT_FORMAT__, win64
49+
%define arg0 rcx
50+
%define arg1 rdx
51+
%define arg2 r8
52+
%define arg3 r9
53+
%define tmp r11
54+
%define tmpb r11b
55+
%define tmp3 r10
56+
%define return rax
57+
%define func(x) proc_frame x
58+
%macro FUNC_SAVE 0
59+
end_prolog
60+
%endmacro
61+
%macro FUNC_RESTORE 0
62+
%endmacro
63+
%endif
64+
65+
%define src arg0
66+
%define len arg1
67+
%define tmp0 arg2
68+
%define tmp1 arg3
69+
70+
%use smartalign
71+
ALIGNMODE P6
72+
default rel
73+
74+
[bits 64]
75+
section .text
76+
align 32 ; maximize mu-ops cache coverage
77+
mk_global mem_zero_detect_avx512, function
78+
func(mem_zero_detect_avx512)
79+
FUNC_SAVE
80+
or tmp1, -1 ; all ones mask
81+
mov eax, DWORD(src)
82+
and eax, 63
83+
neg rax
84+
add rax, 64 ; 64 - eax
85+
cmp rax, len
86+
cmovae eax, DWORD(len)
87+
bzhi tmp1, tmp1, rax ; alignment mask
88+
kmovq k1, tmp1
89+
vmovdqu8 zmm0{k1}{z}, [src]
90+
add src, rax ; align to cacheline
91+
sub len, rax
92+
vptestmb k1, zmm0, zmm0
93+
xor DWORD(tmp0), DWORD(tmp0)
94+
ktestq k1, k1
95+
setnz BYTE(tmp0)
96+
mov DWORD(tmp3), DWORD(len)
97+
xor eax, eax
98+
shr len, 7 ; len/128
99+
setz al
100+
add eax, DWORD(tmp0)
101+
jnz .mem_z_small_block
102+
103+
104+
align 16
105+
.mem_z_loop:
106+
vmovdqa64 zmm0, [src]
107+
vporq zmm0, zmm0,[src+64]
108+
xor tmp1,tmp1
109+
sub len, 1
110+
setz BYTE(tmp1)
111+
add src, 128
112+
vptestmb k1, zmm0, zmm0
113+
kmovq tmp0, k1
114+
add tmp1, tmp0 ; for macrofusion.
115+
jz .mem_z_loop
116+
117+
align 16
118+
.mem_z_small_block:
119+
;len < 128
120+
xor eax, eax
121+
lea tmp1, [rax-1] ; 0xFFFFFF...
122+
mov DWORD(len), DWORD(tmp3)
123+
and DWORD(len), 127 ; len % 128
124+
and DWORD(tmp3),63 ; len % 64
125+
bzhi tmp, tmp1, tmp3; mask
126+
cmp DWORD(len), 64
127+
cmovb tmp1, tmp
128+
cmovb tmp, rax
129+
kmovq k1, tmp1
130+
kmovq k2, tmp
131+
vmovdqu8 zmm0{k1}{z}, [src]
132+
vmovdqu8 zmm1{k2}{z}, [src+64]
133+
vporq zmm0, zmm0, zmm1
134+
vptestmb k1, zmm0, zmm0
135+
kmovq tmp1, k1
136+
or tmp0, tmp1
137+
setnz al ; eax is still zero
138+
FUNC_RESTORE
139+
ret
140+
141+
142+
endproc_frame

0 commit comments

Comments
 (0)