Skip to content

Commit 87908c9

Browse files
committed
mem: Move new mem_zero_detect function to avx2
New mem_zero_detect function will fail on avx only machines. Change-Id: I3bca49bff886f9c130c89e8c74b31110e9bac76b Signed-off-by: Greg Tucker <[email protected]>
1 parent 0e65117 commit 87908c9

File tree

5 files changed

+277
-85
lines changed

5 files changed

+277
-85
lines changed

Makefile.nmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ objs = \
159159
bin\igzip_set_long_icf_fg_04.obj \
160160
bin\igzip_set_long_icf_fg_06.obj \
161161
bin\mem_zero_detect_avx.obj \
162+
bin\mem_zero_detect_avx2.obj \
162163
bin\mem_zero_detect_sse.obj \
163164
bin\mem_multibinary.obj
164165

mem/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ lsrc_base_aliases += mem/mem_zero_detect_base_aliases.c
3535
lsrc_ppc64le += mem/mem_zero_detect_base_aliases.c
3636

3737
lsrc_x86_64 += mem/mem_zero_detect_avx.asm \
38+
mem/mem_zero_detect_avx2.asm \
3839
mem/mem_zero_detect_sse.asm \
3940
mem/mem_multibinary.asm
4041

mem/mem_multibinary.asm

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,11 @@
3333
default rel
3434
[bits 64]
3535

36+
extern mem_zero_detect_avx2
3637
extern mem_zero_detect_avx
3738
extern mem_zero_detect_sse
3839
extern mem_zero_detect_base
3940

4041
mbin_interface isal_zero_detect
4142

42-
mbin_dispatch_init5 isal_zero_detect, mem_zero_detect_base, mem_zero_detect_sse, mem_zero_detect_avx, mem_zero_detect_avx
43+
mbin_dispatch_init5 isal_zero_detect, mem_zero_detect_base, mem_zero_detect_sse, mem_zero_detect_avx, mem_zero_detect_avx2

mem/mem_zero_detect_avx.asm

Lines changed: 105 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -64,105 +64,126 @@
6464

6565
%define src arg0
6666
%define len arg1
67-
%define tmp0 arg2
68-
%define tmp1 arg3
67+
%define ptr arg2
68+
%define pos return
6969

70-
%use smartalign
71-
ALIGNMODE P6
7270
default rel
7371

7472
[bits 64]
7573
section .text
76-
align 32 ; maximize mu-ops cache usage
74+
75+
align 16
7776
mk_global mem_zero_detect_avx, function
7877
func(mem_zero_detect_avx)
7978
FUNC_SAVE
80-
cmp len, 127
81-
jbe .mem_z_small_block
82-
; check the first 128 bytes
83-
vpxor xmm2, xmm2, xmm2
84-
vmovdqu ymm0, [src]
85-
vpor ymm0, ymm0, [src+32]
86-
vmovdqu ymm1, [src+64]
87-
vpor ymm1, ymm1, [src+96]
88-
vpor ymm0, ymm0, ymm1
89-
vpcmpeqb ymm0, ymm2, ymm0
90-
vpmovmskb DWORD(tmp0), ymm0
91-
not DWORD(tmp0)
92-
mov DWORD(tmp1), DWORD(len)
93-
and DWORD(tmp1), 127
94-
add src, tmp1
95-
xor eax, eax
96-
shr len, 7 ; len/128
97-
test len, len; break partial flag stall
98-
setz al ; if len < 128, eax != 0
99-
add eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion
100-
jnz .return
101-
xor eax, eax
79+
mov pos, 0
80+
sub len, 4*32
81+
jle .mem_z_small_block
10282

103-
align 16
10483
.mem_z_loop:
105-
vmovdqu ymm0, [src]
106-
vpor ymm0, ymm0,[src+32]
107-
vmovdqu ymm1, [src+64]
108-
vpor ymm1, ymm1, [src+96]
109-
add src, 128
110-
xor DWORD(tmp1), DWORD(tmp1)
111-
sub len, 1
112-
setz BYTE(tmp1)
113-
vpor ymm0, ymm0, ymm1
114-
vpcmpeqb ymm0, ymm2, ymm0
115-
vpmovmskb DWORD(tmp0), ymm0
116-
not DWORD(tmp0)
117-
add DWORD(tmp1), DWORD(tmp0)
118-
jz .mem_z_loop
119-
120-
.return:
121-
xor eax, eax
122-
test tmp0, tmp0
123-
setnz al
84+
vmovdqu ymm0, [src+pos]
85+
vmovdqu ymm1, [src+pos+1*32]
86+
vmovdqu ymm2, [src+pos+2*32]
87+
vmovdqu ymm3, [src+pos+3*32]
88+
vptest ymm0, ymm0
89+
jnz .return_fail
90+
vptest ymm1, ymm1
91+
jnz .return_fail
92+
vptest ymm2, ymm2
93+
jnz .return_fail
94+
vptest ymm3, ymm3
95+
jnz .return_fail
96+
add pos, 4*32
97+
cmp pos, len
98+
jl .mem_z_loop
99+
100+
.mem_z_last_block:
101+
vmovdqu ymm0, [src+len]
102+
vmovdqu ymm1, [src+len+1*32]
103+
vmovdqu ymm2, [src+len+2*32]
104+
vmovdqu ymm3, [src+len+3*32]
105+
vptest ymm0, ymm0
106+
jnz .return_fail
107+
vptest ymm1, ymm1
108+
jnz .return_fail
109+
vptest ymm2, ymm2
110+
jnz .return_fail
111+
vptest ymm3, ymm3
112+
jnz .return_fail
113+
114+
.return_pass:
115+
mov return, 0
124116
FUNC_RESTORE
125117
ret
126118

127119

128-
align 16
129120
.mem_z_small_block:
130-
;len < 128
131-
xor DWORD(tmp0), DWORD(tmp0)
132-
movzx DWORD(tmp1), BYTE(len)
133-
cmp DWORD(len), 16
134-
jb .mem_z_small_check_zero
135-
;17 < len < 128
136-
shr DWORD(len), 4
137-
xor eax, eax ; alignment
138-
.mem_z_small_block_loop:
139-
xor eax, eax
140-
mov tmp0, [src]
141-
or tmp0, [src+8]
142-
sub DWORD(len), 1
143-
setz al
144-
add src, 16
145-
add rax, tmp0
146-
jz .mem_z_small_block_loop
147-
148-
test tmp0, tmp0
149-
jnz .return_small
150-
movzx DWORD(len), BYTE(tmp1)
151-
152-
.mem_z_small_check_zero:
153-
xor DWORD(tmp0), DWORD(tmp0)
154-
and DWORD(len), 15
155-
jz .return_small
156-
.mem_z_small_byte_loop:
157-
movzx eax, byte [src]
158-
add src, 1
159-
or DWORD(tmp0), eax
160-
sub DWORD(len), 1
161-
jnz .mem_z_small_byte_loop
162-
.return_small:
163-
xor eax, eax
164-
test tmp0, tmp0
165-
setnz al
121+
add len, 4*32
122+
cmp len, 2*32
123+
jl .mem_z_lt64
124+
vmovdqu ymm0, [src]
125+
vmovdqu ymm1, [src+32]
126+
vmovdqu ymm2, [src+len-2*32]
127+
vmovdqu ymm3, [src+len-1*32]
128+
vptest ymm0, ymm0
129+
jnz .return_fail
130+
vptest ymm1, ymm1
131+
jnz .return_fail
132+
vptest ymm2, ymm2
133+
jnz .return_fail
134+
vptest ymm3, ymm3
135+
jnz .return_fail
136+
jmp .return_pass
137+
138+
.mem_z_lt64:
139+
cmp len, 32
140+
jl .mem_z_lt32
141+
vmovdqu ymm0, [src]
142+
vmovdqu ymm1, [src+len-32]
143+
vptest ymm0, ymm0
144+
jnz .return_fail
145+
vptest ymm1, ymm1
146+
jnz .return_fail
147+
jmp .return_pass
148+
149+
150+
.mem_z_lt32:
151+
cmp len, 16
152+
jl .mem_z_lt16
153+
vmovdqu xmm0, [src]
154+
vmovdqu xmm1, [src+len-16]
155+
vptest xmm0, xmm0
156+
jnz .return_fail
157+
vptest xmm1, xmm1
158+
jnz .return_fail
159+
jmp .return_pass
160+
161+
162+
.mem_z_lt16:
163+
cmp len, 8
164+
jl .mem_z_lt8
165+
mov tmp, [src]
166+
mov tmp3,[src+len-8]
167+
or tmp, tmp3
168+
test tmp, tmp
169+
jnz .return_fail
170+
jmp .return_pass
171+
172+
.mem_z_lt8:
173+
cmp len, 0
174+
je .return_pass
175+
.mem_z_1byte_loop:
176+
mov tmpb, [src+pos]
177+
cmp tmpb, 0
178+
jnz .return_fail
179+
add pos, 1
180+
cmp pos, len
181+
jl .mem_z_1byte_loop
182+
jmp .return_pass
183+
184+
.return_fail:
185+
mov return, 1
186+
FUNC_RESTORE
166187
ret
167188

168189
endproc_frame

0 commit comments

Comments
 (0)