Skip to content

Commit 0e65117

Browse files
Nicola Torraccagbtucker
authored andcommitted
mem_zero_detect_avx: OR multiple vector and test for non zero on the result
micro-optimizations: vpcmpeqb+vpmaskmov is faster than vptest according to uops.info; make usually untaken branches target forward. reduce numbers of data dependant branches and code size. Change-Id: Ie70b4bc99685368e5131f23344348bfaf7c27d3e Signed-off-by: Nicola Torracca <[email protected]>
1 parent f980b36 commit 0e65117

File tree

1 file changed

+84
-105
lines changed

1 file changed

+84
-105
lines changed

mem/mem_zero_detect_avx.asm

Lines changed: 84 additions & 105 deletions
Original file line numberDiff line numberDiff line change
@@ -64,126 +64,105 @@
6464

6565
%define src arg0
6666
%define len arg1
67-
%define ptr arg2
68-
%define pos return
67+
%define tmp0 arg2
68+
%define tmp1 arg3
6969

70+
%use smartalign
71+
ALIGNMODE P6
7072
default rel
7173

7274
[bits 64]
7375
section .text
74-
75-
align 16
76+
align 32 ; maximize mu-ops cache usage
7677
mk_global mem_zero_detect_avx, function
7778
func(mem_zero_detect_avx)
7879
FUNC_SAVE
79-
mov pos, 0
80-
sub len, 4*32
81-
jle .mem_z_small_block
80+
cmp len, 127
81+
jbe .mem_z_small_block
82+
; check the first 128 bytes
83+
vpxor xmm2, xmm2, xmm2
84+
vmovdqu ymm0, [src]
85+
vpor ymm0, ymm0, [src+32]
86+
vmovdqu ymm1, [src+64]
87+
vpor ymm1, ymm1, [src+96]
88+
vpor ymm0, ymm0, ymm1
89+
vpcmpeqb ymm0, ymm2, ymm0
90+
vpmovmskb DWORD(tmp0), ymm0
91+
not DWORD(tmp0)
92+
mov DWORD(tmp1), DWORD(len)
93+
and DWORD(tmp1), 127
94+
add src, tmp1
95+
xor eax, eax
96+
shr len, 7 ; len/128
97+
test len, len; break partial flag stall
98+
setz al ; if len < 128, eax != 0
99+
add eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion
100+
jnz .return
101+
xor eax, eax
82102

103+
align 16
83104
.mem_z_loop:
84-
vmovdqu ymm0, [src+pos]
85-
vmovdqu ymm1, [src+pos+1*32]
86-
vmovdqu ymm2, [src+pos+2*32]
87-
vmovdqu ymm3, [src+pos+3*32]
88-
vptest ymm0, ymm0
89-
jnz .return_fail
90-
vptest ymm1, ymm1
91-
jnz .return_fail
92-
vptest ymm2, ymm2
93-
jnz .return_fail
94-
vptest ymm3, ymm3
95-
jnz .return_fail
96-
add pos, 4*32
97-
cmp pos, len
98-
jl .mem_z_loop
99-
100-
.mem_z_last_block:
101-
vmovdqu ymm0, [src+len]
102-
vmovdqu ymm1, [src+len+1*32]
103-
vmovdqu ymm2, [src+len+2*32]
104-
vmovdqu ymm3, [src+len+3*32]
105-
vptest ymm0, ymm0
106-
jnz .return_fail
107-
vptest ymm1, ymm1
108-
jnz .return_fail
109-
vptest ymm2, ymm2
110-
jnz .return_fail
111-
vptest ymm3, ymm3
112-
jnz .return_fail
113-
114-
.return_pass:
115-
mov return, 0
105+
vmovdqu ymm0, [src]
106+
vpor ymm0, ymm0,[src+32]
107+
vmovdqu ymm1, [src+64]
108+
vpor ymm1, ymm1, [src+96]
109+
add src, 128
110+
xor DWORD(tmp1), DWORD(tmp1)
111+
sub len, 1
112+
setz BYTE(tmp1)
113+
vpor ymm0, ymm0, ymm1
114+
vpcmpeqb ymm0, ymm2, ymm0
115+
vpmovmskb DWORD(tmp0), ymm0
116+
not DWORD(tmp0)
117+
add DWORD(tmp1), DWORD(tmp0)
118+
jz .mem_z_loop
119+
120+
.return:
121+
xor eax, eax
122+
test tmp0, tmp0
123+
setnz al
116124
FUNC_RESTORE
117125
ret
118126

119127

128+
align 16
120129
.mem_z_small_block:
121-
add len, 4*32
122-
cmp len, 2*32
123-
jl .mem_z_lt64
124-
vmovdqu ymm0, [src]
125-
vmovdqu ymm1, [src+32]
126-
vmovdqu ymm2, [src+len-2*32]
127-
vmovdqu ymm3, [src+len-1*32]
128-
vptest ymm0, ymm0
129-
jnz .return_fail
130-
vptest ymm1, ymm1
131-
jnz .return_fail
132-
vptest ymm2, ymm2
133-
jnz .return_fail
134-
vptest ymm3, ymm3
135-
jnz .return_fail
136-
jmp .return_pass
137-
138-
.mem_z_lt64:
139-
cmp len, 32
140-
jl .mem_z_lt32
141-
vmovdqu ymm0, [src]
142-
vmovdqu ymm1, [src+len-32]
143-
vptest ymm0, ymm0
144-
jnz .return_fail
145-
vptest ymm1, ymm1
146-
jnz .return_fail
147-
jmp .return_pass
148-
149-
150-
.mem_z_lt32:
151-
cmp len, 16
152-
jl .mem_z_lt16
153-
vmovdqu xmm0, [src]
154-
vmovdqu xmm1, [src+len-16]
155-
vptest xmm0, xmm0
156-
jnz .return_fail
157-
vptest xmm1, xmm1
158-
jnz .return_fail
159-
jmp .return_pass
160-
161-
162-
.mem_z_lt16:
163-
cmp len, 8
164-
jl .mem_z_lt8
165-
mov tmp, [src]
166-
mov tmp3,[src+len-8]
167-
or tmp, tmp3
168-
test tmp, tmp
169-
jnz .return_fail
170-
jmp .return_pass
171-
172-
.mem_z_lt8:
173-
cmp len, 0
174-
je .return_pass
175-
.mem_z_1byte_loop:
176-
mov tmpb, [src+pos]
177-
cmp tmpb, 0
178-
jnz .return_fail
179-
add pos, 1
180-
cmp pos, len
181-
jl .mem_z_1byte_loop
182-
jmp .return_pass
183-
184-
.return_fail:
185-
mov return, 1
186-
FUNC_RESTORE
130+
;len < 128
131+
xor DWORD(tmp0), DWORD(tmp0)
132+
movzx DWORD(tmp1), BYTE(len)
133+
cmp DWORD(len), 16
134+
jb .mem_z_small_check_zero
135+
;17 < len < 128
136+
shr DWORD(len), 4
137+
xor eax, eax ; alignment
138+
.mem_z_small_block_loop:
139+
xor eax, eax
140+
mov tmp0, [src]
141+
or tmp0, [src+8]
142+
sub DWORD(len), 1
143+
setz al
144+
add src, 16
145+
add rax, tmp0
146+
jz .mem_z_small_block_loop
147+
148+
test tmp0, tmp0
149+
jnz .return_small
150+
movzx DWORD(len), BYTE(tmp1)
151+
152+
.mem_z_small_check_zero:
153+
xor DWORD(tmp0), DWORD(tmp0)
154+
and DWORD(len), 15
155+
jz .return_small
156+
.mem_z_small_byte_loop:
157+
movzx eax, byte [src]
158+
add src, 1
159+
or DWORD(tmp0), eax
160+
sub DWORD(len), 1
161+
jnz .mem_z_small_byte_loop
162+
.return_small:
163+
xor eax, eax
164+
test tmp0, tmp0
165+
setnz al
187166
ret
188167

189168
endproc_frame

0 commit comments

Comments
 (0)