|
64 | 64 |
|
65 | 65 | %define src arg0
|
66 | 66 | %define len arg1
|
67 |
| -%define ptr arg2 |
68 |
| -%define pos return |
| 67 | +%define tmp0 arg2 |
| 68 | +%define tmp1 arg3 |
69 | 69 |
|
| 70 | +%use smartalign |
| 71 | +ALIGNMODE P6 |
70 | 72 | default rel
|
71 | 73 |
|
72 | 74 | [bits 64]
|
73 | 75 | section .text
|
74 |
| - |
75 |
| -align 16 |
| 76 | +align 32 ; maximize mu-ops cache usage |
76 | 77 | mk_global mem_zero_detect_avx, function
|
77 | 78 | func(mem_zero_detect_avx)
|
78 | 79 | FUNC_SAVE
|
79 |
| - mov pos, 0 |
80 |
| - sub len, 4*32 |
81 |
| - jle .mem_z_small_block |
| 80 | + cmp len, 127 |
| 81 | + jbe .mem_z_small_block |
| 82 | + ; check the first 128 bytes |
| 83 | + vpxor xmm2, xmm2, xmm2 |
| 84 | + vmovdqu ymm0, [src] |
| 85 | + vpor ymm0, ymm0, [src+32] |
| 86 | + vmovdqu ymm1, [src+64] |
| 87 | + vpor ymm1, ymm1, [src+96] |
| 88 | + vpor ymm0, ymm0, ymm1 |
| 89 | + vpcmpeqb ymm0, ymm2, ymm0 |
| 90 | + vpmovmskb DWORD(tmp0), ymm0 |
| 91 | + not DWORD(tmp0) |
| 92 | + mov DWORD(tmp1), DWORD(len) |
| 93 | + and DWORD(tmp1), 127 |
| 94 | + add src, tmp1 |
| 95 | + xor eax, eax |
| 96 | + shr len, 7 ; len/128 |
| 97 | + test len, len; break partial flag stall |
| 98 | + setz al ; if len < 128, eax != 0 |
| 99 | + add eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion |
| 100 | + jnz .return |
| 101 | + xor eax, eax |
82 | 102 |
|
| 103 | +align 16 |
83 | 104 | .mem_z_loop:
|
84 |
| - vmovdqu ymm0, [src+pos] |
85 |
| - vmovdqu ymm1, [src+pos+1*32] |
86 |
| - vmovdqu ymm2, [src+pos+2*32] |
87 |
| - vmovdqu ymm3, [src+pos+3*32] |
88 |
| - vptest ymm0, ymm0 |
89 |
| - jnz .return_fail |
90 |
| - vptest ymm1, ymm1 |
91 |
| - jnz .return_fail |
92 |
| - vptest ymm2, ymm2 |
93 |
| - jnz .return_fail |
94 |
| - vptest ymm3, ymm3 |
95 |
| - jnz .return_fail |
96 |
| - add pos, 4*32 |
97 |
| - cmp pos, len |
98 |
| - jl .mem_z_loop |
99 |
| - |
100 |
| -.mem_z_last_block: |
101 |
| - vmovdqu ymm0, [src+len] |
102 |
| - vmovdqu ymm1, [src+len+1*32] |
103 |
| - vmovdqu ymm2, [src+len+2*32] |
104 |
| - vmovdqu ymm3, [src+len+3*32] |
105 |
| - vptest ymm0, ymm0 |
106 |
| - jnz .return_fail |
107 |
| - vptest ymm1, ymm1 |
108 |
| - jnz .return_fail |
109 |
| - vptest ymm2, ymm2 |
110 |
| - jnz .return_fail |
111 |
| - vptest ymm3, ymm3 |
112 |
| - jnz .return_fail |
113 |
| - |
114 |
| -.return_pass: |
115 |
| - mov return, 0 |
| 105 | + vmovdqu ymm0, [src] |
| 106 | + vpor ymm0, ymm0,[src+32] |
| 107 | + vmovdqu ymm1, [src+64] |
| 108 | + vpor ymm1, ymm1, [src+96] |
| 109 | + add src, 128 |
| 110 | + xor DWORD(tmp1), DWORD(tmp1) |
| 111 | + sub len, 1 |
| 112 | + setz BYTE(tmp1) |
| 113 | + vpor ymm0, ymm0, ymm1 |
| 114 | + vpcmpeqb ymm0, ymm2, ymm0 |
| 115 | + vpmovmskb DWORD(tmp0), ymm0 |
| 116 | + not DWORD(tmp0) |
| 117 | + add DWORD(tmp1), DWORD(tmp0) |
| 118 | + jz .mem_z_loop |
| 119 | + |
| 120 | +.return: |
| 121 | + xor eax, eax |
| 122 | + test tmp0, tmp0 |
| 123 | + setnz al |
116 | 124 | FUNC_RESTORE
|
117 | 125 | ret
|
118 | 126 |
|
119 | 127 |
|
| 128 | +align 16 |
120 | 129 | .mem_z_small_block:
|
121 |
| - add len, 4*32 |
122 |
| - cmp len, 2*32 |
123 |
| - jl .mem_z_lt64 |
124 |
| - vmovdqu ymm0, [src] |
125 |
| - vmovdqu ymm1, [src+32] |
126 |
| - vmovdqu ymm2, [src+len-2*32] |
127 |
| - vmovdqu ymm3, [src+len-1*32] |
128 |
| - vptest ymm0, ymm0 |
129 |
| - jnz .return_fail |
130 |
| - vptest ymm1, ymm1 |
131 |
| - jnz .return_fail |
132 |
| - vptest ymm2, ymm2 |
133 |
| - jnz .return_fail |
134 |
| - vptest ymm3, ymm3 |
135 |
| - jnz .return_fail |
136 |
| - jmp .return_pass |
137 |
| - |
138 |
| -.mem_z_lt64: |
139 |
| - cmp len, 32 |
140 |
| - jl .mem_z_lt32 |
141 |
| - vmovdqu ymm0, [src] |
142 |
| - vmovdqu ymm1, [src+len-32] |
143 |
| - vptest ymm0, ymm0 |
144 |
| - jnz .return_fail |
145 |
| - vptest ymm1, ymm1 |
146 |
| - jnz .return_fail |
147 |
| - jmp .return_pass |
148 |
| - |
149 |
| - |
150 |
| -.mem_z_lt32: |
151 |
| - cmp len, 16 |
152 |
| - jl .mem_z_lt16 |
153 |
| - vmovdqu xmm0, [src] |
154 |
| - vmovdqu xmm1, [src+len-16] |
155 |
| - vptest xmm0, xmm0 |
156 |
| - jnz .return_fail |
157 |
| - vptest xmm1, xmm1 |
158 |
| - jnz .return_fail |
159 |
| - jmp .return_pass |
160 |
| - |
161 |
| - |
162 |
| -.mem_z_lt16: |
163 |
| - cmp len, 8 |
164 |
| - jl .mem_z_lt8 |
165 |
| - mov tmp, [src] |
166 |
| - mov tmp3,[src+len-8] |
167 |
| - or tmp, tmp3 |
168 |
| - test tmp, tmp |
169 |
| - jnz .return_fail |
170 |
| - jmp .return_pass |
171 |
| - |
172 |
| -.mem_z_lt8: |
173 |
| - cmp len, 0 |
174 |
| - je .return_pass |
175 |
| -.mem_z_1byte_loop: |
176 |
| - mov tmpb, [src+pos] |
177 |
| - cmp tmpb, 0 |
178 |
| - jnz .return_fail |
179 |
| - add pos, 1 |
180 |
| - cmp pos, len |
181 |
| - jl .mem_z_1byte_loop |
182 |
| - jmp .return_pass |
183 |
| - |
184 |
| -.return_fail: |
185 |
| - mov return, 1 |
186 |
| - FUNC_RESTORE |
| 130 | + ;len < 128 |
| 131 | + xor DWORD(tmp0), DWORD(tmp0) |
| 132 | + movzx DWORD(tmp1), BYTE(len) |
| 133 | + cmp DWORD(len), 16 |
| 134 | + jb .mem_z_small_check_zero |
| 135 | + ;17 < len < 128 |
| 136 | + shr DWORD(len), 4 |
| 137 | + xor eax, eax ; alignment |
| 138 | +.mem_z_small_block_loop: |
| 139 | + xor eax, eax |
| 140 | + mov tmp0, [src] |
| 141 | + or tmp0, [src+8] |
| 142 | + sub DWORD(len), 1 |
| 143 | + setz al |
| 144 | + add src, 16 |
| 145 | + add rax, tmp0 |
| 146 | + jz .mem_z_small_block_loop |
| 147 | + |
| 148 | + test tmp0, tmp0 |
| 149 | + jnz .return_small |
| 150 | + movzx DWORD(len), BYTE(tmp1) |
| 151 | + |
| 152 | +.mem_z_small_check_zero: |
| 153 | + xor DWORD(tmp0), DWORD(tmp0) |
| 154 | + and DWORD(len), 15 |
| 155 | + jz .return_small |
| 156 | +.mem_z_small_byte_loop: |
| 157 | + movzx eax, byte [src] |
| 158 | + add src, 1 |
| 159 | + or DWORD(tmp0), eax |
| 160 | + sub DWORD(len), 1 |
| 161 | + jnz .mem_z_small_byte_loop |
| 162 | +.return_small: |
| 163 | + xor eax, eax |
| 164 | + test tmp0, tmp0 |
| 165 | + setnz al |
187 | 166 | ret
|
188 | 167 |
|
189 | 168 | endproc_frame
|
0 commit comments