|
64 | 64 |
|
65 | 65 | %define src arg0
|
66 | 66 | %define len arg1
|
67 |
| -%define tmp0 arg2 |
68 |
| -%define tmp1 arg3 |
| 67 | +%define ptr arg2 |
| 68 | +%define pos return |
69 | 69 |
|
70 |
| -%use smartalign |
71 |
| -ALIGNMODE P6 |
72 | 70 | default rel
|
73 | 71 |
|
74 | 72 | [bits 64]
|
75 | 73 | section .text
|
76 |
| -align 32 ; maximize mu-ops cache usage |
| 74 | + |
| 75 | +align 16 |
77 | 76 | mk_global mem_zero_detect_avx, function
|
78 | 77 | func(mem_zero_detect_avx)
|
79 | 78 | FUNC_SAVE
|
80 |
| - cmp len, 127 |
81 |
| - jbe .mem_z_small_block |
82 |
| - ; check the first 128 bytes |
83 |
| - vpxor xmm2, xmm2, xmm2 |
84 |
| - vmovdqu ymm0, [src] |
85 |
| - vpor ymm0, ymm0, [src+32] |
86 |
| - vmovdqu ymm1, [src+64] |
87 |
| - vpor ymm1, ymm1, [src+96] |
88 |
| - vpor ymm0, ymm0, ymm1 |
89 |
| - vpcmpeqb ymm0, ymm2, ymm0 |
90 |
| - vpmovmskb DWORD(tmp0), ymm0 |
91 |
| - not DWORD(tmp0) |
92 |
| - mov DWORD(tmp1), DWORD(len) |
93 |
| - and DWORD(tmp1), 127 |
94 |
| - add src, tmp1 |
95 |
| - xor eax, eax |
96 |
| - shr len, 7 ; len/128 |
97 |
| - test len, len; break partial flag stall |
98 |
| - setz al ; if len < 128, eax != 0 |
99 |
| - add eax, DWORD(tmp0) ; jump if (edx OR eax) !=0, use add for macrofusion |
100 |
| - jnz .return |
101 |
| - xor eax, eax |
| 79 | + mov pos, 0 |
| 80 | + sub len, 4*32 |
| 81 | + jle .mem_z_small_block |
102 | 82 |
|
103 |
| -align 16 |
104 | 83 | .mem_z_loop:
|
105 |
| - vmovdqu ymm0, [src] |
106 |
| - vpor ymm0, ymm0,[src+32] |
107 |
| - vmovdqu ymm1, [src+64] |
108 |
| - vpor ymm1, ymm1, [src+96] |
109 |
| - add src, 128 |
110 |
| - xor DWORD(tmp1), DWORD(tmp1) |
111 |
| - sub len, 1 |
112 |
| - setz BYTE(tmp1) |
113 |
| - vpor ymm0, ymm0, ymm1 |
114 |
| - vpcmpeqb ymm0, ymm2, ymm0 |
115 |
| - vpmovmskb DWORD(tmp0), ymm0 |
116 |
| - not DWORD(tmp0) |
117 |
| - add DWORD(tmp1), DWORD(tmp0) |
118 |
| - jz .mem_z_loop |
119 |
| - |
120 |
| -.return: |
121 |
| - xor eax, eax |
122 |
| - test tmp0, tmp0 |
123 |
| - setnz al |
| 84 | + vmovdqu ymm0, [src+pos] |
| 85 | + vmovdqu ymm1, [src+pos+1*32] |
| 86 | + vmovdqu ymm2, [src+pos+2*32] |
| 87 | + vmovdqu ymm3, [src+pos+3*32] |
| 88 | + vptest ymm0, ymm0 |
| 89 | + jnz .return_fail |
| 90 | + vptest ymm1, ymm1 |
| 91 | + jnz .return_fail |
| 92 | + vptest ymm2, ymm2 |
| 93 | + jnz .return_fail |
| 94 | + vptest ymm3, ymm3 |
| 95 | + jnz .return_fail |
| 96 | + add pos, 4*32 |
| 97 | + cmp pos, len |
| 98 | + jl .mem_z_loop |
| 99 | + |
| 100 | +.mem_z_last_block: |
| 101 | + vmovdqu ymm0, [src+len] |
| 102 | + vmovdqu ymm1, [src+len+1*32] |
| 103 | + vmovdqu ymm2, [src+len+2*32] |
| 104 | + vmovdqu ymm3, [src+len+3*32] |
| 105 | + vptest ymm0, ymm0 |
| 106 | + jnz .return_fail |
| 107 | + vptest ymm1, ymm1 |
| 108 | + jnz .return_fail |
| 109 | + vptest ymm2, ymm2 |
| 110 | + jnz .return_fail |
| 111 | + vptest ymm3, ymm3 |
| 112 | + jnz .return_fail |
| 113 | + |
| 114 | +.return_pass: |
| 115 | + mov return, 0 |
124 | 116 | FUNC_RESTORE
|
125 | 117 | ret
|
126 | 118 |
|
127 | 119 |
|
128 |
| -align 16 |
129 | 120 | .mem_z_small_block:
|
130 |
| - ;len < 128 |
131 |
| - xor DWORD(tmp0), DWORD(tmp0) |
132 |
| - movzx DWORD(tmp1), BYTE(len) |
133 |
| - cmp DWORD(len), 16 |
134 |
| - jb .mem_z_small_check_zero |
135 |
| - ;17 < len < 128 |
136 |
| - shr DWORD(len), 4 |
137 |
| - xor eax, eax ; alignment |
138 |
| -.mem_z_small_block_loop: |
139 |
| - xor eax, eax |
140 |
| - mov tmp0, [src] |
141 |
| - or tmp0, [src+8] |
142 |
| - sub DWORD(len), 1 |
143 |
| - setz al |
144 |
| - add src, 16 |
145 |
| - add rax, tmp0 |
146 |
| - jz .mem_z_small_block_loop |
147 |
| - |
148 |
| - test tmp0, tmp0 |
149 |
| - jnz .return_small |
150 |
| - movzx DWORD(len), BYTE(tmp1) |
151 |
| - |
152 |
| -.mem_z_small_check_zero: |
153 |
| - xor DWORD(tmp0), DWORD(tmp0) |
154 |
| - and DWORD(len), 15 |
155 |
| - jz .return_small |
156 |
| -.mem_z_small_byte_loop: |
157 |
| - movzx eax, byte [src] |
158 |
| - add src, 1 |
159 |
| - or DWORD(tmp0), eax |
160 |
| - sub DWORD(len), 1 |
161 |
| - jnz .mem_z_small_byte_loop |
162 |
| -.return_small: |
163 |
| - xor eax, eax |
164 |
| - test tmp0, tmp0 |
165 |
| - setnz al |
| 121 | + add len, 4*32 |
| 122 | + cmp len, 2*32 |
| 123 | + jl .mem_z_lt64 |
| 124 | + vmovdqu ymm0, [src] |
| 125 | + vmovdqu ymm1, [src+32] |
| 126 | + vmovdqu ymm2, [src+len-2*32] |
| 127 | + vmovdqu ymm3, [src+len-1*32] |
| 128 | + vptest ymm0, ymm0 |
| 129 | + jnz .return_fail |
| 130 | + vptest ymm1, ymm1 |
| 131 | + jnz .return_fail |
| 132 | + vptest ymm2, ymm2 |
| 133 | + jnz .return_fail |
| 134 | + vptest ymm3, ymm3 |
| 135 | + jnz .return_fail |
| 136 | + jmp .return_pass |
| 137 | + |
| 138 | +.mem_z_lt64: |
| 139 | + cmp len, 32 |
| 140 | + jl .mem_z_lt32 |
| 141 | + vmovdqu ymm0, [src] |
| 142 | + vmovdqu ymm1, [src+len-32] |
| 143 | + vptest ymm0, ymm0 |
| 144 | + jnz .return_fail |
| 145 | + vptest ymm1, ymm1 |
| 146 | + jnz .return_fail |
| 147 | + jmp .return_pass |
| 148 | + |
| 149 | + |
| 150 | +.mem_z_lt32: |
| 151 | + cmp len, 16 |
| 152 | + jl .mem_z_lt16 |
| 153 | + vmovdqu xmm0, [src] |
| 154 | + vmovdqu xmm1, [src+len-16] |
| 155 | + vptest xmm0, xmm0 |
| 156 | + jnz .return_fail |
| 157 | + vptest xmm1, xmm1 |
| 158 | + jnz .return_fail |
| 159 | + jmp .return_pass |
| 160 | + |
| 161 | + |
| 162 | +.mem_z_lt16: |
| 163 | + cmp len, 8 |
| 164 | + jl .mem_z_lt8 |
| 165 | + mov tmp, [src] |
| 166 | + mov tmp3,[src+len-8] |
| 167 | + or tmp, tmp3 |
| 168 | + test tmp, tmp |
| 169 | + jnz .return_fail |
| 170 | + jmp .return_pass |
| 171 | + |
| 172 | +.mem_z_lt8: |
| 173 | + cmp len, 0 |
| 174 | + je .return_pass |
| 175 | +.mem_z_1byte_loop: |
| 176 | + mov tmpb, [src+pos] |
| 177 | + cmp tmpb, 0 |
| 178 | + jnz .return_fail |
| 179 | + add pos, 1 |
| 180 | + cmp pos, len |
| 181 | + jl .mem_z_1byte_loop |
| 182 | + jmp .return_pass |
| 183 | + |
| 184 | +.return_fail: |
| 185 | + mov return, 1 |
| 186 | + FUNC_RESTORE |
166 | 187 | ret
|
167 | 188 |
|
168 | 189 | endproc_frame
|
0 commit comments