|
1 | 1 | #=
|
2 | 2 | UTF8Str type
|
3 | 3 |
|
4 |
| -Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones, |
| 4 | +Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones, |
5 | 5 | and other contributors to the Julia language
|
6 | 6 |
|
7 | 7 | Licensed under MIT License, see LICENSE.md
|
@@ -89,10 +89,19 @@ xor 80 then << 1 then |
|
89 | 89 | 11 -> 01 -> 1
|
90 | 90 | =#
|
91 | 91 |
|
| 92 | +@inline _widen_mask(msk::UInt) = ((msk%BigChunk) << (8*sizeof(UInt))) | msk |
| 93 | + |
92 | 94 | const hi_mask = CHUNKSZ == 4 ? 0x8080_8080 : 0x8080_8080_8080_8080
|
| 95 | +const big_hi_mask = _widen_mask(hi_mask) |
| 96 | + |
| 97 | +@inline _count_cont(v, msk) = (v = xor(v, msk); count_ones(xor(((v << 1) | v), msk) & msk)) |
| 98 | +@inline msk_lead(v, msk) = (v = xor(v, msk); xor(xor(((v << 1) | v), msk) & msk, msk)) |
| 99 | + |
| 100 | +@inline _count_cont(v::UInt) = _count_cont(v, hi_mask) |
| 101 | +@inline msk_lead(v::UInt) = msk_lead(v, hi_mask) |
93 | 102 |
|
94 |
| -@inline _count_cont(v) = (v = xor(v, hi_mask); count_ones(xor(((v << 1) | v), hi_mask) & hi_mask)) |
95 |
| -@inline msk_lead(v) = (v = xor(v, hi_mask); xor(xor(((v << 1) | v), hi_mask) & hi_mask, hi_mask)) |
| 103 | +@inline _count_cont(v::BigChunk) = _count_cont(v, big_hi_mask) |
| 104 | +@inline _msk_lead(v::BigChunk) = _msk_lead(v, big_hi_mask) |
96 | 105 |
|
97 | 106 | @inline function _align_len_utf8(pnt, cnt, v)
|
98 | 107 | len = 0
|
@@ -141,29 +150,83 @@ end
|
141 | 150 | _check_mask_al(pnt, cnt, msk, v)
|
142 | 151 | end
|
143 | 152 |
|
| 153 | +@inline _mask_bytes(v::T, cnt) where {T} = |
| 154 | + ifelse((cnt & (sizeof(T)-1)%UInt) == 0, |
| 155 | + v, T(v & (one(T) << ((cnt & (sizeof(T)-1)%UInt) << 3)) - 1)) |
| 156 | + |
| 157 | +@inline chk_chunk(ptr, msk::T, cnt) where {T} = |
| 158 | + iszero(_mask_bytes(unsafe_load(reinterpret(Ptr{T}, ptr)) & msk, cnt)) |
| 159 | + |
| 160 | +@inline function _check_block_al(ptr, cnt, msk) |
| 161 | + # First check very frequent cases of short strings |
| 162 | + # (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24) |
| 163 | + # taking advantage of the knowledge of how String types are stored in Julia, |
| 164 | + # i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2 |
| 165 | + cnt <= CHUNKSZ && return chk_chunk(ptr, msk, cnt) |
| 166 | + bigmsk = _widen_mask(msk) |
| 167 | + cnt <= BIGCHUNKSZ && return chk_chunk(ptr, bigmsk, cnt) |
| 168 | + (unsafe_load(_pntchunk(ptr)) & msk) == 0 || return false |
| 169 | + cnt -= CHUNKSZ |
| 170 | + cnt <= BIGCHUNKSZ && return chk_chunk(ptr, bigmsk, cnt) |
| 171 | + pnt = _pntbigchunk(ptr + CHUNKSZ) |
| 172 | + fin = _pntbigchunk(ptr + CHUNKSZ + cnt) |
| 173 | + v = unsafe_load(pnt) & bigmsk |
| 174 | + while (pnt += BIGCHUNKSZ) < fin |
| 175 | + v == 0 || return false |
| 176 | + v = unsafe_load(pnt) & bigmsk |
| 177 | + end |
| 178 | + iszero(_mask_bytes(v, cnt)) |
| 179 | +end |
| 180 | + |
| 181 | +@inline function _check_block_ul(beg, cnt, msk) |
| 182 | + align = reinterpret(UInt, beg) |
| 183 | + pnt = _pntbigchunk(align & ~BIGCHUNKMSK) |
| 184 | + v = unsafe_load(pnt) |
| 185 | + if (align &= BIGCHUNKMSK) != 0 |
| 186 | + v &= ~_big_mask_bytes(align) |
| 187 | + cnt += align |
| 188 | + end |
| 189 | + fin = _pntbigchunk(pnt + cnt) |
| 190 | + bigmsk = _widen_mask(msk) |
| 191 | + while (pnt += BIGCHUNKSZ) < fin |
| 192 | + (v & bigmsk) == 0 || return false |
| 193 | + v = unsafe_load(pnt) |
| 194 | + end |
| 195 | + ((cnt & BIGCHUNKMSK) == 0 ? v : (v & _big_mask_bytes(cnt))) & bigmsk == 0 |
| 196 | +end |
| 197 | + |
144 | 198 | _ascii_mask(::Type{UInt8}) = hi_mask
|
145 |
| -_ascii_mask(::Type{UInt16}) = 0xff80_ff80_ff80_ff80 |
146 |
| -_ascii_mask(::Type{UInt32}) = 0xffffff80_ffffff80 |
| 199 | +@static if UInt == 4 |
| 200 | + _ascii_mask(::Type{UInt16}) = 0xff80_ff80 |
| 201 | + _ascii_mask(::Type{UInt32}) = 0xffffff80 |
147 | 202 |
|
148 |
| -_latin_mask(::Type{UInt16}) = 0xff00_ff00_ff00_ff00 |
149 |
| -_latin_mask(::Type{UInt32}) = 0xffffff00_ffffff00 |
| 203 | + _latin_mask(::Type{UInt16}) = 0xff00_ff00 |
| 204 | + _latin_mask(::Type{UInt32}) = 0xffffff00 |
150 | 205 |
|
151 |
| -const _bmp_mask_32 = 0xffff0000_ffff0000 |
| 206 | + const _bmp_mask_32 = 0xffff0000 |
| 207 | +else |
| 208 | + _ascii_mask(::Type{UInt16}) = 0xff80_ff80_ff80_ff80 |
| 209 | + _ascii_mask(::Type{UInt32}) = 0xffffff80_ffffff80 |
152 | 210 |
|
153 |
| -is_ascii(str::SubString{<:Str{C}}) where {C<:Union{UTF8CSE,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE, |
154 |
| - Text2CSE,Text4CSE,UTF32CSE}} = |
155 |
| - (cnt = sizeof(str)) == 0 ? true : |
156 |
| - @preserve str _check_mask_ul(pointer(str), cnt, _ascii_mask(codeunit(C))) |
| 211 | + _latin_mask(::Type{UInt16}) = 0xff00_ff00_ff00_ff00 |
| 212 | + _latin_mask(::Type{UInt32}) = 0xffffff00_ffffff00 |
| 213 | + |
| 214 | + const _bmp_mask_32 = 0xffff0000_ffff0000 |
| 215 | +end |
| 216 | + |
| 217 | +const ASCII_Union = Union{UTF8CSE,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE,Text2CSE,Text4CSE,UTF32CSE} |
| 218 | + |
| 219 | +is_ascii(str::SubString{<:Str{C}}) where {C<:ASCII_Union} = |
| 220 | + (cnt = sizeof(str)) == 0 || |
| 221 | + (@preserve str _check_block_ul(pointer(str), cnt, _ascii_mask(codeunit(C)))) |
157 | 222 |
|
158 | 223 | is_ascii(vec::Vector{T}) where {T<:CodeUnitTypes} =
|
159 |
| - (cnt = sizeof(vec)) == 0 ? true : |
160 |
| - @preserve str _check_mask_ul(pointer(vec), cnt, _ascii_mask(T)) |
| 224 | + (cnt = sizeof(vec)) == 0 || |
| 225 | + (@preserve str _check_block_ul(pointer(vec), cnt, _ascii_mask(T))) |
161 | 226 |
|
162 |
| -is_ascii(str::Str{C}) where {C<:Union{UTF8_CSEs,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE, |
163 |
| - Text2CSE,Text4CSE,UTF32CSE}} = |
164 |
| - (cnt = sizeof(str)) == 0 ? true : |
165 |
| - @preserve str _check_mask_al(reinterpret(Ptr{UInt}, pointer(str)), cnt, |
166 |
| - _ascii_mask(codeunit(C))) |
| 227 | +is_ascii(str::Str{C}) where {C<:ASCII_Union} = |
| 228 | + (cnt = sizeof(str)) == 0 || |
| 229 | + (@preserve str _check_block_al(pointer(str), cnt, _ascii_mask(codeunit(C)))) |
167 | 230 |
|
168 | 231 | # Todo! Here you need to see that 0b11yyyyxx at least 1 y must be set,
|
169 | 232 | # which indicates a non-Latin1 character
|
|
0 commit comments