Skip to content

Commit 275d95b

Browse files
committed
Further optimizations, for length, is_latin, is_bmp
1 parent 4e53204 commit 275d95b

File tree

4 files changed

+105
-95
lines changed

4 files changed

+105
-95
lines changed

src/StrBase.jl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,15 @@ using ModuleInterfaceTools
2323

2424
@api develop! check_string, unsafe_check_string, fast_check_string, skipascii, skipbmp,
2525
countmask, count_chars, _count_mask_al, _count_mask_ul, count_latin,
26-
_copysub, _cvtsize, _repeat, empty_str, _data, _pntchunk, _str,
26+
_copysub, _cvtsize, _repeat, empty_str, _data, _mask_bytes,
27+
_pntchunk, _pntbigchunk, _str,
2728
ValidatedStyle, MutableStyle, EqualsStyle, CanContain
2829

2930
@api develop LineCounts, CharTypes, CharStat, maxbit, calcstats, check_continuation,
3031
UTF_LONG, UTF_LATIN1, UTF_UNICODE2, UTF_UNICODE3, UTF_UNICODE4, UTF_SURROGATE,
31-
UTF_INVALID, CHUNKSZ, CHUNKMSK,
32+
UTF_INVALID, CHUNKSZ, CHUNKMSK, BIGCHUNKSZ, BIGCHUNKMSK,
3233
_memcmp, _memcpy, _memset, _fwd_memchr, _rev_memchr,
33-
empty_string, _calcpnt, _mask_bytes, _allocate,
34+
BigChunk, empty_string, _calcpnt, _allocate, SingleCU, MultiCU,
3435
MS_UTF8, MS_UTF16, MS_UTF32, MS_SubUTF32, MS_Latin, MS_ByteStr, MS_RawUTF8,
3536
_wrap_substr, _empty_sub,
3637
AccessType, UInt16_U, UInt32_U, UInt16_S, UInt32_S, UInt16_US, UInt32_US,

src/types.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,13 @@ ncodeunits(s::Str{<:Quad_CSEs}) = sizeof(s) >>> 2
144144

145145
@inline _big_mask_bytes(n) = ((1%BigChunk) << ((n & BIGCHUNKMSK) << 3)) - 0x1
146146

147+
@inline function _mask_bytes(v::T, cnt) where {T}
148+
shft = (cnt & (sizeof(T) - 1))%UInt << 3
149+
ifelse(shft == 0, v, v & ~(typemax(T) << shft))
150+
end
151+
152+
@inline _widen_mask(msk::UInt) = ((msk%BigChunk) << (8*sizeof(UInt))) | msk
153+
147154
# Support for SubString of Str
148155

149156
Base.SubString(str::Str{C}) where {C<:SubSet_CSEs} =

src/utf16.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ const _hi_bit_16 = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000
2222
len += count_ones(v)
2323
v = _get_lead(pnt)
2424
end
25-
len + count_ones((cnt & CHUNKMSK) == 0 ? v : (v & _mask_bytes(cnt)))
25+
len + count_ones(_mask_bytes(v, cnt))
2626
end
2727

2828
_length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int) =
@@ -100,7 +100,7 @@ end
100100
v == 0 || return false
101101
v = _get_masked(pnt)
102102
end
103-
((cnt & CHUNKMSK) == 0 ? v : (v & _mask_bytes(cnt))) == 0
103+
_mask_bytes(v, cnt) == 0
104104
end
105105
@inline _check_bmp_utf16_al(pnt, cnt) = _check_bmp_utf16_al(pnt, cnt, unsafe_load(pnt))
106106

@@ -116,11 +116,11 @@ end
116116
end
117117

118118
is_bmp(str::Str{UTF16CSE}) =
119-
(cnt = sizeof(str)) == 0 ? true :
119+
(cnt = sizeof(str)) == 0 ||
120120
@preserve str _check_bmp_utf16_al(reinterpret(Ptr{UInt}, pointer(str)), cnt)
121121

122122
is_bmp(str::SubString{<:Str{UTF16CSE}}) =
123-
(cnt = sizeof(str)) == 0 ? true : @preserve str _check_bmp_utf16_ul(pointer(str), cnt)
123+
(cnt = sizeof(str)) == 0 || @preserve str _check_bmp_utf16_ul(pointer(str), cnt)
124124

125125
is_bmp(str::MaybeSub{<:Str{<:UCS2_CSEs}}) = true
126126

src/utf8.jl

Lines changed: 90 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -89,110 +89,101 @@ xor 80 then << 1 then |
8989
11 -> 01 -> 1
9090
=#
9191

92-
@inline _widen_mask(msk::UInt) = ((msk%BigChunk) << (8*sizeof(UInt))) | msk
93-
9492
const hi_mask = CHUNKSZ == 4 ? 0x8080_8080 : 0x8080_8080_8080_8080
9593
const big_hi_mask = _widen_mask(hi_mask)
9694

97-
@inline _count_cont(v, msk) = (v = xor(v, msk); count_ones(xor(((v << 1) | v), msk) & msk))
95+
@inline get_high_mask(::UInt) = hi_mask
96+
@inline get_high_mask(::BigChunk) = big_hi_mask
97+
9898
@inline msk_lead(v, msk) = (v = xor(v, msk); xor(xor(((v << 1) | v), msk) & msk, msk))
9999

100-
@inline _count_cont(v::UInt) = _count_cont(v, hi_mask)
101-
@inline msk_lead(v::UInt) = msk_lead(v, hi_mask)
100+
@inline msk_lead(v) = msk_lead(v, get_high_mask(v))
102101

103-
@inline _count_cont(v::BigChunk) = _count_cont(v, big_hi_mask)
104-
@inline _msk_lead(v::BigChunk) = _msk_lead(v, big_hi_mask)
102+
@inline get_lead(T, ptr) = msk_lead(unsafe_load(reinterpret(Ptr{T}, ptr)))
105103

106-
@inline function _align_len_utf8(pnt, cnt, v)
107-
len = 0
108-
fin = pnt + cnt
109-
v = msk_lead(v)
110-
while (pnt += CHUNKSZ) < fin
104+
@inline count_masked(v, cnt) = count_ones(_mask_bytes(v, cnt))
105+
106+
function _length_al(::MultiCU, ::Type{UTF8CSE}, beg::Ptr{UInt8}, cnt::Int)
107+
# First check very frequent cases of short strings
108+
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
109+
# taking advantage of the knowledge of how String types are stored in Julia,
110+
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
111+
if cnt <= BIGCHUNKSZ
112+
return (cnt <= CHUNKSZ
113+
? count_masked(get_lead(UInt, beg), cnt)
114+
: count_masked(get_lead(BigChunk, beg), cnt))
115+
end
116+
len = count_ones(get_lead(UInt, beg))
117+
cnt -= CHUNKSZ
118+
pnt = _pntbigchunk(beg + CHUNKSZ)
119+
v = get_lead(BigChunk, pnt)
120+
cnt <= BIGCHUNKSZ && return len + count_masked(v, cnt)
121+
fin = _pntbigchunk(beg + CHUNKSZ + cnt)
122+
while (pnt += BIGCHUNKSZ) < fin
111123
len += count_ones(v)
112-
v = msk_lead(unsafe_load(pnt))
124+
v = get_lead(BigChunk, pnt)
113125
end
114-
len + count_ones(cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes(cnt)))
126+
len + count_masked(v, cnt)
115127
end
116128

117-
_length_al(::MultiCU, ::Type{UTF8CSE}, beg::Ptr{UInt8}, cnt::Int) =
118-
(pnt = reinterpret(Ptr{UInt}, beg); _align_len_utf8(pnt, cnt, unsafe_load(pnt)))
119-
120-
function _length(::MultiCU, ::Type{UTF8CSE}, beg::Ptr{UInt8}, cnt::Int)
129+
function _length_ul(::MultiCU, ::Type{UTF8CSE}, beg::Ptr{UInt8}, cnt::Int)
121130
align = reinterpret(UInt, beg)
122-
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
131+
pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK)
123132
v = unsafe_load(pnt)
124-
if (align &= CHUNKMSK) != 0
125-
msk = _mask_bytes(align)
126-
v = (v & ~msk) | (msk & hi_mask)
133+
if (align &= BIGCHUNKMSK) != 0
134+
msk = _big_mask_bytes(align)
135+
v = (v & ~msk) | (msk & big_hi_mask)
127136
cnt += align
128137
end
129-
_align_len_utf8(pnt, cnt, v)
130-
end
131-
132-
@inline function _check_mask_al(pnt, cnt, msk, v)
138+
len = 0
133139
fin = pnt + cnt
134-
while (pnt += CHUNKSZ) < fin
135-
(v & msk) == 0 || return false
136-
v = unsafe_load(pnt)
137-
end
138-
(cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes(cnt))) & msk == 0
139-
end
140-
@inline _check_mask_al(pnt, cnt, msk) = _check_mask_al(pnt, cnt, msk, unsafe_load(pnt))
141-
142-
@inline function _check_mask_ul(beg, cnt, msk)
143-
align = reinterpret(UInt, beg)
144-
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
145-
v = unsafe_load(pnt)
146-
if (align &= CHUNKMSK) != 0
147-
v &= ~_mask_bytes(align)
148-
cnt += align
140+
v = msk_lead(v)
141+
while (pnt += BIGCHUNKSZ) < fin
142+
len += count_ones(v)
143+
v = msk_lead(unsafe_load(pnt))
149144
end
150-
_check_mask_al(pnt, cnt, msk, v)
145+
len + count_masked(v, cnt)
151146
end
152147

153-
@inline _mask_bytes(v::T, cnt) where {T} =
154-
ifelse((cnt & (sizeof(T)-1)%UInt) == 0,
155-
v, T(v & (one(T) << ((cnt & (sizeof(T)-1)%UInt) << 3)) - 1))
156-
157-
@inline chk_chunk(ptr, msk::T, cnt) where {T} =
158-
iszero(_mask_bytes(unsafe_load(reinterpret(Ptr{T}, ptr)) & msk, cnt))
148+
@inline get_chunk(ptr, msk::T, cnt) where {T} =
149+
_mask_bytes(unsafe_load(reinterpret(Ptr{T}, ptr)) & msk, cnt)
159150

160-
@inline function _check_block_al(ptr, cnt, msk)
151+
@inline function _check_mask_al(ptr, cnt, msk)
161152
# First check very frequent cases of short strings
162153
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
163154
# taking advantage of the knowledge of how String types are stored in Julia,
164155
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
165-
cnt <= CHUNKSZ && return chk_chunk(ptr, msk, cnt)
156+
cnt <= CHUNKSZ && return get_chunk(ptr, msk, cnt) == 0
166157
bigmsk = _widen_mask(msk)
167-
cnt <= BIGCHUNKSZ && return chk_chunk(ptr, bigmsk, cnt)
158+
cnt <= BIGCHUNKSZ && return get_chunk(ptr, bigmsk, cnt) == 0
168159
(unsafe_load(_pntchunk(ptr)) & msk) == 0 || return false
169160
cnt -= CHUNKSZ
170-
cnt <= BIGCHUNKSZ && return chk_chunk(ptr, bigmsk, cnt)
161+
cnt <= BIGCHUNKSZ && return get_chunk(ptr, bigmsk, cnt) == 0
171162
pnt = _pntbigchunk(ptr + CHUNKSZ)
172163
fin = _pntbigchunk(ptr + CHUNKSZ + cnt)
173-
v = unsafe_load(pnt) & bigmsk
164+
v = unsafe_load(pnt)
174165
while (pnt += BIGCHUNKSZ) < fin
175-
v == 0 || return false
176-
v = unsafe_load(pnt) & bigmsk
166+
(v & bigmsk) == 0 || return false
167+
v = unsafe_load(pnt)
177168
end
178-
iszero(_mask_bytes(v, cnt))
169+
_mask_bytes(v & bigmsk, cnt) == 0
179170
end
180171

181-
@inline function _check_block_ul(beg, cnt, msk)
172+
@inline function _check_mask_ul(beg, cnt, msk)
173+
bigmsk = _widen_mask(msk)
182174
align = reinterpret(UInt, beg)
183175
pnt = _pntbigchunk(align & ~BIGCHUNKMSK)
184176
v = unsafe_load(pnt)
185177
if (align &= BIGCHUNKMSK) != 0
186178
v &= ~_big_mask_bytes(align)
187179
cnt += align
188180
end
189-
fin = _pntbigchunk(pnt + cnt)
190-
bigmsk = _widen_mask(msk)
181+
fin = pnt + cnt
191182
while (pnt += BIGCHUNKSZ) < fin
192183
(v & bigmsk) == 0 || return false
193184
v = unsafe_load(pnt)
194185
end
195-
((cnt & BIGCHUNKMSK) == 0 ? v : (v & _big_mask_bytes(cnt))) & bigmsk == 0
186+
_mask_bytes(v & bigmsk, cnt) == 0
196187
end
197188

198189
_ascii_mask(::Type{UInt8}) = hi_mask
@@ -226,88 +217,99 @@ is_ascii(vec::Vector{T}) where {T<:CodeUnitTypes} =
226217

227218
is_ascii(str::Str{C}) where {C<:ASCII_Union} =
228219
(cnt = sizeof(str)) == 0 ||
229-
(@preserve str _check_block_al(pointer(str), cnt, _ascii_mask(codeunit(C))))
220+
(@preserve str _check_mask_al(pointer(str), cnt, _ascii_mask(codeunit(C))))
230221

231222
# Todo! Here you need to see that 0b11yyyyxx at least 1 y must be set,
232223
# which indicates a non-Latin1 character
233-
_all_latin(val) = ((val & (val<<1) & (val<<2 | (val<<3) | (val<<4) | (val<<5))) & hi_mask) == 0
224+
_all_latin(val) =
225+
((val & (val<<1) & (val<<2 | (val<<3) | (val<<4) | (val<<5))) & get_high_mask(val)) == 0
234226

235-
@inline function _check_latin_utf8_al(pnt, cnt, v)
227+
@inline function _check_latin_utf8_al(beg, cnt)
228+
pnt = reinterpret(Ptr{UInt}, beg)
236229
fin = pnt + cnt
230+
v = unsafe_load(pnt)
237231
while (pnt += CHUNKSZ) < fin
238232
_all_latin(v) || return false
239233
v = unsafe_load(pnt)
240234
end
241-
_all_latin(cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes(cnt)))
235+
_all_latin(_mask_bytes(v, cnt))
242236
end
243-
@inline _check_latin_utf8_al(pnt, cnt) = _check_latin_utf8_al(pnt, cnt, unsafe_load(pnt))
244237

245238
@inline function _check_latin_utf8_ul(beg, cnt)
246239
align = reinterpret(UInt, beg)
247-
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
240+
pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK)
248241
v = unsafe_load(pnt)
249-
if (align &= CHUNKMSK) != 0
250-
v &= ~_mask_bytes(align)
242+
if (align &= BIGCHUNKMSK) != 0
243+
v &= ~_big_mask_bytes(align)
251244
cnt += align
252245
end
253-
_check_latin_utf8_al(pnt, cnt, v)
246+
fin = pnt + cnt
247+
while (pnt += BIGCHUNKSZ) < fin
248+
_all_latin(v) || return false
249+
v = unsafe_load(pnt)
250+
end
251+
_all_latin(_mask_bytes(v, cnt))
254252
end
255253

256254
is_latin(str::Str{UTF8CSE}) =
257-
(siz = sizeof(str)) == 0 ? true :
258-
@preserve str _check_latin_utf8_al(reinterpret(Ptr{UInt}, pointer(str)), siz)
255+
(siz = sizeof(str)) == 0 || @preserve str _check_latin_utf8_al(pointer(str), siz)
259256

260257
is_latin(str::SubString{<:Str{UTF8CSE}}) =
261-
(cnt = sizeof(str)) == 0 ? true : @preserve str _check_latin_utf8_ul(pointer(str), cnt)
258+
(cnt = sizeof(str)) == 0 || @preserve str _check_latin_utf8_ul(pointer(str), cnt)
262259

263260
is_latin(vec::Vector{T}) where {T<:Union{UInt16,UInt32}} =
264-
(cnt = sizeof(vec)) == 0 ? true :
261+
(cnt = sizeof(vec)) == 0 ||
265262
@preserve vec _check_mask_ul(pointer(vec), cnt, _latin_mask(T))
266263

267264
is_latin(str::SubString{<:Str{C}}) where {C<:Union{Word_CSEs,Quad_CSEs}} =
268-
(cnt = sizeof(str)) == 0 ? true :
265+
(cnt = sizeof(str)) == 0 ||
269266
@preserve str _check_mask_ul(pointer(str), cnt, _latin_mask(codeunit(C)))
270267

271268
is_latin(str::Str{C}) where {C<:Union{Word_CSEs,Quad_CSEs}} =
272-
(cnt = sizeof(str)) == 0 ? true :
269+
(cnt = sizeof(str)) == 0 ||
273270
@preserve str _check_mask_al(pointer(str), cnt, _latin_mask(codeunit(C)))
274271

275272
# All 4 top bits must be 1 (i.e. 0xfx) for this to be non-BMP
276-
_all_bmp(val) = ((val | (val<<1) | (val<<2) | (val<<3)) & hi_mask) == 0
273+
_all_bmp(val) = ((val | (val<<1) | (val<<2) | (val<<3)) & get_high_mask(val)) == 0
277274

278-
@inline function _check_bmp_utf8_al(pnt, cnt, v)
275+
@inline function _check_bmp_utf8_al(beg, cnt)
276+
pnt = reinterpret(Ptr{UInt}, beg)
279277
fin = pnt + cnt
278+
v = unsafe_load(pnt)
280279
while (pnt += CHUNKSZ) < fin
281280
_all_bmp(v) || return false
282281
v = unsafe_load(pnt)
283282
end
284-
_all_bmp(cnt & CHUNKMSK == 0 ? v : (v & _mask_bytes(cnt)))
283+
_all_bmp(_mask_bytes(v, cnt))
285284
end
286-
@inline _check_bmp_utf8_al(pnt, cnt) = _check_bmp_utf8_al(pnt, cnt, unsafe_load(pnt))
287285

288286
@inline function _check_bmp_utf8_ul(beg, cnt)
289287
align = reinterpret(UInt, beg)
290-
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
288+
pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK)
291289
v = unsafe_load(pnt)
292-
if (align &= CHUNKMSK) != 0
293-
v &= ~_mask_bytes(align)
290+
if (align &= BIGCHUNKMSK) != 0
291+
v &= ~_big_mask_bytes(align)
294292
cnt += align
295293
end
296-
_check_bmp_utf8_al(pnt, cnt, v)
294+
fin = pnt + cnt
295+
while (pnt += BIGCHUNKSZ) < fin
296+
_all_bmp(v) || return false
297+
v = unsafe_load(pnt)
298+
end
299+
_all_bmp(_mask_bytes(v, cnt))
297300
end
298301

299302
is_bmp(str::Str{UTF8CSE}) =
300-
(cnt = sizeof(str)) == 0 ? true :
301-
@preserve str _check_bmp_utf8_al(reinterpret(Ptr{UInt}, pointer(str)), cnt)
303+
(cnt = sizeof(str)) == 0 || @preserve str _check_bmp_utf8_al(pointer(str), cnt)
302304

303305
is_bmp(str::SubString{<:Str{UTF8CSE}}) =
304-
(cnt = sizeof(str)) == 0 ? true : @preserve str _check_bmp_utf8_ul(pointer(str), cnt)
306+
(cnt = sizeof(str)) == 0 || @preserve str _check_bmp_utf8_ul(pointer(str), cnt)
305307

306308
is_bmp(str::SubString{<:Str{<:Union{Text4CSE,UTF32CSE}}}) =
307-
(cnt = sizeof(str)) == 0 ? true : @preserve str _check_mask_ul(pointer(str), cnt, _bmp_mask_32)
309+
(cnt = sizeof(str)) == 0 || @preserve str _check_mask_ul(pointer(str), cnt, _bmp_mask_32)
308310

309311
is_bmp(str::Str{<:Union{Text4CSE,UTF32CSE}}) =
310-
(cnt = sizeof(str)) == 0 ? true : @preserve str _check_mask_al(pointer(str), cnt, _bmp_mask_32)
312+
(cnt = sizeof(str)) == 0 || @preserve str _check_mask_al(pointer(str), cnt, _bmp_mask_32)
311313

312314
is_unicode(str::MS_UTF8) = true
313315

0 commit comments

Comments
 (0)