Skip to content

Commit 98fedf9

Browse files
committed
Optimizations of UTF-16 length, is_bmp
1 parent 275d95b commit 98fedf9

File tree

9 files changed

+106
-64
lines changed

9 files changed

+106
-64
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,6 @@ test = ["Test", "Random"]
2727
julia = "1"
2828
ModuleInterfaceTools = "1"
2929
MurmurHash3 = "^1.0.3"
30-
StrAPI = "1"
30+
StrAPI = "1.1"
3131
ChrBase = "^1.0.1"
3232
CharSetEncodings = "1"

src/compare.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ end
5454
while pnt < fin
5555
str_done(b, pos) && return 1
5656
c1, pnt = _nextcp(C, pnt)
57-
ch, pos = str_next(b, pos)
57+
ch, pos = iterate(b, pos)
5858
c2 = ch%UInt32
5959
c1 == c2 || return ifelse(c1 < c2, -1, 1)
6060
end
@@ -93,7 +93,7 @@ function _cpeq(a::MaybeSub{T}, b) where {C<:CSE, T<:Str{C}}
9393
while pnt < fin
9494
str_done(b, pos) && return false
9595
c1, pnt = _nextcp(C, pnt)
96-
ch, pos = str_next(b, pos)
96+
ch, pos = iterate(b, pos)
9797
c1 == codepoint(ch) || return false
9898
end
9999
true

src/core.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Core functions
33
44
5-
Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones, and others (see Julia contributors)
5+
Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones, and others (see Julia contributors)
66
Licensed under MIT License, see LICENSE.md
77
88
Inspired by / derived from code in Julia
@@ -33,7 +33,7 @@ _nextcp(::Type{T}, pnt) where {T} = _nextcpfun(EncodingStyle(T), T, pnt)
3333

3434
# Use more generic length check
3535
@inline _length_check(str::SubString{<:Str{C}}, cnt) where {C<:CSE} =
36-
_length(MultiCU(), C, pointer(str), cnt)
36+
@preserve str _length_ul(MultiCU(), C, pointer(str), cnt)
3737

3838
# Go directly to aligned length check
3939
@inline _length_check(str::Str{C}, cnt) where {C<:CSE} =
@@ -42,7 +42,7 @@ _nextcp(::Type{T}, pnt) where {T} = _nextcpfun(EncodingStyle(T), T, pnt)
4242
@inline _length(::MultiCU, str::MaybeSub{T}) where {T<:Str} =
4343
(cnt = ncodeunits(str); cnt < 2 ? Int(cnt > 0) : @preserve str _length_check(str, cnt))
4444

45-
@inline _length(::SingleCU, ::Type{<:CSE}, ::Ptr{<:CodeUnitTypes}, cnt::Int) = cnt
45+
@inline _length_ul(::SingleCU, ::Type{<:CSE}, ::Ptr{<:CodeUnitTypes}, cnt::Int) = cnt
4646

4747
@inline _length(::MultiCU, str::Str{RawUTF8CSE}) = length(str.data)
4848
@inline _length(::MultiCU, str::Str{RawUTF8CSE}, i::Int, j::Int) = length(str.data, i, j)
@@ -55,7 +55,7 @@ _nextcp(::Type{T}, pnt) where {T} = _nextcpfun(EncodingStyle(T), T, pnt)
5555
0 <= j < lim || boundserr(str, j)
5656
end
5757
(cnt = j - i + 1) <= 0 ? 0 :
58-
@preserve str _length(cs, cse(str), bytoff(pointer(str), i - 1), cnt)
58+
@preserve str _length_ul(cs, cse(str), bytoff(pointer(str), i - 1), cnt)
5959
end
6060

6161
@inline _thisind(::SingleCU, str, len, pnt, pos) = Int(pos)

src/search.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ function find(::Type{D}, needle::AbstractString, str::AbstractString,
193193
@inbounds is_valid(str, pos) || index_error(str, pos)
194194
(tlen = ncodeunits(needle)) == 0 && return pos:pos-1
195195
(cmp = CanContain(str, needle)) === NoCompare() && return _not_found
196-
@inbounds ch, nxt = str_next(needle, 1)
196+
@inbounds ch, nxt = iterate(needle, 1)
197197
is_valid(eltype(str), ch) || return _not_found
198198
# Check if single character
199199
if nxt > tlen
@@ -209,7 +209,7 @@ function find(::Type{T}, needle::AbstractString, str::AbstractString) where {T<:
209209
pos = T === First ? 1 : thisind(str, slen)
210210
(tlen = ncodeunits(needle)) == 0 && return pos:(pos-1)
211211
(cmp = CanContain(str, needle)) === NoCompare() && return _not_found
212-
@inbounds ch, nxt = str_next(needle, 1)
212+
@inbounds ch, nxt = iterate(needle, 1)
213213
is_valid(eltype(str), ch) || return _not_found
214214
# Check if single character
215215
if nxt > tlen
@@ -302,8 +302,8 @@ end
302302
"""Compare two strings, starting at nxtstr and nxtsub"""
303303
@inline function _cmp_str(str, strpos, endpos, sub, subpos, endsub)
304304
while strpos <= endpos
305-
c, strnxt = str_next(str, strpos)
306-
d, subpos = str_next(sub, subpos)
305+
c, strnxt = iterate(str, strpos)
306+
d, subpos = iterate(sub, subpos)
307307
c == d || break
308308
subpos > endsub && return strpos
309309
strpos = strnxt

src/support.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ function unsafe_check_string(str::T;
264264
totalchar = latin1byte = num2byte = num3byte = num4byte = invalids = 0
265265
pos = 1
266266
@inbounds while !str_done(str, pos)
267-
chr, nxt = str_next(str, pos)
267+
chr, nxt = iterate(str, pos)
268268
ch = chr%UInt32
269269
totalchar += 1
270270
if ch > 0x7f
@@ -288,7 +288,7 @@ function unsafe_check_string(str::T;
288288
break
289289
end
290290
# next character *must* be a trailing surrogate character
291-
chr, nxt = str_next(str, nxt)
291+
chr, nxt = iterate(str, nxt)
292292
if !is_surrogate_trail(chr)
293293
accept_invalids || strerror(StrErrors.NOT_TRAIL, pos, chr)
294294
invalids += 1

src/types.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ ncodeunits(s::Str{<:Quad_CSEs}) = sizeof(s) >>> 2
145145
@inline _big_mask_bytes(n) = ((1%BigChunk) << ((n & BIGCHUNKMSK) << 3)) - 0x1
146146

147147
@inline function _mask_bytes(v::T, cnt) where {T}
148-
shft = (cnt & (sizeof(T) - 1))%UInt << 3
148+
shft = ((cnt & (sizeof(T) - 1))%UInt) << 3
149149
ifelse(shft == 0, v, v & ~(typemax(T) << shft))
150150
end
151151

src/utf16.jl

Lines changed: 62 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -10,34 +10,59 @@ Based in (small) part on code for UTF16String that used to be in Julia
1010
const _trail_mask = CHUNKSZ == 4 ? 0xdc00_dc00 : 0xdc00_dc00_dc00_dc00
1111
const _hi_bit_16 = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000
1212

13-
@inline _mask_surr(v) = xor((v | v<<1 | v<<2 | v<<3 | v<<4 | v<<5) & _hi_bit_16, _hi_bit_16)
14-
@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask))
15-
@inline _get_masked(qpnt::Ptr{UInt}) = _get_masked(unsafe_load(qpnt))
16-
@inline _get_lead(qpnt) = xor(_get_masked(qpnt), _hi_bit_16)
13+
const _big_trail_mask = _widen_mask(_trail_mask)
14+
const _big_hi_bit_16 = _widen_mask(_big_hi_bit_16)
1715

18-
@inline function _align_len_utf16(pnt, cnt, v)
19-
len = 0
16+
@inline _mask_surr(v, msk) = xor((v | v<<1 | v<<2 | v<<3 | v<<4 | v<<5) & msk, msk)
17+
18+
@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask))
19+
@inline _get_masked(v::BigChunk) = _mask_surr(xor(v, _big_trail_mask))
20+
@inline _get_masked(qpnt::Ptr) = _get_masked(unsafe_load(qpnt))
21+
22+
@inline _get_lead(qpnt::Ptr{UInt}) = xor(_get_masked(qpnt), _hi_bit_16)
23+
@inline _get_lead(qpnt::Ptr{BigChunk}) = xor(_get_masked(qpnt), _big_hi_bit_16)
24+
25+
@inline function _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
26+
# First check very frequent cases of short strings
27+
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
28+
# taking advantage of the knowledge of how String types are stored in Julia,
29+
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
30+
cnt <<= 1
31+
if cnt <= BIGCHUNKSZ
32+
return (cnt <= CHUNKSZ
33+
? count_ones(_mask_bytes(_get_lead(_pntchunk(beg), cnt))
34+
: count_ones(_mask_bytes(_get_lead(_pntbigchunk(beg), cnt))
35+
end
36+
len = count_ones(_get_lead(_pntchunk(beg)))
37+
cnt -= CHUNKSZ
38+
pnt = _pntbigchunk(beg + CHUNKSZ)
39+
v = _get_lead(pnt)
40+
cnt <= BIGCHUNKSZ && return len + count_ones(_mask_bytes(v, cnt))
2041
fin = pnt + cnt
21-
while (pnt += CHUNKSZ) < fin
42+
while (pnt += BIGCHUNKSZ) < fin
2243
len += count_ones(v)
2344
v = _get_lead(pnt)
2445
end
2546
len + count_ones(_mask_bytes(v, cnt))
2647
end
2748

28-
_length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int) =
29-
(pnt = reinterpret(Ptr{UInt}, beg); _align_len_utf16(pnt, cnt<<1, _get_lead(pnt)))
30-
31-
function _length(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
49+
function _length_ul(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
3250
align = reinterpret(UInt, beg)
33-
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
51+
pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK)
52+
cnt <<= 1
3453
v = _get_lead(pnt)
35-
if (align &= CHUNKMSK) != 0
36-
msk = _mask_bytes(align)
37-
v = (v & ~msk) | (msk & _trail_mask)
38-
cnt += (align>>>1)
54+
if (align &= BIGCHUNKMSK) != 0
55+
msk = _big_mask_bytes(align)
56+
v = (v & ~msk) | (msk & _big_trail_mask)
57+
cnt += align
3958
end
40-
_align_len_utf16(pnt, cnt<<1, v)
59+
len = 0
60+
fin = pnt + cnt
61+
while (pnt += BIGCHUNKSZ) < fin
62+
len += count_ones(v)
63+
v = _get_lead(pnt)
64+
end
65+
len + count_ones(_mask_bytes(v, cnt))
4166
end
4267

4368
function _nextind(::MultiCU, str::MS_UTF16, pos::Int, nchar::Int)
@@ -93,31 +118,41 @@ function is_bmp(str::MS_UTF16)
93118
end
94119
end
95120

96-
@inline function _check_bmp_utf16_al(pnt, cnt, v)
121+
@inline function _check_bmp_utf16_al(beg, cnt)
122+
cnt <= CHUNKSZ && return _mask_bytes(_get_masked(_pntchunk(beg)), cnt) == 0
123+
cnt <= BIGCHUNKSZ && return _mask_bytes(_get_masked(_pntbigchunk(beg)), cnt) == 0
124+
_get_masked(_pntchunk(beg)) == 0 || return false
125+
cnt -= CHUNKSZ
126+
cnt <= BIGCHUNKSZ && return _mask_bytes(_get_masked(_pntbigchunk(beg)), cnt) == 0
127+
pnt = _pntbigchunk(beg + CHUNKSZ)
128+
v = _get_masked(pnt)
97129
fin = pnt + cnt
98-
v = _get_masked(v)
99-
while (pnt += CHUNKSZ) < fin
130+
while (pnt += BIGCHUNKSZ) < fin
100131
v == 0 || return false
101132
v = _get_masked(pnt)
102133
end
103134
_mask_bytes(v, cnt) == 0
104135
end
105-
@inline _check_bmp_utf16_al(pnt, cnt) = _check_bmp_utf16_al(pnt, cnt, unsafe_load(pnt))
106136

107137
@inline function _check_bmp_utf16_ul(beg, cnt)
108138
align = reinterpret(UInt, beg)
109-
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
139+
pnt = reinterpret(Ptr{BigChunk}, align & ~BIGCHUNKMSK)
110140
v = unsafe_load(pnt)
111-
if (align &= CHUNKMSK) != 0
112-
v &= ~_mask_bytes(align)
141+
if (align &= BIGCHUNKMSK) != 0
142+
v &= ~_big_mask_bytes(align)
113143
cnt += align
114144
end
115-
_check_bmp_utf16_al(pnt, cnt, v)
145+
v = _get_masked(v)
146+
fin = pnt + cnt
147+
while (pnt += BIGCHUNKSZ) < fin
148+
v == 0 || return false
149+
v = _get_masked(pnt)
150+
end
151+
_mask_bytes(v, cnt) == 0
116152
end
117153

118154
is_bmp(str::Str{UTF16CSE}) =
119-
(cnt = sizeof(str)) == 0 ||
120-
@preserve str _check_bmp_utf16_al(reinterpret(Ptr{UInt}, pointer(str)), cnt)
155+
(cnt = sizeof(str)) == 0 || @preserve str _check_bmp_utf16_al(pointer(str), cnt)
121156

122157
is_bmp(str::SubString{<:Str{UTF16CSE}}) =
123158
(cnt = sizeof(str)) == 0 || @preserve str _check_bmp_utf16_ul(pointer(str), cnt)

src/utf8.jl

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,11 @@ const ASCII_Union = Union{UTF8CSE,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE,Text2CSE
209209

210210
is_ascii(str::SubString{<:Str{C}}) where {C<:ASCII_Union} =
211211
(cnt = sizeof(str)) == 0 ||
212-
(@preserve str _check_block_ul(pointer(str), cnt, _ascii_mask(codeunit(C))))
212+
(@preserve str _check_mask_ul(pointer(str), cnt, _ascii_mask(codeunit(C))))
213213

214214
is_ascii(vec::Vector{T}) where {T<:CodeUnitTypes} =
215215
(cnt = sizeof(vec)) == 0 ||
216-
(@preserve str _check_block_ul(pointer(vec), cnt, _ascii_mask(T)))
216+
(@preserve str _check_mask_ul(pointer(vec), cnt, _ascii_mask(T)))
217217

218218
is_ascii(str::Str{C}) where {C<:ASCII_Union} =
219219
(cnt = sizeof(str)) == 0 ||
@@ -225,10 +225,16 @@ _all_latin(val) =
225225
((val & (val<<1) & (val<<2 | (val<<3) | (val<<4) | (val<<5))) & get_high_mask(val)) == 0
226226

227227
@inline function _check_latin_utf8_al(beg, cnt)
228-
pnt = reinterpret(Ptr{UInt}, beg)
229-
fin = pnt + cnt
228+
cnt <= CHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntchunk(ptr)), cnt))
229+
bigmsk = _widen_mask(msk)
230+
cnt <= BIGCHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntbigchunk(ptr)), cnt))
231+
_all_latin(unsafe_load(_pntchunk(ptr))) || return false
232+
cnt -= CHUNKSZ
233+
cnt <= BIGCHUNKSZ && return _all_latin(_mask_bytes(unsafe_load(_pntbigchunk(ptr)), cnt))
234+
pnt = _pntbigchunk(ptr + CHUNKSZ)
230235
v = unsafe_load(pnt)
231-
while (pnt += CHUNKSZ) < fin
236+
fin = pnt + cnt
237+
while (pnt += BIGCHUNKSZ) < fin
232238
_all_latin(v) || return false
233239
v = unsafe_load(pnt)
234240
end
@@ -273,10 +279,16 @@ is_latin(str::Str{C}) where {C<:Union{Word_CSEs,Quad_CSEs}} =
273279
_all_bmp(val) = ((val | (val<<1) | (val<<2) | (val<<3)) & get_high_mask(val)) == 0
274280

275281
@inline function _check_bmp_utf8_al(beg, cnt)
276-
pnt = reinterpret(Ptr{UInt}, beg)
277-
fin = pnt + cnt
282+
cnt <= CHUNKSZ && return _all_bmp(_mask_bytes(unsafe_load(_pntchunk(ptr)), cnt))
283+
bigmsk = _widen_mask(msk)
284+
cnt <= BIGCHUNKSZ && return _all_bmp(_mask_bytes(unsafe_load(_pntbigchunk(ptr)), cnt))
285+
_all_bmp(unsafe_load(_pntchunk(ptr))) || return false
286+
cnt -= CHUNKSZ
287+
cnt <= BIGCHUNKSZ && return _all_bmp(_mask_bytes(unsafe_load(_pntbigchunk(ptr)), cnt))
288+
pnt = _pntbigchunk(ptr + CHUNKSZ)
289+
fin = _pntbigchunk(ptr + CHUNKSZ + cnt)
278290
v = unsafe_load(pnt)
279-
while (pnt += CHUNKSZ) < fin
291+
while (pnt += BIGCHUNKSZ) < fin
280292
_all_bmp(v) || return false
281293
v = unsafe_load(pnt)
282294
end
@@ -452,9 +464,9 @@ _iterate(::MultiCU, ::Type{T}, str::SubString{<:Str{RawUTF8CSE}}, pos::Int) wher
452464
end
453465

454466
_next(::MultiCU, ::Type{T}, str::Str{RawUTF8CSE}, pos::Int) where {T} =
455-
str_next(str.data, pos)
467+
iterate(str.data, pos)
456468
_next(::MultiCU, ::Type{T}, str::SubString{<:Str{RawUTF8CSE}}, pos::Int) where {T} =
457-
str_next(SubString(str.string.data, str.offset + pos, str.offset + ncodeunits(str)), 1)
469+
iterate(SubString(str.string.data, str.offset + pos, str.offset + ncodeunits(str)), 1)
458470

459471
## overload methods for efficiency ##
460472

test/basic.jl

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -292,8 +292,8 @@ let
292292

293293
@test lastindex(srep) == 7
294294

295-
@test str_next(srep, 3) == ('β',5)
296-
@test str_next(srep, 7) == ('β',9)
295+
@test iterate(srep, 3) == ('β',5)
296+
@test iterate(srep, 7) == ('β',9)
297297

298298
@test srep[7] == 'β'
299299
@test_throws StringIndexError srep[8]
@@ -327,8 +327,8 @@ end
327327
@test_throws MethodError codeunit(tstr, true)
328328
@test_throws MethodError isvalid(tstr, 1)
329329
@test_throws MethodError isvalid(tstr, true)
330-
@test_throws MethodError str_next(tstr, 1)
331-
@test_throws MethodError str_next(tstr, true)
330+
@test_throws MethodError iterate(tstr, 1)
331+
@test_throws MethodError iterate(tstr, true)
332332
@test_throws MethodError lastindex(tstr)
333333

334334
gstr = GenericString("12")
@@ -598,7 +598,7 @@ end
598598
for st in ("Hello", "Σ", "こんにちは", "😊😁")
599599
local s
600600
s = ST(st)
601-
@test str_next(s, lastindex(s))[2] > sizeof(s)
601+
@test iterate(s, lastindex(s))[2] > sizeof(s)
602602
@test nextind(s, lastindex(s)) > sizeof(s)
603603
end
604604
end
@@ -902,7 +902,7 @@ function testbin(::Type{ST}) where {ST}
902902
b"\xf8\x9f\x98\x84", b"\xf8\x9f\x98\x84z")),
903903
s in lst
904904
st = ST(s)
905-
@test str_next(st, 1)[2] == 2
905+
@test iterate(st, 1)[2] == 2
906906
@test nextind(st, 1) == 2
907907
end
908908

@@ -917,7 +917,7 @@ function testbin(::Type{ST}) where {ST}
917917
(s, r) in lst
918918
st = ST(s)
919919
(ST === BinaryStr || ST === Text1Str) && (r = 2)
920-
@test str_next(st, 1)[2] == r
920+
@test iterate(st, 1)[2] == r
921921
@test nextind(st, 1) == r
922922
end
923923
end
@@ -937,12 +937,7 @@ end
937937
@test String(sym) == string(Char(0xdcdb))
938938
@test Meta.lower(Main, sym) === sym
939939
res = string(Meta.parse(string(Char(0xdcdb)," = 1"),1,raise=false)[1])
940-
@static if VERSION v"1.5.0-DEV.460"
941-
@test res == "\$(Expr(:error, \"invalid UTF-8 sequence\"))"
942-
else
943-
@test startswith(res, "\$(Expr(:error, \"invalid character \\\"\\udcdb\\\"")
944-
@test endswith(res, "\"))")
945-
end
940+
@test res == "\$(Expr(:error, \"invalid UTF-8 sequence\"))"
946941
end
947942

948943
@testset "invalid code point" begin

0 commit comments

Comments
 (0)