Skip to content

Commit 4e53204

Browse files
committed
Optimization for some string operations, such as isascii
1 parent 027da44 commit 4e53204

File tree

6 files changed

+106
-66
lines changed

6 files changed

+106
-66
lines changed

.drone.yml

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,30 +11,3 @@ steps:
1111
image: julia:1.5
1212
commands:
1313
- "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'"
14-
---
15-
kind: pipeline
16-
name: linux - arm64 - Julia 1.0
17-
18-
platform:
19-
os: linux
20-
arch: arm64
21-
22-
steps:
23-
- name: build
24-
image: julia:1.0
25-
commands:
26-
- "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'"
27-
28-
---
29-
kind: pipeline
30-
name: linux - arm - Julia 1.0
31-
32-
platform:
33-
os: linux
34-
arch: arm
35-
36-
steps:
37-
- name: build
38-
image: julia:1.0
39-
commands:
40-
- "julia --project=. --check-bounds=yes --color=yes -e 'using InteractiveUtils; versioninfo(verbose=true); using Pkg; Pkg.build(); Pkg.test(coverage=true)'"

Project.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ authors = ["ScottPJones <[email protected]>"]
44
keywords = ["Strings"]
55
license = "MIT"
66
uuid = "e79e7a6a-7bb1-5a4d-9d64-da657b06f53a"
7-
version = "1.0.4"
7+
version = "1.0.5"
88

99
[deps]
1010
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
@@ -24,9 +24,9 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
2424
test = ["Test", "Random"]
2525

2626
[compat]
27-
julia = "^1.0.0"
28-
ModuleInterfaceTools = "^1.0.0"
27+
julia = "1"
28+
ModuleInterfaceTools = "1"
2929
MurmurHash3 = "^1.0.3"
30-
StrAPI = "^1.0.0"
30+
StrAPI = "1"
3131
ChrBase = "^1.0.1"
32-
CharSetEncodings = "^1.0.0"
32+
CharSetEncodings = "1"

src/latin.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ end
167167

168168
function convert(::Type{<:Str{C}}, vec::Vector{CU}) where {C<:Latin_CSEs,CU<:CodeUnitTypes}
169169
# handle zero length string quickly
170-
(len = length(vec)) == 0 && return _empty_str(C)
170+
(len = length(vec)) == 0 && return empty_str(C)
171171
@preserve vec begin
172172
pnt = pointer(vec)
173173
# get number of bytes to allocate

src/search.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,10 @@ found(::Type{<:AbstractString}, v) = v != 0
121121
find_result(::Type{<:AbstractString}, v) = v
122122

123123
nothing_sentinel(i) = first(i) == 0 ? nothing : i
124+
Base.findfirst(a::AbstractChar, b::Str) = nothing_sentinel(find(First, a, b))
125+
Base.findlast(a::AbstractChar, b::Str) = nothing_sentinel(find(Last, a, b))
126+
Base.findnext(a::AbstractChar, b::Str, i) = nothing_sentinel(find(Fwd, a, b, i))
127+
Base.findprev(a::AbstractChar, b::Str, i) = nothing_sentinel(find(Rev, a, b, i))
124128
Base.findfirst(a, b::Str) = nothing_sentinel(find(First, a, b))
125129
Base.findlast(a, b::Str) = nothing_sentinel(find(Last, a, b))
126130
Base.findnext(a, b::Str, i) = nothing_sentinel(find(Fwd, a, b, i))

src/types.jl

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,6 @@ _mskdn32(v, m, s) = _msk32(v, m) >>> s
3030
(::Type{Str})(::Type{C}, v::String) where {C<:CSE} = Str(C, v, nothing, nothing, nothing)
3131
(::Type{Str})(::Type{C}, v::Str) where {C<:CSE} = Str(C, v.data, nothing, nothing, nothing)
3232

33-
# Handle change from endof -> lastindex
34-
@static if !isdefined(Base, :lastindex)
35-
lastindex(str::AbstractString) = Base.endof(str)
36-
lastindex(arr::AbstractArray) = Base.endof(arr)
37-
Base.endof(str::Str) = lastindex(str)
38-
end
39-
@static if !isdefined(Base, :firstindex)
40-
firstindex(str::AbstractString) = 1
41-
# AbstractVector might be an OffsetArray
42-
firstindex(str::Vector) = 1
43-
end
44-
4533
# Definition of built-in Str types
4634

4735
const empty_string = ""
@@ -131,8 +119,18 @@ pointer(s::Str{<:Quad_CSEs}) = reinterpret(Ptr{UInt32}, pointer(s.data))
131119
const CHUNKSZ = sizeof(UInt) # used for fast processing of strings
132120
const CHUNKMSK = (CHUNKSZ-1)%UInt
133121

134-
_pntchunk(s::Union{String,Vector{UInt8}}) = reinterpret(Ptr{UInt}, pointer(s))
135-
_pntchunk(s::Str) = reinterpret(Ptr{UInt}, pointer(s.data))
122+
_pntchunk(p::Union{UInt,Ptr}) = reinterpret(Ptr{UInt}, p)
123+
_pntchunk(s::Union{String,Vector{UInt8}}) = _pntchunk(pointer(s))
124+
_pntchunk(s::Str) = _pntchunk(pointer(s.data))
125+
126+
# Type and mask for even faster string handling
127+
const BigChunk = UInt === UInt32 ? UInt64 : UInt128
128+
const BIGCHUNKSZ = sizeof(BigChunk)
129+
const BIGCHUNKMSK = (BIGCHUNKSZ-1)%UInt
130+
131+
_pntbigchunk(p::Union{UInt,Ptr}) = reinterpret(Ptr{BigChunk}, p)
132+
_pntbigchunk(s::Union{String,Vector{UInt8}}) = _pntbigchunk(pointer(s))
133+
_pntbigchunk(s::Str) = _pntbigchunk(pointer(s.data))
136134

137135
"""Length of string in codeunits"""
138136
ncodeunits(s::Str) = sizeof(s)
@@ -144,6 +142,8 @@ ncodeunits(s::Str{<:Quad_CSEs}) = sizeof(s) >>> 2
144142

145143
@inline _mask_bytes(n) = ((1%UInt) << ((n & CHUNKMSK) << 3)) - 0x1
146144

145+
@inline _big_mask_bytes(n) = ((1%BigChunk) << ((n & BIGCHUNKMSK) << 3)) - 0x1
146+
147147
# Support for SubString of Str
148148

149149
Base.SubString(str::Str{C}) where {C<:SubSet_CSEs} =

src/utf8.jl

Lines changed: 82 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#=
22
UTF8Str type
33
4-
Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones,
4+
Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones,
55
and other contributors to the Julia language
66
77
Licensed under MIT License, see LICENSE.md
@@ -89,10 +89,19 @@ xor 80 then << 1 then |
8989
11 -> 01 -> 1
9090
=#
9191

92+
@inline _widen_mask(msk::UInt) = ((msk%BigChunk) << (8*sizeof(UInt))) | msk
93+
9294
const hi_mask = CHUNKSZ == 4 ? 0x8080_8080 : 0x8080_8080_8080_8080
95+
const big_hi_mask = _widen_mask(hi_mask)
96+
97+
@inline _count_cont(v, msk) = (v = xor(v, msk); count_ones(xor(((v << 1) | v), msk) & msk))
98+
@inline msk_lead(v, msk) = (v = xor(v, msk); xor(xor(((v << 1) | v), msk) & msk, msk))
99+
100+
@inline _count_cont(v::UInt) = _count_cont(v, hi_mask)
101+
@inline msk_lead(v::UInt) = msk_lead(v, hi_mask)
93102

94-
@inline _count_cont(v) = (v = xor(v, hi_mask); count_ones(xor(((v << 1) | v), hi_mask) & hi_mask))
95-
@inline msk_lead(v) = (v = xor(v, hi_mask); xor(xor(((v << 1) | v), hi_mask) & hi_mask, hi_mask))
103+
@inline _count_cont(v::BigChunk) = _count_cont(v, big_hi_mask)
104+
@inline _msk_lead(v::BigChunk) = _msk_lead(v, big_hi_mask)
96105

97106
@inline function _align_len_utf8(pnt, cnt, v)
98107
len = 0
@@ -141,29 +150,83 @@ end
141150
_check_mask_al(pnt, cnt, msk, v)
142151
end
143152

153+
@inline _mask_bytes(v::T, cnt) where {T} =
154+
ifelse((cnt & (sizeof(T)-1)%UInt) == 0,
155+
v, T(v & (one(T) << ((cnt & (sizeof(T)-1)%UInt) << 3)) - 1))
156+
157+
@inline chk_chunk(ptr, msk::T, cnt) where {T} =
158+
iszero(_mask_bytes(unsafe_load(reinterpret(Ptr{T}, ptr)) & msk, cnt))
159+
160+
@inline function _check_block_al(ptr, cnt, msk)
161+
# First check very frequent cases of short strings
162+
# (on 64-bit machines, 1-8 bytes, 9-16 bytes, and 17-24)
163+
# taking advantage of the knowledge of how String types are stored in Julia,
164+
# i.e. UInt length, immediate followed by the string data, aligned on sizeof(UInt)*2
165+
cnt <= CHUNKSZ && return chk_chunk(ptr, msk, cnt)
166+
bigmsk = _widen_mask(msk)
167+
cnt <= BIGCHUNKSZ && return chk_chunk(ptr, bigmsk, cnt)
168+
(unsafe_load(_pntchunk(ptr)) & msk) == 0 || return false
169+
cnt -= CHUNKSZ
170+
cnt <= BIGCHUNKSZ && return chk_chunk(ptr, bigmsk, cnt)
171+
pnt = _pntbigchunk(ptr + CHUNKSZ)
172+
fin = _pntbigchunk(ptr + CHUNKSZ + cnt)
173+
v = unsafe_load(pnt) & bigmsk
174+
while (pnt += BIGCHUNKSZ) < fin
175+
v == 0 || return false
176+
v = unsafe_load(pnt) & bigmsk
177+
end
178+
iszero(_mask_bytes(v, cnt))
179+
end
180+
181+
@inline function _check_block_ul(beg, cnt, msk)
182+
align = reinterpret(UInt, beg)
183+
pnt = _pntbigchunk(align & ~BIGCHUNKMSK)
184+
v = unsafe_load(pnt)
185+
if (align &= BIGCHUNKMSK) != 0
186+
v &= ~_big_mask_bytes(align)
187+
cnt += align
188+
end
189+
fin = _pntbigchunk(pnt + cnt)
190+
bigmsk = _widen_mask(msk)
191+
while (pnt += BIGCHUNKSZ) < fin
192+
(v & bigmsk) == 0 || return false
193+
v = unsafe_load(pnt)
194+
end
195+
((cnt & BIGCHUNKMSK) == 0 ? v : (v & _big_mask_bytes(cnt))) & bigmsk == 0
196+
end
197+
144198
_ascii_mask(::Type{UInt8}) = hi_mask
145-
_ascii_mask(::Type{UInt16}) = 0xff80_ff80_ff80_ff80
146-
_ascii_mask(::Type{UInt32}) = 0xffffff80_ffffff80
199+
@static if UInt == 4
200+
_ascii_mask(::Type{UInt16}) = 0xff80_ff80
201+
_ascii_mask(::Type{UInt32}) = 0xffffff80
147202

148-
_latin_mask(::Type{UInt16}) = 0xff00_ff00_ff00_ff00
149-
_latin_mask(::Type{UInt32}) = 0xffffff00_ffffff00
203+
_latin_mask(::Type{UInt16}) = 0xff00_ff00
204+
_latin_mask(::Type{UInt32}) = 0xffffff00
150205

151-
const _bmp_mask_32 = 0xffff0000_ffff0000
206+
const _bmp_mask_32 = 0xffff0000
207+
else
208+
_ascii_mask(::Type{UInt16}) = 0xff80_ff80_ff80_ff80
209+
_ascii_mask(::Type{UInt32}) = 0xffffff80_ffffff80
152210

153-
is_ascii(str::SubString{<:Str{C}}) where {C<:Union{UTF8CSE,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE,
154-
Text2CSE,Text4CSE,UTF32CSE}} =
155-
(cnt = sizeof(str)) == 0 ? true :
156-
@preserve str _check_mask_ul(pointer(str), cnt, _ascii_mask(codeunit(C)))
211+
_latin_mask(::Type{UInt16}) = 0xff00_ff00_ff00_ff00
212+
_latin_mask(::Type{UInt32}) = 0xffffff00_ffffff00
213+
214+
const _bmp_mask_32 = 0xffff0000_ffff0000
215+
end
216+
217+
const ASCII_Union = Union{UTF8CSE,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE,Text2CSE,Text4CSE,UTF32CSE}
218+
219+
is_ascii(str::SubString{<:Str{C}}) where {C<:ASCII_Union} =
220+
(cnt = sizeof(str)) == 0 ||
221+
(@preserve str _check_block_ul(pointer(str), cnt, _ascii_mask(codeunit(C))))
157222

158223
is_ascii(vec::Vector{T}) where {T<:CodeUnitTypes} =
159-
(cnt = sizeof(vec)) == 0 ? true :
160-
@preserve str _check_mask_ul(pointer(vec), cnt, _ascii_mask(T))
224+
(cnt = sizeof(vec)) == 0 ||
225+
(@preserve str _check_block_ul(pointer(vec), cnt, _ascii_mask(T)))
161226

162-
is_ascii(str::Str{C}) where {C<:Union{UTF8_CSEs,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE,
163-
Text2CSE,Text4CSE,UTF32CSE}} =
164-
(cnt = sizeof(str)) == 0 ? true :
165-
@preserve str _check_mask_al(reinterpret(Ptr{UInt}, pointer(str)), cnt,
166-
_ascii_mask(codeunit(C)))
227+
is_ascii(str::Str{C}) where {C<:ASCII_Union} =
228+
(cnt = sizeof(str)) == 0 ||
229+
(@preserve str _check_block_al(pointer(str), cnt, _ascii_mask(codeunit(C))))
167230

168231
# Todo! Here you need to see that 0b11yyyyxx at least 1 y must be set,
169232
# which indicates a non-Latin1 character

0 commit comments

Comments
 (0)