Skip to content

Commit f923c26

Browse files
authored
Merge pull request #4 from JuliaString/fix32
Fix some issues on 32-bit platforms
2 parents addd123 + a49bdeb commit f923c26

File tree

9 files changed

+46
-41
lines changed

9 files changed

+46
-41
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ os:
55
- osx
66
julia:
77
- 1.0
8-
- 1.1
8+
- 1.3
99
- nightly
1010
notifications:
1111
email: false

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ authors = ["ScottPJones <[email protected]>"]
44
keywords = ["Strings"]
55
license = "MIT"
66
uuid = "e79e7a6a-7bb1-5a4d-9d64-da657b06f53a"
7-
version = "1.0.0"
7+
version = "1.0.2"
88

99
[deps]
1010
Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
@@ -26,7 +26,7 @@ test = ["Test", "Random"]
2626
[compat]
2727
julia = "^1.0.0"
2828
ModuleInterfaceTools = "≥ 1.0.0"
29-
MurmurHash3 = "≥ 1.0.0"
29+
MurmurHash3 = "≥ 1.0.3"
3030
StrAPI = "≥ 1.0.0"
3131
ChrBase = "≥ 1.0.0"
3232
CharSetEncodings = "≥ 1.0.0"

appveyor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
environment:
22
matrix:
3-
- julia_version: 1.0
43
- julia_version: 1
4+
- julia_version: 1.3
55
- julia_version: latest
66

77
platform:

src/StrBase.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ __precompile__(true)
22
"""
33
StrBase package
44
5-
Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones,
5+
Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones,
66
and other contributors to the Julia language
77
Licensed under MIT License, see LICENSE.md
88
Based partly on code in LegacyStrings that used to be part of Julia
@@ -23,7 +23,7 @@ using ModuleInterfaceTools
2323

2424
@api develop! check_string, unsafe_check_string, fast_check_string, skipascii, skipbmp,
2525
countmask, count_chars, _count_mask_al, _count_mask_ul, count_latin,
26-
_copysub, _cvtsize, _repeat, empty_str, _data, _pnt64, _str,
26+
_copysub, _cvtsize, _repeat, empty_str, _data, _pntchunk, _str,
2727
ValidatedStyle, MutableStyle, EqualsStyle, CanContain
2828

2929
@api develop LineCounts, CharTypes, CharStat, maxbit, calcstats, check_continuation,

src/support.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ end
313313

314314
@inline function alignpnt(beg::Ptr)
315315
align = reinterpret(UInt, beg)
316-
align, reinterpret(Ptr{UInt64}, align & (~CHUNKMSK)%UInt)
316+
align, reinterpret(Ptr{UInt}, align & (~CHUNKMSK)%UInt)
317317
end
318318

319319
@inline function skipascii(beg::Ptr{UInt8}, fin::Ptr{UInt8})
@@ -387,7 +387,7 @@ end
387387
const _bmp_mask = 0xd800_d800_d800_d800
388388
@inline _mask_allsurr(v) = xor((v | v<<1 | v<<2 | v<<3 | v<<4) & _hi_bit_16, _hi_bit_16)
389389

390-
@inline _get_bmp_mask(v::UInt64) = _mask_allsurr(xor(v, _bmp_mask))
390+
@inline _get_bmp_mask(v::UInt) = _mask_allsurr(xor(v, _bmp_mask))
391391

392392
const msk_ascii_16 = 0xff80ff80ff80ff80
393393
const msk_latin_16 = 0xff00ff00ff00ff00

src/types.jl

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
#=
22
Basic types for strings
33
4-
Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
4+
Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones
55
Licensed under MIT License, see LICENSE.md
66
=#
7-
const STR_KEEP_NUL = true # keep nul byte placed by String
87

98
# Note: this is still in transition to expressing character set, encoding
109
# and optional cached info for hashes, UTF-8/UTF-16 encodings, subsets, etc.
@@ -74,18 +73,20 @@ typemin(::Type{T}) where {T<:Str} = empty_str(T)
7473
typemin(::T) where {T<:Str} = empty_str(T)
7574

7675
"""Union type for fast dispatching"""
77-
const UniStr = Union{ASCIIStr, _LatinStr, _UCS2Str, _UTF32Str}
76+
#const UniStr = Union{ASCIIStr, _LatinStr, _UCS2Str, _UTF32Str}
77+
const UniCSE = Union{ASCIICSE, _LatinCSE, _UCS2CSE, _UTF32CSE}
78+
const UniStr = Str{<:UniCSE, Nothing, Nothing, Nothing}
7879
show(io::IO, ::Type{UniStr}) = print(io, :UniStr)
7980

8081
# Display BinaryCSE as if String
8182
show(io::IO, str::T) where {T<:Str{BinaryCSE}} = show(io, str.data)
8283
show(io::IO, str::SubString{T}) where {T<:Str{BinaryCSE}} =
8384
@inbounds show(io, SubString(str.string.data, str.offset+1, str.offset+lastindex(str)))
8485

85-
_allocate(len) = Base._string_n((len+STR_KEEP_NUL-1)%Csize_t)
86+
_allocate(len) = Base._string_n(len%Csize_t)
8687

8788
function _allocate(::Type{T}, len) where {T <: CodeUnitTypes}
88-
buf = _allocate((len+STR_KEEP_NUL-1) * sizeof(T))
89+
buf = _allocate(len * sizeof(T))
8990
buf, reinterpret(Ptr{T}, pointer(buf))
9091
end
9192

@@ -115,7 +116,7 @@ promote_rule(::Type{String}, ::Type{<:Str}) = String
115116
promote_rule(::Type{<:Str{S}}, ::Type{<:Str{T}}) where {S,T} =
116117
(P = promote_rule(S,T)) === Union{} ? Union{} : Str{P}
117118

118-
sizeof(s::Str) = sizeof(s.data) + 1 - STR_KEEP_NUL
119+
sizeof(s::Str) = sizeof(s.data)
119120

120121
"""Codeunits of string as a Vector"""
121122
_data(s::Vector{UInt8}) = s
@@ -127,19 +128,19 @@ pointer(s::Str{<:Byte_CSEs}) = pointer(s.data)
127128
pointer(s::Str{<:Word_CSEs}) = reinterpret(Ptr{UInt16}, pointer(s.data))
128129
pointer(s::Str{<:Quad_CSEs}) = reinterpret(Ptr{UInt32}, pointer(s.data))
129130

130-
const CHUNKSZ = sizeof(UInt64) # used for fast processing of strings
131-
const CHUNKMSK = (CHUNKSZ-1)%UInt64
131+
const CHUNKSZ = sizeof(UInt) # used for fast processing of strings
132+
const CHUNKMSK = (CHUNKSZ-1)%UInt
132133

133-
_pnt64(s::Union{String,Vector{UInt8}}) = reinterpret(Ptr{UInt64}, pointer(s))
134-
_pnt64(s::Str) = reinterpret(Ptr{UInt64}, pointer(s.data))
134+
_pntchunk(s::Union{String,Vector{UInt8}}) = reinterpret(Ptr{UInt}, pointer(s))
135+
_pntchunk(s::Str) = reinterpret(Ptr{UInt}, pointer(s.data))
135136

136137
"""Length of string in codeunits"""
137138
ncodeunits(s::Str) = sizeof(s)
138139
ncodeunits(s::Str{<:Word_CSEs}) = sizeof(s) >>> 1
139140
ncodeunits(s::Str{<:Quad_CSEs}) = sizeof(s) >>> 2
140141

141142
# For convenience
142-
@inline _calcpnt(str, siz) = (pnt = _pnt64(str) - CHUNKSZ; (pnt, pnt + siz))
143+
@inline _calcpnt(str, siz) = (pnt = _pntchunk(str) - CHUNKSZ; (pnt, pnt + siz))
143144

144145
@inline _mask_bytes(n) = ((1%UInt) << ((n & CHUNKMSK) << 3)) - 0x1
145146

src/utf16.jl

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
#=
22
UTF16Str and UCS2Str types (UTF-16 encoding and pure BMP UCS-2)
33
4-
Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones,
4+
Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones,
55
and other contributors to the Julia language
66
Licensed under MIT License, see LICENSE.md
77
Based in (small) part on code for UTF16String that used to be in Julia
88
=#
99

10-
const _trail_mask = 0xdc00_dc00_dc00_dc00
11-
const _hi_bit_16 = 0x8000_8000_8000_8000
10+
const _trail_mask = CHUNKSZ == 4 ? 0xdc00_dc00 : 0xdc00_dc00_dc00_dc00
11+
const _hi_bit_16 = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000
1212

1313
@inline _mask_surr(v) = xor((v | v<<1 | v<<2 | v<<3 | v<<4 | v<<5) & _hi_bit_16, _hi_bit_16)
14-
@inline _get_masked(v::UInt64) = _mask_surr(xor(v, _trail_mask))
15-
@inline _get_masked(qpnt::Ptr{UInt64}) = _get_masked(unsafe_load(qpnt))
14+
@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask))
15+
@inline _get_masked(qpnt::Ptr{UInt}) = _get_masked(unsafe_load(qpnt))
1616
@inline _get_lead(qpnt) = xor(_get_masked(qpnt), _hi_bit_16)
1717

1818
@inline function _align_len_utf16(pnt, cnt, v)
@@ -26,11 +26,11 @@ const _hi_bit_16 = 0x8000_8000_8000_8000
2626
end
2727

2828
_length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int) =
29-
(pnt = reinterpret(Ptr{UInt64}, beg); _align_len_utf16(pnt, cnt<<1, _get_lead(pnt)))
29+
(pnt = reinterpret(Ptr{UInt}, beg); _align_len_utf16(pnt, cnt<<1, _get_lead(pnt)))
3030

3131
function _length(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
3232
align = reinterpret(UInt, beg)
33-
pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
33+
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
3434
v = _get_lead(pnt)
3535
if (align &= CHUNKMSK) != 0
3636
msk = _mask_bytes(align)
@@ -83,7 +83,7 @@ function is_bmp(str::MS_UTF16)
8383
(siz = sizeof(str)) == 0 && return true
8484
# Todo: handle unaligned for ARM32
8585
@preserve str begin
86-
siz < CHUNKSZ && return (_get_masked(_pnt64(str)) & _mask_bytes(siz)) == 0
86+
siz < CHUNKSZ && return (_get_masked(_pntchunk(str)) & _mask_bytes(siz)) == 0
8787

8888
pnt, fin = _calcpnt(str, siz)
8989
while (pnt += CHUNKSZ) <= fin
@@ -106,7 +106,7 @@ end
106106

107107
@inline function _check_bmp_utf16_ul(beg, cnt)
108108
align = reinterpret(UInt, beg)
109-
pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
109+
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
110110
v = unsafe_load(pnt)
111111
if (align &= CHUNKMSK) != 0
112112
v &= ~_mask_bytes(align)
@@ -117,7 +117,7 @@ end
117117

118118
is_bmp(str::Str{UTF16CSE}) =
119119
(cnt = sizeof(str)) == 0 ? true :
120-
@preserve str _check_bmp_utf16_al(reinterpret(Ptr{UInt64}, pointer(str)), cnt)
120+
@preserve str _check_bmp_utf16_al(reinterpret(Ptr{UInt}, pointer(str)), cnt)
121121

122122
is_bmp(str::SubString{<:Str{UTF16CSE}}) =
123123
(cnt = sizeof(str)) == 0 ? true : @preserve str _check_bmp_utf16_ul(pointer(str), cnt)
@@ -316,8 +316,8 @@ function is_valid(::Type{<:Str{UTF16CSE}}, data::AbstractArray{UInt16})
316316
@inbounds return pos > len || !is_surrogate_codeunit(get_codeunit(data[pos + 1]))
317317
end
318318

319-
# This can be sped up, to check 4 words at a time, only checking for unpaired
320-
# or out of order surrogates when one is found in the UInt64
319+
# This can be sped up, to check 2/4 words at a time, only checking for unpaired
320+
# or out of order surrogates when one is found in the UInt
321321
function is_valid(::Type{<:Str{UTF16CSE}}, pnt::Ptr{UInt16}, len)
322322
len == 0 && return true
323323
fin = bytoff(pnt, len - 1)

src/utf8.jl

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ xor 80 then << 1 then |
8989
11 -> 01 -> 1
9090
=#
9191

92-
const hi_mask = 0x8080_8080_8080_8080
92+
const hi_mask = CHUNKSZ == 4 ? 0x8080_8080 : 0x8080_8080_8080_8080
9393

9494
@inline _count_cont(v) = (v = xor(v, hi_mask); count_ones(xor(((v << 1) | v), hi_mask) & hi_mask))
9595
@inline msk_lead(v) = (v = xor(v, hi_mask); xor(xor(((v << 1) | v), hi_mask) & hi_mask, hi_mask))
@@ -106,11 +106,11 @@ const hi_mask = 0x8080_8080_8080_8080
106106
end
107107

108108
_length_al(::MultiCU, ::Type{UTF8CSE}, beg::Ptr{UInt8}, cnt::Int) =
109-
(pnt = reinterpret(Ptr{UInt64}, beg); _align_len_utf8(pnt, cnt, unsafe_load(pnt)))
109+
(pnt = reinterpret(Ptr{UInt}, beg); _align_len_utf8(pnt, cnt, unsafe_load(pnt)))
110110

111111
function _length(::MultiCU, ::Type{UTF8CSE}, beg::Ptr{UInt8}, cnt::Int)
112112
align = reinterpret(UInt, beg)
113-
pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
113+
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
114114
v = unsafe_load(pnt)
115115
if (align &= CHUNKMSK) != 0
116116
msk = _mask_bytes(align)
@@ -132,7 +132,7 @@ end
132132

133133
@inline function _check_mask_ul(beg, cnt, msk)
134134
align = reinterpret(UInt, beg)
135-
pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
135+
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
136136
v = unsafe_load(pnt)
137137
if (align &= CHUNKMSK) != 0
138138
v &= ~_mask_bytes(align)
@@ -162,7 +162,7 @@ is_ascii(vec::Vector{T}) where {T<:CodeUnitTypes} =
162162
is_ascii(str::Str{C}) where {C<:Union{UTF8_CSEs,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE,
163163
Text2CSE,Text4CSE,UTF32CSE}} =
164164
(cnt = sizeof(str)) == 0 ? true :
165-
@preserve str _check_mask_al(reinterpret(Ptr{UInt64}, pointer(str)), cnt,
165+
@preserve str _check_mask_al(reinterpret(Ptr{UInt}, pointer(str)), cnt,
166166
_ascii_mask(codeunit(C)))
167167

168168
# Todo! Here you need to see that 0b11yyyyxx at least 1 y must be set,
@@ -181,7 +181,7 @@ end
181181

182182
@inline function _check_latin_utf8_ul(beg, cnt)
183183
align = reinterpret(UInt, beg)
184-
pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
184+
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
185185
v = unsafe_load(pnt)
186186
if (align &= CHUNKMSK) != 0
187187
v &= ~_mask_bytes(align)
@@ -192,7 +192,7 @@ end
192192

193193
is_latin(str::Str{UTF8CSE}) =
194194
(siz = sizeof(str)) == 0 ? true :
195-
@preserve str _check_latin_utf8_al(reinterpret(Ptr{UInt64}, pointer(str)), siz)
195+
@preserve str _check_latin_utf8_al(reinterpret(Ptr{UInt}, pointer(str)), siz)
196196

197197
is_latin(str::SubString{<:Str{UTF8CSE}}) =
198198
(cnt = sizeof(str)) == 0 ? true : @preserve str _check_latin_utf8_ul(pointer(str), cnt)
@@ -224,7 +224,7 @@ end
224224

225225
@inline function _check_bmp_utf8_ul(beg, cnt)
226226
align = reinterpret(UInt, beg)
227-
pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
227+
pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
228228
v = unsafe_load(pnt)
229229
if (align &= CHUNKMSK) != 0
230230
v &= ~_mask_bytes(align)
@@ -235,7 +235,7 @@ end
235235

236236
is_bmp(str::Str{UTF8CSE}) =
237237
(cnt = sizeof(str)) == 0 ? true :
238-
@preserve str _check_bmp_utf8_al(reinterpret(Ptr{UInt64}, pointer(str)), cnt)
238+
@preserve str _check_bmp_utf8_al(reinterpret(Ptr{UInt}, pointer(str)), cnt)
239239

240240
is_bmp(str::SubString{<:Str{UTF8CSE}}) =
241241
(cnt = sizeof(str)) == 0 ? true : @preserve str _check_bmp_utf8_ul(pointer(str), cnt)

test/basic.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -966,6 +966,7 @@ end
966966
end
967967
end
968968

969+
969970
@testset "CESU-8 sequences" begin
970971
## UTF-8 tests
971972

@@ -974,7 +975,9 @@ end
974975
for hichar = 0xd800:0xdbff, lochar = 0xdc00:0xdfff
975976
seq = string(Char(hichar), Char(lochar))
976977
# Normal conversion throws an error
977-
@test_throws StringError utf8(seq)
978+
if sizeof(Int) != 4 # Avoid horrible 1000x performance issue on 32-bit platforms
979+
@test_throws StringError utf8(seq)
980+
end
978981
# Unsafe conversions return invalid strings as Text*Str
979982
@test typeof(unsafe_str(seq)) == Text1Str
980983
# With accept_surrogates flag, return converted to valid string (_UTF32Str)
@@ -985,6 +988,7 @@ end
985988

986989
end
987990

991+
988992
@testset "Reverse of UTF8" begin
989993
# Reverse of UTF8Str
990994
@test reverse(UTF8Str("")) == ""

0 commit comments

Comments
 (0)