Merge pull request #4 from JuliaString/fix32

ScottPJones · web-flow · commit f923c26440dc · 2020-01-25T17:52:44.000-05:00
Fix some issues on 32-bit platforms
diff --git a/.travis.yml b/.travis.yml
@@ -5,7 +5,7 @@ os:
   - osx
 julia:
   - 1.0
-  - 1.1
+  - 1.3
   - nightly
 notifications:
   email: false
diff --git a/Project.toml b/Project.toml
@@ -4,7 +4,7 @@ authors  = ["ScottPJones <scottjones@alum.mit.edu>"]
 keywords = ["Strings"]
 license  = "MIT"
 uuid     = "e79e7a6a-7bb1-5a4d-9d64-da657b06f53a"
-version = "1.0.0"
+version = "1.0.2"
 
 [deps]
 Unicode = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
@@ -26,7 +26,7 @@ test = ["Test", "Random"]
 [compat]
 julia = "^1.0.0"
 ModuleInterfaceTools = "≥ 1.0.0"
-MurmurHash3 = "≥ 1.0.0"
+MurmurHash3 = "≥ 1.0.3"
 StrAPI = "≥ 1.0.0"
 ChrBase = "≥ 1.0.0"
 CharSetEncodings = "≥ 1.0.0"
diff --git a/appveyor.yml b/appveyor.yml
@@ -1,7 +1,7 @@
 environment:
   matrix:
-  - julia_version: 1.0
   - julia_version: 1
+  - julia_version: 1.3
   - julia_version: latest
 
 platform:
diff --git a/src/StrBase.jl b/src/StrBase.jl
@@ -2,7 +2,7 @@ __precompile__(true)
 """
 StrBase package
 
-Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones,
+Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones,
 and other contributors to the Julia language
 Licensed under MIT License, see LICENSE.md
 Based partly on code in LegacyStrings that used to be part of Julia
@@ -23,7 +23,7 @@ using ModuleInterfaceTools
 
 @api develop! check_string, unsafe_check_string, fast_check_string, skipascii, skipbmp,
               countmask, count_chars, _count_mask_al, _count_mask_ul, count_latin,
-              _copysub, _cvtsize, _repeat, empty_str, _data, _pnt64, _str,
+              _copysub, _cvtsize, _repeat, empty_str, _data, _pntchunk, _str,
               ValidatedStyle, MutableStyle, EqualsStyle, CanContain
 
 @api develop LineCounts, CharTypes, CharStat, maxbit, calcstats, check_continuation,
diff --git a/src/support.jl b/src/support.jl
@@ -313,7 +313,7 @@ end
 
 @inline function alignpnt(beg::Ptr)
     align = reinterpret(UInt, beg)
-    align, reinterpret(Ptr{UInt64}, align & (~CHUNKMSK)%UInt)
+    align, reinterpret(Ptr{UInt}, align & (~CHUNKMSK)%UInt)
 end
 
 @inline function skipascii(beg::Ptr{UInt8}, fin::Ptr{UInt8})
@@ -387,7 +387,7 @@ end
 const _bmp_mask = 0xd800_d800_d800_d800
 @inline _mask_allsurr(v)  = xor((v | v<<1 | v<<2 | v<<3 | v<<4) & _hi_bit_16, _hi_bit_16)
 
-@inline _get_bmp_mask(v::UInt64) = _mask_allsurr(xor(v, _bmp_mask))
+@inline _get_bmp_mask(v::UInt) = _mask_allsurr(xor(v, _bmp_mask))
 
 const msk_ascii_16 = 0xff80ff80ff80ff80
 const msk_latin_16 = 0xff00ff00ff00ff00
diff --git a/src/types.jl b/src/types.jl
@@ -1,10 +1,9 @@
 #=
 Basic types for strings
 
-Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
+Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones
 Licensed under MIT License, see LICENSE.md
 =#
-const STR_KEEP_NUL    = true  # keep nul byte placed by String
 
 # Note: this is still in transition to expressing character set, encoding
 # and optional cached info for hashes, UTF-8/UTF-16 encodings, subsets, etc.
@@ -74,18 +73,20 @@ typemin(::Type{T}) where {T<:Str} = empty_str(T)
 typemin(::T) where {T<:Str} = empty_str(T)
 
 """Union type for fast dispatching"""
-const UniStr = Union{ASCIIStr, _LatinStr, _UCS2Str, _UTF32Str}
+#const UniStr = Union{ASCIIStr, _LatinStr, _UCS2Str, _UTF32Str}
+const UniCSE = Union{ASCIICSE, _LatinCSE, _UCS2CSE, _UTF32CSE}
+const UniStr = Str{<:UniCSE, Nothing, Nothing, Nothing}
 show(io::IO, ::Type{UniStr}) = print(io, :UniStr)
 
 # Display BinaryCSE as if String
 show(io::IO, str::T) where {T<:Str{BinaryCSE}} = show(io, str.data)
 show(io::IO, str::SubString{T}) where {T<:Str{BinaryCSE}} =
     @inbounds show(io, SubString(str.string.data, str.offset+1, str.offset+lastindex(str)))
 
-_allocate(len) = Base._string_n((len+STR_KEEP_NUL-1)%Csize_t)
+_allocate(len) = Base._string_n(len%Csize_t)
 
 function _allocate(::Type{T}, len) where {T <: CodeUnitTypes}
-    buf = _allocate((len+STR_KEEP_NUL-1) * sizeof(T))
+    buf = _allocate(len * sizeof(T))
     buf, reinterpret(Ptr{T}, pointer(buf))
 end
 
@@ -115,7 +116,7 @@ promote_rule(::Type{String}, ::Type{<:Str}) = String
 promote_rule(::Type{<:Str{S}}, ::Type{<:Str{T}}) where {S,T} =
     (P = promote_rule(S,T)) === Union{} ? Union{} : Str{P}
 
-sizeof(s::Str) = sizeof(s.data) + 1 - STR_KEEP_NUL
+sizeof(s::Str) = sizeof(s.data)
 
 """Codeunits of string as a Vector"""
 _data(s::Vector{UInt8}) = s
@@ -127,19 +128,19 @@ pointer(s::Str{<:Byte_CSEs}) = pointer(s.data)
 pointer(s::Str{<:Word_CSEs}) = reinterpret(Ptr{UInt16}, pointer(s.data))
 pointer(s::Str{<:Quad_CSEs}) = reinterpret(Ptr{UInt32}, pointer(s.data))
 
-const CHUNKSZ = sizeof(UInt64) # used for fast processing of strings
-const CHUNKMSK = (CHUNKSZ-1)%UInt64
+const CHUNKSZ = sizeof(UInt) # used for fast processing of strings
+const CHUNKMSK = (CHUNKSZ-1)%UInt
 
-_pnt64(s::Union{String,Vector{UInt8}}) = reinterpret(Ptr{UInt64}, pointer(s))
-_pnt64(s::Str) = reinterpret(Ptr{UInt64}, pointer(s.data))
+_pntchunk(s::Union{String,Vector{UInt8}}) = reinterpret(Ptr{UInt}, pointer(s))
+_pntchunk(s::Str) = reinterpret(Ptr{UInt}, pointer(s.data))
 
 """Length of string in codeunits"""
 ncodeunits(s::Str)              = sizeof(s)
 ncodeunits(s::Str{<:Word_CSEs}) = sizeof(s) >>> 1
 ncodeunits(s::Str{<:Quad_CSEs}) = sizeof(s) >>> 2
 
 # For convenience
-@inline _calcpnt(str, siz) = (pnt = _pnt64(str) - CHUNKSZ;  (pnt, pnt + siz))
+@inline _calcpnt(str, siz) = (pnt = _pntchunk(str) - CHUNKSZ;  (pnt, pnt + siz))
 
 @inline _mask_bytes(n) = ((1%UInt) << ((n & CHUNKMSK) << 3)) - 0x1
 
diff --git a/src/utf16.jl b/src/utf16.jl
@@ -1,18 +1,18 @@
 #=
 UTF16Str and UCS2Str types (UTF-16 encoding and pure BMP UCS-2)
 
-Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones,
+Copyright 2017-2020 Gandalf Software, Inc., Scott P. Jones,
 and other contributors to the Julia language
 Licensed under MIT License, see LICENSE.md
 Based in (small) part on code for UTF16String that used to be in Julia
 =#
 
-const _trail_mask = 0xdc00_dc00_dc00_dc00
-const _hi_bit_16  = 0x8000_8000_8000_8000
+const _trail_mask = CHUNKSZ == 4 ? 0xdc00_dc00 : 0xdc00_dc00_dc00_dc00
+const _hi_bit_16  = CHUNKSZ == 4 ? 0x8000_8000 : 0x8000_8000_8000_8000
 
 @inline _mask_surr(v)  = xor((v | v<<1 | v<<2 | v<<3 | v<<4 | v<<5) & _hi_bit_16, _hi_bit_16)
-@inline _get_masked(v::UInt64) = _mask_surr(xor(v, _trail_mask))
-@inline _get_masked(qpnt::Ptr{UInt64}) = _get_masked(unsafe_load(qpnt))
+@inline _get_masked(v::UInt) = _mask_surr(xor(v, _trail_mask))
+@inline _get_masked(qpnt::Ptr{UInt}) = _get_masked(unsafe_load(qpnt))
 @inline _get_lead(qpnt) = xor(_get_masked(qpnt), _hi_bit_16)
 
 @inline function _align_len_utf16(pnt, cnt, v)
@@ -26,11 +26,11 @@ const _hi_bit_16  = 0x8000_8000_8000_8000
 end
 
 _length_al(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int) =
-    (pnt = reinterpret(Ptr{UInt64}, beg); _align_len_utf16(pnt, cnt<<1, _get_lead(pnt)))
+    (pnt = reinterpret(Ptr{UInt}, beg); _align_len_utf16(pnt, cnt<<1, _get_lead(pnt)))
 
 function _length(::MultiCU, ::Type{UTF16CSE}, beg::Ptr{UInt16}, cnt::Int)
     align = reinterpret(UInt, beg)
-    pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
+    pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
     v = _get_lead(pnt)
     if (align &= CHUNKMSK) != 0
         msk = _mask_bytes(align)
@@ -83,7 +83,7 @@ function is_bmp(str::MS_UTF16)
     (siz = sizeof(str)) == 0 && return true
     # Todo: handle unaligned for ARM32
     @preserve str begin
-        siz < CHUNKSZ && return (_get_masked(_pnt64(str)) & _mask_bytes(siz)) == 0
+        siz < CHUNKSZ && return (_get_masked(_pntchunk(str)) & _mask_bytes(siz)) == 0
 
         pnt, fin = _calcpnt(str, siz)
         while (pnt += CHUNKSZ) <= fin
@@ -106,7 +106,7 @@ end
 
 @inline function _check_bmp_utf16_ul(beg, cnt)
     align = reinterpret(UInt, beg)
-    pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
+    pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
     v = unsafe_load(pnt)
     if (align &= CHUNKMSK) != 0
         v &= ~_mask_bytes(align)
@@ -117,7 +117,7 @@ end
 
 is_bmp(str::Str{UTF16CSE}) =
     (cnt = sizeof(str)) == 0 ? true :
-    @preserve str _check_bmp_utf16_al(reinterpret(Ptr{UInt64}, pointer(str)), cnt)
+    @preserve str _check_bmp_utf16_al(reinterpret(Ptr{UInt}, pointer(str)), cnt)
 
 is_bmp(str::SubString{<:Str{UTF16CSE}}) =
     (cnt = sizeof(str)) == 0 ? true : @preserve str _check_bmp_utf16_ul(pointer(str), cnt)
@@ -316,8 +316,8 @@ function is_valid(::Type{<:Str{UTF16CSE}}, data::AbstractArray{UInt16})
     @inbounds return pos > len || !is_surrogate_codeunit(get_codeunit(data[pos + 1]))
 end
 
-# This can be sped up, to check 4 words at a time, only checking for unpaired
-# or out of order surrogates when one is found in the UInt64
+# This can be sped up, to check 2/4 words at a time, only checking for unpaired
+# or out of order surrogates when one is found in the UInt
 function is_valid(::Type{<:Str{UTF16CSE}}, pnt::Ptr{UInt16}, len)
     len == 0 && return true
     fin = bytoff(pnt, len - 1)
diff --git a/src/utf8.jl b/src/utf8.jl
@@ -89,7 +89,7 @@ xor 80 then << 1 then |
 11 -> 01 -> 1
 =#
 
-const hi_mask = 0x8080_8080_8080_8080
+const hi_mask = CHUNKSZ == 4 ? 0x8080_8080 : 0x8080_8080_8080_8080
 
 @inline _count_cont(v) = (v = xor(v, hi_mask); count_ones(xor(((v << 1) | v), hi_mask) & hi_mask))
 @inline msk_lead(v) = (v = xor(v, hi_mask); xor(xor(((v << 1) | v), hi_mask) & hi_mask, hi_mask))
@@ -106,11 +106,11 @@ const hi_mask = 0x8080_8080_8080_8080
 end
 
 _length_al(::MultiCU, ::Type{UTF8CSE}, beg::Ptr{UInt8}, cnt::Int) =
-    (pnt = reinterpret(Ptr{UInt64}, beg); _align_len_utf8(pnt, cnt, unsafe_load(pnt)))
+    (pnt = reinterpret(Ptr{UInt}, beg); _align_len_utf8(pnt, cnt, unsafe_load(pnt)))
 
 function _length(::MultiCU, ::Type{UTF8CSE}, beg::Ptr{UInt8}, cnt::Int)
     align = reinterpret(UInt, beg)
-    pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
+    pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
     v = unsafe_load(pnt)
     if (align &= CHUNKMSK) != 0
         msk = _mask_bytes(align)
@@ -132,7 +132,7 @@ end
 
 @inline function _check_mask_ul(beg, cnt, msk)
     align = reinterpret(UInt, beg)
-    pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
+    pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
     v = unsafe_load(pnt)
     if (align &= CHUNKMSK) != 0
         v &= ~_mask_bytes(align)
@@ -162,7 +162,7 @@ is_ascii(vec::Vector{T}) where {T<:CodeUnitTypes} =
 is_ascii(str::Str{C}) where {C<:Union{UTF8_CSEs,LatinCSE,Binary_CSEs,UTF16CSE,UCS2CSE,
                                       Text2CSE,Text4CSE,UTF32CSE}} =
     (cnt = sizeof(str)) == 0 ? true :
-    @preserve str _check_mask_al(reinterpret(Ptr{UInt64}, pointer(str)), cnt,
+    @preserve str _check_mask_al(reinterpret(Ptr{UInt}, pointer(str)), cnt,
                                  _ascii_mask(codeunit(C)))
 
 # Todo! Here you need to see that 0b11yyyyxx at least 1 y must be set,
@@ -181,7 +181,7 @@ end
 
 @inline function _check_latin_utf8_ul(beg, cnt)
     align = reinterpret(UInt, beg)
-    pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
+    pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
     v = unsafe_load(pnt)
     if (align &= CHUNKMSK) != 0
         v &= ~_mask_bytes(align)
@@ -192,7 +192,7 @@ end
 
 is_latin(str::Str{UTF8CSE}) =
     (siz = sizeof(str)) == 0 ? true :
-    @preserve str _check_latin_utf8_al(reinterpret(Ptr{UInt64}, pointer(str)), siz)
+    @preserve str _check_latin_utf8_al(reinterpret(Ptr{UInt}, pointer(str)), siz)
 
 is_latin(str::SubString{<:Str{UTF8CSE}}) =
     (cnt = sizeof(str)) == 0 ? true : @preserve str _check_latin_utf8_ul(pointer(str), cnt)
@@ -224,7 +224,7 @@ end
 
 @inline function _check_bmp_utf8_ul(beg, cnt)
     align = reinterpret(UInt, beg)
-    pnt = reinterpret(Ptr{UInt64}, align & ~CHUNKMSK)
+    pnt = reinterpret(Ptr{UInt}, align & ~CHUNKMSK)
     v = unsafe_load(pnt)
     if (align &= CHUNKMSK) != 0
         v &= ~_mask_bytes(align)
@@ -235,7 +235,7 @@ end
 
 is_bmp(str::Str{UTF8CSE}) =
     (cnt = sizeof(str)) == 0 ? true :
-    @preserve str _check_bmp_utf8_al(reinterpret(Ptr{UInt64}, pointer(str)), cnt)
+    @preserve str _check_bmp_utf8_al(reinterpret(Ptr{UInt}, pointer(str)), cnt)
 
 is_bmp(str::SubString{<:Str{UTF8CSE}}) =
     (cnt = sizeof(str)) == 0 ? true : @preserve str _check_bmp_utf8_ul(pointer(str), cnt)
diff --git a/test/basic.jl b/test/basic.jl
@@ -966,6 +966,7 @@ end
     end
 end
 
+
 @testset "CESU-8 sequences" begin
     ## UTF-8 tests
 
@@ -974,7 +975,9 @@ end
         for hichar = 0xd800:0xdbff, lochar = 0xdc00:0xdfff
             seq = string(Char(hichar), Char(lochar))
             # Normal conversion throws an error
-            @test_throws StringError utf8(seq)
+            if sizeof(Int) != 4 # Avoid horrible 1000x performance issue on 32-bit platforms
+                @test_throws StringError utf8(seq)
+            end
             # Unsafe conversions return invalid strings as Text*Str
             @test typeof(unsafe_str(seq)) == Text1Str
             # With accept_surrogates flag, return converted to valid string (_UTF32Str)
@@ -985,6 +988,7 @@ end
 
 end
 
+
 @testset "Reverse of UTF8" begin
     # Reverse of UTF8Str
     @test reverse(UTF8Str("")) == ""