JuliaString
diff --git a/‎src/StrBase.jl
Lines changed: 1 addition & 0 deletions b/‎src/StrBase.jl
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/casefold.jl
Lines changed: 92 additions & 60 deletions b/‎src/casefold.jl
Lines changed: 92 additions & 60 deletions
diff --git a/‎src/charcase.jl
Lines changed: 103 additions & 0 deletions b/‎src/charcase.jl
Lines changed: 103 additions & 0 deletions
diff --git a/‎src/chars.jl
Lines changed: 1 addition & 0 deletions b/‎src/chars.jl
Lines changed: 1 addition & 0 deletions
@@ -48,6 +48,7 @@ include("types.jl")
 @static V6_COMPAT && include("compat.jl")
 @static NEW_ITERATE && include("fixparse.jl")
 include("chars.jl")
+include("charcase.jl")
 include("access.jl")
 include("traits.jl")
 include("utf8proc.jl")
 
@@ -5,31 +5,16 @@ Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
 Licensed under MIT License, see LICENSE.md
 =#
 
-_wide_lower_l(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)
-
-@inline _wide_lower_ch(ch) =
-    ch <= 0x7f ? _islower_a(ch) : (ch > 0xff ? _islower_u(ch) : _wide_lower_l(ch))
-
-@inline _isupper_ch(ch) =
-    ch <= 0x7f ? _isupper_a(ch) : (ch > 0xff ? _isupper_u(ch) : _isupper_l(ch))
-
-_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))
-
-_wide_out_upper(ch) =
-    ifelse(ch == 0xb5, 0x39c,
-           ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))
-
-
 function uppercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
     (len = ncodeunits(str)) == 0 && return str
     @preserve str begin
         pnt = pointer(str)
         ch = get_codeunit(pnt)
         _islower_a(ch) || return str
-        out = _allocate(len)
+        buf, out = _allocate(UInt8, len)
         unsafe_copyto!(out, pnt, len)
         set_codeunit!(out, ch - 0x20)
-        Str(C, out)
+        Str(C, buf)
     end
 end
 
@@ -39,10 +24,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
         pnt = pointer(str)
         ch = get_codeunit(pnt)
         _isupper_a(ch) || return str
-        out = _allocate(len)
+        buf, out = _allocate(UInt8, len)
         unsafe_copyto!(out, pnt, len)
         set_codeunit!(out, ch + 0x20)
-        Str(C, out)
+        Str(C, buf)
     end
 end
 
@@ -119,7 +104,7 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
         _can_upper(ch) || return str
         buf, out = _allocate(UInt8, len)
         set_codeunit!(out, ch - 0x20)
-        len > 1 && unsafe_copyto!(out, pnt+1, len-1)
+        len > 1 && unsafe_copyto!(out + 1, pnt+1, len-1)
         Str(C, buf)
     end
 end
@@ -130,19 +115,16 @@ function uppercase_first(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
     @preserve str begin
         pnt = pointer(str)
         ch = get_codeunit(pnt)
-        if _can_upper(ch)
-            buf, out8 = _allocate(UInt8, len)
-            set_codeunit!(out8, ch - 0x20)
-            len > 1 && unsafe_copyto!(out8, pnt+1, len-1)
-            Str(C, buf)
-        elseif _wide_lower_latin(ch)
+        if _wide_lower_latin(ch)
             buf, out = _allocate(UInt16, len)
+            _widen!(out, pnt, pnt + len)
             set_codeunit!(out, _wide_out_upper(ch))
-            # Perform the widen operation on the rest (should be done via SIMD)
-            @inbounds for i = 2:len
-                set_codeunit!(out += 2, get_codeunit(pnt += 2)%UInt16)
-            end
             Str(_UCS2CSE, buf)
+        elseif _can_upper(ch)
+            buf8, out8 = _allocate(UInt8, len)
+            len > 1 && unsafe_copyto!(out8, pnt, len)
+            set_codeunit!(out8, ch - 0x20)
+            Str(_LatinCSE, buf8)
         else
             str
         end
@@ -154,10 +136,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
     @preserve str begin
         pnt = pointer(str)
         ch = get_codeunit(pnt)
-        _isupper(ch) || return str
+        _isupper_al(ch) || return str
         buf, out = _allocate(UInt8, len)
         set_codeunit!(out, ch + 0x20)
-        len > 1 && unsafe_copyto!(out, pnt+1, len-1)
+        len > 1 && unsafe_copyto!(out+1, pnt+1, len-1)
         Str(C, buf)
     end
 end
@@ -261,14 +243,17 @@ function lowercase(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
     str
 end
 
+_is_latin_ucs2(len, pnt) = _check_mask_ul(pnt, len, _latin_mask(UInt16))
+
 # result must have at least one character > 0xff, so if the only character(s)
 # > 0xff became <= 0xff, then the result may need to be narrowed and returned as _LatinStr
 
 function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
     CU = codeunit(C)
     buf, out = _allocate(CU, len)
     unsafe_copyto!(out, beg, len)
-    fin = out + (len*sizeof(CU))
+    lenw = len*sizeof(CU)
+    fin = out + lenw
     out += off
     flg = false
     while out < fin
@@ -277,18 +262,19 @@ function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
             _isupper_a(ch) && set_codeunit!(out, ch += 0x20)
         elseif ch <= 0xff
             _isupper_l(ch) && set_codeunit!(out, ch += 0x20)
-        elseif _isupper_u(ch)
-            ch = _lowercase_u(ch)
-            flg = ch <= 0xff
-            set_codeunit!(out, ch)
+        elseif ch <= 0xffff
+            if _can_lower_bmp(ch)
+                ch = _lower_bmp(ch)
+                flg = ch <= 0xff
+                set_codeunit!(out, ch)
+            end
         end
         out += sizeof(CU)
     end
-    if flg && is_latin(buf)
-        out = pointer(buf)
-        buf = _allocate(len)
-        _narrow!(pointer(buf), out, out + len)
-        Str(_LatinCSE, buf)
+    if flg && (src = reinterpret(Ptr{UInt16}, pointer(buf)); _is_latin_ucs2(lenw, src))
+        buf8 = _allocate(len)
+        _narrow!(pointer(buf8), src, src + lenw)
+        Str(_LatinCSE, buf8)
     else
         Str(C, buf)
     end
@@ -302,25 +288,75 @@ function _lower(::Type{C}, beg, off, len) where {C<:Union{UCS2CSE,UTF32_CSEs}}
     out += off
     while out < fin
         ch = get_codeunit(out)
-        if ch <= 0x7f
-            _isupper_a(ch) && set_codeunit!(out, ch += 0x20)
-        elseif ch <= 0xff
-            _isupper_l(ch) && set_codeunit!(out, ch += 0x20)
-        elseif _isupper_u(ch)
-            set_codeunit!(out, _lowercase_u(ch))
+        if ch <= 0xff
+            _isupper_al(ch) && set_codeunit!(out, ch += 0x20)
+        elseif ch <= 0xffff
+            _can_lower_bmp(ch) && set_codeunit!(out, _lower_bmp(ch))
+        elseif ch <= 0x1ffff
+            _can_lower_slp(ch) && set_codeunit!(out, _lower_slp(ch))
         end
         out += sizeof(CU)
     end
     Str(C, buf)
 end
 
+function lowercase_first(str::MaybeSub{S}) where {C<:_UCS2CSE,S<:Str{C}}
+    (len = ncodeunits(str)) == 0 && return str
+    @preserve str begin
+        pnt = pointer(str)
+        ch = get_codeunit(pnt)
+        (ch <= 0xff ? _isupper_al(ch) : ch <= 0xffff ? _can_lower_bmp(ch) :
+         ch <= 0x1ffff && _can_lower_slp(ch)) ||
+         return str
+        cl = _lower_ch(ch)
+        if ch > 0xff && cl <= 0xff && _check_mask_ul(pnt+1, len-1, _latin_mask(UInt16))
+            buf8, out8 = _allocate(UInt8, len)
+            len > 1 && _narrow!(out8, pnt, pnt + len)
+            set_codeunit!(out8, cl)
+            Str(_LatinCSE, buf8)
+        else
+            buf, out = _allocate(codeunit(C), len)
+            len > 1 && unsafe_copyto!(out, pnt, len)
+            set_codeunit!(out, cl)
+            Str(C, buf)
+        end
+    end
+end
+
+function uppercase_first(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
+    (len = ncodeunits(str)) == 0 && return str
+    @preserve str begin
+        pnt = pointer(str)
+        ch = get_codeunit(pnt)
+        cp = _title_ch(ch)
+        ch == cp && return str
+        buf, out = _allocate(codeunit(C), len)
+        len > 1 && unsafe_copyto!(out, pnt, len)
+        set_codeunit!(out, cp)
+        Str(C, buf)
+    end
+end
+
+function lowercase_first(str::MaybeSub{S}) where {C<:Union{UCS2CSE,UTF32_CSEs},S<:Str{C}}
+    (len = ncodeunits(str)) == 0 && return str
+    @preserve str begin
+        pnt = pointer(str)
+        ch = get_codeunit(pnt)
+        _can_lower_ch(ch) || return str
+        buf, out = _allocate(codeunit(C), len)
+        len > 1 && unsafe_copyto!(out, pnt, len)
+        set_codeunit!(out, _lower_ch(ch))
+        Str(C, buf)
+    end
+end
+
 function lowercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
     @preserve str begin
         CU = codeunit(C)
         pnt = beg = pointer(str)
         fin = beg + sizeof(str)
         while pnt < fin
-            _isupper_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
+            _can_lower_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
             pnt += sizeof(CU)
         end
     end
@@ -337,16 +373,12 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
         ch = get_codeunit(out)
         if ch <= 0x7f
             _islower_a(ch) && set_codeunit!(out, ch -= 0x20)
-        elseif ch > 0xff
-            _islower_u(ch) && set_codeunit!(out, _uppercase_u(ch))
-        elseif _can_upper(ch)
-            set_codeunit!(out, ch -= 0x20)
-        elseif ch == 0xb5
-            set_codeunit!(out, 0x39c)
-        elseif ch == 0xff
-            set_codeunit!(out, 0x178)
-        elseif !V6_COMPAT && ch == 0xdf
-            set_codeunit!(out, 0x1e9e)
+        elseif ch <= 0xff
+            set_codeunit!(out, _uppercase_l(ch))
+        elseif ch <= 0xffff
+            _can_upper_bmp(ch) && set_codeunit!(out, _upper_bmp(ch))
+        elseif ch <= 0x1ffff
+            _can_upper_slp(ch) && set_codeunit!(out, _upper_slp(ch))
         end
         out += sizeof(CU)
     end
@@ -359,7 +391,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St
         pnt = beg = pointer(str)
         fin = beg + sizeof(str)
         while pnt < fin
-            _wide_lower_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
+            _can_upper_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
             pnt += sizeof(CU)
         end
         str
 
@@ -0,0 +1,103 @@
+#=
+Case folding for Unicode characters
+
+Copyright 2018 Gandalf Software, Inc., Scott P. Jones
+Licensed under MIT License, see LICENSE.md
+=#
+
+module CaseTables
+include("maketables.jl")
+
+const ct, tupvec, offvec, bitvec, sizvecl, sizvecu = case_tables()
+end # module CaseTables
+
+using .CaseTables
+
+const ct = CaseTables.ct
+
+using ModuleInterfaceTools
+@api extend ChrBase
+
+_can_upper_lat(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)
+
+_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))
+
+_wide_out_upper(ch) =
+    ifelse(ch == 0xb5, 0x39c,
+           ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))
+
+@inline function _check_tab(mask, tab, ch)
+    t = (ch >>> 9)
+    ((mask >>> (t & 0x7f)) & 1) != 0 && (off = tab[t+1]) != 0 &&
+        (CaseTables.bitvec[off][((ch >>> 5) & 0xf) + 1] & (UInt32(1) << (ch & 0x1f))) != 0
+end
+
+@inline _get_tab(off, ch, base) =
+    off == 0 ? ch : (off = CaseTables.offvec[off][((ch >>> 5) & 0x1f) + 1]) == 0 ? ch :
+    (base + CaseTables.tupvec[off][(ch & 0x1f) + 1])
+
+@inline _get_tab_bmp(mask, tab, ch) =
+    (t = (ch >>> 9); ((mask >>> t) & 1) == 0 ? ch : _get_tab(tab[(t>>1)+1], ch, 0x0000))
+@inline _get_tab_slp(mask, tab, ch) =
+    (t = (ch >>> 9); ((mask >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(tab[(t>>1)+1], ch, 0x10000))
+
+@inline _upper_lat(ch) = _get_tab(ct.u_tab[1], ch, 0x0000)
+
+@inline _upper_bmp(ch) = _get_tab_bmp(ct.can_u_flg, ct.u_tab, ch)
+@inline _lower_bmp(ch) = _get_tab_bmp(ct.can_l_flg, ct.l_tab, ch)
+@inline _title_bmp(ch) = _get_tab_bmp(ct.can_u_flg, ct.t_tab, ch)
+@inline _upper_slp(ch) = _get_tab_slp(ct.can_su_flg, ct.u_tab, ch)
+@inline _lower_slp(ch) = _get_tab_slp(ct.can_sl_flg, ct.l_tab, ch)
+
+@inline _can_lower_bmp(ch) = _check_tab(ct.can_l_flg, ct.can_l_tab, ch)
+@inline _can_upper_bmp(ch) = _check_tab(ct.can_u_flg, ct.can_u_tab, ch)
+@inline _can_lower_slp(ch) = _check_tab(ct.can_sl_flg, ct.can_l_tab, ch)
+@inline _can_upper_slp(ch) = _check_tab(ct.can_su_flg, ct.can_u_tab, ch)
+@inline _is_lower_bmp(ch)  = _check_tab(ct.is_l_flg, ct.is_l_tab, ch)
+@inline _is_upper_bmp(ch)  = _check_tab(ct.is_u_flg, ct.is_u_tab, ch)
+@inline _is_lower_slp(ch)  = _check_tab(ct.is_sl_flg, ct.is_sl_tab, ch)
+@inline _is_upper_slp(ch)  = _check_tab(ct.is_su_flg, ct.is_su_tab, ch)
+
+const _can_title_bmp = _can_upper_bmp
+
+@inline _is_lower_ch(ch) =
+    ch <= 0x7f ? _islower_a(ch) :
+    ch <= 0xff ? _islower_l(ch) :
+    ch <= 0xffff ? _is_lower_bmp(ch) :
+    ch <= 0x1ffff ? _is_lower_slp(ch) : false
+
+@inline _is_upper_ch(ch) =
+    ch <= 0x7f ? _isupper_a(ch) :
+    ch <= 0xff ? _isupper_l(ch) :
+    ch <= 0xffff ? _is_upper_bmp(ch) :
+    ch <= 0x1ffff ? _is_upper_slp(ch) : false
+
+@inline _can_lower_ch(ch) =
+    ch <= 0x7f ? _isupper_a(ch) :
+    ch <= 0xff ? _isupper_l(ch) :
+    ch <= 0xffff ? _can_lower_bmp(ch) :
+    ch <= 0x1ffff ? _can_lower_slp(ch) : false
+
+@inline _can_upper_ch(ch) =
+    ch <= 0x7f ? _islower_a(ch) :
+    ch <= 0xff ? _can_upper_lat(ch) :
+    ch <= 0xffff ? _can_upper_bmp(ch) :
+    ch <= 0x1ffff ? _can_upper_slp(ch) : false
+
+@inline _lower_ch(ch) =
+    ch <= 0x7f ? (ch + (_isupper_a(ch)<<5)) :
+    ch <= 0xff ? (ch + (_isupper_l(ch)<<5)) :
+    ch <= 0xffff ? _lower_bmp(ch) :
+    ch <= 0x1ffff ? _lower_slp(ch) : ch
+
+@inline _upper_ch(ch) =
+    ch <= 0x7f ? (_islower_a(ch) ? (ch - 0x20) : ch) :
+    ch <= 0xff ? _upper_lat(ch) :
+    ch <= 0xffff ? _upper_bmp(ch) :
+    ch <= 0x1ffff ? _upper_slp(ch) : ch
+
+@inline _title_ch(ch) =
+    ch <= 0x7f ? (_islower_a(ch) ? (ch - 0x20) : ch) :
+    ch <= 0xff ? _upper_lat(ch) :
+    ch <= 0xffff ? _title_bmp(ch) :
+    ch <= 0x1ffff ? _upper_slp(ch) : ch
@@ -15,6 +15,7 @@ codeunit(::Type{<:MaybeSub{S}}) where {S<:Str} = codeunit(S)
 
 eltype(::Type{<:Str{BinaryCSE}}) = UInt8
 
+eltype(::Type{UniStr})                = UTF32Chr
 eltype(::Type{<:Str{Text1CSE}})       = Text1Chr
 eltype(::Type{<:Str{Text2CSE}})       = Text2Chr
 eltype(::Type{<:Str{Text4CSE}})       = Text4Chr