Skip to content

Commit 11ba0a6

Browse files
committed
Update to use tables for case
1 parent bc78697 commit 11ba0a6

File tree

9 files changed

+690
-164
lines changed

9 files changed

+690
-164
lines changed

src/StrBase.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ include("types.jl")
4848
@static V6_COMPAT && include("compat.jl")
4949
@static NEW_ITERATE && include("fixparse.jl")
5050
include("chars.jl")
51+
include("charcase.jl")
5152
include("access.jl")
5253
include("traits.jl")
5354
include("utf8proc.jl")

src/casefold.jl

Lines changed: 92 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,16 @@ Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
55
Licensed under MIT License, see LICENSE.md
66
=#
77

8-
_wide_lower_l(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)
9-
10-
@inline _wide_lower_ch(ch) =
11-
ch <= 0x7f ? _islower_a(ch) : (ch > 0xff ? _islower_u(ch) : _wide_lower_l(ch))
12-
13-
@inline _isupper_ch(ch) =
14-
ch <= 0x7f ? _isupper_a(ch) : (ch > 0xff ? _isupper_u(ch) : _isupper_l(ch))
15-
16-
_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))
17-
18-
_wide_out_upper(ch) =
19-
ifelse(ch == 0xb5, 0x39c,
20-
ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))
21-
22-
238
function uppercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
249
(len = ncodeunits(str)) == 0 && return str
2510
@preserve str begin
2611
pnt = pointer(str)
2712
ch = get_codeunit(pnt)
2813
_islower_a(ch) || return str
29-
out = _allocate(len)
14+
buf, out = _allocate(UInt8, len)
3015
unsafe_copyto!(out, pnt, len)
3116
set_codeunit!(out, ch - 0x20)
32-
Str(C, out)
17+
Str(C, buf)
3318
end
3419
end
3520

@@ -39,10 +24,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
3924
pnt = pointer(str)
4025
ch = get_codeunit(pnt)
4126
_isupper_a(ch) || return str
42-
out = _allocate(len)
27+
buf, out = _allocate(UInt8, len)
4328
unsafe_copyto!(out, pnt, len)
4429
set_codeunit!(out, ch + 0x20)
45-
Str(C, out)
30+
Str(C, buf)
4631
end
4732
end
4833

@@ -119,7 +104,7 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
119104
_can_upper(ch) || return str
120105
buf, out = _allocate(UInt8, len)
121106
set_codeunit!(out, ch - 0x20)
122-
len > 1 && unsafe_copyto!(out, pnt+1, len-1)
107+
len > 1 && unsafe_copyto!(out + 1, pnt+1, len-1)
123108
Str(C, buf)
124109
end
125110
end
@@ -130,19 +115,16 @@ function uppercase_first(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
130115
@preserve str begin
131116
pnt = pointer(str)
132117
ch = get_codeunit(pnt)
133-
if _can_upper(ch)
134-
buf, out8 = _allocate(UInt8, len)
135-
set_codeunit!(out8, ch - 0x20)
136-
len > 1 && unsafe_copyto!(out8, pnt+1, len-1)
137-
Str(C, buf)
138-
elseif _wide_lower_latin(ch)
118+
if _wide_lower_latin(ch)
139119
buf, out = _allocate(UInt16, len)
120+
_widen!(out, pnt, pnt + len)
140121
set_codeunit!(out, _wide_out_upper(ch))
141-
# Perform the widen operation on the rest (should be done via SIMD)
142-
@inbounds for i = 2:len
143-
set_codeunit!(out += 2, get_codeunit(pnt += 2)%UInt16)
144-
end
145122
Str(_UCS2CSE, buf)
123+
elseif _can_upper(ch)
124+
buf8, out8 = _allocate(UInt8, len)
125+
len > 1 && unsafe_copyto!(out8, pnt, len)
126+
set_codeunit!(out8, ch - 0x20)
127+
Str(_LatinCSE, buf8)
146128
else
147129
str
148130
end
@@ -154,10 +136,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
154136
@preserve str begin
155137
pnt = pointer(str)
156138
ch = get_codeunit(pnt)
157-
_isupper(ch) || return str
139+
_isupper_al(ch) || return str
158140
buf, out = _allocate(UInt8, len)
159141
set_codeunit!(out, ch + 0x20)
160-
len > 1 && unsafe_copyto!(out, pnt+1, len-1)
142+
len > 1 && unsafe_copyto!(out+1, pnt+1, len-1)
161143
Str(C, buf)
162144
end
163145
end
@@ -261,14 +243,17 @@ function lowercase(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
261243
str
262244
end
263245

246+
_is_latin_ucs2(len, pnt) = _check_mask_ul(pnt, len, _latin_mask(UInt16))
247+
264248
# result must have at least one character > 0xff, so if the only character(s)
265249
# > 0xff became <= 0xff, then the result may need to be narrowed and returned as _LatinStr
266250

267251
function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
268252
CU = codeunit(C)
269253
buf, out = _allocate(CU, len)
270254
unsafe_copyto!(out, beg, len)
271-
fin = out + (len*sizeof(CU))
255+
lenw = len*sizeof(CU)
256+
fin = out + lenw
272257
out += off
273258
flg = false
274259
while out < fin
@@ -277,18 +262,19 @@ function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
277262
_isupper_a(ch) && set_codeunit!(out, ch += 0x20)
278263
elseif ch <= 0xff
279264
_isupper_l(ch) && set_codeunit!(out, ch += 0x20)
280-
elseif _isupper_u(ch)
281-
ch = _lowercase_u(ch)
282-
flg = ch <= 0xff
283-
set_codeunit!(out, ch)
265+
elseif ch <= 0xffff
266+
if _can_lower_bmp(ch)
267+
ch = _lower_bmp(ch)
268+
flg = ch <= 0xff
269+
set_codeunit!(out, ch)
270+
end
284271
end
285272
out += sizeof(CU)
286273
end
287-
if flg && is_latin(buf)
288-
out = pointer(buf)
289-
buf = _allocate(len)
290-
_narrow!(pointer(buf), out, out + len)
291-
Str(_LatinCSE, buf)
274+
if flg && (src = reinterpret(Ptr{UInt16}, pointer(buf)); _is_latin_ucs2(lenw, src))
275+
buf8 = _allocate(len)
276+
_narrow!(pointer(buf8), src, src + lenw)
277+
Str(_LatinCSE, buf8)
292278
else
293279
Str(C, buf)
294280
end
@@ -302,25 +288,75 @@ function _lower(::Type{C}, beg, off, len) where {C<:Union{UCS2CSE,UTF32_CSEs}}
302288
out += off
303289
while out < fin
304290
ch = get_codeunit(out)
305-
if ch <= 0x7f
306-
_isupper_a(ch) && set_codeunit!(out, ch += 0x20)
307-
elseif ch <= 0xff
308-
_isupper_l(ch) && set_codeunit!(out, ch += 0x20)
309-
elseif _isupper_u(ch)
310-
set_codeunit!(out, _lowercase_u(ch))
291+
if ch <= 0xff
292+
_isupper_al(ch) && set_codeunit!(out, ch += 0x20)
293+
elseif ch <= 0xffff
294+
_can_lower_bmp(ch) && set_codeunit!(out, _lower_bmp(ch))
295+
elseif ch <= 0x1ffff
296+
_can_lower_slp(ch) && set_codeunit!(out, _lower_slp(ch))
311297
end
312298
out += sizeof(CU)
313299
end
314300
Str(C, buf)
315301
end
316302

303+
function lowercase_first(str::MaybeSub{S}) where {C<:_UCS2CSE,S<:Str{C}}
304+
(len = ncodeunits(str)) == 0 && return str
305+
@preserve str begin
306+
pnt = pointer(str)
307+
ch = get_codeunit(pnt)
308+
(ch <= 0xff ? _isupper_al(ch) : ch <= 0xffff ? _can_lower_bmp(ch) :
309+
ch <= 0x1ffff && _can_lower_slp(ch)) ||
310+
return str
311+
cl = _lower_ch(ch)
312+
if ch > 0xff && cl <= 0xff && _check_mask_ul(pnt+1, len-1, _latin_mask(UInt16))
313+
buf8, out8 = _allocate(UInt8, len)
314+
len > 1 && _narrow!(out8, pnt, pnt + len)
315+
set_codeunit!(out8, cl)
316+
Str(_LatinCSE, buf8)
317+
else
318+
buf, out = _allocate(codeunit(C), len)
319+
len > 1 && unsafe_copyto!(out, pnt, len)
320+
set_codeunit!(out, cl)
321+
Str(C, buf)
322+
end
323+
end
324+
end
325+
326+
function uppercase_first(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
327+
(len = ncodeunits(str)) == 0 && return str
328+
@preserve str begin
329+
pnt = pointer(str)
330+
ch = get_codeunit(pnt)
331+
cp = _title_ch(ch)
332+
ch == cp && return str
333+
buf, out = _allocate(codeunit(C), len)
334+
len > 1 && unsafe_copyto!(out, pnt, len)
335+
set_codeunit!(out, cp)
336+
Str(C, buf)
337+
end
338+
end
339+
340+
function lowercase_first(str::MaybeSub{S}) where {C<:Union{UCS2CSE,UTF32_CSEs},S<:Str{C}}
341+
(len = ncodeunits(str)) == 0 && return str
342+
@preserve str begin
343+
pnt = pointer(str)
344+
ch = get_codeunit(pnt)
345+
_can_lower_ch(ch) || return str
346+
buf, out = _allocate(codeunit(C), len)
347+
len > 1 && unsafe_copyto!(out, pnt, len)
348+
set_codeunit!(out, _lower_ch(ch))
349+
Str(C, buf)
350+
end
351+
end
352+
317353
function lowercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:Str{C}}
318354
@preserve str begin
319355
CU = codeunit(C)
320356
pnt = beg = pointer(str)
321357
fin = beg + sizeof(str)
322358
while pnt < fin
323-
_isupper_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
359+
_can_lower_ch(get_codeunit(pnt)) && return _lower(C, beg, pnt-beg, ncodeunits(str))
324360
pnt += sizeof(CU)
325361
end
326362
end
@@ -337,16 +373,12 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
337373
ch = get_codeunit(out)
338374
if ch <= 0x7f
339375
_islower_a(ch) && set_codeunit!(out, ch -= 0x20)
340-
elseif ch > 0xff
341-
_islower_u(ch) && set_codeunit!(out, _uppercase_u(ch))
342-
elseif _can_upper(ch)
343-
set_codeunit!(out, ch -= 0x20)
344-
elseif ch == 0xb5
345-
set_codeunit!(out, 0x39c)
346-
elseif ch == 0xff
347-
set_codeunit!(out, 0x178)
348-
elseif !V6_COMPAT && ch == 0xdf
349-
set_codeunit!(out, 0x1e9e)
376+
elseif ch <= 0xff
377+
set_codeunit!(out, _uppercase_l(ch))
378+
elseif ch <= 0xffff
379+
_can_upper_bmp(ch) && set_codeunit!(out, _upper_bmp(ch))
380+
elseif ch <= 0x1ffff
381+
_can_upper_slp(ch) && set_codeunit!(out, _upper_slp(ch))
350382
end
351383
out += sizeof(CU)
352384
end
@@ -359,7 +391,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St
359391
pnt = beg = pointer(str)
360392
fin = beg + sizeof(str)
361393
while pnt < fin
362-
_wide_lower_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
394+
_can_upper_ch(get_codeunit(pnt)) && return _upper(C, beg, pnt-beg, ncodeunits(str))
363395
pnt += sizeof(CU)
364396
end
365397
str

src/charcase.jl

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#=
2+
Case folding for Unicode characters
3+
4+
Copyright 2018 Gandalf Software, Inc., Scott P. Jones
5+
Licensed under MIT License, see LICENSE.md
6+
=#
7+
8+
module CaseTables
9+
include("maketables.jl")
10+
11+
const ct, tupvec, offvec, bitvec, sizvecl, sizvecu = case_tables()
12+
end # module CaseTables
13+
14+
using .CaseTables
15+
16+
const ct = CaseTables.ct
17+
18+
using ModuleInterfaceTools
19+
@api extend ChrBase
20+
21+
_can_upper_lat(c) = ifelse(c > (V6_COMPAT ? 0xdf : 0xde), c != 0xf7, c == 0xb5)
22+
23+
_wide_lower_latin(ch) = (ch == 0xb5) | (ch == 0xff) | (!V6_COMPAT && (ch == 0xdf))
24+
25+
_wide_out_upper(ch) =
26+
ifelse(ch == 0xb5, 0x39c,
27+
ifelse(ch == 0xff, 0x178, ifelse(!V6_COMPAT && ch == 0xdf, 0x1e9e, ch%UInt16)))
28+
29+
@inline function _check_tab(mask, tab, ch)
30+
t = (ch >>> 9)
31+
((mask >>> (t & 0x7f)) & 1) != 0 && (off = tab[t+1]) != 0 &&
32+
(CaseTables.bitvec[off][((ch >>> 5) & 0xf) + 1] & (UInt32(1) << (ch & 0x1f))) != 0
33+
end
34+
35+
@inline _get_tab(off, ch, base) =
36+
off == 0 ? ch : (off = CaseTables.offvec[off][((ch >>> 5) & 0x1f) + 1]) == 0 ? ch :
37+
(base + CaseTables.tupvec[off][(ch & 0x1f) + 1])
38+
39+
@inline _get_tab_bmp(mask, tab, ch) =
40+
(t = (ch >>> 9); ((mask >>> t) & 1) == 0 ? ch : _get_tab(tab[(t>>1)+1], ch, 0x0000))
41+
@inline _get_tab_slp(mask, tab, ch) =
42+
(t = (ch >>> 9); ((mask >>> (t & 0x7f)) & 1) == 0 ? ch : _get_tab(tab[(t>>1)+1], ch, 0x10000))
43+
44+
@inline _upper_lat(ch) = _get_tab(ct.u_tab[1], ch, 0x0000)
45+
46+
@inline _upper_bmp(ch) = _get_tab_bmp(ct.can_u_flg, ct.u_tab, ch)
47+
@inline _lower_bmp(ch) = _get_tab_bmp(ct.can_l_flg, ct.l_tab, ch)
48+
@inline _title_bmp(ch) = _get_tab_bmp(ct.can_u_flg, ct.t_tab, ch)
49+
@inline _upper_slp(ch) = _get_tab_slp(ct.can_su_flg, ct.u_tab, ch)
50+
@inline _lower_slp(ch) = _get_tab_slp(ct.can_sl_flg, ct.l_tab, ch)
51+
52+
@inline _can_lower_bmp(ch) = _check_tab(ct.can_l_flg, ct.can_l_tab, ch)
53+
@inline _can_upper_bmp(ch) = _check_tab(ct.can_u_flg, ct.can_u_tab, ch)
54+
@inline _can_lower_slp(ch) = _check_tab(ct.can_sl_flg, ct.can_l_tab, ch)
55+
@inline _can_upper_slp(ch) = _check_tab(ct.can_su_flg, ct.can_u_tab, ch)
56+
@inline _is_lower_bmp(ch) = _check_tab(ct.is_l_flg, ct.is_l_tab, ch)
57+
@inline _is_upper_bmp(ch) = _check_tab(ct.is_u_flg, ct.is_u_tab, ch)
58+
@inline _is_lower_slp(ch) = _check_tab(ct.is_sl_flg, ct.is_sl_tab, ch)
59+
@inline _is_upper_slp(ch) = _check_tab(ct.is_su_flg, ct.is_su_tab, ch)
60+
61+
const _can_title_bmp = _can_upper_bmp
62+
63+
@inline _is_lower_ch(ch) =
64+
ch <= 0x7f ? _islower_a(ch) :
65+
ch <= 0xff ? _islower_l(ch) :
66+
ch <= 0xffff ? _is_lower_bmp(ch) :
67+
ch <= 0x1ffff ? _is_lower_slp(ch) : false
68+
69+
@inline _is_upper_ch(ch) =
70+
ch <= 0x7f ? _isupper_a(ch) :
71+
ch <= 0xff ? _isupper_l(ch) :
72+
ch <= 0xffff ? _is_upper_bmp(ch) :
73+
ch <= 0x1ffff ? _is_upper_slp(ch) : false
74+
75+
@inline _can_lower_ch(ch) =
76+
ch <= 0x7f ? _isupper_a(ch) :
77+
ch <= 0xff ? _isupper_l(ch) :
78+
ch <= 0xffff ? _can_lower_bmp(ch) :
79+
ch <= 0x1ffff ? _can_lower_slp(ch) : false
80+
81+
@inline _can_upper_ch(ch) =
82+
ch <= 0x7f ? _islower_a(ch) :
83+
ch <= 0xff ? _can_upper_lat(ch) :
84+
ch <= 0xffff ? _can_upper_bmp(ch) :
85+
ch <= 0x1ffff ? _can_upper_slp(ch) : false
86+
87+
@inline _lower_ch(ch) =
88+
ch <= 0x7f ? (ch + (_isupper_a(ch)<<5)) :
89+
ch <= 0xff ? (ch + (_isupper_l(ch)<<5)) :
90+
ch <= 0xffff ? _lower_bmp(ch) :
91+
ch <= 0x1ffff ? _lower_slp(ch) : ch
92+
93+
@inline _upper_ch(ch) =
94+
ch <= 0x7f ? (_islower_a(ch) ? (ch - 0x20) : ch) :
95+
ch <= 0xff ? _upper_lat(ch) :
96+
ch <= 0xffff ? _upper_bmp(ch) :
97+
ch <= 0x1ffff ? _upper_slp(ch) : ch
98+
99+
@inline _title_ch(ch) =
100+
ch <= 0x7f ? (_islower_a(ch) ? (ch - 0x20) : ch) :
101+
ch <= 0xff ? _upper_lat(ch) :
102+
ch <= 0xffff ? _title_bmp(ch) :
103+
ch <= 0x1ffff ? _upper_slp(ch) : ch

src/chars.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ codeunit(::Type{<:MaybeSub{S}}) where {S<:Str} = codeunit(S)
1515

1616
eltype(::Type{<:Str{BinaryCSE}}) = UInt8
1717

18+
eltype(::Type{UniStr}) = UTF32Chr
1819
eltype(::Type{<:Str{Text1CSE}}) = Text1Chr
1920
eltype(::Type{<:Str{Text2CSE}}) = Text2Chr
2021
eltype(::Type{<:Str{Text4CSE}}) = Text4Chr

0 commit comments

Comments
 (0)