@@ -5,31 +5,16 @@ Copyright 2017-2018 Gandalf Software, Inc., Scott P. Jones
5
5
Licensed under MIT License, see LICENSE.md
6
6
=#
7
7
8
- _wide_lower_l (c) = ifelse (c > (V6_COMPAT ? 0xdf : 0xde ), c != 0xf7 , c == 0xb5 )
9
-
10
- @inline _wide_lower_ch (ch) =
11
- ch <= 0x7f ? _islower_a (ch) : (ch > 0xff ? _islower_u (ch) : _wide_lower_l (ch))
12
-
13
- @inline _isupper_ch (ch) =
14
- ch <= 0x7f ? _isupper_a (ch) : (ch > 0xff ? _isupper_u (ch) : _isupper_l (ch))
15
-
16
- _wide_lower_latin (ch) = (ch == 0xb5 ) | (ch == 0xff ) | (! V6_COMPAT && (ch == 0xdf ))
17
-
18
- _wide_out_upper (ch) =
19
- ifelse (ch == 0xb5 , 0x39c ,
20
- ifelse (ch == 0xff , 0x178 , ifelse (! V6_COMPAT && ch == 0xdf , 0x1e9e , ch% UInt16)))
21
-
22
-
23
8
function uppercase_first (str:: MaybeSub{S} ) where {C<: ASCIICSE ,S<: Str{C} }
24
9
(len = ncodeunits (str)) == 0 && return str
25
10
@preserve str begin
26
11
pnt = pointer (str)
27
12
ch = get_codeunit (pnt)
28
13
_islower_a (ch) || return str
29
- out = _allocate (len)
14
+ buf, out = _allocate (UInt8, len)
30
15
unsafe_copyto! (out, pnt, len)
31
16
set_codeunit! (out, ch - 0x20 )
32
- Str (C, out )
17
+ Str (C, buf )
33
18
end
34
19
end
35
20
@@ -39,10 +24,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:ASCIICSE,S<:Str{C}}
39
24
pnt = pointer (str)
40
25
ch = get_codeunit (pnt)
41
26
_isupper_a (ch) || return str
42
- out = _allocate (len)
27
+ buf, out = _allocate (UInt8, len)
43
28
unsafe_copyto! (out, pnt, len)
44
29
set_codeunit! (out, ch + 0x20 )
45
- Str (C, out )
30
+ Str (C, buf )
46
31
end
47
32
end
48
33
@@ -119,7 +104,7 @@ function uppercase_first(str::MaybeSub{S}) where {C<:LatinCSE,S<:Str{C}}
119
104
_can_upper (ch) || return str
120
105
buf, out = _allocate (UInt8, len)
121
106
set_codeunit! (out, ch - 0x20 )
122
- len > 1 && unsafe_copyto! (out, pnt+ 1 , len- 1 )
107
+ len > 1 && unsafe_copyto! (out + 1 , pnt+ 1 , len- 1 )
123
108
Str (C, buf)
124
109
end
125
110
end
@@ -130,19 +115,16 @@ function uppercase_first(str::MaybeSub{S}) where {C<:_LatinCSE,S<:Str{C}}
130
115
@preserve str begin
131
116
pnt = pointer (str)
132
117
ch = get_codeunit (pnt)
133
- if _can_upper (ch)
134
- buf, out8 = _allocate (UInt8, len)
135
- set_codeunit! (out8, ch - 0x20 )
136
- len > 1 && unsafe_copyto! (out8, pnt+ 1 , len- 1 )
137
- Str (C, buf)
138
- elseif _wide_lower_latin (ch)
118
+ if _wide_lower_latin (ch)
139
119
buf, out = _allocate (UInt16, len)
120
+ _widen! (out, pnt, pnt + len)
140
121
set_codeunit! (out, _wide_out_upper (ch))
141
- # Perform the widen operation on the rest (should be done via SIMD)
142
- @inbounds for i = 2 : len
143
- set_codeunit! (out += 2 , get_codeunit (pnt += 2 )% UInt16)
144
- end
145
122
Str (_UCS2CSE, buf)
123
+ elseif _can_upper (ch)
124
+ buf8, out8 = _allocate (UInt8, len)
125
+ len > 1 && unsafe_copyto! (out8, pnt, len)
126
+ set_codeunit! (out8, ch - 0x20 )
127
+ Str (_LatinCSE, buf8)
146
128
else
147
129
str
148
130
end
@@ -154,10 +136,10 @@ function lowercase_first(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
154
136
@preserve str begin
155
137
pnt = pointer (str)
156
138
ch = get_codeunit (pnt)
157
- _isupper (ch) || return str
139
+ _isupper_al (ch) || return str
158
140
buf, out = _allocate (UInt8, len)
159
141
set_codeunit! (out, ch + 0x20 )
160
- len > 1 && unsafe_copyto! (out, pnt+ 1 , len- 1 )
142
+ len > 1 && unsafe_copyto! (out+ 1 , pnt+ 1 , len- 1 )
161
143
Str (C, buf)
162
144
end
163
145
end
@@ -261,14 +243,17 @@ function lowercase(str::MaybeSub{S}) where {C<:Latin_CSEs,S<:Str{C}}
261
243
str
262
244
end
263
245
246
+ _is_latin_ucs2 (len, pnt) = _check_mask_ul (pnt, len, _latin_mask (UInt16))
247
+
264
248
# result must have at least one character > 0xff, so if the only character(s)
265
249
# > 0xff became <= 0xff, then the result may need to be narrowed and returned as _LatinStr
266
250
267
251
function _lower (:: Type{C} , beg, off, len) where {C<: _UCS2CSE }
268
252
CU = codeunit (C)
269
253
buf, out = _allocate (CU, len)
270
254
unsafe_copyto! (out, beg, len)
271
- fin = out + (len* sizeof (CU))
255
+ lenw = len* sizeof (CU)
256
+ fin = out + lenw
272
257
out += off
273
258
flg = false
274
259
while out < fin
@@ -277,18 +262,19 @@ function _lower(::Type{C}, beg, off, len) where {C<:_UCS2CSE}
277
262
_isupper_a (ch) && set_codeunit! (out, ch += 0x20 )
278
263
elseif ch <= 0xff
279
264
_isupper_l (ch) && set_codeunit! (out, ch += 0x20 )
280
- elseif _isupper_u (ch)
281
- ch = _lowercase_u (ch)
282
- flg = ch <= 0xff
283
- set_codeunit! (out, ch)
265
+ elseif ch <= 0xffff
266
+ if _can_lower_bmp (ch)
267
+ ch = _lower_bmp (ch)
268
+ flg = ch <= 0xff
269
+ set_codeunit! (out, ch)
270
+ end
284
271
end
285
272
out += sizeof (CU)
286
273
end
287
- if flg && is_latin (buf)
288
- out = pointer (buf)
289
- buf = _allocate (len)
290
- _narrow! (pointer (buf), out, out + len)
291
- Str (_LatinCSE, buf)
274
+ if flg && (src = reinterpret (Ptr{UInt16}, pointer (buf)); _is_latin_ucs2 (lenw, src))
275
+ buf8 = _allocate (len)
276
+ _narrow! (pointer (buf8), src, src + lenw)
277
+ Str (_LatinCSE, buf8)
292
278
else
293
279
Str (C, buf)
294
280
end
@@ -302,25 +288,75 @@ function _lower(::Type{C}, beg, off, len) where {C<:Union{UCS2CSE,UTF32_CSEs}}
302
288
out += off
303
289
while out < fin
304
290
ch = get_codeunit (out)
305
- if ch <= 0x7f
306
- _isupper_a (ch) && set_codeunit! (out, ch += 0x20 )
307
- elseif ch <= 0xff
308
- _isupper_l (ch) && set_codeunit! (out, ch += 0x20 )
309
- elseif _isupper_u (ch)
310
- set_codeunit! (out, _lowercase_u (ch))
291
+ if ch <= 0xff
292
+ _isupper_al (ch) && set_codeunit! (out, ch += 0x20 )
293
+ elseif ch <= 0xffff
294
+ _can_lower_bmp (ch) && set_codeunit! (out, _lower_bmp (ch) )
295
+ elseif ch <= 0x1ffff
296
+ _can_lower_slp (ch) && set_codeunit! (out, _lower_slp (ch))
311
297
end
312
298
out += sizeof (CU)
313
299
end
314
300
Str (C, buf)
315
301
end
316
302
303
+ function lowercase_first (str:: MaybeSub{S} ) where {C<: _UCS2CSE ,S<: Str{C} }
304
+ (len = ncodeunits (str)) == 0 && return str
305
+ @preserve str begin
306
+ pnt = pointer (str)
307
+ ch = get_codeunit (pnt)
308
+ (ch <= 0xff ? _isupper_al (ch) : ch <= 0xffff ? _can_lower_bmp (ch) :
309
+ ch <= 0x1ffff && _can_lower_slp (ch)) ||
310
+ return str
311
+ cl = _lower_ch (ch)
312
+ if ch > 0xff && cl <= 0xff && _check_mask_ul (pnt+ 1 , len- 1 , _latin_mask (UInt16))
313
+ buf8, out8 = _allocate (UInt8, len)
314
+ len > 1 && _narrow! (out8, pnt, pnt + len)
315
+ set_codeunit! (out8, cl)
316
+ Str (_LatinCSE, buf8)
317
+ else
318
+ buf, out = _allocate (codeunit (C), len)
319
+ len > 1 && unsafe_copyto! (out, pnt, len)
320
+ set_codeunit! (out, cl)
321
+ Str (C, buf)
322
+ end
323
+ end
324
+ end
325
+
326
+ function uppercase_first (str:: MaybeSub{S} ) where {C<: Union{UCS2_CSEs,UTF32_CSEs} ,S<: Str{C} }
327
+ (len = ncodeunits (str)) == 0 && return str
328
+ @preserve str begin
329
+ pnt = pointer (str)
330
+ ch = get_codeunit (pnt)
331
+ cp = _title_ch (ch)
332
+ ch == cp && return str
333
+ buf, out = _allocate (codeunit (C), len)
334
+ len > 1 && unsafe_copyto! (out, pnt, len)
335
+ set_codeunit! (out, cp)
336
+ Str (C, buf)
337
+ end
338
+ end
339
+
340
+ function lowercase_first (str:: MaybeSub{S} ) where {C<: Union{UCS2CSE,UTF32_CSEs} ,S<: Str{C} }
341
+ (len = ncodeunits (str)) == 0 && return str
342
+ @preserve str begin
343
+ pnt = pointer (str)
344
+ ch = get_codeunit (pnt)
345
+ _can_lower_ch (ch) || return str
346
+ buf, out = _allocate (codeunit (C), len)
347
+ len > 1 && unsafe_copyto! (out, pnt, len)
348
+ set_codeunit! (out, _lower_ch (ch))
349
+ Str (C, buf)
350
+ end
351
+ end
352
+
317
353
function lowercase (str:: MaybeSub{S} ) where {C<: Union{UCS2_CSEs,UTF32_CSEs} ,S<: Str{C} }
318
354
@preserve str begin
319
355
CU = codeunit (C)
320
356
pnt = beg = pointer (str)
321
357
fin = beg + sizeof (str)
322
358
while pnt < fin
323
- _isupper_ch (get_codeunit (pnt)) && return _lower (C, beg, pnt- beg, ncodeunits (str))
359
+ _can_lower_ch (get_codeunit (pnt)) && return _lower (C, beg, pnt- beg, ncodeunits (str))
324
360
pnt += sizeof (CU)
325
361
end
326
362
end
@@ -337,16 +373,12 @@ function _upper(::Type{C}, beg, off, len) where {C<:Union{UCS2_CSEs,UTF32_CSEs}}
337
373
ch = get_codeunit (out)
338
374
if ch <= 0x7f
339
375
_islower_a (ch) && set_codeunit! (out, ch -= 0x20 )
340
- elseif ch > 0xff
341
- _islower_u (ch) && set_codeunit! (out, _uppercase_u (ch))
342
- elseif _can_upper (ch)
343
- set_codeunit! (out, ch -= 0x20 )
344
- elseif ch == 0xb5
345
- set_codeunit! (out, 0x39c )
346
- elseif ch == 0xff
347
- set_codeunit! (out, 0x178 )
348
- elseif ! V6_COMPAT && ch == 0xdf
349
- set_codeunit! (out, 0x1e9e )
376
+ elseif ch <= 0xff
377
+ set_codeunit! (out, _uppercase_l (ch))
378
+ elseif ch <= 0xffff
379
+ _can_upper_bmp (ch) && set_codeunit! (out, _upper_bmp (ch))
380
+ elseif ch <= 0x1ffff
381
+ _can_upper_slp (ch) && set_codeunit! (out, _upper_slp (ch))
350
382
end
351
383
out += sizeof (CU)
352
384
end
@@ -359,7 +391,7 @@ function uppercase(str::MaybeSub{S}) where {C<:Union{UCS2_CSEs,UTF32_CSEs},S<:St
359
391
pnt = beg = pointer (str)
360
392
fin = beg + sizeof (str)
361
393
while pnt < fin
362
- _wide_lower_ch (get_codeunit (pnt)) && return _upper (C, beg, pnt- beg, ncodeunits (str))
394
+ _can_upper_ch (get_codeunit (pnt)) && return _upper (C, beg, pnt- beg, ncodeunits (str))
363
395
pnt += sizeof (CU)
364
396
end
365
397
str
0 commit comments