Skip to content

Commit d192302

Browse files
Merge pull request #24999 from JuliaLang/sk/strings
string overhaul
2 parents ed1ae9e + 8de25f5 commit d192302

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+1276
-1240
lines changed

base/char.jl

Lines changed: 76 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,58 @@
11
# This file is a part of Julia. License is MIT: https://julialang.org/license
22

3-
convert(::Type{Char}, x::UInt32) = reinterpret(Char, x)
3+
struct MalformedCharError <: Exception
4+
char::Char
5+
end
6+
struct CodePointError <: Exception
7+
code::Integer
8+
end
9+
@noinline malformed_char(c::Char) = throw(MalformedCharError(c))
10+
@noinline code_point_err(u::UInt32) = throw(CodePointError(u))
11+
12+
function ismalformed(c::Char)
13+
u = reinterpret(UInt32, c)
14+
l1 = leading_ones(u) << 3
15+
t0 = trailing_zeros(u) & 56
16+
(l1 == 8) | (l1 + t0 > 32) |
17+
(((u & 0x00c0c0c0) 0x00808080) >> t0 != 0)
18+
end
19+
20+
function convert(::Type{UInt32}, c::Char)
21+
# TODO: use optimized inline LLVM
22+
u = reinterpret(UInt32, c)
23+
u < 0x80000000 && return reinterpret(UInt32, u >> 24)
24+
l1 = leading_ones(u)
25+
t0 = trailing_zeros(u) & 56
26+
(l1 == 1) | (8l1 + t0 > 32) |
27+
(((u & 0x00c0c0c0) 0x00808080) >> t0 != 0) &&
28+
malformed_char(c)::Union{}
29+
u &= 0xffffffff >> l1
30+
u >>= t0
31+
(u & 0x0000007f >> 0) | (u & 0x00007f00 >> 2) |
32+
(u & 0x007f0000 >> 4) | (u & 0x7f000000 >> 6)
33+
end
34+
35+
function convert(::Type{Char}, u::UInt32)
36+
u < 0x80 && return reinterpret(Char, u << 24)
37+
u < 0x00200000 || code_point_err(u)::Union{}
38+
c = ((u << 0) & 0x0000003f) | ((u << 2) & 0x00003f00) |
39+
((u << 4) & 0x003f0000) | ((u << 6) & 0x3f000000)
40+
c = u < 0x00000800 ? (c << 16) | 0xc0800000 :
41+
u < 0x00010000 ? (c << 08) | 0xe0808000 :
42+
(c << 00) | 0xf0808080
43+
reinterpret(Char, c)
44+
end
45+
46+
function convert(::Type{T}, c::Char) where T <: Union{Int8,UInt8}
47+
i = reinterpret(Int32, c)
48+
i 0 ? ((i >>> 24) % T) : T(UInt32(c))
49+
end
50+
51+
function convert(::Type{Char}, b::Union{Int8,UInt8})
52+
0 b 0x7f ? reinterpret(Char, (b % UInt32) << 24) : Char(UInt32(b))
53+
end
54+
455
convert(::Type{Char}, x::Number) = Char(UInt32(x))
5-
convert(::Type{UInt32}, x::Char) = reinterpret(UInt32, x)
656
convert(::Type{T}, x::Char) where {T<:Number} = convert(T, UInt32(x))
757

858
rem(x::Char, ::Type{T}) where {T<:Number} = rem(UInt32(x), T)
@@ -29,19 +79,16 @@ done(c::Char, state) = state
2979
isempty(c::Char) = false
3080
in(x::Char, y::Char) = x == y
3181

32-
==(x::Char, y::Char) = UInt32(x) == UInt32(y)
33-
isless(x::Char, y::Char) = UInt32(x) < UInt32(y)
34-
35-
const hashchar_seed = 0xd4d64234
36-
hash(x::Char, h::UInt) = hash_uint64(((UInt64(x)+hashchar_seed)<<32) UInt64(h))
82+
==(x::Char, y::Char) = reinterpret(UInt32, x) == reinterpret(UInt32, y)
83+
isless(x::Char, y::Char) = reinterpret(UInt32, x) < reinterpret(UInt32, y)
84+
hash(x::Char, h::UInt) =
85+
hash_uint64(((reinterpret(UInt32, x) + UInt64(0xd4d64234)) << 32) UInt64(h))
3786

3887
-(x::Char, y::Char) = Int(x) - Int(y)
3988
-(x::Char, y::Integer) = Char(Int32(x) - Int32(y))
4089
+(x::Char, y::Integer) = Char(Int32(x) + Int32(y))
4190
+(x::Integer, y::Char) = y + x
4291

43-
bswap(x::Char) = Char(bswap(UInt32(x)))
44-
4592
print(io::IO, c::Char) = (write(io, c); nothing)
4693

4794
const hex_chars = UInt8['0':'9';'a':'z']
@@ -66,21 +113,37 @@ function show(io::IO, c::Char)
66113
end
67114
if Unicode.isprint(c)
68115
write(io, 0x27, c, 0x27)
69-
else
116+
elseif !ismalformed(c)
70117
u = UInt32(c)
71118
write(io, 0x27, 0x5c, c <= '\x7f' ? 0x78 : c <= '\uffff' ? 0x75 : 0x55)
72119
d = max(2, 8 - (leading_zeros(u) >> 2))
73120
while 0 < d
74121
write(io, hex_chars[((u >> ((d -= 1) << 2)) & 0xf) + 1])
75122
end
76123
write(io, 0x27)
124+
else # malformed
125+
write(io, 0x27)
126+
u = reinterpret(UInt32, c)
127+
while true
128+
a = hex_chars[((u >> 28) & 0xf) + 1]
129+
b = hex_chars[((u >> 24) & 0xf) + 1]
130+
write(io, 0x5c, 'x', a, b)
131+
(u <<= 8) == 0 && break
132+
end
133+
write(io, 0x27)
77134
end
78135
return
79136
end
80137

81138
function show(io::IO, ::MIME"text/plain", c::Char)
82139
show(io, c)
83-
u = UInt32(c)
84-
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
85-
print(io, " (category ", Unicode.category_abbrev(c), ": ", Unicode.category_string(c), ")")
140+
if !ismalformed(c)
141+
u = UInt32(c)
142+
print(io, ": ", Unicode.isascii(c) ? "ASCII/" : "", "Unicode U+", hex(u, u > 0xffff ? 6 : 4))
143+
else
144+
print(io, ": Malformed UTF-8")
145+
end
146+
abr = Unicode.category_abbrev(c)
147+
str = Unicode.category_string(c)
148+
print(io, " (category ", abr, ": ", str, ")")
86149
end

base/deprecated.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2992,6 +2992,10 @@ end
29922992
@deprecate_binding Complex64 ComplexF32
29932993
@deprecate_binding Complex128 ComplexF64
29942994

2995+
# PR #24999
2996+
@deprecate ind2chr(s::AbstractString, i::Integer) length(s, 1, i)
2997+
@deprecate chr2ind(s::AbstractString, n::Integer) nextind(s, 0, n)
2998+
29952999
# END 0.7 deprecations
29963000

29973001
# BEGIN 1.0 deprecations

base/exports.jl

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ export
154154
NullException,
155155
ParseError,
156156
SystemError,
157-
UnicodeError,
157+
StringIndexError,
158158

159159
# Global constants and variables
160160
ARGS,
@@ -716,7 +716,6 @@ export
716716
bytes2hex,
717717
chomp,
718718
chop,
719-
chr2ind,
720719
codeunit,
721720
dec,
722721
digits,
@@ -728,7 +727,6 @@ export
728727
hex,
729728
hex2bytes,
730729
hex2bytes!,
731-
ind2chr,
732730
info,
733731
ismatch,
734732
isvalid,

base/filesystem.jl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,26 @@ function read(f::File, ::Type{UInt8})
149149
return ret % UInt8
150150
end
151151

152+
function read(f::File, ::Type{Char})
153+
b0 = read(f, UInt8)
154+
l = 8(4-leading_ones(b0))
155+
c = UInt32(b0) << 24
156+
if l < 24
157+
s = 16
158+
while s l && !eof(f)
159+
p = position(f)
160+
b = read(f, UInt8)
161+
if b & 0xc0 != 0x80
162+
seek(f, p)
163+
break
164+
end
165+
c |= UInt32(b) << s
166+
s -= 8
167+
end
168+
end
169+
return reinterpret(Char, c)
170+
end
171+
152172
function unsafe_read(f::File, p::Ptr{UInt8}, nel::UInt)
153173
check_open(f)
154174
ret = ccall(:jl_fs_read, Int32, (Int32, Ptr{Void}, Csize_t),

base/intfuncs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -654,8 +654,8 @@ for sym in (:bin, :oct, :dec, :hex)
654654
@eval begin
655655
($sym)(x::Unsigned, p::Int) = ($sym)(x,p,false)
656656
($sym)(x::Unsigned) = ($sym)(x,1,false)
657-
($sym)(x::Char, p::Int) = ($sym)(unsigned(x),p,false)
658-
($sym)(x::Char) = ($sym)(unsigned(x),1,false)
657+
($sym)(x::Char, p::Int) = ($sym)(UInt32(x),p,false)
658+
($sym)(x::Char) = ($sym)(UInt32(x),1,false)
659659
($sym)(x::Integer, p::Int) = ($sym)(unsigned(abs(x)),p,x<0)
660660
($sym)(x::Integer) = ($sym)(unsigned(abs(x)),1,x<0)
661661
end

base/io.jl

Lines changed: 25 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -535,25 +535,13 @@ function write(s::IO, a::SubArray{T,N,<:Array}) where {T,N}
535535
end
536536
end
537537

538-
539-
function write(s::IO, ch::Char)
540-
c = reinterpret(UInt32, ch)
541-
if c < 0x80
542-
return write(s, c%UInt8)
543-
elseif c < 0x800
544-
return (write(s, (( c >> 6 ) | 0xC0)%UInt8)) +
545-
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
546-
elseif c < 0x10000
547-
return (write(s, (( c >> 12 ) | 0xE0)%UInt8)) +
548-
(write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) +
549-
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
550-
elseif c < 0x110000
551-
return (write(s, (( c >> 18 ) | 0xF0)%UInt8)) +
552-
(write(s, (((c >> 12) & 0x3F ) | 0x80)%UInt8)) +
553-
(write(s, (((c >> 6) & 0x3F ) | 0x80)%UInt8)) +
554-
(write(s, (( c & 0x3F ) | 0x80)%UInt8))
555-
else
556-
return write(s, '\ufffd')
538+
function write(io::IO, c::Char)
539+
u = bswap(reinterpret(UInt32, c))
540+
n = 1
541+
while true
542+
write(io, u % UInt8)
543+
(u >>= 8) == 0 && return n
544+
n += 1
557545
end
558546
end
559547

@@ -596,31 +584,28 @@ function read!(s::IO, a::Array{T}) where T
596584
return a
597585
end
598586

599-
function read(s::IO, ::Type{Char})
600-
ch = read(s, UInt8)
601-
if ch < 0x80
602-
return Char(ch)
603-
end
604-
605-
# mimic utf8.next function
606-
trailing = Base.utf8_trailing[ch+1]
607-
c::UInt32 = 0
608-
for j = 1:trailing
609-
c += ch
610-
c <<= 6
611-
ch = read(s, UInt8)
587+
function read(io::IO, ::Type{Char})
588+
b0 = read(io, UInt8)
589+
l = 8(4-leading_ones(b0))
590+
c = UInt32(b0) << 24
591+
if l < 24
592+
s = 16
593+
while s l && !eof(io)
594+
peek(io) & 0xc0 == 0x80 || break
595+
b = read(io, UInt8)
596+
c |= UInt32(b) << s
597+
s -= 8
598+
end
612599
end
613-
c += ch
614-
c -= Base.utf8_offset[trailing+1]
615-
return Char(c)
600+
return reinterpret(Char, c)
616601
end
617602

618603
# readuntil_string is useful below since it has
619604
# an optimized method for s::IOStream
620605
readuntil_string(s::IO, delim::UInt8) = String(readuntil(s, delim))
621606

622607
function readuntil(s::IO, delim::Char)
623-
if delim < Char(0x80)
608+
if delim '\x7f'
624609
return readuntil_string(s, delim % UInt8)
625610
end
626611
out = IOBuffer()
@@ -701,7 +686,7 @@ function readuntil(io::IO, target::AbstractString)
701686
i = start(target)
702687
done(target, i) && return ""
703688
c, i = next(target, start(target))
704-
if done(target, i) && c < Char(0x80)
689+
if done(target, i) && c <= '\x7f'
705690
return readuntil_string(io, c % UInt8)
706691
end
707692
# decide how we can index target
@@ -728,12 +713,11 @@ function readuntil(io::IO, target::AbstractVector{T}) where T
728713
return out
729714
end
730715

731-
732716
"""
733717
readchomp(x)
734718
735-
Read the entirety of `x` as a string and remove a single trailing newline.
736-
Equivalent to `chomp!(read(x, String))`.
719+
Read the entirety of `x` as a string and remove a single trailing newline
720+
if there is one. Equivalent to `chomp(read(x, String))`.
737721
738722
# Examples
739723
```jldoctest
@@ -747,7 +731,7 @@ julia> readchomp("my_file.txt")
747731
julia> rm("my_file.txt");
748732
```
749733
"""
750-
readchomp(x) = chomp!(read(x, String))
734+
readchomp(x) = chomp(read(x, String))
751735

752736
# read up to nb bytes into nb, returning # bytes read
753737

base/iostream.jl

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -315,12 +315,13 @@ end
315315

316316
## low-level calls ##
317317

318-
write(s::IOStream, b::UInt8) = Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
318+
function write(s::IOStream, b::UInt8)
319+
iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
320+
Int(ccall(:ios_putc, Cint, (Cint, Ptr{Void}), b, s.ios))
321+
end
319322

320323
function unsafe_write(s::IOStream, p::Ptr{UInt8}, nb::UInt)
321-
if !iswritable(s)
322-
throw(ArgumentError("write failed, IOStream is not writeable"))
323-
end
324+
iswritable(s) || throw(ArgumentError("write failed, IOStream is not writeable"))
324325
return Int(ccall(:ios_write, Csize_t, (Ptr{Void}, Ptr{Void}, Csize_t), s.ios, p, nb))
325326
end
326327

@@ -353,14 +354,6 @@ end
353354

354355
## text I/O ##
355356

356-
function write(s::IOStream, c::Char)
357-
if !iswritable(s)
358-
throw(ArgumentError("write failed, IOStream is not writeable"))
359-
end
360-
Int(ccall(:ios_pututf8, Cint, (Ptr{Void}, UInt32), s.ios, c))
361-
end
362-
read(s::IOStream, ::Type{Char}) = Char(ccall(:jl_getutf8, UInt32, (Ptr{Void},), s.ios))
363-
364357
take!(s::IOStream) =
365358
ccall(:jl_take_buffer, Vector{UInt8}, (Ptr{Void},), s.ios)
366359

@@ -452,14 +445,23 @@ function read(s::IOStream, nb::Integer; all::Bool=true)
452445
end
453446

454447
## Character streams ##
455-
const _chtmp = Ref{Char}()
448+
456449
function peekchar(s::IOStream)
457-
if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{Char}), s, _chtmp) < 0
450+
chref = Ref{UInt32}()
451+
if ccall(:ios_peekutf8, Cint, (Ptr{Void}, Ptr{UInt32}), s, chref) < 0
458452
return typemax(Char)
459453
end
460-
return _chtmp[]
454+
return Char(chref[])
461455
end
462456

463457
function peek(s::IOStream)
464458
ccall(:ios_peekc, Cint, (Ptr{Void},), s)
465459
end
460+
461+
function peek(s::IO)
462+
mark(s)
463+
try read(s, UInt8)
464+
finally
465+
reset(s)
466+
end
467+
end

base/parse.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,12 +224,12 @@ end
224224
## string to float functions ##
225225

226226
tryparse(::Type{Float64}, s::String) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
227-
tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
227+
tryparse(::Type{Float64}, s::SubString{String}) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
228228
tryparse_internal(::Type{Float64}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
229229
tryparse_internal(::Type{Float64}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtod, Nullable{Float64}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)
230230

231231
tryparse(::Type{Float32}, s::String) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, 0, sizeof(s))
232-
tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.endof)
232+
tryparse(::Type{Float32}, s::SubString{String}) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset, s.ncodeunits)
233233
tryparse_internal(::Type{Float32}, s::String, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s, startpos-1, endpos-startpos+1)
234234
tryparse_internal(::Type{Float32}, s::SubString{String}, startpos::Int, endpos::Int) = ccall(:jl_try_substrtof, Nullable{Float32}, (Ptr{UInt8},Csize_t,Csize_t), s.string, s.offset+startpos-1, endpos-startpos+1)
235235

0 commit comments

Comments
 (0)