Skip to content

Commit a89ca42

Browse files
committed
Introduce Encoding parametric singleton type
First step towards efficient encoders for common encodings, as well as towards providing information about encodings. This also allows adding convenience methods to base I/O functions taking an additional encoding parameter without risking ambiguities.
1 parent 5f601c2 commit a89ca42

File tree

3 files changed

+83
-35
lines changed

3 files changed

+83
-35
lines changed

src/StringEncodings.jl

Lines changed: 44 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# This file is a part of Julia. License is MIT: http://julialang.org/license
1+
# This file is a part of StringEncodings.jl. License is MIT: http://julialang.org/license
22

33
module StringEncodings
44
import Base: close, eof, flush, read, readall, write, show
@@ -8,6 +8,7 @@ export StringEncoder, StringDecoder, encode, decode, encodings
88
export StringEncodingError, OutputBufferError, IConvError
99
export InvalidEncodingError, InvalidSequenceError, IncompleteSequenceError
1010

11+
include("encodings.jl")
1112

1213
abstract StringEncodingError
1314

@@ -62,7 +63,7 @@ function iconv_close(cd::Ptr{Void})
6263
end
6364
end
6465

65-
function iconv_open(tocode, fromcode)
66+
function iconv_open(tocode::ASCIIString, fromcode::ASCIIString)
6667
p = ccall((:iconv_open, libiconv), Ptr{Void}, (Cstring, Cstring), tocode, fromcode)
6768
if p != Ptr{Void}(-1)
6869
return p
@@ -173,14 +174,16 @@ end
173174
## StringEncoder
174175

175176
"""
176-
StringEncoder(istream, to, from="UTF-8")
177+
StringEncoder(istream, to, from=enc"UTF-8")
177178
178179
Returns a new write-only I/O stream, which converts any text in the encoding `from`
179180
written to it into text in the encoding `to` written to ostream. Calling `close` on the
180181
stream is necessary to complete the encoding (but does not close `ostream`).
182+
183+
`to` and `from` can be specified either as a string or as an `Encoding` object.
181184
"""
182-
function StringEncoder(ostream::IO, to::ASCIIString, from::ASCIIString="UTF-8")
183-
cd = iconv_open(to, from)
185+
function StringEncoder(ostream::IO, to::Encoding, from::Encoding=enc"UTF-8")
186+
cd = iconv_open(ASCIIString(to), ASCIIString(from))
184187
inbuf = Vector{UInt8}(BUFSIZE)
185188
outbuf = Vector{UInt8}(BUFSIZE)
186189
s = StringEncoder(ostream, cd, inbuf, outbuf,
@@ -190,6 +193,11 @@ function StringEncoder(ostream::IO, to::ASCIIString, from::ASCIIString="UTF-8")
190193
s
191194
end
192195

196+
StringEncoder(ostream::IO, to::AbstractString, from::Encoding=enc"UTF-8") =
197+
StringEncoder(ostream, Encoding(to), from)
198+
StringEncoder(ostream::IO, to::AbstractString, from::AbstractString) =
199+
StringEncoder(ostream, Encoding(to), Encoding(from))
200+
193201
# Flush input buffer and convert it into output buffer
194202
# Returns the number of bytes written to output buffer
195203
function flush(s::StringEncoder)
@@ -226,16 +234,18 @@ end
226234
## StringDecoder
227235

228236
"""
229-
StringDecoder(istream, from, to="UTF-8")
237+
StringDecoder(istream, from, to=enc"UTF-8")
230238
231239
Returns a new read-only I/O stream, which converts text in the encoding `from`
232240
read from `istream` into text in the encoding `to`.
233241
242+
`to` and `from` can be specified either as a string or as an `Encoding` object.
243+
234244
Note that some implementations (notably the Windows one) may accept invalid sequences
235245
in the input data without raising an error.
236246
"""
237-
function StringDecoder(istream::IO, from::ASCIIString, to::ASCIIString="UTF-8")
238-
cd = iconv_open(to, from)
247+
function StringDecoder(istream::IO, from::Encoding, to::Encoding=enc"UTF-8")
248+
cd = iconv_open(ASCIIString(to), ASCIIString(from))
239249
inbuf = Vector{UInt8}(BUFSIZE)
240250
outbuf = Vector{UInt8}(BUFSIZE)
241251
s = StringDecoder(istream, cd, inbuf, outbuf,
@@ -245,6 +255,11 @@ function StringDecoder(istream::IO, from::ASCIIString, to::ASCIIString="UTF-8")
245255
s
246256
end
247257

258+
StringDecoder(istream::IO, from::AbstractString, to::Encoding=enc"UTF-8") =
259+
StringDecoder(istream, Encoding(from), to)
260+
StringDecoder(istream::IO, from::AbstractString, to::AbstractString) =
261+
StringDecoder(istream, Encoding(from), Encoding(to))
262+
248263
# Fill input buffer and convert it into output buffer
249264
# Returns the number of bytes written to output buffer
250265
function fill_buffer!(s::StringDecoder)
@@ -289,68 +304,67 @@ end
289304
## Convenience I/O functions
290305
if isdefined(Base, :readstring)
291306
@doc """
292-
readstring(stream or filename, enc::ASCIIString)
307+
readstring(stream or filename, enc::Encoding)
293308
294309
Read the entire contents of an I/O stream or a file in encoding `enc` as a string.
295310
""" ->
296-
Base.readstring(s::IO, enc::ASCIIString) = readstring(StringDecoder(s, enc))
297-
Base.readstring(filename::AbstractString, enc::ASCIIString) = open(io->readstring(io, enc), filename)
311+
Base.readstring(s::IO, enc::Encoding) = readstring(StringDecoder(s, enc))
312+
Base.readstring(filename::AbstractString, enc::Encoding) = open(io->readstring(io, enc), filename)
298313
else # Compatibility with Julia 0.4
299314
@doc """
300-
readall(stream or filename, enc::ASCIIString)
315+
readall(stream or filename, enc::Encoding)
301316
302317
Read the entire contents of an I/O stream or a file in encoding `enc` as a string.
303318
""" ->
304-
Base.readall(s::IO, enc::ASCIIString) = readall(StringDecoder(s, enc))
305-
Base.readall(filename::AbstractString, enc::ASCIIString) = open(io->readall(io, enc), filename)
319+
Base.readall(s::IO, enc::Encoding) = readall(StringDecoder(s, enc))
320+
Base.readall(filename::AbstractString, enc::Encoding) = open(io->readall(io, enc), filename)
306321
end
307322

308323

309324
## Functions to encode/decode strings
310325

311-
encoding_string(::Type{ASCIIString}) = "ASCII"
312-
encoding_string(::Type{UTF8String}) = "UTF-8"
313-
encoding_string(::Type{UTF16String}) = (ENDIAN_BOM == 0x04030201) ? "UTF-16LE" : "UTF-16BE"
314-
encoding_string(::Type{UTF32String}) = (ENDIAN_BOM == 0x04030201) ? "UTF-32LE" : "UTF-32BE"
315-
316326
"""
317-
decode([T,] a::Vector{UInt8}, enc::ASCIIString)
327+
decode([T,] a::Vector{UInt8}, enc)
318328
319329
Convert an array of bytes `a` representing text in encoding `enc` to a string of type `T`.
320330
By default, a `UTF8String` is returned.
321331
332+
`enc` can be specified either as a string or as an `Encoding` object.
333+
322334
Note that some implementations (notably the Windows one) may accept invalid sequences
323335
in the input data without raising an error.
324336
"""
325-
function decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::ASCIIString)
337+
function decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::Encoding)
326338
b = IOBuffer(a)
327339
try
328-
T(readbytes(StringDecoder(b, enc, encoding_string(T))))
340+
T(readbytes(StringDecoder(b, enc, encoding(T))))
329341
finally
330342
close(b)
331343
end
332344
end
333345

334-
decode(a::Vector{UInt8}, enc::ASCIIString) = decode(UTF8String, a, enc)
346+
decode{T<:AbstractString}(::Type{T}, a::Vector{UInt8}, enc::AbstractString) = decode(T, a, Encoding(enc))
347+
348+
decode(a::Vector{UInt8}, enc::AbstractString) = decode(UTF8String, a, Encoding(enc))
349+
decode(a::Vector{UInt8}, enc::Union{AbstractString, Encoding}) = decode(UTF8String, a, enc)
335350

336351
"""
337-
encode(s::AbstractString, enc::ASCIIString)
352+
encode(s::AbstractString, enc)
338353
339354
Convert string `s` to an array of bytes representing text in encoding `enc`.
355+
`enc` can be specified either as a string or as an `Encoding` object.
340356
"""
341-
function encode(s::AbstractString, enc::ASCIIString)
357+
function encode(s::AbstractString, enc::Encoding)
342358
b = IOBuffer()
343-
p = StringEncoder(b, enc, encoding_string(typeof(s)))
359+
p = StringEncoder(b, enc, encoding(typeof(s)))
344360
write(p, s)
345361
close(p)
346362
takebuf_array(b)
347363
end
348364

365+
encode(s::AbstractString, enc::AbstractString) = encode(s, Encoding(enc))
349366

350-
## Function to list supported encodings
351-
include("encodings.jl")
352-
353-
function test_encoding(enc)
367+
function test_encoding(enc::ASCIIString)
354368
# We assume that an encoding is supported if it's possible to convert from it to UTF-8:
355369
cd = ccall((:iconv_open, libiconv), Ptr{Void}, (Cstring, Cstring), enc, "UTF-8")
356370
if cd == Ptr{Void}(-1)

src/encodings.jl

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,35 @@
1-
import StringEncodings: libiconv, iconv_close
1+
# This file is a part of StringEncodings.jl. License is MIT: http://julialang.org/license
2+
3+
# Parametric singleton type representing a given string encoding via its symbol parameter
4+
5+
import Base: show, print, convert
6+
export Encoding, @enc_str
7+
8+
immutable Encoding{enc} end
9+
10+
Encoding(s) = Encoding{symbol(s)}()
11+
macro enc_str(s)
12+
:(Encoding{$(Expr(:quote, symbol(s)))}())
13+
end
14+
15+
convert{T<:AbstractString, enc}(::Type{T}, ::Encoding{enc}) = string(enc)
16+
17+
show{enc}(io::IO, ::Encoding{enc}) = print(io, string(enc), " string encoding type")
18+
print{enc}(io::IO, ::Encoding{enc}) = print(io, enc)
19+
20+
21+
## Get the encoding used by a string type
22+
encoding(::Type{ASCIIString}) = enc"ASCII"
23+
encoding(::Type{UTF8String}) = enc"UTF-8"
24+
25+
if ENDIAN_BOM == 0x04030201
26+
encoding(::Type{UTF16String}) = enc"UTF-16LE"
27+
encoding(::Type{UTF32String}) = enc"UTF-32LE"
28+
else
29+
encoding(::Type{UTF16String}) = enc"UTF-16BE"
30+
encoding(::Type{UTF32String}) = enc"UTF-32BE"
31+
end
32+
233
encodings_list = ["1026", "1046", "1047", "10646-1:1993", "10646-1:1993/UCS4",
334
"437", "500", "500V1", "850", "851", "852", "855", "856", "857",
435
"860", "861", "862", "863", "864", "865", "866", "866NAV", "869",

test/runtests.jl

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ for s in ("", "\0", "a", "café crème",
77
"a string \0€ チャネルパ\0\0トナーの選択 with embedded and trailing nuls\0")
88
# Test round-trip to Unicode formats, checking against pure-Julia implementation
99
for (T, nullen) in ((UTF8String, 0), (UTF16String, 2), (UTF32String, 4))
10-
enc = StringEncodings.encoding_string(T)
10+
enc = StringEncodings.encoding(T)
1111
a = reinterpret(UInt8, T(s).data)
1212
# Adjust for explicit \0 only for .data on UTF16String/UTF32String
1313
a = a[1:end - nullen]
@@ -102,7 +102,7 @@ mktemp() do p, io
102102
s = "café crème"
103103
write(io, encode(s, "CP1252"))
104104
close(io)
105-
@test readall(p, "CP1252") == s
105+
@test readall(p, enc"CP1252") == s
106106
end
107107

108108
@test_throws InvalidEncodingError p = StringEncoder(IOBuffer(), "nonexistent_encoding")
@@ -136,10 +136,13 @@ mktemp() do path, io
136136
write(io, encode(s, "ISO-2022-JP"))
137137
close(io)
138138

139-
@test readstring(path, "ISO-2022-JP") == s
140-
@test open(io->readstring(io, "ISO-2022-JP"), path) == s
139+
@test readstring(path, enc"ISO-2022-JP") == s
140+
@test open(io->readstring(io, enc"ISO-2022-JP"), path) == s
141141
end
142142

143+
144+
## Test encodings support
145+
143146
encodings_list = encodings()
144147
@test "ASCII" in encodings_list
145148
@test "UTF-8" in encodings_list

0 commit comments

Comments
 (0)