Skip to content

Commit f2f12d8

Browse files
committed
Initial commit
1 parent f1a6d1b commit f2f12d8

File tree

2 files changed

+348
-4
lines changed

2 files changed

+348
-4
lines changed

src/iconv.jl

Lines changed: 281 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,285 @@
1+
# This file is a part of Julia. License is MIT: http://julialang.org/license
2+
13
module iconv
4+
import Base: close, eof, flush, read, readall, write
5+
import Base.Libc: errno, strerror
6+
export StringEncoder, StringDecoder, encode, decode
7+
8+
## iconv wrappers
9+
10+
const E2BIG = 7
11+
const EINVAL = 22
12+
const EILSEQ = 84
13+
14+
type IConv
15+
p::Ptr{Void}
16+
end
17+
18+
function iconv_close(cd::IConv)
19+
cd.p != C_NULL || return
20+
ret = ccall((:iconv_close, :libc), Cint, (Ptr{Void},), cd.p)
21+
ret == 0 || error("failed to call iconv_close: error $(errno()) ($(strerror(errno())))")
22+
cd.p = C_NULL
23+
nothing
24+
end
25+
26+
function iconv_open(tocode, fromcode)
27+
p = ccall((:iconv_open, :libc), Ptr{Void}, (Cstring, Cstring), tocode, fromcode)
28+
if p != Ptr{Void}(-1)
29+
obj = IConv(p)
30+
finalizer(obj, iconv_close)
31+
return obj
32+
elseif errno() == EINVAL
33+
error("conversion from $fromcode to $tocode not supported by iconv implementation, check that specified encodings are correct")
34+
else
35+
error("iconv_open error $(errno()): $(strerror(errno()))")
36+
end
37+
end
38+
39+
40+
## StringEncoder and StringDecoder common functions
41+
42+
const BUFSIZE = 100
43+
44+
type StringEncoder{S<:IO} <: IO
45+
ostream::S
46+
cd::IConv
47+
inbuf::Vector{UInt8}
48+
outbuf::Vector{UInt8}
49+
inbytesleft::Ref{Csize_t}
50+
outbytesleft::Ref{Csize_t}
51+
end
52+
53+
type StringDecoder{S<:IO} <: IO
54+
istream::S
55+
cd::IConv
56+
inbuf::Vector{UInt8}
57+
outbuf::Vector{UInt8}
58+
inbytesleft::Ref{Csize_t}
59+
outbytesleft::Ref{Csize_t}
60+
skip::Int
61+
end
62+
63+
function iconv!(cd::IConv, inbuf::Vector{UInt8}, outbuf::Vector{UInt8},
64+
inbytesleft::Ref{Csize_t}, outbytesleft::Ref{Csize_t})
65+
inbuf2_orig = pointer(inbuf, 1)
66+
outbuf2_orig = pointer(outbuf, 1)
67+
68+
inbytesleft_orig = inbytesleft[]
69+
outbytesleft[] = BUFSIZE
70+
71+
inbuf2 = Ptr{UInt8}[inbuf2_orig]
72+
outbuf2 = Ptr{UInt8}[outbuf2_orig]
73+
74+
ret = ccall((:iconv, :libc), Csize_t,
75+
(Ptr{Void}, Ptr{Ptr{UInt8}}, Ref{Csize_t}, Ptr{Ptr{UInt8}}, Ref{Csize_t}),
76+
cd.p, pointer(inbuf2, 1), inbytesleft, pointer(outbuf2, 1), outbytesleft)
77+
78+
if ret == reinterpret(Csize_t, -1)
79+
err = errno()
80+
81+
# Should never happen unless a very small buffer is used
82+
if err == E2BIG && outbytesleft[] == BUFSIZE
83+
error("iconv error: ran out of space in the output buffer")
84+
# Output buffer is full, or sequence is incomplete:
85+
# copy remaining bytes to the start of the input buffer for next time
86+
elseif err == E2BIG || err == EINVAL
87+
copy!(inbuf, 1, inbuf, inbytesleft_orig-inbytesleft[]+1, inbytesleft[])
88+
elseif err == EILSEQ
89+
b = inbuf[(inbytesleft_orig-inbytesleft[]+1):inbytesleft_orig]
90+
error("iconv error: byte sequence 0x$(bytes2hex(b)) is invalid in source encoding or cannot be represented in target encoding")
91+
else
92+
error("iconv error $(errno()): $(strerror(errno()))")
93+
end
94+
end
95+
96+
BUFSIZE - outbytesleft[]
97+
end
98+
99+
# Reset iconv to initial state
100+
# Returns the number of bytes written into the output buffer, if any
101+
function iconv_reset!(s::Union{StringEncoder, StringDecoder})
102+
s.cd.p != C_NULL || return 0
103+
104+
if is(s, StringDecoder)
105+
s.skip = 0
106+
end
107+
108+
outbuf2 = Ptr{UInt8}[pointer(s.outbuf, 1)]
109+
s.outbytesleft[] = BUFSIZE
110+
ret = ccall((:iconv, :libc), Csize_t,
111+
(Ptr{Void}, Ptr{Ptr{UInt8}}, Ref{Csize_t}, Ptr{Ptr{UInt8}}, Ref{Csize_t}),
112+
s.cd.p, C_NULL, C_NULL, pointer(outbuf2, 1), s.outbytesleft)
113+
114+
if ret == reinterpret(Csize_t, -1)
115+
err = errno()
116+
if err == EINVAL
117+
error("iconv error: incomplete byte sequence at end of input")
118+
elseif err == E2BIG
119+
error("iconv error: ran out of space in the output buffer")
120+
elseif err == EILSEQ
121+
error("iconv error: invalid byte sequence in input")
122+
else
123+
error("iconv error $(errno()): $(strerror(errno()))")
124+
end
125+
end
126+
127+
BUFSIZE - s.outbytesleft[]
128+
end
129+
130+
131+
## StringEncoder
132+
133+
"""
134+
StringEncoder(istream, to, from="UTF-8")
135+
136+
Returns a new write-only I/O stream, which converts any text in the encoding `from`
137+
written to it into text in the encoding `to` written to ostream. Calling `close` on the
138+
stream is necessary to complete the encoding (but does not close `ostream`).
139+
"""
140+
function StringEncoder(ostream::IO, to::ASCIIString, from::ASCIIString="UTF-8")
141+
cd = iconv_open(to, from)
142+
inbuf = Vector{UInt8}(BUFSIZE)
143+
outbuf = Vector{UInt8}(BUFSIZE)
144+
s = StringEncoder(ostream, cd, inbuf, outbuf, Ref{Csize_t}(0), Ref{Csize_t}(BUFSIZE))
145+
finalizer(s, close)
146+
s
147+
end
148+
149+
# Flush input buffer and convert it into output buffer
150+
# Returns the number of bytes written to output buffer
151+
function flush(s::StringEncoder)
152+
s.cd.p != C_NULL || return s
153+
154+
# We need to retry several times in case output buffer is too small to convert
155+
# all of the input. Even so, some incomplete sequences may remain in the input
156+
# until more data is written, which will only trigger an error on close().
157+
s.outbytesleft[] = 0
158+
while s.outbytesleft[] < BUFSIZE
159+
iconv!(s.cd, s.inbuf, s.outbuf, s.inbytesleft, s.outbytesleft)
160+
write(s.ostream, sub(s.outbuf, 1:(BUFSIZE - s.outbytesleft[])))
161+
end
162+
163+
s
164+
end
165+
166+
function close(s::StringEncoder)
167+
s.cd.p != C_NULL || return s
168+
flush(s)
169+
iconv_reset!(s)
170+
finalize(s.cd)
171+
# flush() wasn't able to empty input buffer, which cannot happen with correct data
172+
s.inbytesleft[] == 0 || error("iconv error: incomplete byte sequence at end of input")
173+
end
174+
175+
function write(s::StringEncoder, x::UInt8)
176+
s.inbytesleft[] >= length(s.inbuf) && flush(s)
177+
s.inbuf[s.inbytesleft[]+=1] = x
178+
1
179+
end
180+
181+
182+
## StringDecoder
183+
184+
"""
185+
StringDecoder(istream, from, to="UTF-8")
186+
187+
Returns a new read-only I/O stream, which converts text in the encoding `from`
188+
read from `istream` into text in the encoding `to`.
189+
"""
190+
function StringDecoder(istream::IO, from::ASCIIString, to::ASCIIString="UTF-8")
191+
cd = iconv_open(to, from)
192+
inbuf = Vector{UInt8}(BUFSIZE)
193+
outbuf = Vector{UInt8}(BUFSIZE)
194+
s = StringDecoder(istream, cd, inbuf, outbuf, Ref{Csize_t}(0), Ref{Csize_t}(BUFSIZE), 0)
195+
finalizer(s, close)
196+
s
197+
end
198+
199+
# Fill input buffer and convert it into output buffer
200+
# Returns the number of bytes written to output buffer
201+
function fill_buffer!(s::StringDecoder)
202+
s.cd.p != C_NULL || return 0
203+
204+
s.skip = 0
205+
206+
# Input buffer and input stream empty
207+
if s.inbytesleft[] == 0 && eof(s.istream)
208+
i = iconv_reset!(s)
209+
return i
210+
end
211+
212+
s.inbytesleft[] += readbytes!(s.istream, sub(s.inbuf, (s.inbytesleft[]+1):BUFSIZE))
213+
iconv!(s.cd, s.inbuf, s.outbuf, s.inbytesleft, s.outbytesleft)
214+
end
215+
216+
# In order to know whether more data is available, we need to:
217+
# 1) check whether the output buffer contains data
218+
# 2) if not, actually try to fill it (this is the only way to find out whether input
219+
# data contains only state control sequences which may be converted to nothing)
220+
# 3) if not, reset iconv to initial state, which may generate data
221+
function eof(s::StringDecoder)
222+
length(s.outbuf) - s.outbytesleft[] == s.skip &&
223+
fill_buffer!(s) == 0 &&
224+
iconv_reset!(s) == 0
225+
end
226+
227+
function close(s::StringDecoder)
228+
s.cd.p != C_NULL || return s
229+
finalize(s.cd)
230+
# fill_buffer!() wasn't able to empty input buffer, which cannot happen with correct data
231+
s.inbytesleft[] == 0 || error("iconv error: incomplete byte sequence at end of input")
232+
end
233+
234+
function read(s::StringDecoder, ::Type{UInt8})
235+
eof(s) ? throw(EOFError()) : s.outbuf[s.skip+=1]
236+
end
237+
238+
function readall(filename::AbstractString, encoding::ASCIIString)
239+
open(s -> readall(StringDecoder(s, encoding)), filename)
240+
end
241+
242+
243+
## Functions to encode/decode strings
244+
245+
encoding_string(::Type{ASCIIString}) = "ASCII"
246+
encoding_string(::Type{UTF8String}) = "UTF-8"
247+
encoding_string(::Type{UTF16String}) = "UTF-16"
248+
encoding_string(::Type{UTF32String}) = "UTF-32"
249+
250+
"""
251+
decode(a::Vector{UInt8}, enc::ASCIIString)
252+
253+
Convert an array of bytes `a` representing text in encoding `enc` to a string.
254+
"""
255+
function decode(a::Vector{UInt8}, enc::ASCIIString)
256+
b = IOBuffer(a)
257+
try
258+
d = readbytes(StringDecoder(b, enc, "UTF-8"))
259+
# Skip final null bytes if needed
260+
# FIXME: find a better solution?
261+
i = length(d)
262+
while i >= 1
263+
d[i] != 0 && break
264+
i -= 1
265+
end
266+
UTF8String(d[1:i])
267+
finally
268+
close(b)
269+
end
270+
end
271+
272+
"""
273+
encode(s::AbstractString, enc::ASCIIString)
2274
3-
# package code goes here
275+
Convert string `s` to an array of bytes representing text in encoding `enc`.
276+
"""
277+
function encode(s::AbstractString, enc::ASCIIString)
278+
b = IOBuffer()
279+
p = StringEncoder(b, enc, encoding_string(typeof(s)))
280+
write(p, s)
281+
close(p)
282+
takebuf_array(b)
283+
end
4284

5285
end # module

test/runtests.jl

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,69 @@
1-
using iconv
21
using Base.Test
32

4-
# write your own tests here
5-
@test 1 == 1
3+
for s in ("", "a", "café crème",
4+
"a"^(iconv.BUFSIZE-1) * "€ with an incomplete codepoint between two input buffer fills",
5+
"a string € チャネルパートナーの選択")
6+
# Test round-trip to Unicode formats, checking against pure-Julia implementation
7+
for (T, enc) in ((UTF8String, "UTF-8"), (UTF16String, "UTF-16"), (UTF32String, "UTF-32"))
8+
a = reinterpret(UInt8, T(s).data)
9+
@test decode(a, enc) == s
10+
@test decode(encode(s, enc), enc) == s
11+
end
12+
end
13+
14+
# Test a few non-Unicode encodings
15+
for (s, enc) in (("noël", "ISO-8859-1"), ("noël €", "ISO-8859-15", "CP1252"),
16+
("Код Обмена Информацией, 8 бит", "KOI8"), ("国家标准", "GB18030"))
17+
@test decode(encode(s, enc), enc) == s
18+
end
19+
20+
# Test that attempt to close stream in the middle of incomplete sequence throws
21+
# TODO: use more specific errors
22+
let s = "a string € チャネルパートナーの選択"
23+
p = StringEncoder(IOBuffer(), "UTF-16")
24+
write(p, s.data[1:10])
25+
@test_throws ErrorException close(p)
26+
27+
p = StringDecoder(IOBuffer(encode(s, "UTF-16")[1:21]), "UTF-16")
28+
@test readall(p) == s[1:9]
29+
@test_throws ErrorException close(p)
30+
31+
# Test stateful encoding, which output some bytes on final reset
32+
# with strings containing different scripts
33+
x = encode(s, "ISO-2022-JP-3")
34+
@test decode(x, "ISO-2022-JP-3") == s
35+
36+
p = StringDecoder(IOBuffer(x), "ISO-2022-JP-3", "UTF-8")
37+
# Test that closed pipe behaves correctly
38+
close(p)
39+
@test eof(p)
40+
@test_throws EOFError read(p, UInt8)
41+
close(p)
42+
end
43+
44+
@test_throws ErrorException encode("qwertyé€", "ASCII")
45+
try
46+
encode("qwertyé€", "ASCII")
47+
catch err
48+
io = IOBuffer()
49+
showerror(io, err)
50+
@test takebuf_string(io) ==
51+
"iconv error: byte sequence 0xc3a9e282ac is invalid in source encoding or cannot be represented in target encoding"
52+
end
53+
54+
@test_throws ErrorException decode("qwertyé€".data, "ASCII")
55+
try
56+
decode("qwertyé€".data, "ASCII")
57+
catch err
58+
io = IOBuffer()
59+
showerror(io, err)
60+
@test takebuf_string(io) ==
61+
"iconv error: byte sequence 0xc3a9e282ac is invalid in source encoding or cannot be represented in target encoding"
62+
end
63+
64+
mktemp() do p, io
65+
s = "café crème"
66+
write(io, encode(s, "CP1252"))
67+
close(io)
68+
@test readall(p, "CP1252") == s
69+
end

0 commit comments

Comments
 (0)