|
| 1 | +# This file is a part of Julia. License is MIT: http://julialang.org/license |
| 2 | + |
1 | 3 | module iconv
|
| 4 | +import Base: close, eof, flush, read, readall, write |
| 5 | +import Base.Libc: errno, strerror |
| 6 | +export StringEncoder, StringDecoder, encode, decode |
| 7 | + |
| 8 | +## iconv wrappers |
| 9 | + |
| 10 | +const E2BIG = 7 |
| 11 | +const EINVAL = 22 |
| 12 | +const EILSEQ = 84 |
| 13 | + |
| 14 | +type IConv |
| 15 | + p::Ptr{Void} |
| 16 | +end |
| 17 | + |
| 18 | +function iconv_close(cd::IConv) |
| 19 | + cd.p != C_NULL || return |
| 20 | + ret = ccall((:iconv_close, :libc), Cint, (Ptr{Void},), cd.p) |
| 21 | + ret == 0 || error("failed to call iconv_close: error $(errno()) ($(strerror(errno())))") |
| 22 | + cd.p = C_NULL |
| 23 | + nothing |
| 24 | +end |
| 25 | + |
| 26 | +function iconv_open(tocode, fromcode) |
| 27 | + p = ccall((:iconv_open, :libc), Ptr{Void}, (Cstring, Cstring), tocode, fromcode) |
| 28 | + if p != Ptr{Void}(-1) |
| 29 | + obj = IConv(p) |
| 30 | + finalizer(obj, iconv_close) |
| 31 | + return obj |
| 32 | + elseif errno() == EINVAL |
| 33 | + error("conversion from $fromcode to $tocode not supported by iconv implementation, check that specified encodings are correct") |
| 34 | + else |
| 35 | + error("iconv_open error $(errno()): $(strerror(errno()))") |
| 36 | + end |
| 37 | +end |
| 38 | + |
| 39 | + |
| 40 | +## StringEncoder and StringDecoder common functions |
| 41 | + |
| 42 | +const BUFSIZE = 100 |
| 43 | + |
| 44 | +type StringEncoder{S<:IO} <: IO |
| 45 | + ostream::S |
| 46 | + cd::IConv |
| 47 | + inbuf::Vector{UInt8} |
| 48 | + outbuf::Vector{UInt8} |
| 49 | + inbytesleft::Ref{Csize_t} |
| 50 | + outbytesleft::Ref{Csize_t} |
| 51 | +end |
| 52 | + |
| 53 | +type StringDecoder{S<:IO} <: IO |
| 54 | + istream::S |
| 55 | + cd::IConv |
| 56 | + inbuf::Vector{UInt8} |
| 57 | + outbuf::Vector{UInt8} |
| 58 | + inbytesleft::Ref{Csize_t} |
| 59 | + outbytesleft::Ref{Csize_t} |
| 60 | + skip::Int |
| 61 | +end |
| 62 | + |
| 63 | +function iconv!(cd::IConv, inbuf::Vector{UInt8}, outbuf::Vector{UInt8}, |
| 64 | + inbytesleft::Ref{Csize_t}, outbytesleft::Ref{Csize_t}) |
| 65 | + inbuf2_orig = pointer(inbuf, 1) |
| 66 | + outbuf2_orig = pointer(outbuf, 1) |
| 67 | + |
| 68 | + inbytesleft_orig = inbytesleft[] |
| 69 | + outbytesleft[] = BUFSIZE |
| 70 | + |
| 71 | + inbuf2 = Ptr{UInt8}[inbuf2_orig] |
| 72 | + outbuf2 = Ptr{UInt8}[outbuf2_orig] |
| 73 | + |
| 74 | + ret = ccall((:iconv, :libc), Csize_t, |
| 75 | + (Ptr{Void}, Ptr{Ptr{UInt8}}, Ref{Csize_t}, Ptr{Ptr{UInt8}}, Ref{Csize_t}), |
| 76 | + cd.p, pointer(inbuf2, 1), inbytesleft, pointer(outbuf2, 1), outbytesleft) |
| 77 | + |
| 78 | + if ret == reinterpret(Csize_t, -1) |
| 79 | + err = errno() |
| 80 | + |
| 81 | + # Should never happen unless a very small buffer is used |
| 82 | + if err == E2BIG && outbytesleft[] == BUFSIZE |
| 83 | + error("iconv error: ran out of space in the output buffer") |
| 84 | + # Output buffer is full, or sequence is incomplete: |
| 85 | + # copy remaining bytes to the start of the input buffer for next time |
| 86 | + elseif err == E2BIG || err == EINVAL |
| 87 | + copy!(inbuf, 1, inbuf, inbytesleft_orig-inbytesleft[]+1, inbytesleft[]) |
| 88 | + elseif err == EILSEQ |
| 89 | + b = inbuf[(inbytesleft_orig-inbytesleft[]+1):inbytesleft_orig] |
| 90 | + error("iconv error: byte sequence 0x$(bytes2hex(b)) is invalid in source encoding or cannot be represented in target encoding") |
| 91 | + else |
| 92 | + error("iconv error $(errno()): $(strerror(errno()))") |
| 93 | + end |
| 94 | + end |
| 95 | + |
| 96 | + BUFSIZE - outbytesleft[] |
| 97 | +end |
| 98 | + |
| 99 | +# Reset iconv to initial state |
| 100 | +# Returns the number of bytes written into the output buffer, if any |
| 101 | +function iconv_reset!(s::Union{StringEncoder, StringDecoder}) |
| 102 | + s.cd.p != C_NULL || return 0 |
| 103 | + |
| 104 | + if is(s, StringDecoder) |
| 105 | + s.skip = 0 |
| 106 | + end |
| 107 | + |
| 108 | + outbuf2 = Ptr{UInt8}[pointer(s.outbuf, 1)] |
| 109 | + s.outbytesleft[] = BUFSIZE |
| 110 | + ret = ccall((:iconv, :libc), Csize_t, |
| 111 | + (Ptr{Void}, Ptr{Ptr{UInt8}}, Ref{Csize_t}, Ptr{Ptr{UInt8}}, Ref{Csize_t}), |
| 112 | + s.cd.p, C_NULL, C_NULL, pointer(outbuf2, 1), s.outbytesleft) |
| 113 | + |
| 114 | + if ret == reinterpret(Csize_t, -1) |
| 115 | + err = errno() |
| 116 | + if err == EINVAL |
| 117 | + error("iconv error: incomplete byte sequence at end of input") |
| 118 | + elseif err == E2BIG |
| 119 | + error("iconv error: ran out of space in the output buffer") |
| 120 | + elseif err == EILSEQ |
| 121 | + error("iconv error: invalid byte sequence in input") |
| 122 | + else |
| 123 | + error("iconv error $(errno()): $(strerror(errno()))") |
| 124 | + end |
| 125 | + end |
| 126 | + |
| 127 | + BUFSIZE - s.outbytesleft[] |
| 128 | +end |
| 129 | + |
| 130 | + |
| 131 | +## StringEncoder |
| 132 | + |
| 133 | +""" |
| 134 | + StringEncoder(istream, to, from="UTF-8") |
| 135 | +
|
| 136 | +Returns a new write-only I/O stream, which converts any text in the encoding `from` |
| 137 | +written to it into text in the encoding `to` written to ostream. Calling `close` on the |
| 138 | +stream is necessary to complete the encoding (but does not close `ostream`). |
| 139 | +""" |
| 140 | +function StringEncoder(ostream::IO, to::ASCIIString, from::ASCIIString="UTF-8") |
| 141 | + cd = iconv_open(to, from) |
| 142 | + inbuf = Vector{UInt8}(BUFSIZE) |
| 143 | + outbuf = Vector{UInt8}(BUFSIZE) |
| 144 | + s = StringEncoder(ostream, cd, inbuf, outbuf, Ref{Csize_t}(0), Ref{Csize_t}(BUFSIZE)) |
| 145 | + finalizer(s, close) |
| 146 | + s |
| 147 | +end |
| 148 | + |
| 149 | +# Flush input buffer and convert it into output buffer |
| 150 | +# Returns the number of bytes written to output buffer |
| 151 | +function flush(s::StringEncoder) |
| 152 | + s.cd.p != C_NULL || return s |
| 153 | + |
| 154 | + # We need to retry several times in case output buffer is too small to convert |
| 155 | + # all of the input. Even so, some incomplete sequences may remain in the input |
| 156 | + # until more data is written, which will only trigger an error on close(). |
| 157 | + s.outbytesleft[] = 0 |
| 158 | + while s.outbytesleft[] < BUFSIZE |
| 159 | + iconv!(s.cd, s.inbuf, s.outbuf, s.inbytesleft, s.outbytesleft) |
| 160 | + write(s.ostream, sub(s.outbuf, 1:(BUFSIZE - s.outbytesleft[]))) |
| 161 | + end |
| 162 | + |
| 163 | + s |
| 164 | +end |
| 165 | + |
| 166 | +function close(s::StringEncoder) |
| 167 | + s.cd.p != C_NULL || return s |
| 168 | + flush(s) |
| 169 | + iconv_reset!(s) |
| 170 | + finalize(s.cd) |
| 171 | + # flush() wasn't able to empty input buffer, which cannot happen with correct data |
| 172 | + s.inbytesleft[] == 0 || error("iconv error: incomplete byte sequence at end of input") |
| 173 | +end |
| 174 | + |
| 175 | +function write(s::StringEncoder, x::UInt8) |
| 176 | + s.inbytesleft[] >= length(s.inbuf) && flush(s) |
| 177 | + s.inbuf[s.inbytesleft[]+=1] = x |
| 178 | + 1 |
| 179 | +end |
| 180 | + |
| 181 | + |
| 182 | +## StringDecoder |
| 183 | + |
| 184 | +""" |
| 185 | + StringDecoder(istream, from, to="UTF-8") |
| 186 | +
|
| 187 | +Returns a new read-only I/O stream, which converts text in the encoding `from` |
| 188 | +read from `istream` into text in the encoding `to`. |
| 189 | +""" |
| 190 | +function StringDecoder(istream::IO, from::ASCIIString, to::ASCIIString="UTF-8") |
| 191 | + cd = iconv_open(to, from) |
| 192 | + inbuf = Vector{UInt8}(BUFSIZE) |
| 193 | + outbuf = Vector{UInt8}(BUFSIZE) |
| 194 | + s = StringDecoder(istream, cd, inbuf, outbuf, Ref{Csize_t}(0), Ref{Csize_t}(BUFSIZE), 0) |
| 195 | + finalizer(s, close) |
| 196 | + s |
| 197 | +end |
| 198 | + |
| 199 | +# Fill input buffer and convert it into output buffer |
| 200 | +# Returns the number of bytes written to output buffer |
| 201 | +function fill_buffer!(s::StringDecoder) |
| 202 | + s.cd.p != C_NULL || return 0 |
| 203 | + |
| 204 | + s.skip = 0 |
| 205 | + |
| 206 | + # Input buffer and input stream empty |
| 207 | + if s.inbytesleft[] == 0 && eof(s.istream) |
| 208 | + i = iconv_reset!(s) |
| 209 | + return i |
| 210 | + end |
| 211 | + |
| 212 | + s.inbytesleft[] += readbytes!(s.istream, sub(s.inbuf, (s.inbytesleft[]+1):BUFSIZE)) |
| 213 | + iconv!(s.cd, s.inbuf, s.outbuf, s.inbytesleft, s.outbytesleft) |
| 214 | +end |
| 215 | + |
| 216 | +# In order to know whether more data is available, we need to: |
| 217 | +# 1) check whether the output buffer contains data |
| 218 | +# 2) if not, actually try to fill it (this is the only way to find out whether input |
| 219 | +# data contains only state control sequences which may be converted to nothing) |
| 220 | +# 3) if not, reset iconv to initial state, which may generate data |
| 221 | +function eof(s::StringDecoder) |
| 222 | + length(s.outbuf) - s.outbytesleft[] == s.skip && |
| 223 | + fill_buffer!(s) == 0 && |
| 224 | + iconv_reset!(s) == 0 |
| 225 | +end |
| 226 | + |
| 227 | +function close(s::StringDecoder) |
| 228 | + s.cd.p != C_NULL || return s |
| 229 | + finalize(s.cd) |
| 230 | + # fill_buffer!() wasn't able to empty input buffer, which cannot happen with correct data |
| 231 | + s.inbytesleft[] == 0 || error("iconv error: incomplete byte sequence at end of input") |
| 232 | +end |
| 233 | + |
| 234 | +function read(s::StringDecoder, ::Type{UInt8}) |
| 235 | + eof(s) ? throw(EOFError()) : s.outbuf[s.skip+=1] |
| 236 | +end |
| 237 | + |
| 238 | +function readall(filename::AbstractString, encoding::ASCIIString) |
| 239 | + open(s -> readall(StringDecoder(s, encoding)), filename) |
| 240 | +end |
| 241 | + |
| 242 | + |
| 243 | +## Functions to encode/decode strings |
| 244 | + |
| 245 | +encoding_string(::Type{ASCIIString}) = "ASCII" |
| 246 | +encoding_string(::Type{UTF8String}) = "UTF-8" |
| 247 | +encoding_string(::Type{UTF16String}) = "UTF-16" |
| 248 | +encoding_string(::Type{UTF32String}) = "UTF-32" |
| 249 | + |
| 250 | +""" |
| 251 | + decode(a::Vector{UInt8}, enc::ASCIIString) |
| 252 | +
|
| 253 | +Convert an array of bytes `a` representing text in encoding `enc` to a string. |
| 254 | +""" |
| 255 | +function decode(a::Vector{UInt8}, enc::ASCIIString) |
| 256 | + b = IOBuffer(a) |
| 257 | + try |
| 258 | + d = readbytes(StringDecoder(b, enc, "UTF-8")) |
| 259 | + # Skip final null bytes if needed |
| 260 | + # FIXME: find a better solution? |
| 261 | + i = length(d) |
| 262 | + while i >= 1 |
| 263 | + d[i] != 0 && break |
| 264 | + i -= 1 |
| 265 | + end |
| 266 | + UTF8String(d[1:i]) |
| 267 | + finally |
| 268 | + close(b) |
| 269 | + end |
| 270 | +end |
| 271 | + |
| 272 | +""" |
| 273 | + encode(s::AbstractString, enc::ASCIIString) |
2 | 274 |
|
3 |
| -# package code goes here |
| 275 | +Convert string `s` to an array of bytes representing text in encoding `enc`. |
| 276 | +""" |
| 277 | +function encode(s::AbstractString, enc::ASCIIString) |
| 278 | + b = IOBuffer() |
| 279 | + p = StringEncoder(b, enc, encoding_string(typeof(s))) |
| 280 | + write(p, s) |
| 281 | + close(p) |
| 282 | + takebuf_array(b) |
| 283 | +end |
4 | 284 |
|
5 | 285 | end # module
|
0 commit comments