|
16 | 16 | // under the License. |
17 | 17 |
|
18 | 18 | use std::fmt::Debug; |
| 19 | +use std::io::{BufRead, Read}; |
19 | 20 |
|
20 | 21 | use arrow::array::RecordBatch; |
21 | 22 | use arrow::error::ArrowError; |
22 | | -use datafusion_common::Result; |
| 23 | +use datafusion_common::{DataFusionError, Result}; |
23 | 24 | use datafusion_datasource::decoder::Decoder; |
24 | | -use encoding_rs::{CoderResult, Encoding}; |
| 25 | +use encoding_rs::{CoderResult, Encoding, UTF_8}; |
25 | 26 |
|
26 | 27 | use self::buffer::Buffer; |
27 | 28 |
|
28 | 29 | /// Default capacity of the buffer used to decode non-UTF-8 charset streams |
29 | 30 | static DECODE_BUFFER_CAP: usize = 8 * 1024; |
30 | 31 |
|
| 32 | +pub fn lookup_charset(enc: Option<&str>) -> Result<Option<&'static Encoding>> { |
| 33 | + match enc { |
| 34 | + Some(enc) => match Encoding::for_label(enc.as_bytes()) { |
| 35 | + Some(enc) => Ok(Some(enc).filter(|enc| *enc != UTF_8)), |
| 36 | + None => Err(DataFusionError::Configuration(format!( |
| 37 | + "Unknown character set '{enc}'" |
| 38 | + )))?, |
| 39 | + }, |
| 40 | + None => Ok(None), |
| 41 | + } |
| 42 | +} |
| 43 | + |
31 | 44 | /// A `Decoder` that decodes input bytes from the specified character encoding |
32 | 45 | /// to UTF-8 before passing them onto the inner `Decoder`. |
33 | 46 | pub struct CharsetDecoder<T> { |
@@ -100,6 +113,54 @@ impl<T: Debug> Debug for CharsetDecoder<T> { |
100 | 113 | } |
101 | 114 | } |
102 | 115 |
|
| 116 | +pub struct CharsetReader<R> { |
| 117 | + inner: R, |
| 118 | + charset_decoder: encoding_rs::Decoder, |
| 119 | + buffer: Buffer, |
| 120 | +} |
| 121 | + |
| 122 | +impl<R: BufRead> CharsetReader<R> { |
| 123 | + pub fn new(inner: R, encoding: &'static Encoding) -> Self { |
| 124 | + Self { |
| 125 | + inner, |
| 126 | + charset_decoder: encoding.new_decoder(), |
| 127 | + buffer: Buffer::with_capacity(DECODE_BUFFER_CAP), |
| 128 | + } |
| 129 | + } |
| 130 | +} |
| 131 | + |
| 132 | +impl<R: BufRead> Read for CharsetReader<R> { |
| 133 | + fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> { |
| 134 | + let src = self.fill_buf()?; |
| 135 | + let len = src.len().min(buf.len()); |
| 136 | + buf[..len].copy_from_slice(&src[..len]); |
| 137 | + Ok(len) |
| 138 | + } |
| 139 | +} |
| 140 | + |
| 141 | +impl<R: BufRead> BufRead for CharsetReader<R> { |
| 142 | + fn fill_buf(&mut self) -> std::io::Result<&[u8]> { |
| 143 | + if self.buffer.is_empty() { |
| 144 | + self.buffer.backshift(); |
| 145 | + |
| 146 | + let buf = self.inner.fill_buf()?; |
| 147 | + let (_, read, written, _) = self.charset_decoder.decode_to_utf8( |
| 148 | + buf, |
| 149 | + self.buffer.write_buf(), |
| 150 | + buf.is_empty(), |
| 151 | + ); |
| 152 | + self.inner.consume(read); |
| 153 | + self.buffer.advance(written); |
| 154 | + } |
| 155 | + |
| 156 | + Ok(self.buffer.read_buf()) |
| 157 | + } |
| 158 | + |
| 159 | + fn consume(&mut self, amount: usize) { |
| 160 | + self.buffer.consume(amount); |
| 161 | + } |
| 162 | +} |
| 163 | + |
103 | 164 | mod buffer { |
104 | 165 | /// A fixed-sized buffer that maintains both |
105 | 166 | /// a read position and a write position |
|
0 commit comments