Skip to content

Commit c6fc0ba

Browse files
committed
Provide some utilities for decoding entire buffers
1 parent bee8ff6 commit c6fc0ba

File tree

2 files changed

+60
-41
lines changed

2 files changed

+60
-41
lines changed

Changelog.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
| |`resolve`
3838
|`event_namespace` |`resolve_element`
3939
|`attribute_namespace` |`resolve_attribute`
40+
- [#439]: Added utilities `detect_encoding()`, `decode()`, and `decode_with_bom_removal()`
41+
under the `quick-xml::encoding` namespace.
4042

4143

4244
### Bug Fixes
@@ -209,6 +211,8 @@
209211
[#431]: https://github.com/tafia/quick-xml/pull/431
210212
[#434]: https://github.com/tafia/quick-xml/pull/434
211213
[#437]: https://github.com/tafia/quick-xml/pull/437
214+
[#439]: https://github.com/tafia/quick-xml/pull/439
215+
212216

213217
## 0.23.0 -- 2022-05-08
214218

src/encoding.rs

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8};
77

88
use crate::{Error, Result};
99

10-
/// Decoder of byte slices to the strings. This is lightweight object that can be copied.
10+
/// Decoder of byte slices into strings.
1111
///
1212
/// If feature `encoding` is enabled, this encoding taken from the `"encoding"`
1313
/// XML declaration or assumes UTF-8, if XML has no <?xml ?> declaration, encoding
@@ -60,7 +60,7 @@ impl Decoder {
6060
///
6161
/// If you instead want to use XML declared encoding, use the `encoding` feature
6262
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
63-
let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") {
63+
let bytes = if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
6464
&bytes[3..]
6565
} else {
6666
bytes
@@ -86,13 +86,7 @@ impl Decoder {
8686
///
8787
/// Returns an error in case of malformed sequences in the `bytes`.
8888
pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
89-
match self
90-
.encoding
91-
.decode_without_bom_handling_and_without_replacement(bytes)
92-
{
93-
None => Err(Error::NonDecodable(None)),
94-
Some(s) => Ok(s),
95-
}
89+
decode(bytes, self.encoding)
9690
}
9791

9892
/// Decodes a slice with BOM removal if it is present in the `bytes` using
@@ -105,57 +99,76 @@ impl Decoder {
10599
///
106100
/// Returns an error in case of malformed sequences in the `bytes`.
107101
pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>> {
108-
self.decode(self.remove_bom(bytes))
102+
self.decode(remove_bom(bytes, self.encoding))
109103
}
110-
/// Copied from [`Encoding::decode_with_bom_removal`]
111-
#[inline]
112-
fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] {
113-
if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
114-
return &bytes[3..];
115-
}
116-
if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") {
117-
return &bytes[2..];
118-
}
119-
if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") {
120-
return &bytes[2..];
121-
}
104+
}
105+
106+
/// Decodes the provided bytes using the specified encoding, ignoring the BOM
107+
/// if it is present in the `bytes`.
108+
///
109+
/// Returns an error in case of malformed sequences in the `bytes`.
110+
#[cfg(feature = "encoding")]
111+
pub fn decode<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> Result<Cow<'b, str>> {
112+
encoding
113+
.decode_without_bom_handling_and_without_replacement(bytes)
114+
.ok_or(Error::NonDecodable(None))
115+
}
116+
117+
/// Decodes a slice with an unknown encoding, removing the BOM if it is present
118+
/// in the bytes.
119+
///
120+
/// Returns an error in case of malformed sequences in the `bytes`.
121+
#[cfg(feature = "encoding")]
122+
pub fn decode_with_bom_removal<'b>(bytes: &'b [u8]) -> Result<Cow<'b, str>> {
123+
if let Some(encoding) = detect_encoding(bytes) {
124+
let bytes = remove_bom(bytes, encoding);
125+
decode(bytes, encoding)
126+
} else {
127+
decode(bytes, UTF_8)
128+
}
129+
}
122130

123-
bytes
131+
#[cfg(feature = "encoding")]
132+
fn split_at_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> (&'b [u8], &'b [u8]) {
133+
if encoding == UTF_8 && bytes.starts_with(&[0xEF, 0xBB, 0xBF]) {
134+
bytes.split_at(3)
135+
} else if encoding == UTF_16LE && bytes.starts_with(&[0xFF, 0xFE]) {
136+
bytes.split_at(2)
137+
} else if encoding == UTF_16BE && bytes.starts_with(&[0xFE, 0xFF]) {
138+
bytes.split_at(2)
139+
} else {
140+
(&[], bytes)
124141
}
125142
}
126143

127-
/// Automatic encoding detection of XML files based using the [recommended algorithm]
128-
/// (https://www.w3.org/TR/xml11/#sec-guessing)
144+
#[cfg(feature = "encoding")]
145+
fn remove_bom<'b>(bytes: &'b [u8], encoding: &'static Encoding) -> &'b [u8] {
146+
let (_, bytes) = split_at_bom(bytes, encoding);
147+
bytes
148+
}
149+
150+
/// Automatic encoding detection of XML files based using the
151+
/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
152+
///
153+
/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
154+
///
155+
/// Because the [`encoding_rs`] crate supports only subset of those encodings, only
156+
/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
129157
///
130158
/// The algorithm suggests examine up to the first 4 bytes to determine encoding
131159
/// according to the following table:
132160
///
133161
/// | Bytes |Detected encoding
134162
/// |-------------|------------------------------------------
135-
/// |`00 00 FE FF`|UCS-4, big-endian machine (1234 order)
136-
/// |`FF FE 00 00`|UCS-4, little-endian machine (4321 order)
137-
/// |`00 00 FF FE`|UCS-4, unusual octet order (2143)
138-
/// |`FE FF 00 00`|UCS-4, unusual octet order (3412)
139163
/// |`FE FF ## ##`|UTF-16, big-endian
140164
/// |`FF FE ## ##`|UTF-16, little-endian
141165
/// |`EF BB BF` |UTF-8
142166
/// |-------------|------------------------------------------
143-
/// |`00 00 00 3C`|UCS-4 or similar (use declared encoding to find the exact one), in big-endian (1234)
144-
/// |`3C 00 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in little-endian (4321)
145-
/// |`00 00 3C 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (2143)
146-
/// |`00 3C 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (3412)
147167
/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
148168
/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
149169
/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
150-
/// |`4C 6F A7 94`|EBCDIC (in some flavor; the full encoding declaration must be read to tell which code page is in use)
151-
/// |_Other_ |UTF-8 without an encoding declaration, or else the data stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, or enclosed in a wrapper of some kind
152-
///
153-
/// Because [`encoding_rs`] crate supported only subset of those encodings, only
154-
/// supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
155-
///
156-
/// If encoding is detected, `Some` is returned, otherwise `None` is returned.
157170
#[cfg(feature = "encoding")]
158-
pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
171+
pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
159172
match bytes {
160173
// with BOM
161174
_ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE),
@@ -170,3 +183,5 @@ pub(crate) fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> {
170183
_ => None,
171184
}
172185
}
186+
187+
// TODO: add some tests for functions

0 commit comments

Comments
 (0)