Skip to content

Commit 3edb78b

Browse files
authored
Merge pull request #739 from phdavis1027/escape-unescape
Allow override of default escape/unescape behavior in more situations
2 parents 42a91c9 + b6d989a commit 3edb78b

File tree

10 files changed

+125
-40
lines changed

10 files changed

+125
-40
lines changed

Cargo.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,13 +111,12 @@ async-tokio = ["tokio"]
111111
## [#158]: https://github.com/tafia/quick-xml/issues/158
112112
encoding = ["encoding_rs"]
113113

114-
## Enables support for recognizing all [HTML 5 entities] in [`unescape`] and
115-
## [`unescape_with`] functions. The full list of entities also can be found in
114+
## Enables support for recognizing all [HTML 5 entities] in [`unescape`]
115+
## function. The full list of entities also can be found in
116116
## <https://html.spec.whatwg.org/entities.json>.
117117
##
118118
## [HTML 5 entities]: https://dev.w3.org/html5/html-author/charref
119119
## [`unescape`]: crate::escape::unescape
120-
## [`unescape_with`]: crate::escape::unescape_with
121120
escape-html = []
122121

123122
## This feature is for the Serde deserializer that enables support for deserializing

Changelog.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ The method of reporting positions of errors has changed - use `error_position()`
1818
to get an offset of the error position. For `SyntaxError`s the range
1919
`error_position()..buffer_position()` also will represent a span of error.
2020

21+
The way of resolve entities with `unescape_with` are changed. Those methods no longer
22+
resolve predefined entities.
23+
2124
### New Features
2225

2326
- [#513]: Allow to continue parsing after getting new `Error::IllFormed`.
@@ -35,6 +38,10 @@ to get an offset of the error position. For `SyntaxError`s the range
3538
- [#722]: Allow to pass owned strings to `Writer::create_element`. This is breaking change!
3639
- [#275]: Added `ElementWriter::new_line()` which enables pretty printing elements with multiple attributes.
3740
- [#743]: Add `Deserializer::get_ref()` to get XML Reader from serde Deserializer
41+
- [#734]: Add helper functions to resolve predefined XML and HTML5 entities:
42+
- `quick_xml::escape::resolve_predefined_entity`
43+
- `quick_xml::escape::resolve_xml_entity`
44+
- `quick_xml::escape::resolve_html5_entity`
3845

3946
### Bug Fixes
4047

@@ -69,6 +76,10 @@ to get an offset of the error position. For `SyntaxError`s the range
6976
- [#738]: Add an example of how to deserialize XML elements into Rust enums using an
7077
intermediate custom deserializer.
7178
- [#748]: Implement `Clone` for [`DeEvent`], [`PayloadEvent`] and [`Text`].
79+
- [#734]: Rename `NoEntityResolver` to `PredefinedEntityResolver`.
80+
- [#734]: No longer resolve predefined entities (`lt`, `gt`, `apos`, `quot`, `amp`)
81+
in `unescape_with` family of methods. You should do that by yourself using the methods
82+
listed above.
7283

7384
[#275]: https://github.com/tafia/quick-xml/issues/275
7485
[#362]: https://github.com/tafia/quick-xml/issues/362
@@ -83,6 +94,7 @@ to get an offset of the error position. For `SyntaxError`s the range
8394
[#704]: https://github.com/tafia/quick-xml/pull/704
8495
[#705]: https://github.com/tafia/quick-xml/pull/705
8596
[#722]: https://github.com/tafia/quick-xml/pull/722
97+
[#734]: https://github.com/tafia/quick-xml/pull/734
8698
[#738]: https://github.com/tafia/quick-xml/pull/738
8799
[#743]: https://github.com/tafia/quick-xml/pull/743
88100
[#748]: https://github.com/tafia/quick-xml/pull/748

examples/custom_entities.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
1010
use std::collections::HashMap;
1111

12+
use quick_xml::escape::resolve_predefined_entity;
1213
use quick_xml::events::Event;
1314
use quick_xml::reader::Reader;
1415
use regex::bytes::Regex;
@@ -59,8 +60,11 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
5960
Ok(Event::Text(ref e)) => {
6061
println!(
6162
"text value: {}",
62-
e.unescape_with(|ent| custom_entities.get(ent).map(|s| s.as_str()))
63-
.unwrap()
63+
e.unescape_with(|ent| match custom_entities.get(ent) {
64+
Some(s) => Some(s.as_str()),
65+
None => resolve_predefined_entity(ent),
66+
})
67+
.unwrap()
6468
);
6569
}
6670
Ok(Event::Eof) => break,

src/de/mod.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1997,7 +1997,7 @@ mod text;
19971997
mod var;
19981998

19991999
pub use crate::errors::serialize::DeError;
2000-
pub use resolver::{EntityResolver, NoEntityResolver};
2000+
pub use resolver::{EntityResolver, PredefinedEntityResolver};
20012001

20022002
use crate::{
20032003
de::map::ElementMapAccess,
@@ -2125,7 +2125,7 @@ impl<'a> PayloadEvent<'a> {
21252125
/// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s.
21262126
/// [`PayloadEvent::Text`] events, that followed by any event except
21272127
/// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end.
2128-
struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = NoEntityResolver> {
2128+
struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver = PredefinedEntityResolver> {
21292129
/// A source of low-level XML events
21302130
reader: R,
21312131
/// Intermediate event, that could be returned by the next call to `next()`.
@@ -2356,7 +2356,7 @@ where
23562356
////////////////////////////////////////////////////////////////////////////////////////////////////
23572357

23582358
/// A structure that deserializes XML into Rust values.
2359-
pub struct Deserializer<'de, R, E: EntityResolver = NoEntityResolver>
2359+
pub struct Deserializer<'de, R, E: EntityResolver = PredefinedEntityResolver>
23602360
where
23612361
R: XmlRead<'de>,
23622362
{
@@ -2799,7 +2799,7 @@ impl<'de> Deserializer<'de, SliceReader<'de>> {
27992799
/// Deserializer created with this method will not resolve custom entities.
28002800
#[allow(clippy::should_implement_trait)]
28012801
pub fn from_str(source: &'de str) -> Self {
2802-
Self::from_str_with_resolver(source, NoEntityResolver)
2802+
Self::from_str_with_resolver(source, PredefinedEntityResolver)
28032803
}
28042804
}
28052805

@@ -2837,7 +2837,7 @@ where
28372837
///
28382838
/// Deserializer created with this method will not resolve custom entities.
28392839
pub fn from_reader(reader: R) -> Self {
2840-
Self::with_resolver(reader, NoEntityResolver)
2840+
Self::with_resolver(reader, PredefinedEntityResolver)
28412841
}
28422842
}
28432843

src/de/resolver.rs

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
use std::convert::Infallible;
44
use std::error::Error;
55

6+
use crate::escape::resolve_predefined_entity;
67
use crate::events::BytesText;
78

89
/// Used to resolve unknown entities while parsing
@@ -87,18 +88,28 @@ pub trait EntityResolver {
8788
fn resolve(&self, entity: &str) -> Option<&str>;
8889
}
8990

90-
/// An `EntityResolver` that does nothing and always returns `None`.
91+
/// An [`EntityResolver`] that resolves only predefined entities:
92+
///
93+
/// | Entity | Resolution
94+
/// |--------|------------
95+
/// |`&lt;` | `<`
96+
/// |`&gt;` | `>`
97+
/// |`&amp;` | `&`
98+
/// |`&apos;`| `'`
99+
/// |`&quot;`| `"`
91100
#[derive(Default, Copy, Clone)]
92-
pub struct NoEntityResolver;
101+
pub struct PredefinedEntityResolver;
93102

94-
impl EntityResolver for NoEntityResolver {
103+
impl EntityResolver for PredefinedEntityResolver {
95104
type Error = Infallible;
96105

106+
#[inline]
97107
fn capture(&mut self, _doctype: BytesText) -> Result<(), Self::Error> {
98108
Ok(())
99109
}
100110

101-
fn resolve(&self, _entity: &str) -> Option<&str> {
102-
None
111+
#[inline]
112+
fn resolve(&self, entity: &str) -> Option<&str> {
113+
resolve_predefined_entity(entity)
103114
}
104115
}

src/escapei.rs renamed to src/escape.rs

Lines changed: 76 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -184,16 +184,48 @@ pub(crate) fn _escape<F: Fn(u8) -> bool>(raw: &str, escape_chars: F) -> Cow<str>
184184
/// [`escape-html`]: ../index.html#escape-html
185185
/// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref
186186
pub fn unescape(raw: &str) -> Result<Cow<str>, EscapeError> {
187-
unescape_with(raw, |_| None)
187+
unescape_with(raw, resolve_predefined_entity)
188188
}
189189

190190
/// Unescape an `&str` and replaces all xml escaped characters (`&...;`) into
191191
/// their corresponding value, using a resolver function for custom entities.
192192
///
193193
/// If feature [`escape-html`] is enabled, then recognizes all [HTML5 escapes].
194194
///
195+
/// Predefined entities will be resolved _after_ trying to resolve with `resolve_entity`,
196+
/// which allows you to override default behavior which required in some XML dialects.
197+
///
198+
/// Character references (`&#hh;`) cannot be overridden, they are resolved before
199+
/// calling `resolve_entity`.
200+
///
201+
/// Note, that entities will not be resolved recursively. In order to satisfy the
202+
/// XML [requirements] you should unescape nested entities by yourself.
203+
///
204+
/// # Example
205+
///
206+
/// ```
207+
/// use quick_xml::escape::resolve_xml_entity;
208+
/// # use quick_xml::escape::unescape_with;
209+
/// # use pretty_assertions::assert_eq;
210+
/// let override_named_entities = |entity: &str| match entity {
211+
/// // Override standard entities
212+
/// "lt" => Some("FOO"),
213+
/// "gt" => Some("BAR"),
214+
/// // Resolve custom entities
215+
/// "baz" => Some("&lt;"),
216+
/// // Delegate other entities to the default implementation
217+
/// _ => resolve_xml_entity(entity),
218+
/// };
219+
///
220+
/// assert_eq!(
221+
/// unescape_with("&amp;&lt;test&gt;&baz;", override_named_entities).unwrap(),
222+
/// "&FOOtestBAR&lt;"
223+
/// );
224+
/// ```
225+
///
195226
/// [`escape-html`]: ../index.html#escape-html
196227
/// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref
228+
/// [requirements]: https://www.w3.org/TR/xml11/#intern-replacement
197229
pub fn unescape_with<'input, 'entity, F>(
198230
raw: &'input str,
199231
mut resolve_entity: F,
@@ -221,8 +253,6 @@ where
221253
if let Some(entity) = pat.strip_prefix('#') {
222254
let codepoint = parse_number(entity, start..end)?;
223255
unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
224-
} else if let Some(value) = named_entity(pat) {
225-
unescaped.push_str(value);
226256
} else if let Some(value) = resolve_entity(pat) {
227257
unescaped.push_str(value);
228258
} else {
@@ -248,10 +278,45 @@ where
248278
}
249279
}
250280

251-
#[cfg(not(feature = "escape-html"))]
252-
fn named_entity(name: &str) -> Option<&str> {
281+
/// Resolves predefined XML entities or all HTML5 entities depending on the feature
282+
/// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
283+
///
284+
/// Behaves like [`resolve_xml_entity`] if feature is not enabled and as
285+
/// [`resolve_html5_entity`] if enabled.
286+
#[inline]
287+
pub fn resolve_predefined_entity(entity: &str) -> Option<&'static str> {
288+
#[cfg(not(feature = "escape-html"))]
289+
{
290+
resolve_xml_entity(entity)
291+
}
292+
293+
#[cfg(feature = "escape-html")]
294+
{
295+
resolve_html5_entity(entity)
296+
}
297+
}
298+
299+
/// Resolves predefined XML entities. If specified entity is not a predefined XML
300+
/// entity, `None` is returned.
301+
///
302+
/// The complete list of predefined entities are defined in the [specification].
303+
///
304+
/// ```
305+
/// # use quick_xml::escape::resolve_xml_entity;
306+
/// # use pretty_assertions::assert_eq;
307+
/// assert_eq!(resolve_xml_entity("lt"), Some("<"));
308+
/// assert_eq!(resolve_xml_entity("gt"), Some(">"));
309+
/// assert_eq!(resolve_xml_entity("amp"), Some("&"));
310+
/// assert_eq!(resolve_xml_entity("apos"), Some("'"));
311+
/// assert_eq!(resolve_xml_entity("quot"), Some("\""));
312+
///
313+
/// assert_eq!(resolve_xml_entity("foo"), None);
314+
/// ```
315+
///
316+
/// [specification]: https://www.w3.org/TR/xml11/#sec-predefined-ent
317+
pub fn resolve_xml_entity(entity: &str) -> Option<&'static str> {
253318
// match over strings are not allowed in const functions
254-
let s = match name.as_bytes() {
319+
let s = match entity.as_bytes() {
255320
b"lt" => "<",
256321
b"gt" => ">",
257322
b"amp" => "&",
@@ -261,12 +326,13 @@ fn named_entity(name: &str) -> Option<&str> {
261326
};
262327
Some(s)
263328
}
264-
#[cfg(feature = "escape-html")]
265-
fn named_entity(name: &str) -> Option<&str> {
329+
330+
/// Resolves all HTML5 entities. For complete list see <https://dev.w3.org/html5/html-author/charref>.
331+
pub fn resolve_html5_entity(entity: &str) -> Option<&'static str> {
266332
// imported from https://dev.w3.org/html5/html-author/charref
267333
// match over strings are not allowed in const functions
268334
//TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
269-
let s = match name.as_bytes() {
335+
let s = match entity.as_bytes() {
270336
b"Tab" => "\u{09}",
271337
b"NewLine" => "\u{0A}",
272338
b"excl" => "\u{21}",
@@ -1804,10 +1870,7 @@ fn test_unescape_with() {
18041870
assert_eq!(unchanged, Cow::Borrowed("test"));
18051871
assert!(matches!(unchanged, Cow::Borrowed(_)));
18061872

1807-
assert_eq!(
1808-
unescape_with("&lt;test&gt;", custom_entities).unwrap(),
1809-
"<test>"
1810-
);
1873+
assert!(unescape_with("&lt;", custom_entities).is_err());
18111874
assert_eq!(unescape_with("&#x30;", custom_entities).unwrap(), "0");
18121875
assert_eq!(unescape_with("&#48;", custom_entities).unwrap(), "0");
18131876
assert_eq!(unescape_with("&foo;", custom_entities).unwrap(), "BAR");

src/events/attributes.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
//! Provides an iterator over attributes key/value pairs
44
55
use crate::errors::Result as XmlResult;
6-
use crate::escape::{escape, unescape_with};
6+
use crate::escape::{escape, resolve_predefined_entity, unescape_with};
77
use crate::name::QName;
88
use crate::reader::{is_whitespace, Reader};
99
use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -85,7 +85,7 @@ impl<'a> Attribute<'a> {
8585
/// This will allocate if the value contains any escape sequences or in
8686
/// non-UTF-8 encoding.
8787
pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<'a, str>> {
88-
self.decode_and_unescape_value_with(reader, |_| None)
88+
self.decode_and_unescape_value_with(reader, resolve_predefined_entity)
8989
}
9090

9191
/// Decodes then unescapes the value with custom entities.

src/events/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ use std::str::from_utf8;
4646

4747
use crate::encoding::Decoder;
4848
use crate::errors::{Error, IllFormedError, Result};
49-
use crate::escape::{escape, minimal_escape, partial_escape, unescape_with};
49+
use crate::escape::{
50+
escape, minimal_escape, partial_escape, resolve_predefined_entity, unescape_with,
51+
};
5052
use crate::name::{LocalName, QName};
5153
use crate::reader::is_whitespace;
5254
use crate::utils::write_cow_string;
@@ -748,7 +750,7 @@ impl<'a> BytesText<'a> {
748750
/// This will allocate if the value contains any escape sequences or in
749751
/// non-UTF-8 encoding.
750752
pub fn unescape(&self) -> Result<Cow<'a, str>> {
751-
self.unescape_with(|_| None)
753+
self.unescape_with(resolve_predefined_entity)
752754
}
753755

754756
/// Decodes then unescapes the content of the event with custom entities.

src/lib.rs

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,7 @@
5656
pub mod de;
5757
pub mod encoding;
5858
pub mod errors;
59-
mod escapei;
60-
pub mod escape {
61-
//! Manage xml character escapes
62-
pub use crate::escapei::{
63-
escape, minimal_escape, partial_escape, unescape, unescape_with, EscapeError,
64-
};
65-
}
59+
pub mod escape;
6660
pub mod events;
6761
pub mod name;
6862
pub mod reader;

src/se/simple_type.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
//! [as defined]: https://www.w3.org/TR/xmlschema11-1/#Simple_Type_Definition
55
66
use crate::errors::serialize::DeError;
7-
use crate::escapei::_escape;
7+
use crate::escape::_escape;
88
use crate::se::{Indent, QuoteLevel};
99
use serde::ser::{
1010
Impossible, Serialize, SerializeSeq, SerializeTuple, SerializeTupleStruct,

0 commit comments

Comments
 (0)