Skip to content

Commit 59a5c76

Browse files
Mingundralley
andcommitted
seq: Allow to limit number of skipped events to prevent huge memory consumption
Co-authored-by: Daniel Alley <[email protected]>
1 parent 7a356ec commit 59a5c76

File tree

3 files changed

+137
-4
lines changed

3 files changed

+137
-4
lines changed

Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,14 @@ encoding = ["encoding_rs"]
7878
## to quadratic parsing time, because the deserializer must check the list of
7979
## events as many times as the number of sequence fields present in the schema.
8080
##
81+
## To reduce negative consequences, always [limit] the maximum number of events
82+
## that [`Deserializer`] will buffer.
83+
##
8184
## This feature works only with `serialize` feature and has no effect if `serialize`
8285
## is not enabled.
86+
##
87+
## [limit]: crate::de::Deserializer::event_buffer_size
88+
## [`Deserializer`]: crate::de::Deserializer
8389
overlapped-lists = []
8490

8591
## Enables support for [`serde`] serialization and deserialization

src/de/mod.rs

Lines changed: 123 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ use std::borrow::Cow;
229229
#[cfg(feature = "overlapped-lists")]
230230
use std::collections::VecDeque;
231231
use std::io::BufRead;
232+
#[cfg(feature = "overlapped-lists")]
233+
use std::num::NonZeroUsize;
232234

233235
pub(crate) const INNER_VALUE: &str = "$value";
234236
pub(crate) const UNFLATTEN_PREFIX: &str = "$unflatten=";
@@ -277,6 +279,12 @@ where
277279
/// moved from this to [`Self::read`].
278280
#[cfg(feature = "overlapped-lists")]
279281
write: VecDeque<DeEvent<'de>>,
282+
/// Maximum number of events that can be skipped when processing sequences
283+
/// that occur out-of-order. This field is used to prevent potential
284+
/// denial-of-service (DoS) attacks which could cause infinite memory
285+
/// consumption when parsing a very large amount of XML into a sequence field.
286+
#[cfg(feature = "overlapped-lists")]
287+
limit: Option<NonZeroUsize>,
280288

281289
#[cfg(not(feature = "overlapped-lists"))]
282290
peek: Option<DeEvent<'de>>,
@@ -375,6 +383,8 @@ where
375383
read: VecDeque::new(),
376384
#[cfg(feature = "overlapped-lists")]
377385
write: VecDeque::new(),
386+
#[cfg(feature = "overlapped-lists")]
387+
limit: None,
378388

379389
#[cfg(not(feature = "overlapped-lists"))]
380390
peek: None,
@@ -387,6 +397,71 @@ where
387397
Self::new(reader)
388398
}
389399

400+
/// Set the maximum number of events that could be skipped during deserialization
401+
/// of sequences.
402+
///
403+
/// If `<element>` contains more than specified nested elements, `#text` or
404+
/// CDATA nodes, then [`DeError::TooManyEvents`] will be returned during
405+
/// deserialization of sequence field (any type that uses [`deserialize_seq`]
406+
/// for the deserialization, for example, `Vec<T>`).
407+
///
408+
/// This method can be used to prevent a [DoS] attack and infinite memory
409+
/// consumption when parsing a very large XML to a sequence field.
410+
///
411+
/// It is strongly recommended to set limit to some value when you parse data
412+
/// from untrusted sources. You should choose a value that your typical XMLs
413+
/// can have _between_ different elements that corresponds to the same sequence.
414+
///
415+
/// # Examples
416+
///
417+
/// Let's imagine, that we deserialize such structure:
418+
/// ```
419+
/// struct List {
420+
/// item: Vec<()>,
421+
/// }
422+
/// ```
423+
///
424+
/// The XML that we try to parse look like this:
425+
/// ```xml
426+
/// <any-name>
427+
/// <item/>
428+
/// <!-- Bufferization starts at this point -->
429+
/// <another-item>
430+
/// <some-element>with text</some-element>
431+
/// <yet-another-element/>
432+
/// </another-item>
433+
/// <!-- Buffer will be emptied at this point; 7 events were buffered -->
434+
/// <item/>
435+
/// <!-- There is nothing to buffer, because elements follows each other -->
436+
/// <item/>
437+
/// </any-name>
438+
/// ```
439+
///
440+
/// There, when we deserialize the `item` field, we need to buffer 7 events,
441+
/// before we can deserialize the second `<item/>`:
442+
///
443+
/// - `<another-item>`
444+
/// - `<some-element>`
445+
/// - `#text(with text)`
446+
/// - `</some-element>`
447+
/// - `<yet-another-element/>` (virtual start event)
448+
/// - `<yet-another-element/>` (vitrual end event)
449+
/// - `</another-item>`
450+
///
451+
/// Note, that `<yet-another-element/>` internally represented as 2 events:
452+
/// one for the start tag and one for the end tag. In the future this can be
453+
/// eliminated, but for now we use [auto-expanding feature] of a reader,
454+
/// because this simplifies deserializer code.
455+
///
456+
/// [`deserialize_seq`]: serde::Deserializer::deserialize_seq
457+
/// [DoS]: https://en.wikipedia.org/wiki/Denial-of-service_attack
458+
/// [auto-expanding feature]: Reader::expand_empty_elements
459+
#[cfg(feature = "overlapped-lists")]
460+
pub fn event_buffer_size(&mut self, limit: Option<NonZeroUsize>) -> &mut Self {
461+
self.limit = limit;
462+
self
463+
}
464+
390465
#[cfg(feature = "overlapped-lists")]
391466
fn peek(&mut self) -> Result<&DeEvent<'de>, DeError> {
392467
if self.read.is_empty() {
@@ -435,7 +510,7 @@ where
435510
#[cfg(feature = "overlapped-lists")]
436511
fn skip(&mut self) -> Result<(), DeError> {
437512
let event = self.next()?;
438-
self.write.push_back(event);
513+
self.skip_event(event)?;
439514
match self.write.back() {
440515
// Skip all subtree, if we skip a start event
441516
Some(DeEvent::Start(e)) => {
@@ -445,24 +520,36 @@ where
445520
let event = self.next()?;
446521
match event {
447522
DeEvent::Start(ref e) if e.name() == end => {
448-
self.write.push_back(event);
523+
self.skip_event(event)?;
449524
depth += 1;
450525
}
451526
DeEvent::End(ref e) if e.name() == end => {
452-
self.write.push_back(event);
527+
self.skip_event(event)?;
453528
if depth == 0 {
454529
return Ok(());
455530
}
456531
depth -= 1;
457532
}
458-
_ => self.write.push_back(event),
533+
_ => self.skip_event(event)?,
459534
}
460535
}
461536
}
462537
_ => Ok(()),
463538
}
464539
}
465540

541+
#[cfg(feature = "overlapped-lists")]
542+
#[inline]
543+
fn skip_event(&mut self, event: DeEvent<'de>) -> Result<(), DeError> {
544+
if let Some(max) = self.limit {
545+
if self.write.len() >= max.get() {
546+
return Err(DeError::TooManyEvents(max));
547+
}
548+
}
549+
self.write.push_back(event);
550+
Ok(())
551+
}
552+
466553
/// Moves all buffered events to the end of [`Self::write`] buffer and swaps
467554
/// read and write buffers.
468555
///
@@ -1169,6 +1256,38 @@ mod tests {
11691256

11701257
assert_eq!(de.next().unwrap(), End(BytesEnd::borrowed(b"root")));
11711258
}
1259+
1260+
/// Checks that limiting buffer size works correctly
1261+
#[test]
1262+
fn limit() {
1263+
use serde::Deserialize;
1264+
1265+
#[derive(Debug, Deserialize)]
1266+
#[allow(unused)]
1267+
struct List {
1268+
item: Vec<()>,
1269+
}
1270+
1271+
let mut de = Deserializer::from_slice(
1272+
br#"
1273+
<any-name>
1274+
<item/>
1275+
<another-item>
1276+
<some-element>with text</some-element>
1277+
<yet-another-element/>
1278+
</another-item>
1279+
<item/>
1280+
<item/>
1281+
</any-name>
1282+
"#,
1283+
);
1284+
de.event_buffer_size(NonZeroUsize::new(3));
1285+
1286+
match List::deserialize(&mut de) {
1287+
Err(DeError::TooManyEvents(count)) => assert_eq!(count.get(), 3),
1288+
e => panic!("Expected `Err(TooManyEvents(3))`, but found {:?}", e),
1289+
}
1290+
}
11721291
}
11731292

11741293
#[test]

src/errors.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ pub mod serialize {
116116
use super::*;
117117
use crate::utils::write_byte_string;
118118
use std::fmt;
119+
#[cfg(feature = "overlapped-lists")]
120+
use std::num::NonZeroUsize;
119121
use std::num::{ParseFloatError, ParseIntError};
120122

121123
/// (De)serialization error
@@ -159,6 +161,10 @@ pub mod serialize {
159161
ExpectedStart,
160162
/// Unsupported operation
161163
Unsupported(&'static str),
164+
/// Too many events were skipped while deserializing a sequence, event limit
165+
/// exceeded. The limit was provided as an argument
166+
#[cfg(feature = "overlapped-lists")]
167+
TooManyEvents(NonZeroUsize),
162168
}
163169

164170
impl fmt::Display for DeError {
@@ -183,6 +189,8 @@ pub mod serialize {
183189
DeError::UnexpectedEof => write!(f, "Unexpected `Event::Eof`"),
184190
DeError::ExpectedStart => write!(f, "Expecting `Event::Start`"),
185191
DeError::Unsupported(s) => write!(f, "Unsupported operation {}", s),
192+
#[cfg(feature = "overlapped-lists")]
193+
DeError::TooManyEvents(s) => write!(f, "Deserializer buffers {} events, limit exceeded", s),
186194
}
187195
}
188196
}

0 commit comments

Comments
 (0)