diff --git a/src/base/encoding.rs b/src/base/encoding.rs index 92c517d8..5c525e85 100644 --- a/src/base/encoding.rs +++ b/src/base/encoding.rs @@ -80,7 +80,8 @@ impl SharedEncoding { #[must_use] pub fn get(&self) -> &'static Encoding { let encoding = self.encoding.load(Ordering::Relaxed); - ALL_ENCODINGS[encoding] + // it will never be out of range, but get() avoids a panic branch + ALL_ENCODINGS.get(encoding).unwrap_or(&ALL_ENCODINGS[0]) } pub fn set(&self, encoding: AsciiCompatibleEncoding) { diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index 069a3fda..f7687a3d 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -1,10 +1,14 @@ use std::any::Any; +pub(crate) use self::mutations::{Mutations, StringChunk}; +pub(crate) use self::text_decoder::TextDecoder; +pub(crate) use self::text_encoder::{IncompleteUtf8Resync, TextEncoder}; + pub use self::document_end::*; pub use self::element::*; pub use self::mutations::{ContentType, StreamingHandler}; -pub(crate) use self::mutations::{Mutations, StringChunk}; -pub use self::text_encoder::{StreamingHandlerSink, Utf8Error}; +pub use self::streaming_sink::StreamingHandlerSink; +pub use self::text_encoder::Utf8Error; pub use self::tokens::*; /// Data that can be attached to a rewritable unit by a user and shared between content handler @@ -85,6 +89,8 @@ mod mutations; mod document_end; mod element; +mod streaming_sink; +mod text_decoder; mod text_encoder; mod tokens; diff --git a/src/rewritable_units/streaming_sink.rs b/src/rewritable_units/streaming_sink.rs new file mode 100644 index 00000000..0652eb54 --- /dev/null +++ b/src/rewritable_units/streaming_sink.rs @@ -0,0 +1,233 @@ +use super::{ContentType, IncompleteUtf8Resync, TextEncoder, Utf8Error}; +use crate::html::escape_body_text; +use encoding_rs::{Encoding, UTF_8}; + +/// Used to write chunks of text or markup in streaming mutation handlers. +/// +/// Argument to [`StreamingHandler::write_all()`](crate::html_content::StreamingHandler::write_all). +pub struct StreamingHandlerSink<'output_handler> { + incomplete_utf8: IncompleteUtf8Resync, + inner: StreamingHandlerSinkInner<'output_handler>, +} + +struct StreamingHandlerSinkInner<'output_handler> { + non_utf8_encoder: Option, + + /// ```compile_fail + /// use lol_html::html_content::StreamingHandlerSink; + /// struct IsSend(T); + /// let x: IsSend>; + /// ``` + /// + /// ```compile_fail + /// use lol_html::html_content::StreamingHandlerSink; + /// struct IsSync(T); + /// let x: IsSync>; + /// ``` + output_handler: &'output_handler mut dyn FnMut(&[u8]), +} + +impl<'output_handler> StreamingHandlerSink<'output_handler> { + #[inline(always)] + pub(crate) fn new( + encoding: &'static Encoding, + output_handler: &'output_handler mut dyn FnMut(&[u8]), + ) -> Self { + Self { + incomplete_utf8: IncompleteUtf8Resync::new(), + inner: StreamingHandlerSinkInner { + non_utf8_encoder: (encoding != UTF_8).then(|| TextEncoder::new(encoding)), + output_handler, + }, + } + } + + /// Writes the given UTF-8 string to the output, converting the encoding and [escaping](ContentType) if necessary. + /// + /// It may be called multiple times. The strings will be concatenated together. + #[inline] + pub fn write_str(&mut self, content: &str, content_type: ContentType) { + if self.incomplete_utf8.discard_incomplete() { + // too late to report the error to the caller of write_utf8_chunk + self.inner.write_html("\u{FFFD}"); + } + self.inner.write_str(content, content_type); + } + + #[inline] + pub(crate) fn output_handler(&mut self) -> &mut dyn FnMut(&[u8]) { + &mut self.inner.output_handler + } + + /// Writes as much of the given UTF-8 fragment as possible, converting the encoding and [escaping](ContentType) if necessary. + /// + /// The `content` doesn't need to be a complete UTF-8 string, as long as consecutive calls to `write_utf8_bytes` create a valid UTF-8 string. + /// Any incomplete UTF-8 sequence at the end of the content is buffered and flushed as soon as it's completed. + /// + /// Other methods like `write_str_chunk` should not be called after a `write_utf8_bytes` call with an incomplete UTF-8 sequence. + #[inline] + pub fn write_utf8_chunk( + &mut self, + mut content: &[u8], + content_type: ContentType, + ) -> Result<(), Utf8Error> { + while !content.is_empty() { + let (valid_chunk, rest) = self.incomplete_utf8.utf8_bytes_to_slice(content)?; + content = rest; + if !valid_chunk.is_empty() { + self.inner.write_str(valid_chunk, content_type); + } + } + Ok(()) + } +} + +impl StreamingHandlerSinkInner<'_> { + #[inline] + pub(crate) fn write_str(&mut self, content: &str, content_type: ContentType) { + match content_type { + ContentType::Html => self.write_html(content), + ContentType::Text => self.write_body_text(content), + } + } + + pub(crate) fn write_html(&mut self, html: &str) { + if let Some(encoder) = &mut self.non_utf8_encoder { + encoder.encode(html, self.output_handler); + } else if !html.is_empty() { + (self.output_handler)(html.as_bytes()); + } + } + + /// For text content, not attributes + pub(crate) fn write_body_text(&mut self, plaintext: &str) { + if let Some(encoder) = &mut self.non_utf8_encoder { + escape_body_text(plaintext, &mut |chunk| { + debug_assert!(!chunk.is_empty()); + encoder.encode(chunk, self.output_handler); + }); + } else { + escape_body_text(plaintext, &mut |chunk| { + debug_assert!(!chunk.is_empty()); + (self.output_handler)(chunk.as_bytes()); + }); + } + } +} + +#[test] +fn utf8_fragments() { + let text = "🐈°文字化けしない ▀▄ ɯopuɐɹ ⓤⓝⓘⓒⓞⓓⓔ and ascii 🐳 sʇuıodǝpoɔ ✴"; + for with_zero_writes in [false, true] { + for len in 1..9 { + let mut out = Vec::new(); + let mut handler = |ch: &[u8]| out.extend_from_slice(ch); + let mut t = StreamingHandlerSink::new(UTF_8, &mut handler); + for (nth, chunk) in text.as_bytes().chunks(len).enumerate() { + let msg = + format!("{len} at {nth} '{chunk:?}'; with_zero_writes={with_zero_writes}"); + if with_zero_writes { + t.write_utf8_chunk(b"", ContentType::Text).expect(&msg); + } + t.write_utf8_chunk(chunk, ContentType::Html).expect(&msg); + } + drop(t); + assert_eq!(String::from_utf8_lossy(&out), text, "{len}"); + } + } +} + +#[test] +fn long_text() { + let mut written = 0; + let mut expected = 0; + let mut handler = |ch: &[u8]| { + assert!( + ch.iter().all(|&c| { + written += 1; + c == if 0 != written & 1 { + 177 + } else { + b'0' + ((written / 2 - 1) % 10) as u8 + } + }), + "@{written} {ch:?}" + ); + }; + let mut t = StreamingHandlerSink::new(encoding_rs::ISO_8859_2, &mut handler); + + let mut s = "ą0ą1ą2ą3ą4ą5ą6ą7ą8ą9".repeat(128); + let mut split_point = 1; + while s.len() <= 1 << 17 { + s.push_str(&s.clone()); + expected += s.chars().count(); + let (a, b) = s.as_bytes().split_at(split_point); + split_point += 13; + t.write_utf8_chunk(a, ContentType::Text).unwrap(); + t.write_utf8_chunk(b, ContentType::Html).unwrap(); + } + assert_eq!(expected, written); +} + +#[test] +fn invalid_utf8_fragments() { + #[rustfmt::skip] + let broken_utf8 = &[ + &b"\x31\x32\x33\xED\xA0\x80\x31"[..], b"\x31\x32\x33\xEF\x80", b"\x31\x32\x33\xEF\x80\xF0\x3c", + b"\x37\x38\x39\xFE", b"\x37\x38\xFE", b"\x37\xFF", b"\x3c\x23\x24\xFE\x3C", b"\x3C\x23\xFE\x3C\x3C", + b"\x3C\x3D\xE0\x80\x3C", b"\x3C\x3D\xE0\x80\xAF\x3C", b"\x3C\x3D\xE0\x80\xE0\x80\x3C", + b"\x3C\x3D\xED\xA0\x80\x3C", b"\x3C\x3D\xF0\x80\x80\x3C", b"\x3C\x3D\xF0\x80\x80\x80\x3C", + b"\x3C\x3D\xF7\xBF\xBF\xBF\x3C", b"\x3C\x3D\xFF\x3C", b"\x7F", b"\x80", b"\x80\x3C", + b"\x80\x81\x82\x83\x84\x85\x86\x87", b"\x80\xBF", b"\x80\xBF\x80", b"\x80\xBF\x80\xBF", + b"\x80\xBF\x80\xBF\x80", b"\x80\xBF\x80\xBF\x80\xBF", b"\x81", b"\x81\x3C", + b"\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F", b"\x90\x91\x92\x93\x94\x95\x96\x97", b"\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F", + b"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7", b"\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF", b"\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7", + b"\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF", b"\xBF", b"\xC0", b"\xC0\x3C\xC1\x3C\xC2\x3C\xC3\x3C", b"\xC0\x80", + b"\xC0\xAF", b"\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41", b"\xC1\x3C", b"\xC1\xBF", b"\xC1\xBF", b"\xC2\x00", + b"\xC2\x41\x42", b"\xC2\x7F", b"\xC2\xC0", b"\xC2\xFF", b"\xC4\x3C\xC5\x3C\xC6\x3C\xC7\x3C", + b"\xC8\x3C\xC9\x3C\xCA\x3C\xCB\x3C", b"\xCC\x3C\xCD\x3C\xCE\x3C\xCF\x3C", b"\xD0\x3C\xD1\x3C\xD2\x3C\xD3\x3C", + b"\xD4\x3C\xD5\x3C\xD6\x3C\xD7\x3C", b"\xD8\x3C\xD9\x3C\xDA\x3C\xDB\x3C", b"\xDC\x3C\xDD\x3C\xDE\x3C\xDF\x3C", + b"\xDF", b"\xDF\x00", b"\xDF\x7F", b"\xDF\xC0", b"\xDF\xFF", b"\xE0\x3C\xE1\x3C\xE2\x3C\xE3\x3C", b"\xE0\x80", + b"\xE0\x80\x00", b"\xE0\x80\x7F", b"\xE0\x80\x80", b"\xE0\x80\xAF", b"\xE0\x80\xC0", b"\xE0\x80\xFF", + b"\xE0\x81\xBF", b"\xE0\x9F\xBF", b"\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41", + b"\xE4\x3C\xE5\x3C\xE6\x3C\xE7\x3C", b"\xE8\x3C\xE9\x3C\xEA\x3C\xEB\x3C", b"\xEC\x3C\xED\x3C\xEE\x3C\xEF\x3C", + b"\xED\x80\x00", b"\xED\x80\x7F", b"\xED\x80\xC0", b"\xED\x80\xFF", b"\xED\xA0\x80", b"\xED\xA0\x80\x35", + b"\xED\xA0\x80\xED\xB0\x80", b"\xED\xA0\x80\xED\xBF\xBF", b"\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41", + b"\xED\xAD\xBF", b"\xED\xAD\xBF\xED\xB0\x80", b"\xED\xAD\xBF\xED\xBF\xBF", b"\xED\xAE\x80", + b"\xED\xAE\x80\xED\xB0\x80", b"\xED\xAE\x80\xED\xBF\xBF", b"\xED\xAF\xBF", b"\xED\xAF\xBF\xED\xB0\x80", + b"\xED\xAF\xBF\xED\xBF\xBF", b"\xED\xB0\x80", b"\xED\xBE\x80", b"\xED\xBF\xBF", b"\xEF\xBF", + b"\xF0\x3C\xF1\x3C", b"\xF0\x80\x80", b"\xF0\x80\x80\x80", b"\xF0\x80\x80\xAF", b"\xF0\x80\x81\xBF", + b"\xF0\x8F\xBF\xBF", b"\xF0\x90\x80\x00", b"\xF0\x90\x80\x7F", b"\xF0\x90\x80\xC0", b"\xF0\x90\x80\xFF", + b"\xF1\x80\x80\x00", b"\xF1\x80\x80\x7F", b"\xF1\x80\x80\xC0", b"\xF1\x80\x80\xFF", b"\xF2\x3C\xF3\x3C", + b"\xF4\x3C\xF5\x3C", b"\xF4\x80\x80\x00", b"\xF4\x80\x80\x7F", b"\xF4\x80\x80\xC0", b"\xF4\x80\x80\xFF", + b"\xF4\x90\x80\x80", b"\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42", b"\xF5\x3C", b"\xF6\x3C\xF7\x3C", + b"\xF7\xBF\xBF", b"\xF7\xBF\xBF\xBF", b"\xF7\xBF\xBF\xBF\xBF", b"\xF7\xBF\xBF\xBF\xBF\xBF", + b"\xF7\xBF\xBF\xBF\xBF\xBF\xBF", b"\xF8\x3C", b"\xF8\x80\x80\x80", b"\xF8\x80\x80\x80\xAF", + b"\xF8\x87\xBF\xBF\xBF", b"\xF8\x88\x80\x80\x80", b"\xF9\x3C", b"\xFA\x3C", b"\xFB\x3C", b"\xFB\xBF\xBF\xBF", + b"\xFC\x3C", b"\xFC\x80\x80\x80\x80", b"\xFC\x80\x80\x80\x80\xAF", b"\xFC\x84\x80\x80\x80\x80", b"\xFD\x3C", + b"\xFD\xBF\xBF\xBF\xBF", b"\xFE", b"\xFF", b"\xFF\x3C" + ]; + + for bad in broken_utf8 { + 'next: for len in 1..bad.len() { + let mut handler = |ch: &[u8]| { + assert!( + !std::str::from_utf8(ch).unwrap().contains('<'), + "{ch:x?} of {bad:x?}" + ); + }; + let mut t = StreamingHandlerSink::new(UTF_8, &mut handler); + for chunk in bad.chunks(len) { + if t.write_utf8_chunk(chunk, ContentType::Text).is_err() { + continue 'next; + } + } + // An ASCII write forces flush of an incomplete sequence + assert!( + t.write_utf8_chunk(b"<", ContentType::Text).is_err(), + "Shouldn't have allowed {bad:?} {}", + String::from_utf8_lossy(bad) + ); + } + } +} diff --git a/src/rewritable_units/text_decoder.rs b/src/rewritable_units/text_decoder.rs new file mode 100644 index 00000000..9fe325f4 --- /dev/null +++ b/src/rewritable_units/text_decoder.rs @@ -0,0 +1,125 @@ +use crate::base::SharedEncoding; +use crate::rewriter::RewritingError; +use encoding_rs::{CoderResult, Decoder, Encoding, UTF_8}; + +pub(crate) struct TextDecoder { + encoding: SharedEncoding, + pending_text_streaming_decoder: Option, + text_buffer: String, +} + +impl TextDecoder { + #[inline] + #[must_use] + pub fn new(encoding: SharedEncoding) -> Self { + Self { + encoding, + pending_text_streaming_decoder: None, + // TODO make adjustable + text_buffer: String::from_utf8(vec![0u8; 1024]).unwrap(), + } + } + + #[inline] + pub fn flush_pending( + &mut self, + output_handler: &mut dyn FnMut(&str, bool, &'static Encoding) -> Result<(), RewritingError>, + ) -> Result<(), RewritingError> { + if self.pending_text_streaming_decoder.is_some() { + self.feed_text(&[], true, output_handler)?; + } + Ok(()) + } + + #[inline(never)] + pub fn feed_text( + &mut self, + mut raw_input: &[u8], + last_in_text_node: bool, + output_handler: &mut dyn FnMut(&str, bool, &'static Encoding) -> Result<(), RewritingError>, + ) -> Result<(), RewritingError> { + let encoding = self.encoding.get(); + + if let Some((utf8_text, rest)) = self.split_utf8_start(raw_input, encoding) { + raw_input = rest; + let really_last = last_in_text_node && rest.is_empty(); + + (output_handler)(utf8_text, really_last, encoding)?; + + if really_last { + debug_assert!(self.pending_text_streaming_decoder.is_none()); + return Ok(()); + } + }; + + let decoder = self + .pending_text_streaming_decoder + .get_or_insert_with(|| encoding.new_decoder_without_bom_handling()); + + loop { + let buffer = self.text_buffer.as_mut_str(); + let (status, read, written, ..) = + decoder.decode_to_str(raw_input, buffer, last_in_text_node); + + let finished_decoding = status == CoderResult::InputEmpty; + + if written > 0 || last_in_text_node { + // the last call to feed_text() may make multiple calls to output_handler, + // but only one call to output_handler can be *the* last one. + let really_last = last_in_text_node && finished_decoding; + + (output_handler)( + // this will always be in bounds, but unwrap_or_default optimizes better + buffer.get(..written).unwrap_or_default(), + really_last, + encoding, + )?; + } + + if finished_decoding { + if last_in_text_node { + self.pending_text_streaming_decoder = None; + } + return Ok(()); + } + raw_input = raw_input.get(read..).unwrap_or_default(); + } + } + + /// Fast path for UTF-8 or ASCII prefix + /// + /// Returns UTF-8 text to emit + remaining bytes, or `None` if the fast path is not available + #[inline] + fn split_utf8_start<'i>( + &self, + raw_input: &'i [u8], + encoding: &'static Encoding, + ) -> Option<(&'i str, &'i [u8])> { + // Can't use the fast path if the decoder may have buffered some bytes + if self.pending_text_streaming_decoder.is_some() { + return None; + } + + let text_or_len = if encoding == UTF_8 { + std::str::from_utf8(raw_input).map_err(|err| err.valid_up_to()) + } else { + debug_assert!(encoding.is_ascii_compatible()); + Err(Encoding::ascii_valid_up_to(raw_input)) + }; + + match text_or_len { + Ok(utf8_text) => Some((utf8_text, &[][..])), + Err(valid_up_to) => { + // The slow path buffers 1KB, and even though this shouldn't matter, + // it is an observable behavior, and it makes bugs worse for text handlers + // that assume they'll get only a single chunk. + if valid_up_to != raw_input.len() && valid_up_to < self.text_buffer.len() { + return None; + } + + let (text, rest) = raw_input.split_at_checked(valid_up_to)?; + Some((std::str::from_utf8(text).ok()?, rest)) + } + } + } +} diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs index 93daf086..de35c74a 100644 --- a/src/rewritable_units/text_encoder.rs +++ b/src/rewritable_units/text_encoder.rs @@ -1,5 +1,3 @@ -use super::ContentType; -use crate::html::escape_body_text; use encoding_rs::{CoderResult, Encoder, Encoding, UTF_8}; use thiserror::Error; @@ -12,119 +10,6 @@ use thiserror::Error; #[error("Invalid UTF-8")] pub struct Utf8Error; -/// Used to write chunks of text or markup in streaming mutation handlers. -/// -/// Argument to [`StreamingHandler::write_all()`](crate::html_content::StreamingHandler::write_all). -pub struct StreamingHandlerSink<'output_handler> { - incomplete_utf8: IncompleteUtf8Resync, - inner: StreamingHandlerSinkInner<'output_handler>, -} - -struct StreamingHandlerSinkInner<'output_handler> { - non_utf8_encoder: Option, - - /// ```compile_fail - /// use lol_html::html_content::StreamingHandlerSink; - /// struct IsSend(T); - /// let x: IsSend>; - /// ``` - /// - /// ```compile_fail - /// use lol_html::html_content::StreamingHandlerSink; - /// struct IsSync(T); - /// let x: IsSync>; - /// ``` - output_handler: &'output_handler mut dyn FnMut(&[u8]), -} - -impl<'output_handler> StreamingHandlerSink<'output_handler> { - #[inline(always)] - pub(crate) fn new( - encoding: &'static Encoding, - output_handler: &'output_handler mut dyn FnMut(&[u8]), - ) -> Self { - Self { - incomplete_utf8: IncompleteUtf8Resync::new(), - inner: StreamingHandlerSinkInner { - non_utf8_encoder: (encoding != UTF_8).then(|| TextEncoder::new(encoding)), - output_handler, - }, - } - } - - /// Writes the given UTF-8 string to the output, converting the encoding and [escaping](ContentType) if necessary. - /// - /// It may be called multiple times. The strings will be concatenated together. - #[inline] - pub fn write_str(&mut self, content: &str, content_type: ContentType) { - if self.incomplete_utf8.discard_incomplete() { - // too late to report the error to the caller of write_utf8_chunk - self.inner.write_html("\u{FFFD}"); - } - self.inner.write_str(content, content_type); - } - - #[inline] - pub(crate) fn output_handler(&mut self) -> &mut dyn FnMut(&[u8]) { - &mut self.inner.output_handler - } - - /// Writes as much of the given UTF-8 fragment as possible, converting the encoding and [escaping](ContentType) if necessary. - /// - /// The `content` doesn't need to be a complete UTF-8 string, as long as consecutive calls to `write_utf8_bytes` create a valid UTF-8 string. - /// Any incomplete UTF-8 sequence at the end of the content is buffered and flushed as soon as it's completed. - /// - /// Other methods like `write_str_chunk` should not be called after a `write_utf8_bytes` call with an incomplete UTF-8 sequence. - #[inline] - pub fn write_utf8_chunk( - &mut self, - mut content: &[u8], - content_type: ContentType, - ) -> Result<(), Utf8Error> { - while !content.is_empty() { - let (valid_chunk, rest) = self.incomplete_utf8.utf8_bytes_to_slice(content)?; - content = rest; - if !valid_chunk.is_empty() { - self.inner.write_str(valid_chunk, content_type); - } - } - Ok(()) - } -} - -impl StreamingHandlerSinkInner<'_> { - #[inline] - pub(crate) fn write_str(&mut self, content: &str, content_type: ContentType) { - match content_type { - ContentType::Html => self.write_html(content), - ContentType::Text => self.write_body_text(content), - } - } - - pub(crate) fn write_html(&mut self, html: &str) { - if let Some(encoder) = &mut self.non_utf8_encoder { - encoder.encode(html, self.output_handler); - } else if !html.is_empty() { - (self.output_handler)(html.as_bytes()); - } - } - - /// For text content, not attributes - pub(crate) fn write_body_text(&mut self, plaintext: &str) { - if let Some(encoder) = &mut self.non_utf8_encoder { - escape_body_text(plaintext, &mut |chunk| { - debug_assert!(!chunk.is_empty()); - encoder.encode(chunk, self.output_handler); - }); - } else { - escape_body_text(plaintext, &mut |chunk| { - debug_assert!(!chunk.is_empty()); - (self.output_handler)(chunk.as_bytes()); - }); - } - } -} - /// Temporary buffer used for encoding_rs output enum Buffer { /// Stack buffer avoids heap allocation, and lets go back quickly to the ASCII fast path. @@ -157,7 +42,7 @@ impl Buffer { } } -struct TextEncoder { +pub(crate) struct TextEncoder { encoder: Encoder, buffer: Buffer, } @@ -177,7 +62,7 @@ impl TextEncoder { /// without heap allocations. /// It also avoids methods that have UB: #[inline(never)] - fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) { + pub fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) { loop { // First, fast path for ASCII-only prefix debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it @@ -235,7 +120,7 @@ const fn utf8_width(b: u8) -> u8 { } /// Stitches together UTF-8 from byte writes that may split UTF-8 sequences into multiple fragments -struct IncompleteUtf8Resync { +pub(crate) struct IncompleteUtf8Resync { /// Buffers an incomplete UTF-8 sequence char_bytes: [u8; 4], /// Number of bytes in `bytes` @@ -325,28 +210,6 @@ impl IncompleteUtf8Resync { } } -#[test] -fn utf8_fragments() { - let text = "🐈°文字化けしない ▀▄ ɯopuɐɹ ⓤⓝⓘⓒⓞⓓⓔ and ascii 🐳 sʇuıodǝpoɔ ✴"; - for with_zero_writes in [false, true] { - for len in 1..9 { - let mut out = Vec::new(); - let mut handler = |ch: &[u8]| out.extend_from_slice(ch); - let mut t = StreamingHandlerSink::new(UTF_8, &mut handler); - for (nth, chunk) in text.as_bytes().chunks(len).enumerate() { - let msg = - format!("{len} at {nth} '{chunk:?}'; with_zero_writes={with_zero_writes}"); - if with_zero_writes { - t.write_utf8_chunk(b"", ContentType::Text).expect(&msg); - } - t.write_utf8_chunk(chunk, ContentType::Html).expect(&msg); - } - drop(t); - assert_eq!(String::from_utf8_lossy(&out), text, "{len}"); - } - } -} - #[test] fn chars() { let boundaries = "🐈°文字化けしない" @@ -362,98 +225,3 @@ fn chars() { .collect::(); assert_eq!("4...2.3..3..3..3..3..3..3..", boundaries); } - -#[test] -fn long_text() { - let mut written = 0; - let mut expected = 0; - let mut handler = |ch: &[u8]| { - assert!( - ch.iter().all(|&c| { - written += 1; - c == if 0 != written & 1 { - 177 - } else { - b'0' + ((written / 2 - 1) % 10) as u8 - } - }), - "@{written} {ch:?}" - ); - }; - let mut t = StreamingHandlerSink::new(encoding_rs::ISO_8859_2, &mut handler); - - let mut s = "ą0ą1ą2ą3ą4ą5ą6ą7ą8ą9".repeat(128); - let mut split_point = 1; - while s.len() <= 1 << 17 { - s.push_str(&s.clone()); - expected += s.chars().count(); - let (a, b) = s.as_bytes().split_at(split_point); - split_point += 13; - t.write_utf8_chunk(a, ContentType::Text).unwrap(); - t.write_utf8_chunk(b, ContentType::Html).unwrap(); - } - assert_eq!(expected, written); -} - -#[test] -fn invalid_utf8_fragments() { - #[rustfmt::skip] - let broken_utf8 = &[ - &b"\x31\x32\x33\xED\xA0\x80\x31"[..], b"\x31\x32\x33\xEF\x80", b"\x31\x32\x33\xEF\x80\xF0\x3c", - b"\x37\x38\x39\xFE", b"\x37\x38\xFE", b"\x37\xFF", b"\x3c\x23\x24\xFE\x3C", b"\x3C\x23\xFE\x3C\x3C", - b"\x3C\x3D\xE0\x80\x3C", b"\x3C\x3D\xE0\x80\xAF\x3C", b"\x3C\x3D\xE0\x80\xE0\x80\x3C", - b"\x3C\x3D\xED\xA0\x80\x3C", b"\x3C\x3D\xF0\x80\x80\x3C", b"\x3C\x3D\xF0\x80\x80\x80\x3C", - b"\x3C\x3D\xF7\xBF\xBF\xBF\x3C", b"\x3C\x3D\xFF\x3C", b"\x7F", b"\x80", b"\x80\x3C", - b"\x80\x81\x82\x83\x84\x85\x86\x87", b"\x80\xBF", b"\x80\xBF\x80", b"\x80\xBF\x80\xBF", - b"\x80\xBF\x80\xBF\x80", b"\x80\xBF\x80\xBF\x80\xBF", b"\x81", b"\x81\x3C", - b"\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F", b"\x90\x91\x92\x93\x94\x95\x96\x97", b"\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F", - b"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7", b"\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF", b"\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7", - b"\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF", b"\xBF", b"\xC0", b"\xC0\x3C\xC1\x3C\xC2\x3C\xC3\x3C", b"\xC0\x80", - b"\xC0\xAF", b"\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41", b"\xC1\x3C", b"\xC1\xBF", b"\xC1\xBF", b"\xC2\x00", - b"\xC2\x41\x42", b"\xC2\x7F", b"\xC2\xC0", b"\xC2\xFF", b"\xC4\x3C\xC5\x3C\xC6\x3C\xC7\x3C", - b"\xC8\x3C\xC9\x3C\xCA\x3C\xCB\x3C", b"\xCC\x3C\xCD\x3C\xCE\x3C\xCF\x3C", b"\xD0\x3C\xD1\x3C\xD2\x3C\xD3\x3C", - b"\xD4\x3C\xD5\x3C\xD6\x3C\xD7\x3C", b"\xD8\x3C\xD9\x3C\xDA\x3C\xDB\x3C", b"\xDC\x3C\xDD\x3C\xDE\x3C\xDF\x3C", - b"\xDF", b"\xDF\x00", b"\xDF\x7F", b"\xDF\xC0", b"\xDF\xFF", b"\xE0\x3C\xE1\x3C\xE2\x3C\xE3\x3C", b"\xE0\x80", - b"\xE0\x80\x00", b"\xE0\x80\x7F", b"\xE0\x80\x80", b"\xE0\x80\xAF", b"\xE0\x80\xC0", b"\xE0\x80\xFF", - b"\xE0\x81\xBF", b"\xE0\x9F\xBF", b"\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41", - b"\xE4\x3C\xE5\x3C\xE6\x3C\xE7\x3C", b"\xE8\x3C\xE9\x3C\xEA\x3C\xEB\x3C", b"\xEC\x3C\xED\x3C\xEE\x3C\xEF\x3C", - b"\xED\x80\x00", b"\xED\x80\x7F", b"\xED\x80\xC0", b"\xED\x80\xFF", b"\xED\xA0\x80", b"\xED\xA0\x80\x35", - b"\xED\xA0\x80\xED\xB0\x80", b"\xED\xA0\x80\xED\xBF\xBF", b"\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41", - b"\xED\xAD\xBF", b"\xED\xAD\xBF\xED\xB0\x80", b"\xED\xAD\xBF\xED\xBF\xBF", b"\xED\xAE\x80", - b"\xED\xAE\x80\xED\xB0\x80", b"\xED\xAE\x80\xED\xBF\xBF", b"\xED\xAF\xBF", b"\xED\xAF\xBF\xED\xB0\x80", - b"\xED\xAF\xBF\xED\xBF\xBF", b"\xED\xB0\x80", b"\xED\xBE\x80", b"\xED\xBF\xBF", b"\xEF\xBF", - b"\xF0\x3C\xF1\x3C", b"\xF0\x80\x80", b"\xF0\x80\x80\x80", b"\xF0\x80\x80\xAF", b"\xF0\x80\x81\xBF", - b"\xF0\x8F\xBF\xBF", b"\xF0\x90\x80\x00", b"\xF0\x90\x80\x7F", b"\xF0\x90\x80\xC0", b"\xF0\x90\x80\xFF", - b"\xF1\x80\x80\x00", b"\xF1\x80\x80\x7F", b"\xF1\x80\x80\xC0", b"\xF1\x80\x80\xFF", b"\xF2\x3C\xF3\x3C", - b"\xF4\x3C\xF5\x3C", b"\xF4\x80\x80\x00", b"\xF4\x80\x80\x7F", b"\xF4\x80\x80\xC0", b"\xF4\x80\x80\xFF", - b"\xF4\x90\x80\x80", b"\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42", b"\xF5\x3C", b"\xF6\x3C\xF7\x3C", - b"\xF7\xBF\xBF", b"\xF7\xBF\xBF\xBF", b"\xF7\xBF\xBF\xBF\xBF", b"\xF7\xBF\xBF\xBF\xBF\xBF", - b"\xF7\xBF\xBF\xBF\xBF\xBF\xBF", b"\xF8\x3C", b"\xF8\x80\x80\x80", b"\xF8\x80\x80\x80\xAF", - b"\xF8\x87\xBF\xBF\xBF", b"\xF8\x88\x80\x80\x80", b"\xF9\x3C", b"\xFA\x3C", b"\xFB\x3C", b"\xFB\xBF\xBF\xBF", - b"\xFC\x3C", b"\xFC\x80\x80\x80\x80", b"\xFC\x80\x80\x80\x80\xAF", b"\xFC\x84\x80\x80\x80\x80", b"\xFD\x3C", - b"\xFD\xBF\xBF\xBF\xBF", b"\xFE", b"\xFF", b"\xFF\x3C" - ]; - - for bad in broken_utf8 { - 'next: for len in 1..bad.len() { - let mut handler = |ch: &[u8]| { - assert!( - !std::str::from_utf8(ch).unwrap().contains('<'), - "{ch:x?} of {bad:x?}" - ); - }; - let mut t = StreamingHandlerSink::new(UTF_8, &mut handler); - for chunk in bad.chunks(len) { - if t.write_utf8_chunk(chunk, ContentType::Text).is_err() { - continue 'next; - } - } - // An ASCII write forces flush of an incomplete sequence - assert!( - t.write_utf8_chunk(b"<", ContentType::Text).is_err(), - "Shouldn't have allowed {bad:?} {}", - String::from_utf8_lossy(bad) - ); - } - } -} diff --git a/src/rewritable_units/tokens/capturer/mod.rs b/src/rewritable_units/tokens/capturer/mod.rs index 4a9957ad..22c691a2 100644 --- a/src/rewritable_units/tokens/capturer/mod.rs +++ b/src/rewritable_units/tokens/capturer/mod.rs @@ -1,15 +1,8 @@ -mod text_decoder; mod to_token; -use self::text_decoder::TextDecoder; -use super::*; -use crate::base::SharedEncoding; -use crate::parser::Lexeme; -use crate::rewriter::RewritingError; -use bitflags::bitflags; - pub(crate) use self::to_token::{ToToken, ToTokenResult}; +use bitflags::bitflags; bitflags! { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct TokenCaptureFlags: u8 { @@ -20,77 +13,3 @@ bitflags! { const DOCTYPES = 0b0001_0000; } } - -#[derive(Debug)] -pub(crate) enum TokenCapturerEvent<'i> { - LexemeConsumed, - TokenProduced(Box>), -} - -type CapturerEventHandler<'h> = - &'h mut dyn FnMut(TokenCapturerEvent<'_>) -> Result<(), RewritingError>; - -pub(crate) struct TokenCapturer { - encoding: SharedEncoding, - text_decoder: TextDecoder, - capture_flags: TokenCaptureFlags, -} - -impl TokenCapturer { - #[inline] - #[must_use] - pub fn new(capture_flags: TokenCaptureFlags, encoding: SharedEncoding) -> Self { - Self { - encoding: SharedEncoding::clone(&encoding), - text_decoder: TextDecoder::new(encoding), - capture_flags, - } - } - - #[inline] - #[must_use] - pub const fn has_captures(&self) -> bool { - !self.capture_flags.is_empty() - } - - #[inline] - pub fn set_capture_flags(&mut self, flags: TokenCaptureFlags) { - self.capture_flags = flags; - } - - #[inline] - pub fn flush_pending_text( - &mut self, - event_handler: CapturerEventHandler<'_>, - ) -> Result<(), RewritingError> { - self.text_decoder.flush_pending(event_handler) - } - - pub fn feed<'i, T>( - &mut self, - lexeme: &Lexeme<'i, T>, - mut event_handler: impl FnMut(TokenCapturerEvent<'_>) -> Result<(), RewritingError>, - ) -> Result<(), RewritingError> - where - Lexeme<'i, T>: ToToken, - { - match lexeme.to_token(&mut self.capture_flags, self.encoding.get()) { - ToTokenResult::Token(token) => { - self.flush_pending_text(&mut event_handler)?; - event_handler(TokenCapturerEvent::LexemeConsumed)?; - event_handler(TokenCapturerEvent::TokenProduced(token)) - } - ToTokenResult::Text(text_type) => { - if self.capture_flags.contains(TokenCaptureFlags::TEXT) { - event_handler(TokenCapturerEvent::LexemeConsumed)?; - - self.text_decoder - .feed_text(&lexeme.raw(), text_type, &mut event_handler)?; - } - - Ok(()) - } - ToTokenResult::None => self.flush_pending_text(&mut event_handler), - } - } -} diff --git a/src/rewritable_units/tokens/capturer/text_decoder.rs b/src/rewritable_units/tokens/capturer/text_decoder.rs deleted file mode 100644 index 1a81e3e3..00000000 --- a/src/rewritable_units/tokens/capturer/text_decoder.rs +++ /dev/null @@ -1,91 +0,0 @@ -use super::*; -use crate::base::SharedEncoding; -use crate::html::TextType; -use crate::rewriter::RewritingError; -use encoding_rs::{CoderResult, Decoder}; - -// NOTE: this can't be refactored into method, because we hold a mutable reference for `self` -// during the decoding loop in `feed_text`. -macro_rules! emit { - ($self:tt, $text:expr, $last:ident, $event_handler:ident) => {{ - let token = TextChunk::new_token($text, $self.last_text_type, $last, $self.encoding.get()); - - $event_handler(TokenCapturerEvent::TokenProduced(Box::new(token))) - }}; -} - -pub(crate) struct TextDecoder { - encoding: SharedEncoding, - pending_text_streaming_decoder: Option, - text_buffer: String, - last_text_type: TextType, -} - -impl TextDecoder { - #[inline] - #[must_use] - pub fn new(encoding: SharedEncoding) -> Self { - Self { - encoding, - pending_text_streaming_decoder: None, - // TODO make adjustable - text_buffer: String::from_utf8(vec![0u8; 1024]).unwrap(), - last_text_type: TextType::Data, - } - } - - #[inline] - pub fn flush_pending( - &mut self, - event_handler: CapturerEventHandler<'_>, - ) -> Result<(), RewritingError> { - if self.pending_text_streaming_decoder.is_some() { - self.decode_with_streaming_decoder(&[], true, event_handler)?; - self.pending_text_streaming_decoder = None; - } - Ok(()) - } - - fn decode_with_streaming_decoder( - &mut self, - raw: &[u8], - last: bool, - event_handler: CapturerEventHandler<'_>, - ) -> Result<(), RewritingError> { - let encoding = self.encoding.get(); - let buffer = self.text_buffer.as_mut_str(); - - let decoder = self - .pending_text_streaming_decoder - .get_or_insert_with(|| encoding.new_decoder_without_bom_handling()); - - let mut consumed = 0; - - loop { - let (status, read, written, ..) = decoder.decode_to_str(&raw[consumed..], buffer, last); - - if written > 0 || last { - emit!(self, &buffer[..written], last, event_handler)?; - } - - if status == CoderResult::InputEmpty { - break; - } - - consumed += read; - } - - Ok(()) - } - - #[inline] - pub fn feed_text( - &mut self, - raw: &[u8], - text_type: TextType, - event_handler: CapturerEventHandler<'_>, - ) -> Result<(), RewritingError> { - self.last_text_type = text_type; - self.decode_with_streaming_decoder(raw, false, event_handler) - } -} diff --git a/src/rewritable_units/tokens/capturer/to_token.rs b/src/rewritable_units/tokens/capturer/to_token.rs index 2d042847..b79e6827 100644 --- a/src/rewritable_units/tokens/capturer/to_token.rs +++ b/src/rewritable_units/tokens/capturer/to_token.rs @@ -1,21 +1,15 @@ -use super::*; +use super::TokenCaptureFlags; use crate::html::TextType; use crate::parser::{NonTagContentLexeme, NonTagContentTokenOutline, TagLexeme, TagTokenOutline}; +use crate::rewritable_units::{Attributes, Comment, Doctype, EndTag, StartTag, Token}; use encoding_rs::Encoding; pub(crate) enum ToTokenResult<'i> { - Token(Box>), + Token(Token<'i>), Text(TextType), None, } -impl<'i> From> for ToTokenResult<'i> { - #[inline] - fn from(token: Token<'i>) -> Self { - ToTokenResult::Token(Box::new(token)) - } -} - pub(crate) trait ToToken { fn to_token( &self, @@ -25,6 +19,7 @@ pub(crate) trait ToToken { } impl ToToken for TagLexeme<'_> { + #[inline] fn to_token( &self, capture_flags: &mut TokenCaptureFlags, @@ -40,16 +35,14 @@ impl ToToken for TagLexeme<'_> { } if capture_flags.contains(TokenCaptureFlags::NEXT_START_TAG) => { // NOTE: clear the flag once we've seen required start tag. capture_flags.remove(TokenCaptureFlags::NEXT_START_TAG); - - StartTag::new_token( + ToTokenResult::Token(StartTag::new_token( self.part(name), Attributes::new(self.input(), attributes, encoding), ns, self_closing, self.raw(), encoding, - ) - .into() + )) } TagTokenOutline::EndTag { name, .. } @@ -57,8 +50,7 @@ impl ToToken for TagLexeme<'_> { { // NOTE: clear the flag once we've seen required end tag. capture_flags.remove(TokenCaptureFlags::NEXT_END_TAG); - - EndTag::new_token(self.part(name), self.raw(), encoding).into() + ToTokenResult::Token(EndTag::new_token(self.part(name), self.raw(), encoding)) } _ => ToTokenResult::None, } @@ -66,17 +58,23 @@ impl ToToken for TagLexeme<'_> { } impl ToToken for NonTagContentLexeme<'_> { + #[inline] fn to_token( &self, capture_flags: &mut TokenCaptureFlags, encoding: &'static Encoding, ) -> ToTokenResult<'_> { match *self.token_outline() { - Some(NonTagContentTokenOutline::Text(text_type)) => ToTokenResult::Text(text_type), + Some(NonTagContentTokenOutline::Text(text_type)) + if capture_flags.contains(TokenCaptureFlags::TEXT) => + { + ToTokenResult::Text(text_type) + } + Some(NonTagContentTokenOutline::Comment(text)) if capture_flags.contains(TokenCaptureFlags::COMMENTS) => { - Comment::new_token(self.part(text), self.raw(), encoding).into() + ToTokenResult::Token(Comment::new_token(self.part(text), self.raw(), encoding)) } Some(NonTagContentTokenOutline::Doctype { @@ -84,16 +82,17 @@ impl ToToken for NonTagContentLexeme<'_> { public_id, system_id, force_quirks, - }) if capture_flags.contains(TokenCaptureFlags::DOCTYPES) => Doctype::new_token( - self.opt_part(name), - self.opt_part(public_id), - self.opt_part(system_id), - force_quirks, - false, // removed - self.raw(), - encoding, - ) - .into(), + }) if capture_flags.contains(TokenCaptureFlags::DOCTYPES) => { + ToTokenResult::Token(Doctype::new_token( + self.opt_part(name), + self.opt_part(public_id), + self.opt_part(system_id), + force_quirks, + false, // removed + self.raw(), + encoding, + )) + } _ => ToTokenResult::None, } } diff --git a/src/rewritable_units/tokens/mod.rs b/src/rewritable_units/tokens/mod.rs index 37f5372c..4aa6ed33 100644 --- a/src/rewritable_units/tokens/mod.rs +++ b/src/rewritable_units/tokens/mod.rs @@ -21,7 +21,7 @@ macro_rules! impl_serialize { mut self, output_handler: &mut dyn FnMut(&[u8]), ) -> Result<(), crate::errors::RewritingError> { - let mut encoder = crate::rewritable_units::text_encoder::StreamingHandlerSink::new( + let mut encoder = crate::rewritable_units::StreamingHandlerSink::new( self.encoding, output_handler, ); diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index d160f3f9..c39c5251 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -1,4 +1,4 @@ -use super::{Mutations, Token}; +use super::Mutations; use crate::base::Bytes; use crate::errors::RewritingError; use crate::html::TextType; @@ -72,20 +72,20 @@ pub struct TextChunk<'i> { impl<'i> TextChunk<'i> { #[inline] #[must_use] - pub(super) fn new_token( + pub(crate) fn new( text: &'i str, text_type: TextType, last_in_text_node: bool, encoding: &'static Encoding, - ) -> Token<'i> { - Token::TextChunk(TextChunk { + ) -> Self { + TextChunk { text: text.into(), text_type, last_in_text_node, encoding, mutations: Mutations::new(), user_data: Box::new(()), - }) + } } /// Returns the textual content of the chunk. @@ -381,14 +381,8 @@ mod tests { #[test] fn in_place_text_modifications() { - use super::super::Token; - let encoding = Encoding::for_label_no_replacement(b"utf-8").unwrap(); - let Token::TextChunk(mut chunk) = - TextChunk::new_token("original text", TextType::PlainText, true, encoding) - else { - unreachable!() - }; + let mut chunk = TextChunk::new("original text", TextType::PlainText, true, encoding); assert_eq!(chunk.as_str(), "original text"); chunk.set_str("hello".to_owned()); @@ -416,6 +410,7 @@ mod tests { macro_rules! skip_eof_chunk { ($c:ident) => { if $c.last_in_text_node() { + // This is not always true — a replacement char for an incomplete UTF-8 sequence could be flushed last assert!($c.as_str().is_empty()); return; } @@ -507,5 +502,15 @@ mod tests { "" ); } + + #[test] + fn last_flush_text_decoder() { + let rewritten = rewrite_text_chunk(b"

\xF0\xF0\x9F\xF0\x9F\x98

", UTF_8, |c| { + if c.last_in_text_node() { + c.after(" last", ContentType::Text); + } + }); + assert_eq!("

\u{fffd}\u{fffd}\u{fffd} last

", rewritten); + } } } diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index 98c882b0..8b4a6741 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -1,15 +1,15 @@ use crate::base::{Bytes, Range, SharedEncoding}; use crate::html::{LocalName, Namespace}; +use crate::html_content::{TextChunk, TextType}; use crate::parser::{ AttributeBuffer, Lexeme, LexemeSink, NonTagContentLexeme, ParserDirective, ParserOutputSink, TagHintSink, TagLexeme, TagTokenOutline, }; -use crate::rewritable_units::{ - DocumentEnd, Serialize, ToToken, Token, TokenCaptureFlags, TokenCapturer, TokenCapturerEvent, -}; +use crate::rewritable_units::TextDecoder; +use crate::rewritable_units::ToTokenResult; +use crate::rewritable_units::{DocumentEnd, Serialize, ToToken, Token, TokenCaptureFlags}; use crate::rewriter::RewritingError; - -use TagTokenOutline::{EndTag, StartTag}; +use encoding_rs::Encoding; pub(crate) struct AuxStartTagInfo<'i> { pub input: &'i Bytes<'i>, @@ -67,45 +67,30 @@ impl OutputSink for F { } // Pub only for integration tests -pub struct Dispatcher -where - C: TransformController, - O: OutputSink, -{ +pub struct Dispatcher { + delegate: DispatcherDelegate, + text_decoder: TextDecoder, + last_text_type: TextType, + got_flags_from_hint: bool, + pending_element_aux_info_req: Option>, + encoding: SharedEncoding, +} + +/// Fields split out of `Dispatcher` for borrow checking of event handlers +struct DispatcherDelegate { transform_controller: C, output_sink: O, remaining_content_start: usize, - token_capturer: TokenCapturer, - got_flags_from_hint: bool, - pending_element_aux_info_req: Option>, + capture_flags: TokenCaptureFlags, emission_enabled: bool, - encoding: SharedEncoding, } -impl Dispatcher +impl DispatcherDelegate where C: TransformController, O: OutputSink, { - pub fn new(transform_controller: C, output_sink: O, encoding: SharedEncoding) -> Self { - let initial_capture_flags = transform_controller.initial_capture_flags(); - - Self { - transform_controller, - output_sink, - remaining_content_start: 0, - token_capturer: TokenCapturer::new( - initial_capture_flags, - SharedEncoding::clone(&encoding), - ), - got_flags_from_hint: false, - pending_element_aux_info_req: None, - emission_enabled: true, - encoding, - } - } - - pub fn flush_remaining_input(&mut self, input: &[u8], consumed_byte_count: usize) { + fn flush_remaining_input(&mut self, input: &[u8], consumed_byte_count: usize) { let output = &input[self.remaining_content_start..consumed_byte_count]; if self.emission_enabled && !output.is_empty() { @@ -115,10 +100,10 @@ where self.remaining_content_start = 0; } - pub fn finish(&mut self, input: &[u8]) -> Result<(), RewritingError> { + fn finish(&mut self, encoding: &'static Encoding, input: &[u8]) -> Result<(), RewritingError> { self.flush_remaining_input(input, input.len()); - let mut document_end = DocumentEnd::new(&mut self.output_sink, self.encoding.get()); + let mut document_end = DocumentEnd::new(&mut self.output_sink, encoding); self.transform_controller.handle_end(&mut document_end)?; @@ -128,6 +113,88 @@ where Ok(()) } + /// Returns offset to the end of the consumed range + #[inline(never)] + fn lexeme_consumed(&mut self, lexeme: &Lexeme<'_, T>) -> usize { + let lexeme_range = lexeme.raw_range(); + + let chunk_range = Range { + start: self.remaining_content_start, + end: lexeme_range.start, + }; + + let chunk = lexeme.input().slice(chunk_range); + + if self.emission_enabled && chunk.len() > 0 { + self.output_sink.handle_chunk(&chunk); + } + + lexeme_range.end + } + + #[inline] + fn token_produced(&mut self, mut token: Token<'_>) -> Result<(), RewritingError> { + trace!(@output token); + + self.transform_controller.handle_token(&mut token)?; + + if self.emission_enabled { + token.into_bytes(&mut |c| self.output_sink.handle_chunk(c))?; + } + Ok(()) + } + + fn text_token_produced( + &mut self, + text: &str, + encoding: &'static Encoding, + text_type: TextType, + is_last_in_node: bool, + ) -> Result<(), RewritingError> { + let mut token = + Token::TextChunk(TextChunk::new(text, text_type, is_last_in_node, encoding)); + + trace!(@output token); + + self.transform_controller.handle_token(&mut token)?; + + if self.emission_enabled { + token.into_bytes(&mut |c| self.output_sink.handle_chunk(c))?; + } + Ok(()) + } + + #[inline] + fn should_stop_removing_element_content(&self) -> bool { + !self.emission_enabled && self.transform_controller.should_emit_content() + } +} + +impl Dispatcher +where + C: TransformController, + O: OutputSink, +{ + pub fn new(transform_controller: C, output_sink: O, encoding: SharedEncoding) -> Self { + let capture_flags = transform_controller.initial_capture_flags(); + + Self { + delegate: DispatcherDelegate { + transform_controller, + output_sink, + capture_flags, + remaining_content_start: 0, + emission_enabled: true, + }, + text_decoder: TextDecoder::new(SharedEncoding::clone(&encoding)), + last_text_type: TextType::Data, + encoding, + got_flags_from_hint: false, + pending_element_aux_info_req: None, + } + } + + #[inline(never)] fn try_produce_token_from_lexeme<'i, T>( &mut self, lexeme: &Lexeme<'i, T>, @@ -135,50 +202,61 @@ where where Lexeme<'i, T>: ToToken, { - let transform_controller = &mut self.transform_controller; - let output_sink = &mut self.output_sink; - let emission_enabled = self.emission_enabled; - let lexeme_range = lexeme.raw_range(); - let remaining_content_start = self.remaining_content_start; - let mut lexeme_consumed = false; - - self.token_capturer.feed(lexeme, |event| { - match event { - TokenCapturerEvent::LexemeConsumed => { - let chunk = lexeme.input().slice(Range { - start: remaining_content_start, - end: lexeme_range.start, + let lexeme_consumed_end; + + match lexeme.to_token(&mut self.delegate.capture_flags, self.encoding.get()) { + ToTokenResult::Token(token) => { + self.text_decoder + .flush_pending(&mut |text, is_last, encoding| { + self.delegate.text_token_produced( + text, + encoding, + self.last_text_type, + is_last, + ) + })?; + lexeme_consumed_end = self.delegate.lexeme_consumed(lexeme); + self.delegate.token_produced(token)?; + } + ToTokenResult::Text(text_type) => { + lexeme_consumed_end = self.delegate.lexeme_consumed(lexeme); + + self.last_text_type = text_type; + self.text_decoder.feed_text( + &lexeme.raw(), + false, + &mut |text, is_last, encoding| { + self.delegate.text_token_produced( + text, + encoding, + self.last_text_type, + is_last, + ) + }, + )?; + } + ToTokenResult::None => { + return self + .text_decoder + .flush_pending(&mut |text, is_last, encoding| { + self.delegate.text_token_produced( + text, + encoding, + self.last_text_type, + is_last, + ) }); - - lexeme_consumed = true; - - if emission_enabled && chunk.len() > 0 { - output_sink.handle_chunk(&chunk); - } - } - TokenCapturerEvent::TokenProduced(mut token) => { - trace!(@output token); - - transform_controller.handle_token(&mut token)?; - - if emission_enabled { - token.into_bytes(&mut |c| output_sink.handle_chunk(c))?; - } - } } - Ok(()) - })?; + }; - if lexeme_consumed { - self.remaining_content_start = lexeme_range.end; - } + self.delegate.remaining_content_start = lexeme_consumed_end; Ok(()) } #[inline] const fn get_next_parser_directive(&self) -> ParserDirective { - if self.token_capturer.has_captures() { + if !self.delegate.capture_flags.is_empty() { ParserDirective::Lex } else { ParserDirective::WherePossibleScanForTagsOnly @@ -194,7 +272,7 @@ where macro_rules! get_flags_from_aux_info_res { ($handler:expr, $attributes:expr, $self_closing:expr) => { $handler( - &mut self.transform_controller, + &mut self.delegate.transform_controller, AuxStartTagInfo { input, attr_buffer: $attributes, @@ -208,7 +286,7 @@ where // NOTE: tag hint was produced for the tag, but // attributes and self closing flag were requested. Some(aux_info_req) => match *lexeme.token_outline() { - StartTag { + TagTokenOutline::StartTag { ref attributes, self_closing, .. @@ -221,7 +299,7 @@ where // NOTE: tag hint hasn't been produced for the tag, because // parser is not in the tag scan mode. None => match *lexeme.token_outline() { - StartTag { + TagTokenOutline::StartTag { name, name_hash, ns, @@ -230,7 +308,11 @@ where } => { let name = LocalName::new(input, name, name_hash); - match self.transform_controller.handle_start_tag(name, ns) { + match self + .delegate + .transform_controller + .handle_start_tag(name, ns) + { Ok(flags) => Ok(flags), Err(DispatcherError::InfoRequest(aux_info_req)) => { get_flags_from_aux_info_res!(aux_info_req, &attributes, self_closing) @@ -239,16 +321,16 @@ where } } - EndTag { name, name_hash } => { + TagTokenOutline::EndTag { name, name_hash } => { let name = LocalName::new(input, name, name_hash); - Ok(self.transform_controller.handle_end_tag(name)) + Ok(self.delegate.transform_controller.handle_end_tag(name)) } }, }; match capture_flags { Ok(flags) => { - self.token_capturer.set_capture_flags(flags); + self.delegate.capture_flags = flags; Ok(()) } Err(e) => Err(e), @@ -260,37 +342,27 @@ where &mut self, flags: TokenCaptureFlags, ) -> ParserDirective { - self.token_capturer.set_capture_flags(flags); + self.delegate.capture_flags = flags; self.got_flags_from_hint = true; self.get_next_parser_directive() } #[inline] fn flush_pending_captured_text(&mut self) -> Result<(), RewritingError> { - let transform_controller = &mut self.transform_controller; - let output_sink = &mut self.output_sink; - let emission_enabled = self.emission_enabled; - - self.token_capturer.flush_pending_text(&mut |event| { - if let TokenCapturerEvent::TokenProduced(mut token) = event { - trace!(@output token); - - transform_controller.handle_token(&mut token)?; - - if emission_enabled { - token.into_bytes(&mut |c| output_sink.handle_chunk(c))?; - } - } - - Ok(()) - })?; + self.text_decoder + .flush_pending(&mut |text, is_last, encoding| { + self.delegate + .text_token_produced(text, encoding, self.last_text_type, is_last) + }) + } - Ok(()) + pub fn flush_remaining_input(&mut self, input: &[u8], consumed_byte_count: usize) { + self.delegate + .flush_remaining_input(input, consumed_byte_count); } - #[inline] - fn should_stop_removing_element_content(&self) -> bool { - !self.emission_enabled && self.transform_controller.should_emit_content() + pub fn finish(&mut self, input: &[u8]) -> Result<(), RewritingError> { + self.delegate.finish(self.encoding.get(), input) } } @@ -313,15 +385,15 @@ where self.adjust_capture_flags_for_tag_lexeme(lexeme)?; } - if let EndTag { .. } = lexeme.token_outline() { - if self.should_stop_removing_element_content() { - self.emission_enabled = true; - self.remaining_content_start = lexeme.raw_range().start; + if let TagTokenOutline::EndTag { .. } = lexeme.token_outline() { + if self.delegate.should_stop_removing_element_content() { + self.delegate.emission_enabled = true; + self.delegate.remaining_content_start = lexeme.raw_range().start; } } self.try_produce_token_from_lexeme(lexeme)?; - self.emission_enabled = self.transform_controller.should_emit_content(); + self.delegate.emission_enabled = self.delegate.transform_controller.should_emit_content(); Ok(self.get_next_parser_directive()) } @@ -345,7 +417,11 @@ where name: LocalName<'_>, ns: Namespace, ) -> Result { - match self.transform_controller.handle_start_tag(name, ns) { + match self + .delegate + .transform_controller + .handle_start_tag(name, ns) + { Ok(flags) => { Ok(self.apply_capture_flags_from_hint_and_get_next_parser_directive(flags)) } @@ -365,13 +441,13 @@ where ) -> Result { self.flush_pending_captured_text()?; - let mut flags = self.transform_controller.handle_end_tag(name); + let mut flags = self.delegate.transform_controller.handle_end_tag(name); // NOTE: if emission was disabled (i.e. we've been removing element content) // we need to request the end tag lexeme, to ensure that we have it. // Otherwise, if we have unfinished end tag in the end of input we'll emit // it where we shouldn't. - if self.should_stop_removing_element_content() { + if self.delegate.should_stop_removing_element_content() { flags |= TokenCaptureFlags::NEXT_END_TAG; }