Skip to content

Commit adf535f

Browse files
committed
feat(libredirectionio): improve perf of html tokenizer avoid a double buffer
1 parent b44028b commit adf535f

File tree

4 files changed

+32
-86
lines changed

4 files changed

+32
-86
lines changed

src/filter/html_body_action/body_append.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ impl BodyAppend {
112112
}
113113

114114
fn append_child(content: String, child: String) -> Result<String> {
115-
let buffer = &mut content.as_bytes() as &mut dyn std::io::Read;
115+
let buffer = content.as_bytes().to_vec();
116116
let mut tokenizer = html::Tokenizer::new(buffer);
117117
let mut output = "".to_string();
118118
let mut level = 0;

src/filter/html_body_action/body_prepend.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ impl BodyPrepend {
107107
}
108108

109109
fn prepend_child(content: String, child: String) -> Result<String> {
110-
let buffer = &mut content.as_bytes() as &mut dyn std::io::Read;
110+
let buffer = content.as_bytes().to_vec();
111111
let mut tokenizer = html::Tokenizer::new(buffer);
112112
let mut output = "".to_string();
113113

src/filter/html_filter_body.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ use crate::filter::error::Result;
33
use crate::filter::html_body_action::HtmlBodyVisitor;
44
use crate::html;
55
use std::collections::HashSet;
6-
use std::io::Cursor;
76

87
#[derive(Debug)]
98
struct BufferLink {
@@ -59,8 +58,7 @@ impl HtmlFilterBodyAction {
5958
let mut data = self.last_buffer.clone();
6059
data.extend(input);
6160

62-
let mut cursor = Cursor::new(data);
63-
let mut tokenizer = html::Tokenizer::new(&mut cursor);
61+
let mut tokenizer = html::Tokenizer::new(data);
6462
let mut to_return = "".to_string();
6563

6664
loop {

src/html/mod.rs

Lines changed: 29 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ mod error;
33
use crate::html::TokenType::{CommentToken, DoctypeToken, EndTagToken, ErrorToken, SelfClosingTagToken, StartTagToken, TextToken};
44
pub use error::HtmlParseError;
55
use error::Result;
6-
use std::io::Read;
76
use std::string::ToString;
87

98
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -49,13 +48,11 @@ struct Span {
4948
end: usize,
5049
}
5150

52-
pub struct Tokenizer<'t> {
53-
reader: &'t mut dyn Read,
51+
pub struct Tokenizer {
52+
reader: Vec<u8>,
5453
token: TokenType,
5554
err: Option<Error>,
5655
raw: Span,
57-
buffer: Vec<u8>,
58-
max_buffer: usize,
5956
data: Span,
6057
pending_attribute: [Span; 2],
6158
attribute: Vec<[Span; 2]>,
@@ -105,19 +102,17 @@ impl ToString for Token {
105102
}
106103
}
107104

108-
impl<'t> Tokenizer<'t> {
109-
pub fn new(reader: &'t mut dyn Read) -> Tokenizer {
105+
impl Tokenizer {
106+
pub fn new(reader: Vec<u8>) -> Tokenizer {
110107
Tokenizer::new_fragment(reader, "".to_string())
111108
}
112109

113-
pub fn new_fragment(reader: &'t mut dyn Read, mut context_tag: String) -> Tokenizer {
110+
pub fn new_fragment(reader: Vec<u8>, mut context_tag: String) -> Tokenizer {
114111
let mut tokenizer = Tokenizer {
115112
reader,
116113
token: TokenType::NoneToken,
117114
err: None,
118115
raw: Span { start: 0, end: 0 },
119-
buffer: Vec::new(),
120-
max_buffer: 0,
121116
data: Span { start: 0, end: 0 },
122117
pending_attribute: [Span { start: 0, end: 0 }, Span { start: 0, end: 0 }],
123118
attribute: Vec::new(),
@@ -150,14 +145,6 @@ impl<'t> Tokenizer<'t> {
150145
self.allow_cdata = allow_cdata;
151146
}
152147

153-
pub fn buffered(&self) -> Vec<u8> {
154-
self.buffer[self.raw.end..].to_vec()
155-
}
156-
157-
pub fn buffered_as_string(&self) -> Result<String> {
158-
Ok(String::from_utf8(self.buffered())?)
159-
}
160-
161148
#[allow(clippy::should_implement_trait)]
162149
pub fn next(&mut self) -> Result<TokenType> {
163150
self.raw.start = self.raw.end;
@@ -300,8 +287,16 @@ impl<'t> Tokenizer<'t> {
300287
Ok(self.token)
301288
}
302289

290+
pub fn buffered(&self) -> Vec<u8> {
291+
self.reader[self.raw.end..].to_vec()
292+
}
293+
294+
pub fn buffered_as_string(&self) -> Result<String> {
295+
Ok(String::from_utf8(self.buffered())?)
296+
}
297+
303298
pub fn raw(&self) -> Vec<u8> {
304-
self.buffer[self.raw.start..self.raw.end].to_vec()
299+
self.reader[self.raw.start..self.raw.end].to_vec()
305300
}
306301

307302
pub fn raw_as_string(&self) -> Result<String> {
@@ -311,7 +306,7 @@ impl<'t> Tokenizer<'t> {
311306
pub fn text(&mut self) -> Result<Option<String>> {
312307
match self.token {
313308
TextToken | CommentToken | DoctypeToken => {
314-
let mut s = String::from_utf8(self.buffer[self.data.start..self.data.end].to_vec())?;
309+
let mut s = String::from_utf8(self.reader[self.data.start..self.data.end].to_vec())?;
315310

316311
self.data.start = self.raw.end;
317312
self.data.end = self.raw.end;
@@ -330,7 +325,7 @@ impl<'t> Tokenizer<'t> {
330325
if self.data.start < self.data.end {
331326
match self.token {
332327
StartTagToken | EndTagToken | SelfClosingTagToken => {
333-
let s = String::from_utf8(self.buffer[self.data.start..self.data.end].to_vec())?;
328+
let s = String::from_utf8(self.reader[self.data.start..self.data.end].to_vec())?;
334329

335330
self.data.start = self.raw.end;
336331
self.data.end = self.raw.end;
@@ -351,8 +346,8 @@ impl<'t> Tokenizer<'t> {
351346
let attr = &self.attribute[self.number_attribute_returned];
352347
self.number_attribute_returned += 1;
353348

354-
let key = String::from_utf8(self.buffer[attr[0].start..attr[0].end].to_vec())?;
355-
let val = String::from_utf8(self.buffer[attr[1].start..attr[1].end].to_vec())?;
349+
let key = String::from_utf8(self.reader[attr[0].start..attr[0].end].to_vec())?;
350+
let val = String::from_utf8(self.reader[attr[1].start..attr[1].end].to_vec())?;
356351

357352
return Ok((
358353
Some(key.to_lowercase()),
@@ -400,69 +395,22 @@ impl<'t> Tokenizer<'t> {
400395
Ok(token)
401396
}
402397

403-
pub fn set_max_buffer(&mut self, max_buffer: usize) {
404-
self.max_buffer = max_buffer;
405-
}
406-
407398
fn read_byte(&mut self) -> u8 {
408-
if self.raw.end >= self.buffer.len() {
409-
// let new_buffer= self.buffer[self.raw.start..self.raw.end].to_vec().clone();
410-
// let start = self.raw.start;
411-
//
412-
// if start != 0 {
413-
// self.data.start -= start;
414-
// self.data.end -= start;
415-
// self.pending_attribute[0].start -= start;
416-
// self.pending_attribute[0].end -= start;
417-
// self.pending_attribute[1].start -= start;
418-
// self.pending_attribute[1].end -= start;
419-
//
420-
// for attribute in &mut self.attribute {
421-
// attribute[0].start -= start;
422-
// attribute[0].end -= start;
423-
// attribute[1].start -= start;
424-
// attribute[1].end -= start;
425-
// }
426-
// }
427-
428-
let mut new_byte_buffer = Vec::new();
429-
let error = self.reader.read_to_end(new_byte_buffer.as_mut());
430-
431-
if error.is_err() {
432-
self.err = Some(Error {
433-
kind: ErrorKind::ReadError,
434-
read_error: error.err(),
435-
});
399+
match self.reader.get(self.raw.end) {
400+
Some(byte) => {
401+
self.raw.end += 1;
436402

437-
return 0;
403+
*byte
438404
}
439-
440-
if new_byte_buffer.is_empty() {
405+
None => {
441406
self.err = Some(Error {
442407
kind: ErrorKind::EOFError,
443408
read_error: None,
444409
});
445410

446-
return 0;
411+
0
447412
}
448-
449-
self.buffer.append(&mut new_byte_buffer);
450413
}
451-
452-
let byte = self.buffer[self.raw.end];
453-
454-
self.raw.end += 1;
455-
456-
if self.max_buffer > 0 && self.raw.end - self.raw.start >= self.max_buffer {
457-
self.err = Some(Error {
458-
kind: ErrorKind::MaxBufferError,
459-
read_error: None,
460-
});
461-
462-
return 0;
463-
}
464-
465-
byte
466414
}
467415

468416
fn skip_white_space(&mut self) {
@@ -1081,7 +1029,7 @@ impl<'t> Tokenizer<'t> {
10811029
}
10821030

10831031
for i in 0..s.len() {
1084-
let mut c = self.buffer[self.data.start + i];
1032+
let mut c = self.reader[self.data.start + i];
10851033

10861034
if c.is_ascii_uppercase() {
10871035
c += b'a' - b'A';
@@ -1106,7 +1054,7 @@ impl<'t> Tokenizer<'t> {
11061054
}
11071055

11081056
let mut raw = false;
1109-
let mut byte = self.buffer[self.data.start];
1057+
let mut byte = self.reader[self.data.start];
11101058

11111059
if byte.is_ascii_uppercase() {
11121060
byte += b'a' - b'A';
@@ -1137,10 +1085,10 @@ impl<'t> Tokenizer<'t> {
11371085
}
11381086

11391087
if raw {
1140-
self.raw_tag = String::from_utf8(self.buffer[self.data.start..self.data.end].to_vec())?.to_lowercase();
1088+
self.raw_tag = String::from_utf8(self.reader[self.data.start..self.data.end].to_vec())?.to_lowercase();
11411089
}
11421090

1143-
if self.err.is_none() && self.buffer[self.raw.end - 2] == b'/' {
1091+
if self.err.is_none() && self.reader[self.raw.end - 2] == b'/' {
11441092
return Ok(SelfClosingTagToken);
11451093
}
11461094

@@ -1334,7 +1282,7 @@ mod tests {
13341282
#[test]
13351283
fn $name() {
13361284
let (html, golden) = $value;
1337-
let reader = &mut html.as_bytes() as &mut dyn std::io::Read;
1285+
let reader = html.as_bytes().to_vec();
13381286
let mut tokenizer = Tokenizer::new(reader);
13391287

13401288
if !golden.is_empty() {

0 commit comments

Comments
 (0)