@@ -3,7 +3,6 @@ mod error;
33use crate :: html:: TokenType :: { CommentToken , DoctypeToken , EndTagToken , ErrorToken , SelfClosingTagToken , StartTagToken , TextToken } ;
44pub use error:: HtmlParseError ;
55use error:: Result ;
6- use std:: io:: Read ;
76use std:: string:: ToString ;
87
98#[ derive( Debug , Clone , Copy , PartialEq , Eq ) ]
@@ -49,13 +48,11 @@ struct Span {
4948 end : usize ,
5049}
5150
52- pub struct Tokenizer < ' t > {
53- reader : & ' t mut dyn Read ,
51+ pub struct Tokenizer {
52+ reader : Vec < u8 > ,
5453 token : TokenType ,
5554 err : Option < Error > ,
5655 raw : Span ,
57- buffer : Vec < u8 > ,
58- max_buffer : usize ,
5956 data : Span ,
6057 pending_attribute : [ Span ; 2 ] ,
6158 attribute : Vec < [ Span ; 2 ] > ,
@@ -105,19 +102,17 @@ impl ToString for Token {
105102 }
106103}
107104
108- impl < ' t > Tokenizer < ' t > {
109- pub fn new ( reader : & ' t mut dyn Read ) -> Tokenizer {
105+ impl Tokenizer {
106+ pub fn new ( reader : Vec < u8 > ) -> Tokenizer {
110107 Tokenizer :: new_fragment ( reader, "" . to_string ( ) )
111108 }
112109
113- pub fn new_fragment ( reader : & ' t mut dyn Read , mut context_tag : String ) -> Tokenizer {
110+ pub fn new_fragment ( reader : Vec < u8 > , mut context_tag : String ) -> Tokenizer {
114111 let mut tokenizer = Tokenizer {
115112 reader,
116113 token : TokenType :: NoneToken ,
117114 err : None ,
118115 raw : Span { start : 0 , end : 0 } ,
119- buffer : Vec :: new ( ) ,
120- max_buffer : 0 ,
121116 data : Span { start : 0 , end : 0 } ,
122117 pending_attribute : [ Span { start : 0 , end : 0 } , Span { start : 0 , end : 0 } ] ,
123118 attribute : Vec :: new ( ) ,
@@ -150,14 +145,6 @@ impl<'t> Tokenizer<'t> {
150145 self . allow_cdata = allow_cdata;
151146 }
152147
153- pub fn buffered ( & self ) -> Vec < u8 > {
154- self . buffer [ self . raw . end ..] . to_vec ( )
155- }
156-
157- pub fn buffered_as_string ( & self ) -> Result < String > {
158- Ok ( String :: from_utf8 ( self . buffered ( ) ) ?)
159- }
160-
161148 #[ allow( clippy:: should_implement_trait) ]
162149 pub fn next ( & mut self ) -> Result < TokenType > {
163150 self . raw . start = self . raw . end ;
@@ -300,8 +287,16 @@ impl<'t> Tokenizer<'t> {
300287 Ok ( self . token )
301288 }
302289
290+ pub fn buffered ( & self ) -> Vec < u8 > {
291+ self . reader [ self . raw . end ..] . to_vec ( )
292+ }
293+
294+ pub fn buffered_as_string ( & self ) -> Result < String > {
295+ Ok ( String :: from_utf8 ( self . buffered ( ) ) ?)
296+ }
297+
303298 pub fn raw ( & self ) -> Vec < u8 > {
304- self . buffer [ self . raw . start ..self . raw . end ] . to_vec ( )
299+ self . reader [ self . raw . start ..self . raw . end ] . to_vec ( )
305300 }
306301
307302 pub fn raw_as_string ( & self ) -> Result < String > {
@@ -311,7 +306,7 @@ impl<'t> Tokenizer<'t> {
311306 pub fn text ( & mut self ) -> Result < Option < String > > {
312307 match self . token {
313308 TextToken | CommentToken | DoctypeToken => {
314- let mut s = String :: from_utf8 ( self . buffer [ self . data . start ..self . data . end ] . to_vec ( ) ) ?;
309+ let mut s = String :: from_utf8 ( self . reader [ self . data . start ..self . data . end ] . to_vec ( ) ) ?;
315310
316311 self . data . start = self . raw . end ;
317312 self . data . end = self . raw . end ;
@@ -330,7 +325,7 @@ impl<'t> Tokenizer<'t> {
330325 if self . data . start < self . data . end {
331326 match self . token {
332327 StartTagToken | EndTagToken | SelfClosingTagToken => {
333- let s = String :: from_utf8 ( self . buffer [ self . data . start ..self . data . end ] . to_vec ( ) ) ?;
328+ let s = String :: from_utf8 ( self . reader [ self . data . start ..self . data . end ] . to_vec ( ) ) ?;
334329
335330 self . data . start = self . raw . end ;
336331 self . data . end = self . raw . end ;
@@ -351,8 +346,8 @@ impl<'t> Tokenizer<'t> {
351346 let attr = & self . attribute [ self . number_attribute_returned ] ;
352347 self . number_attribute_returned += 1 ;
353348
354- let key = String :: from_utf8 ( self . buffer [ attr[ 0 ] . start ..attr[ 0 ] . end ] . to_vec ( ) ) ?;
355- let val = String :: from_utf8 ( self . buffer [ attr[ 1 ] . start ..attr[ 1 ] . end ] . to_vec ( ) ) ?;
349+ let key = String :: from_utf8 ( self . reader [ attr[ 0 ] . start ..attr[ 0 ] . end ] . to_vec ( ) ) ?;
350+ let val = String :: from_utf8 ( self . reader [ attr[ 1 ] . start ..attr[ 1 ] . end ] . to_vec ( ) ) ?;
356351
357352 return Ok ( (
358353 Some ( key. to_lowercase ( ) ) ,
@@ -400,69 +395,22 @@ impl<'t> Tokenizer<'t> {
400395 Ok ( token)
401396 }
402397
403- pub fn set_max_buffer ( & mut self , max_buffer : usize ) {
404- self . max_buffer = max_buffer;
405- }
406-
407398 fn read_byte ( & mut self ) -> u8 {
408- if self . raw . end >= self . buffer . len ( ) {
409- // let new_buffer= self.buffer[self.raw.start..self.raw.end].to_vec().clone();
410- // let start = self.raw.start;
411- //
412- // if start != 0 {
413- // self.data.start -= start;
414- // self.data.end -= start;
415- // self.pending_attribute[0].start -= start;
416- // self.pending_attribute[0].end -= start;
417- // self.pending_attribute[1].start -= start;
418- // self.pending_attribute[1].end -= start;
419- //
420- // for attribute in &mut self.attribute {
421- // attribute[0].start -= start;
422- // attribute[0].end -= start;
423- // attribute[1].start -= start;
424- // attribute[1].end -= start;
425- // }
426- // }
427-
428- let mut new_byte_buffer = Vec :: new ( ) ;
429- let error = self . reader . read_to_end ( new_byte_buffer. as_mut ( ) ) ;
430-
431- if error. is_err ( ) {
432- self . err = Some ( Error {
433- kind : ErrorKind :: ReadError ,
434- read_error : error. err ( ) ,
435- } ) ;
399+ match self . reader . get ( self . raw . end ) {
400+ Some ( byte) => {
401+ self . raw . end += 1 ;
436402
437- return 0 ;
403+ * byte
438404 }
439-
440- if new_byte_buffer. is_empty ( ) {
405+ None => {
441406 self . err = Some ( Error {
442407 kind : ErrorKind :: EOFError ,
443408 read_error : None ,
444409 } ) ;
445410
446- return 0 ;
411+ 0
447412 }
448-
449- self . buffer . append ( & mut new_byte_buffer) ;
450413 }
451-
452- let byte = self . buffer [ self . raw . end ] ;
453-
454- self . raw . end += 1 ;
455-
456- if self . max_buffer > 0 && self . raw . end - self . raw . start >= self . max_buffer {
457- self . err = Some ( Error {
458- kind : ErrorKind :: MaxBufferError ,
459- read_error : None ,
460- } ) ;
461-
462- return 0 ;
463- }
464-
465- byte
466414 }
467415
468416 fn skip_white_space ( & mut self ) {
@@ -1081,7 +1029,7 @@ impl<'t> Tokenizer<'t> {
10811029 }
10821030
10831031 for i in 0 ..s. len ( ) {
1084- let mut c = self . buffer [ self . data . start + i] ;
1032+ let mut c = self . reader [ self . data . start + i] ;
10851033
10861034 if c. is_ascii_uppercase ( ) {
10871035 c += b'a' - b'A' ;
@@ -1106,7 +1054,7 @@ impl<'t> Tokenizer<'t> {
11061054 }
11071055
11081056 let mut raw = false ;
1109- let mut byte = self . buffer [ self . data . start ] ;
1057+ let mut byte = self . reader [ self . data . start ] ;
11101058
11111059 if byte. is_ascii_uppercase ( ) {
11121060 byte += b'a' - b'A' ;
@@ -1137,10 +1085,10 @@ impl<'t> Tokenizer<'t> {
11371085 }
11381086
11391087 if raw {
1140- self . raw_tag = String :: from_utf8 ( self . buffer [ self . data . start ..self . data . end ] . to_vec ( ) ) ?. to_lowercase ( ) ;
1088+ self . raw_tag = String :: from_utf8 ( self . reader [ self . data . start ..self . data . end ] . to_vec ( ) ) ?. to_lowercase ( ) ;
11411089 }
11421090
1143- if self . err . is_none ( ) && self . buffer [ self . raw . end - 2 ] == b'/' {
1091+ if self . err . is_none ( ) && self . reader [ self . raw . end - 2 ] == b'/' {
11441092 return Ok ( SelfClosingTagToken ) ;
11451093 }
11461094
@@ -1334,7 +1282,7 @@ mod tests {
13341282 #[ test]
13351283 fn $name( ) {
13361284 let ( html, golden) = $value;
1337- let reader = & mut html. as_bytes( ) as & mut dyn std :: io :: Read ;
1285+ let reader = html. as_bytes( ) . to_vec ( ) ;
13381286 let mut tokenizer = Tokenizer :: new( reader) ;
13391287
13401288 if !golden. is_empty( ) {
0 commit comments