@@ -45,7 +45,7 @@ impl Lexer {
4545 self . consume_n ( 2 ) ?; // {{
4646 let content = self . consume_until ( "}}" ) ?;
4747 self . consume_n ( 2 ) ?; // }}
48- TokenType :: DjangoVariable ( content. trim ( ) . to_string ( ) )
48+ TokenType :: DjangoVariable ( content)
4949 }
5050 '#' => {
5151 self . consume_n ( 2 ) ?; // {#
@@ -54,42 +54,106 @@ impl Lexer {
5454 TokenType :: Comment ( content, "{#" . to_string ( ) , Some ( "#}" . to_string ( ) ) )
5555 }
5656 _ => {
57- self . consume ( ) ?;
58- TokenType :: Text ( "{" . to_string ( ) )
57+ self . consume ( ) ?; // {
58+ TokenType :: Text ( String :: from ( "{" ) )
5959 }
6060 } ,
61- '\n' => {
62- self . consume ( ) ?;
63- let token = TokenType :: Newline ;
64- self . line += 1 ;
65- token
66- }
67- ' ' | '\t' | '\r' => {
68- let mut count = 1 ;
69- self . consume ( ) ?;
70- while let Ok ( c) = self . peek ( ) {
71- if c != ' ' && c != '\t' && c != '\r' {
72- break ;
61+
62+ '<' => match self . peek_next ( ) ? {
63+ '/' => {
64+ self . consume_n ( 2 ) ?; // </
65+ let tag = self . consume_until ( ">" ) ?;
66+ self . consume ( ) ?; // >
67+ TokenType :: HtmlTagClose ( tag)
68+ }
69+ '!' if self . matches ( "<!--" ) ? => {
70+ self . consume_n ( 4 ) ?; // <!--
71+ let content = self . consume_until ( "-->" ) ?;
72+ self . consume_n ( 3 ) ?; // -->
73+ TokenType :: Comment ( content, "<!--" . to_string ( ) , Some ( "-->" . to_string ( ) ) )
74+ }
75+ _ => {
76+ self . consume ( ) ?; // consume <
77+ let tag = self . consume_until ( ">" ) ?;
78+ self . consume ( ) ?; // consume >
79+ if tag. starts_with ( "script" ) {
80+ TokenType :: ScriptTagOpen ( tag)
81+ } else if tag. starts_with ( "style" ) {
82+ TokenType :: StyleTagOpen ( tag)
83+ } else if tag. ends_with ( "/" ) {
84+ TokenType :: HtmlTagVoid ( tag. trim_end_matches ( "/" ) . to_string ( ) )
85+ } else {
86+ TokenType :: HtmlTagOpen ( tag)
7387 }
88+ }
89+ } ,
90+
91+ '/' => match self . peek_next ( ) ? {
92+ '/' => {
93+ self . consume_n ( 2 ) ?; // //
94+ let content = self . consume_until ( "\n " ) ?;
95+ TokenType :: Comment ( content, "//" . to_string ( ) , None )
96+ }
97+ '*' => {
98+ self . consume_n ( 2 ) ?; // /*
99+ let content = self . consume_until ( "*/" ) ?;
100+ self . consume_n ( 2 ) ?; // */
101+ TokenType :: Comment ( content, "/*" . to_string ( ) , Some ( "*/" . to_string ( ) ) )
102+ }
103+ _ => {
74104 self . consume ( ) ?;
75- count += 1 ;
105+ TokenType :: Text ( "/" . to_string ( ) )
106+ }
107+ } ,
108+
109+ c if c. is_whitespace ( ) => {
110+ if c == '\n' || c == '\r' {
111+ self . consume ( ) ?; // \r or \n
112+ if c == '\r' && self . peek ( ) ? == '\n' {
113+ self . consume ( ) ?; // \n of \r\n
114+ }
115+ TokenType :: Newline
116+ } else {
117+ self . consume ( ) ?; // Consume the first whitespace
118+ while !self . is_at_end ( ) && self . peek ( ) ?. is_whitespace ( ) {
119+ if self . peek ( ) ? == '\n' || self . peek ( ) ? == '\r' {
120+ break ;
121+ }
122+ self . consume ( ) ?;
123+ }
124+ let whitespace_count = self . current - self . start ;
125+ TokenType :: Whitespace ( whitespace_count)
76126 }
77- TokenType :: Whitespace ( count)
78127 }
128+
79129 _ => {
80130 let mut text = String :: new ( ) ;
81131 while !self . is_at_end ( ) {
82132 let c = self . peek ( ) ?;
83- if c == '{' || c == '\n ' || c == ' ' || c == '\t' || c == '\r ' {
133+ if c == '{' || c == '< ' || c == '\n ' {
84134 break ;
85135 }
86- text. push ( self . consume ( ) ?) ;
136+ text. push ( c) ;
137+ self . consume ( ) ?;
87138 }
88139 TokenType :: Text ( text)
89140 }
90141 } ;
91142
92- Ok ( Token :: new ( token_type, self . line , Some ( self . start ) ) )
143+ let token = Token :: new ( token_type, self . line , Some ( self . start ) ) ;
144+
145+ match self . peek_previous ( ) ? {
146+ '\n' => self . line += 1 ,
147+ '\r' => {
148+ self . line += 1 ;
149+ if self . peek ( ) ? == '\n' {
150+ self . current += 1 ;
151+ }
152+ }
153+ _ => { }
154+ }
155+
156+ Ok ( token)
93157 }
94158
95159 fn peek ( & self ) -> Result < char , LexerError > {
@@ -246,7 +310,15 @@ mod tests {
246310 #[ test]
247311 fn test_tokenize_comments ( ) {
248312 let source = r#"<!-- HTML comment -->
249- {# Django comment #}"# ;
313+ {# Django comment #}
314+ <script>
315+ // JS single line comment
316+ /* JS multi-line
317+ comment */
318+ </script>
319+ <style>
320+ /* CSS comment */
321+ </style>"# ;
250322 let mut lexer = Lexer :: new ( source) ;
251323 let tokens = lexer. tokenize ( ) . unwrap ( ) ;
252324 insta:: assert_yaml_snapshot!( tokens) ;
@@ -285,7 +357,7 @@ mod tests {
285357 assert ! ( Lexer :: new( "{{ user.name" ) . tokenize( ) . is_err( ) ) ; // No closing }}
286358 assert ! ( Lexer :: new( "{% if" ) . tokenize( ) . is_err( ) ) ; // No closing %}
287359 assert ! ( Lexer :: new( "{#" ) . tokenize( ) . is_err( ) ) ; // No closing #}
288- assert ! ( Lexer :: new( "<div" ) . tokenize( ) . is_ok ( ) ) ; // No closing >, but HTML is treated as text
360+ assert ! ( Lexer :: new( "<div" ) . tokenize( ) . is_err ( ) ) ; // No closing >
289361
290362 // Invalid characters or syntax within tokens
291363 assert ! ( Lexer :: new( "{{}}" ) . tokenize( ) . is_ok( ) ) ; // Empty but valid
0 commit comments