@@ -41,6 +41,9 @@ pub fn html_to_text(input: &str) -> String {
4141
4242 let mut in_tag = false ;
4343 let mut in_head = false ;
44+ let mut in_style = false ;
45+ let mut in_script = false ;
46+ let mut in_template = false ;
4447 let mut in_comment = false ;
4548
4649 let mut is_token_start = true ;
@@ -58,7 +61,13 @@ pub fn html_to_text(input: &str) -> String {
5861 if !in_comment {
5962 match ch {
6063 b'<' => {
61- if !in_tag && !in_head && !is_token_start {
64+ if !in_tag
65+ && !in_head
66+ && !in_style
67+ && !in_script
68+ && !in_template
69+ && !is_token_start
70+ {
6271 add_html_token (
6372 & mut result,
6473 & input[ token_start..token_end + 1 ] ,
@@ -87,6 +96,15 @@ pub fn html_to_text(input: &str) -> String {
8796 Some ( tag) if tag. eq_ignore_ascii_case ( b"head" ) => {
8897 in_head = !is_tag_close;
8998 }
99+ Some ( tag) if tag. eq_ignore_ascii_case ( b"style" ) => {
100+ in_style = !is_tag_close;
101+ }
102+ Some ( tag) if tag. eq_ignore_ascii_case ( b"script" ) => {
103+ in_script = !is_tag_close;
104+ }
105+ Some ( tag) if tag. eq_ignore_ascii_case ( b"template" ) => {
106+ in_template = !is_tag_close;
107+ }
90108 _ => ( ) ,
91109 }
92110 }
@@ -109,7 +127,7 @@ pub fn html_to_text(input: &str) -> String {
109127 }
110128 }
111129 b' ' | b'\t' | b'\r' | b'\n' => {
112- if !in_tag && !in_head {
130+ if !in_tag && !in_head && !in_style && !in_script && !in_template {
113131 if !is_token_start {
114132 add_html_token (
115133 & mut result,
@@ -126,6 +144,9 @@ pub fn html_to_text(input: &str) -> String {
126144 continue ;
127145 }
128146 b'&' if !in_tag && !is_token_start && !in_head => {
147+ if in_style || in_script || in_template {
148+ continue ;
149+ }
129150 add_html_token (
130151 & mut result,
131152 & input[ token_start..token_end + 1 ] ,
@@ -136,6 +157,9 @@ pub fn html_to_text(input: &str) -> String {
136157 is_after_space = false ;
137158 }
138159 b';' if !in_tag && !is_token_start && !in_head => {
160+ if in_style || in_script || in_template {
161+ continue ;
162+ }
139163 add_html_token (
140164 & mut result,
141165 & input[ token_start..pos + 1 ] ,
@@ -170,7 +194,7 @@ pub fn html_to_text(input: &str) -> String {
170194 }
171195 }
172196
173- if !in_tag && !is_token_start && !in_head {
197+ if !in_tag && !is_token_start && !in_head && !in_style && !in_script && !in_template {
174198 add_html_token (
175199 & mut result,
176200 & input[ token_start..token_end + 1 ] ,
@@ -2417,4 +2441,41 @@ mod tests {
24172441 assert_eq ! ( result, input. 1 , "Failed for '{:?}" , input. 0 ) ;
24182442 }
24192443 }
2444+
2445+ #[ test]
2446+ fn html_to_text_removes_style_content ( ) {
2447+ let input = "<style>body{color:red}</style><div>Hello</div>" ;
2448+ let output = html_to_text ( input) ;
2449+ assert ! ( !output. contains( "body{color:red}" ) ) ;
2450+ assert ! ( output. contains( "Hello" ) ) ;
2451+ }
2452+
2453+ #[ test]
2454+ fn html_to_text_removes_script_content ( ) {
2455+ let input = concat ! (
2456+ "<style>body{color:red}</style>" ,
2457+ "<div>Hello</div>" ,
2458+ "<script>alert('x')</script>" ,
2459+ "<div>World</div>"
2460+ ) ;
2461+ let output = html_to_text ( input) ;
2462+ assert ! ( !output. contains( "body{color:red}" ) ) ;
2463+ assert ! ( !output. contains( "alert('x')" ) ) ;
2464+ assert ! ( output. contains( "Hello" ) ) ;
2465+ assert ! ( output. contains( "World" ) ) ;
2466+ }
2467+
2468+ #[ test]
2469+ fn html_to_text_removes_template_content ( ) {
2470+ let input = concat ! (
2471+ "<div>Hello</div>" ,
2472+ "<template><div>Hidden</div><style>.x{}</style></template>" ,
2473+ "<div>World</div>"
2474+ ) ;
2475+ let output = html_to_text ( input) ;
2476+ assert ! ( !output. contains( "Hidden" ) ) ;
2477+ assert ! ( !output. contains( ".x{}" ) ) ;
2478+ assert ! ( output. contains( "Hello" ) ) ;
2479+ assert ! ( output. contains( "World" ) ) ;
2480+ }
24202481}
0 commit comments