Skip to content

Commit ebd2b17

Browse files
authored
fix(html): skip style/script/template blocks in html_to_text (#134)
1 parent af44d48 commit ebd2b17

File tree

1 file changed

+64
-3
lines changed

1 file changed

+64
-3
lines changed

src/decoders/html.rs

Lines changed: 64 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ pub fn html_to_text(input: &str) -> String {
4141

4242
let mut in_tag = false;
4343
let mut in_head = false;
44+
let mut in_style = false;
45+
let mut in_script = false;
46+
let mut in_template = false;
4447
let mut in_comment = false;
4548

4649
let mut is_token_start = true;
@@ -58,7 +61,13 @@ pub fn html_to_text(input: &str) -> String {
5861
if !in_comment {
5962
match ch {
6063
b'<' => {
61-
if !in_tag && !in_head && !is_token_start {
64+
if !in_tag
65+
&& !in_head
66+
&& !in_style
67+
&& !in_script
68+
&& !in_template
69+
&& !is_token_start
70+
{
6271
add_html_token(
6372
&mut result,
6473
&input[token_start..token_end + 1],
@@ -87,6 +96,15 @@ pub fn html_to_text(input: &str) -> String {
8796
Some(tag) if tag.eq_ignore_ascii_case(b"head") => {
8897
in_head = !is_tag_close;
8998
}
99+
Some(tag) if tag.eq_ignore_ascii_case(b"style") => {
100+
in_style = !is_tag_close;
101+
}
102+
Some(tag) if tag.eq_ignore_ascii_case(b"script") => {
103+
in_script = !is_tag_close;
104+
}
105+
Some(tag) if tag.eq_ignore_ascii_case(b"template") => {
106+
in_template = !is_tag_close;
107+
}
90108
_ => (),
91109
}
92110
}
@@ -109,7 +127,7 @@ pub fn html_to_text(input: &str) -> String {
109127
}
110128
}
111129
b' ' | b'\t' | b'\r' | b'\n' => {
112-
if !in_tag && !in_head {
130+
if !in_tag && !in_head && !in_style && !in_script && !in_template {
113131
if !is_token_start {
114132
add_html_token(
115133
&mut result,
@@ -126,6 +144,9 @@ pub fn html_to_text(input: &str) -> String {
126144
continue;
127145
}
128146
b'&' if !in_tag && !is_token_start && !in_head => {
147+
if in_style || in_script || in_template {
148+
continue;
149+
}
129150
add_html_token(
130151
&mut result,
131152
&input[token_start..token_end + 1],
@@ -136,6 +157,9 @@ pub fn html_to_text(input: &str) -> String {
136157
is_after_space = false;
137158
}
138159
b';' if !in_tag && !is_token_start && !in_head => {
160+
if in_style || in_script || in_template {
161+
continue;
162+
}
139163
add_html_token(
140164
&mut result,
141165
&input[token_start..pos + 1],
@@ -170,7 +194,7 @@ pub fn html_to_text(input: &str) -> String {
170194
}
171195
}
172196

173-
if !in_tag && !is_token_start && !in_head {
197+
if !in_tag && !is_token_start && !in_head && !in_style && !in_script && !in_template {
174198
add_html_token(
175199
&mut result,
176200
&input[token_start..token_end + 1],
@@ -2417,4 +2441,41 @@ mod tests {
24172441
assert_eq!(result, input.1, "Failed for '{:?}", input.0);
24182442
}
24192443
}
2444+
2445+
#[test]
2446+
fn html_to_text_removes_style_content() {
2447+
let input = "<style>body{color:red}</style><div>Hello</div>";
2448+
let output = html_to_text(input);
2449+
assert!(!output.contains("body{color:red}"));
2450+
assert!(output.contains("Hello"));
2451+
}
2452+
2453+
#[test]
2454+
fn html_to_text_removes_script_content() {
2455+
let input = concat!(
2456+
"<style>body{color:red}</style>",
2457+
"<div>Hello</div>",
2458+
"<script>alert('x')</script>",
2459+
"<div>World</div>"
2460+
);
2461+
let output = html_to_text(input);
2462+
assert!(!output.contains("body{color:red}"));
2463+
assert!(!output.contains("alert('x')"));
2464+
assert!(output.contains("Hello"));
2465+
assert!(output.contains("World"));
2466+
}
2467+
2468+
#[test]
2469+
fn html_to_text_removes_template_content() {
2470+
let input = concat!(
2471+
"<div>Hello</div>",
2472+
"<template><div>Hidden</div><style>.x{}</style></template>",
2473+
"<div>World</div>"
2474+
);
2475+
let output = html_to_text(input);
2476+
assert!(!output.contains("Hidden"));
2477+
assert!(!output.contains(".x{}"));
2478+
assert!(output.contains("Hello"));
2479+
assert!(output.contains("World"));
2480+
}
24202481
}

0 commit comments

Comments
 (0)