From 32aff62a821d4ac04ae851912a81c8efd9890c54 Mon Sep 17 00:00:00 2001 From: D-Sketon <2055272094@qq.com> Date: Fri, 3 Oct 2025 00:43:50 +0800 Subject: [PATCH 1/2] perf: faster strip_html --- lib/strip_html.ts | 153 ++++++++++++++++++++-------------------------- 1 file changed, 66 insertions(+), 87 deletions(-) diff --git a/lib/strip_html.ts b/lib/strip_html.ts index 492974ce..b279743b 100644 --- a/lib/strip_html.ts +++ b/lib/strip_html.ts @@ -2,6 +2,15 @@ const STATE_PLAINTEXT = Symbol('plaintext'); const STATE_HTML = Symbol('html'); const STATE_COMMENT = Symbol('comment'); +const CHAR_LT = 60; // '<' +const CHAR_GT = 62; // '>' +const CHAR_QUOTE = 34; // '"' +const CHAR_APOS = 39; // "'" +const CHAR_DASH = 45; // '-' +const CHAR_SPACE = 32; // ' ' +const CHAR_NEWLINE = 10; // '\n' +const CHAR_EXCLAIM = 33; // '!' + // eslint-disable-next-line @typescript-eslint/ban-types function striptags(html: string | String = '') { // if not string, then safely return an empty string @@ -10,114 +19,84 @@ function striptags(html: string | String = '') { } let state = STATE_PLAINTEXT; - let tag_buffer = ''; let depth = 0; - let in_quote_char = ''; + let in_quote_char = 0; let output = ''; + let tag_start = -1; + let plain_text_start = 0; const { length } = html; for (let idx = 0; idx < length; idx++) { - const char = html[idx]; + const charCode = html.charCodeAt(idx); if (state === STATE_PLAINTEXT) { - switch (char) { - case '<': - state = STATE_HTML; - tag_buffer = tag_buffer + char; - break; - - default: - output += char; - break; + if (charCode === CHAR_LT) { + output += html.slice(plain_text_start, idx); + state = STATE_HTML; + tag_start = idx; } } else if (state === STATE_HTML) { - switch (char) { - case '<': - // ignore '<' if inside a quote - if (in_quote_char) break; - - // we're seeing a nested '<' - depth++; - break; - - case '>': - // ignore '>' if inside a quote - if (in_quote_char) { - break; - } - - // something like this is happening: '<<>>' + if (charCode === CHAR_LT) { + // ignore '<' if inside a quote + if (!in_quote_char) depth++; + } else if (charCode === CHAR_GT) { + // ignore '>' if inside a quote + if (!in_quote_char) { if (depth) { depth--; - - break; - } - - // this is closing the tag in tag_buffer - in_quote_char = ''; - state = STATE_PLAINTEXT; - // tag_buffer += '>'; - - tag_buffer = ''; - break; - - case '"': - case '\'': - // catch both single and double quotes - - if (char === in_quote_char) { - in_quote_char = ''; } else { - in_quote_char = in_quote_char || char; - } - - tag_buffer = tag_buffer + char; - break; - - case '-': - if (tag_buffer === '': - if (tag_buffer.slice(-2) === '--') { - // close the comment - state = STATE_PLAINTEXT; - } - - tag_buffer = ''; - break; - - default: - tag_buffer = tag_buffer + char; - break; + if (charCode === CHAR_GT) { + // same as if (html.slice(idx - 2, idx) === '--') { + if (idx >= 2 + && html.charCodeAt(idx - 1) === CHAR_DASH + && html.charCodeAt(idx - 2) === CHAR_DASH) { + // close the comment + state = STATE_PLAINTEXT; + plain_text_start = idx + 1; + } + tag_start = -1; } } } + if (state === STATE_PLAINTEXT && plain_text_start < length) { + output += html.slice(plain_text_start); + } + return output; } From 62b496ffa745ed40ca0a510182ab5e5ea2a482f6 Mon Sep 17 00:00:00 2001 From: D-Sketon <2055272094@qq.com> Date: Fri, 3 Oct 2025 01:10:48 +0800 Subject: [PATCH 2/2] perf: use number as state --- lib/strip_html.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/strip_html.ts b/lib/strip_html.ts index b279743b..24a5abd6 100644 --- a/lib/strip_html.ts +++ b/lib/strip_html.ts @@ -1,6 +1,6 @@ -const STATE_PLAINTEXT = Symbol('plaintext'); -const STATE_HTML = Symbol('html'); -const STATE_COMMENT = Symbol('comment'); +const STATE_PLAINTEXT = 0; +const STATE_HTML = 1; +const STATE_COMMENT = 2; const CHAR_LT = 60; // '<' const CHAR_GT = 62; // '>'