|
1 | | -const STATE_PLAINTEXT = Symbol('plaintext'); |
2 | | -const STATE_HTML = Symbol('html'); |
3 | | -const STATE_COMMENT = Symbol('comment'); |
4 | | - |
5 | | -// eslint-disable-next-line @typescript-eslint/ban-types |
| 1 | +const STATE_PLAINTEXT = 0; |
| 2 | +const STATE_HTML = 1; |
| 3 | +const STATE_COMMENT = 2; |
| 4 | + |
| 5 | +const CHAR_LT = 60; // '<' |
| 6 | +const CHAR_GT = 62; // '>' |
| 7 | +const CHAR_QUOTE = 34; // '"' |
| 8 | +const CHAR_APOS = 39; // "'" |
| 9 | +const CHAR_DASH = 45; // '-' |
| 10 | +const CHAR_SPACE = 32; // ' ' |
| 11 | +const CHAR_NEWLINE = 10; // '\n' |
| 12 | +const CHAR_EXCLAIM = 33; // '!' |
| 13 | + |
| 14 | +// eslint-disable-next-line @typescript-eslint/no-wrapper-object-types |
6 | 15 | function striptags(html: string | String = '') { |
7 | 16 | // if not string, then safely return an empty string |
8 | 17 | if (typeof html !== 'string' && !(html instanceof String)) { |
9 | 18 | return ''; |
10 | 19 | } |
11 | 20 |
|
12 | 21 | let state = STATE_PLAINTEXT; |
13 | | - let tag_buffer = ''; |
14 | 22 | let depth = 0; |
15 | | - let in_quote_char = ''; |
| 23 | + let in_quote_char = 0; |
16 | 24 | let output = ''; |
| 25 | + let tag_start = -1; |
| 26 | + let plain_text_start = 0; |
17 | 27 |
|
18 | 28 | const { length } = html; |
19 | 29 |
|
20 | 30 | for (let idx = 0; idx < length; idx++) { |
21 | | - const char = html[idx]; |
| 31 | + const charCode = html.charCodeAt(idx); |
22 | 32 |
|
23 | 33 | if (state === STATE_PLAINTEXT) { |
24 | | - switch (char) { |
25 | | - case '<': |
26 | | - state = STATE_HTML; |
27 | | - tag_buffer = tag_buffer + char; |
28 | | - break; |
29 | | - |
30 | | - default: |
31 | | - output += char; |
32 | | - break; |
| 34 | + if (charCode === CHAR_LT) { |
| 35 | + output += html.slice(plain_text_start, idx); |
| 36 | + state = STATE_HTML; |
| 37 | + tag_start = idx; |
33 | 38 | } |
34 | 39 | } else if (state === STATE_HTML) { |
35 | | - switch (char) { |
36 | | - case '<': |
37 | | - // ignore '<' if inside a quote |
38 | | - if (in_quote_char) break; |
39 | | - |
40 | | - // we're seeing a nested '<' |
41 | | - depth++; |
42 | | - break; |
43 | | - |
44 | | - case '>': |
45 | | - // ignore '>' if inside a quote |
46 | | - if (in_quote_char) { |
47 | | - break; |
48 | | - } |
49 | | - |
50 | | - // something like this is happening: '<<>>' |
| 40 | + if (charCode === CHAR_LT) { |
| 41 | + // ignore '<' if inside a quote |
| 42 | + if (!in_quote_char) depth++; |
| 43 | + } else if (charCode === CHAR_GT) { |
| 44 | + // ignore '>' if inside a quote |
| 45 | + if (!in_quote_char) { |
51 | 46 | if (depth) { |
52 | 47 | depth--; |
53 | | - |
54 | | - break; |
55 | | - } |
56 | | - |
57 | | - // this is closing the tag in tag_buffer |
58 | | - in_quote_char = ''; |
59 | | - state = STATE_PLAINTEXT; |
60 | | - // tag_buffer += '>'; |
61 | | - |
62 | | - tag_buffer = ''; |
63 | | - break; |
64 | | - |
65 | | - case '"': |
66 | | - case '\'': |
67 | | - // catch both single and double quotes |
68 | | - |
69 | | - if (char === in_quote_char) { |
70 | | - in_quote_char = ''; |
71 | 48 | } else { |
72 | | - in_quote_char = in_quote_char || char; |
73 | | - } |
74 | | - |
75 | | - tag_buffer = tag_buffer + char; |
76 | | - break; |
77 | | - |
78 | | - case '-': |
79 | | - if (tag_buffer === '<!-') { |
80 | | - state = STATE_COMMENT; |
81 | | - } |
82 | | - |
83 | | - tag_buffer = tag_buffer + char; |
84 | | - break; |
85 | | - |
86 | | - case ' ': |
87 | | - case '\n': |
88 | | - if (tag_buffer === '<') { |
| 49 | + // this is closing the tag in tag_buffer |
| 50 | + in_quote_char = 0; |
89 | 51 | state = STATE_PLAINTEXT; |
90 | | - output += '< '; |
91 | | - tag_buffer = ''; |
92 | | - |
93 | | - break; |
| 52 | + tag_start = -1; |
| 53 | + plain_text_start = idx + 1; |
94 | 54 | } |
95 | | - |
96 | | - tag_buffer = tag_buffer + char; |
97 | | - break; |
98 | | - |
99 | | - default: |
100 | | - tag_buffer = tag_buffer + char; |
101 | | - break; |
| 55 | + } |
| 56 | + } else if (charCode === CHAR_QUOTE || charCode === CHAR_APOS) { |
| 57 | + // catch both single and double quotes |
| 58 | + |
| 59 | + if (charCode === in_quote_char) { |
| 60 | + in_quote_char = 0; |
| 61 | + } else { |
| 62 | + in_quote_char = in_quote_char || charCode; |
| 63 | + } |
| 64 | + } else if (charCode === CHAR_DASH) { |
| 65 | + // same as if (html.slice(tag_start, idx) === '<!-') { |
| 66 | + if (idx - tag_start === 3 |
| 67 | + && html.charCodeAt(tag_start + 1) === CHAR_EXCLAIM |
| 68 | + && html.charCodeAt(tag_start + 2) === CHAR_DASH |
| 69 | + ) { |
| 70 | + state = STATE_COMMENT; |
| 71 | + } |
| 72 | + } else if (charCode === CHAR_SPACE || charCode === CHAR_NEWLINE) { |
| 73 | + // same as if (html.slice(tag_start, idx) === '<') { |
| 74 | + if (idx - tag_start === 1) { |
| 75 | + state = STATE_PLAINTEXT; |
| 76 | + output += '< '; |
| 77 | + tag_start = -1; |
| 78 | + plain_text_start = idx + 1; |
| 79 | + } |
102 | 80 | } |
103 | 81 | } else if (state === STATE_COMMENT) { |
104 | | - switch (char) { |
105 | | - case '>': |
106 | | - if (tag_buffer.slice(-2) === '--') { |
107 | | - // close the comment |
108 | | - state = STATE_PLAINTEXT; |
109 | | - } |
110 | | - |
111 | | - tag_buffer = ''; |
112 | | - break; |
113 | | - |
114 | | - default: |
115 | | - tag_buffer = tag_buffer + char; |
116 | | - break; |
| 82 | + if (charCode === CHAR_GT) { |
| 83 | + // same as if (html.slice(idx - 2, idx) === '--') { |
| 84 | + if (idx >= 2 |
| 85 | + && html.charCodeAt(idx - 1) === CHAR_DASH |
| 86 | + && html.charCodeAt(idx - 2) === CHAR_DASH) { |
| 87 | + // close the comment |
| 88 | + state = STATE_PLAINTEXT; |
| 89 | + plain_text_start = idx + 1; |
| 90 | + } |
| 91 | + tag_start = -1; |
117 | 92 | } |
118 | 93 | } |
119 | 94 | } |
120 | 95 |
|
| 96 | + if (state === STATE_PLAINTEXT && plain_text_start < length) { |
| 97 | + output += html.slice(plain_text_start); |
| 98 | + } |
| 99 | + |
121 | 100 | return output; |
122 | 101 | } |
123 | 102 |
|
|
0 commit comments