@@ -37,30 +37,71 @@ export function isUrl(input: string): boolean {
3737 * Simple extraction that removes scripts, styles, and HTML tags
3838 */
3939function extractTextFromHtml ( html : string ) : string {
40- // Remove script and style tags with their content
41- let text = html
42- . replace ( / < s c r i p t [ ^ > ] * > [ \s \S ] * ?< \/ s c r i p t > / gi, '' )
43- . replace ( / < s t y l e [ ^ > ] * > [ \s \S ] * ?< \/ s t y l e > / gi, '' )
44- . replace ( / < n o s c r i p t [ ^ > ] * > [ \s \S ] * ?< \/ n o s c r i p t > / gi, '' ) ;
40+ // Remove script, style, and noscript blocks before generic tag stripping.
41+ let text = stripElementBlocks ( html , 'script' ) ;
42+ text = stripElementBlocks ( text , 'style' ) ;
43+ text = stripElementBlocks ( text , 'noscript' ) ;
4544
4645 // Remove HTML tags but keep content
4746 text = text . replace ( / < [ ^ > ] + > / g, ' ' ) ;
4847
49- // Decode common HTML entities
50- text = text
51- . replace ( / & n b s p ; / g, ' ' )
52- . replace ( / & a m p ; / g, '&' )
53- . replace ( / & l t ; / g, '<' )
54- . replace ( / & g t ; / g, '>' )
55- . replace ( / & q u o t ; / g, '"' )
56- . replace ( / & # 3 9 ; / g, "'" ) ;
48+ // Decode only safe presentation entities; keep angle brackets encoded.
49+ text = decodeSafeHtmlEntities ( text ) ;
5750
5851 // Clean up whitespace
5952 text = text . replace ( / \s + / g, ' ' ) . trim ( ) ;
6053
6154 return text ;
6255}
6356
57+ /**
58+ * Remove full HTML element blocks (open tag + content + closing tag) using
59+ * deterministic string scanning instead of regex.
60+ */
61+ function stripElementBlocks ( input : string , tagName : string ) : string {
62+ let output = input ;
63+ const openToken = `<${ tagName } ` ;
64+ const closeToken = `</${ tagName } ` ;
65+
66+ while ( true ) {
67+ const lower = output . toLowerCase ( ) ;
68+ const openStart = lower . indexOf ( openToken ) ;
69+ if ( openStart === - 1 ) {
70+ break ;
71+ }
72+
73+ const openEnd = lower . indexOf ( '>' , openStart + openToken . length ) ;
74+ if ( openEnd === - 1 ) {
75+ output = output . slice ( 0 , openStart ) ;
76+ break ;
77+ }
78+
79+ const closeStart = lower . indexOf ( closeToken , openEnd + 1 ) ;
80+ if ( closeStart === - 1 ) {
81+ output = output . slice ( 0 , openStart ) ;
82+ break ;
83+ }
84+
85+ const closeEnd = lower . indexOf ( '>' , closeStart + closeToken . length ) ;
86+ if ( closeEnd === - 1 ) {
87+ output = output . slice ( 0 , openStart ) ;
88+ break ;
89+ }
90+
91+ output = output . slice ( 0 , openStart ) + output . slice ( closeEnd + 1 ) ;
92+ }
93+
94+ return output ;
95+ }
96+
97+ /** Decode non-structural entities only (quotes/spaces), preserving `<`/`>`/`&`. */
98+ function decodeSafeHtmlEntities ( input : string ) : string {
99+ return input
100+ . replace ( / & n b s p ; / gi, ' ' )
101+ . replace ( / & q u o t ; / gi, '"' )
102+ . replace ( / & # 3 9 ; / g, "'" ) ;
103+ }
104+
64105/**
65106 * Fetch content from a URL
66107 */
0 commit comments