@@ -113,16 +113,41 @@ function cleanFragment(fragment: DocumentFragment): DocumentFragment {
113113 scripts . forEach ( ( script ) => {
114114 script . remove ( )
115115 } )
116+ // Remove all formatting tags like <b>, <i>, etc., but keep their inner text/content
117+ const formattingTags = [ 'b' , 'i' , 'u' , 'em' , 'strong' , 'mark' , 'small' , 'del' , 'ins' , 'sub' , 'sup' , 'span' , 'font' ]
118+ formattingTags . forEach ( ( tag ) => {
119+ const elements = Array . from ( fragment . querySelectorAll ( tag ) )
120+ elements . forEach ( ( el ) => {
121+ // Move all children before the formatting element (merging with surrounding content)
122+ while ( el . firstChild ) {
123+ el . parentNode ?. insertBefore ( el . firstChild , el )
124+ }
125+ el . parentNode ?. removeChild ( el )
126+ } )
127+ } )
128+ // Remove all empty text nodes and merge adjacent text nodes
129+ fragment . normalize ( )
130+ // Add a line break "\n" to the end of each text node
131+ const walker = document . createTreeWalker ( fragment , NodeFilter . SHOW_TEXT )
132+ let node : Text | null = walker . nextNode ( ) as Text | null
133+ while ( node ) {
134+ node . textContent += '\n'
135+ node = walker . nextNode ( ) as Text | null
136+ }
116137 return fragment
117138}
118139
119140function cleanText ( text : string ) : string {
120141 return text
121142 . split ( '\n' ) // Break into lines
122- . map ( ( line ) => line . trim ( ) ) // Trim each line
143+ . map (
144+ ( line ) =>
145+ line
146+ . replace ( / \t + / g, ' ' ) // Replace tabs with a space
147+ . replace ( / { 2 , } / g, ' ' ) // Collapse multiple spaces
148+ . trim ( ) // Trim each line
149+ )
123150 . join ( '\n' ) // Recombine
124- . replace ( / \t + / g, ' ' ) // Replace tabs with a space
125- . replace ( / { 2 , } / g, ' ' ) // Collapse multiple spaces
126151 . replace ( / \n { 2 , } / g, '\n' ) // Collapse multiple line breaks
127152 . trim ( ) // Final trim of full result
128153}
0 commit comments