@@ -86,8 +86,16 @@ function parseBackMatter(text: string): BackMatter {
8686 return { metadata : { } , content : text } ;
8787 }
8888
89- // Remove the back-matter block from content
90- const content = text . replace ( backMatterRegex , "" ) . trim ( ) ;
89+ // Remove the back-matter block from content using the same regex on normalized text
90+ // Then clean up excessive whitespace
91+ let content = normalizedText . replace ( backMatterRegex , "" ) . trim ( ) ;
92+
93+ // Clean up excessive whitespace that might remain
94+ content = content
95+ . replace ( / \r \n / g, "\n" )
96+ . replace ( / \r / g, "\n" )
97+ . replace ( / \n { 3 , } / g, "\n\n" ) // Max 2 consecutive newlines
98+ . trim ( ) ;
9199
92100 return { metadata, content } ;
93101}
@@ -109,23 +117,42 @@ function stripBackMatterFromHtml(html: string): string {
109117 }
110118
111119 // Remove the back-matter block from HTML
112- // Handle various Outlook patterns:
113- // - <p>---</p><p>slug: value</p><p>---</p>
114- // - <div>---</div><div>slug: value</div><div>---</div>
115- // - ---<br>slug: value<br>---
120+ // Handle various Outlook patterns including those with newlines inside tags
116121 let result = html ;
117122
118- // Pattern 1: Block elements (p/div) wrapping each line
119- const blockPattern = / ( \s * < ( p | d i v ) [ ^ > ] * > \s * - - - \s * < \/ ( p | d i v ) > \s * ) ( < ( p | d i v ) [ ^ > ] * > [ ^ < ] * : [ ^ < ] * < \/ ( p | d i v ) > \s * ) * ( \s * < ( p | d i v ) [ ^ > ] * > \s * - - - \s * < \/ ( p | d i v ) > \s * ) $ / i;
120- result = result . replace ( blockPattern , "" ) ;
123+ // Pattern: Find --- followed by key:value pairs followed by --- (with any HTML/whitespace between)
124+ // This is a more aggressive pattern that handles Outlook's various formatting quirks
125+ const generalPattern = / < ( p | d i v | s p a n ) [ ^ > ] * > [ \s \n ] * - - - [ \s \S ] * ?- - - [ \s \n ] * ( < b r \s * \/ ? > ) ? [ \s \n ] * < \/ ( p | d i v | s p a n ) > / gi;
126+ result = result . replace ( generalPattern , "" ) ;
127+
128+ // Pattern for back-matter spread across multiple block elements
129+ // Match from first --- block to last --- block including everything between
130+ const multiBlockPattern = / < ( p | d i v ) [ ^ > ] * > [ \s \n ] * - - - [ \s \n ] * < \/ ( p | d i v ) > [ \s \S ] * ?< ( p | d i v ) [ ^ > ] * > [ \s \n ] * - - - [ \s \n ] * ( < b r \s * \/ ? > ) ? [ \s \n ] * < \/ ( p | d i v ) > / gi;
131+ result = result . replace ( multiBlockPattern , "" ) ;
132+
133+ // Clean up empty paragraphs and trailing whitespace elements
134+ result = result . replace ( / < ( p | d i v | s p a n ) [ ^ > ] * > [ \s \n ] * ( < b r \s * \/ ? > ) ? [ \s \n ] * < \/ ( p | d i v | s p a n ) > \s * $ / gi, "" ) ;
135+ result = result . replace ( / < ( p | d i v | s p a n ) [ ^ > ] * > [ \s \n ] * ( < b r \s * \/ ? > ) ? [ \s \n ] * < \/ ( p | d i v | s p a n ) > \s * ( < s p a n > < \/ s p a n > ) ? \s * $ / gi, "" ) ;
136+
137+ // Remove trailing empty spans
138+ result = result . replace ( / \s * < s p a n > \s * < \/ s p a n > \s * $ / gi, "" ) ;
139+
140+ // Remove leading empty spans
141+ result = result . replace ( / ^ \s * < s p a n > \s * < \/ s p a n > \s * / gi, "" ) ;
142+
143+ // Clean up multiple trailing empty paragraphs
144+ while ( / < ( p | d i v ) [ ^ > ] * > \s * ( < b r \s * \/ ? > ) ? \s * < \/ ( p | d i v ) > \s * $ / i. test ( result ) ) {
145+ result = result . replace ( / < ( p | d i v ) [ ^ > ] * > \s * ( < b r \s * \/ ? > ) ? \s * < \/ ( p | d i v ) > \s * $ / i, "" ) ;
146+ }
147+
148+ // Remove trailing <br> tags
149+ result = result . replace ( / ( < b r \s * \/ ? > [ \s \n ] * ) + $ / gi, "" ) ;
121150
122- // Pattern 2: BR-separated content
123- const brPattern = / ( \s * - - - \s * ( < b r \s * \/ ? > ) \s * ) ( [ ^ < ] * : [ ^ < ] * ( < b r \s * \/ ? > ) \s * ) * ( \s * - - - \s * ( < b r \s * \/ ? > ) ? \s * ) $ / i;
124- result = result . replace ( brPattern , "" ) ;
151+ // Remove trailing </p> with only <br> before it
152+ result = result . replace ( / < b r \s * \/ ? > \s * \n * < \/ p > \s * $ / gi, "</p>" ) ;
125153
126- // Pattern 3: Mixed - opening/closing dashes in blocks, content with BRs
127- const mixedPattern = / ( \s * < ( p | d i v ) [ ^ > ] * > \s * - - - [ \s \S ] * ?- - - \s * < \/ ( p | d i v ) > \s * ) $ / i;
128- result = result . replace ( mixedPattern , "" ) ;
154+ // Remove empty paragraphs that only contain <br> (these create excessive whitespace)
155+ result = result . replace ( / < p [ ^ > ] * > [ \s \n ] * < b r \s * \/ ? > [ \s \n ] * < \/ p > / gi, "" ) ;
129156
130157 return result . trim ( ) ;
131158}
0 commit comments