Skip to content

Commit 971a191

Browse files
marcselmanmarcselman
authored andcommitted
Improve filtering of session descriptions to remove extra whitespace and back-matter
Refine `parseBackMatter` and `stripBackMatterFromHtml` to better handle Outlook's unconventional HTML formatting, removing extraneous whitespace and metadata from session descriptions. Replit-Commit-Author: Agent Replit-Commit-Session-Id: 72d1ee0d-43f0-4151-941b-cf0c139bc10a Replit-Commit-Checkpoint-Type: full_checkpoint Replit-Commit-Event-Id: cd3fc3f2-ff84-4649-8aaa-56988b7d6d1f Replit-Commit-Screenshot-Url: https://storage.googleapis.com/screenshot-production-us-central1/04133160-d2cd-42c9-82f0-4e08d800b951/72d1ee0d-43f0-4151-941b-cf0c139bc10a/nKdzCgw Replit-Helium-Checkpoint-Created: true
1 parent fddcedd commit 971a191

File tree

1 file changed

+42
-15
lines changed

1 file changed

+42
-15
lines changed

server/microsoft-graph.ts

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,16 @@ function parseBackMatter(text: string): BackMatter {
8686
return { metadata: {}, content: text };
8787
}
8888

89-
// Remove the back-matter block from content
90-
const content = text.replace(backMatterRegex, "").trim();
89+
// Remove the back-matter block from content using the same regex on normalized text
90+
// Then clean up excessive whitespace
91+
let content = normalizedText.replace(backMatterRegex, "").trim();
92+
93+
// Clean up excessive whitespace that might remain
94+
content = content
95+
.replace(/\r\n/g, "\n")
96+
.replace(/\r/g, "\n")
97+
.replace(/\n{3,}/g, "\n\n") // Max 2 consecutive newlines
98+
.trim();
9199

92100
return { metadata, content };
93101
}
@@ -109,23 +117,42 @@ function stripBackMatterFromHtml(html: string): string {
109117
}
110118

111119
// Remove the back-matter block from HTML
112-
// Handle various Outlook patterns:
113-
// - <p>---</p><p>slug: value</p><p>---</p>
114-
// - <div>---</div><div>slug: value</div><div>---</div>
115-
// - ---<br>slug: value<br>---
120+
// Handle various Outlook patterns including those with newlines inside tags
116121
let result = html;
117122

118-
// Pattern 1: Block elements (p/div) wrapping each line
119-
const blockPattern = /(\s*<(p|div)[^>]*>\s*---\s*<\/(p|div)>\s*)(<(p|div)[^>]*>[^<]*:[^<]*<\/(p|div)>\s*)*(\s*<(p|div)[^>]*>\s*---\s*<\/(p|div)>\s*)$/i;
120-
result = result.replace(blockPattern, "");
123+
// Pattern: Find --- followed by key:value pairs followed by --- (with any HTML/whitespace between)
124+
// This is a more aggressive pattern that handles Outlook's various formatting quirks
125+
const generalPattern = /<(p|div|span)[^>]*>[\s\n]*---[\s\S]*?---[\s\n]*(<br\s*\/?>)?[\s\n]*<\/(p|div|span)>/gi;
126+
result = result.replace(generalPattern, "");
127+
128+
// Pattern for back-matter spread across multiple block elements
129+
// Match from first --- block to last --- block including everything between
130+
const multiBlockPattern = /<(p|div)[^>]*>[\s\n]*---[\s\n]*<\/(p|div)>[\s\S]*?<(p|div)[^>]*>[\s\n]*---[\s\n]*(<br\s*\/?>)?[\s\n]*<\/(p|div)>/gi;
131+
result = result.replace(multiBlockPattern, "");
132+
133+
// Clean up empty paragraphs and trailing whitespace elements
134+
result = result.replace(/<(p|div|span)[^>]*>[\s\n]*(<br\s*\/?>)?[\s\n]*<\/(p|div|span)>\s*$/gi, "");
135+
result = result.replace(/<(p|div|span)[^>]*>[\s\n]*(<br\s*\/?>)?[\s\n]*<\/(p|div|span)>\s*(<span><\/span>)?\s*$/gi, "");
136+
137+
// Remove trailing empty spans
138+
result = result.replace(/\s*<span>\s*<\/span>\s*$/gi, "");
139+
140+
// Remove leading empty spans
141+
result = result.replace(/^\s*<span>\s*<\/span>\s*/gi, "");
142+
143+
// Clean up multiple trailing empty paragraphs
144+
while (/<(p|div)[^>]*>\s*(<br\s*\/?>)?\s*<\/(p|div)>\s*$/i.test(result)) {
145+
result = result.replace(/<(p|div)[^>]*>\s*(<br\s*\/?>)?\s*<\/(p|div)>\s*$/i, "");
146+
}
147+
148+
// Remove trailing <br> tags
149+
result = result.replace(/(<br\s*\/?>[\s\n]*)+$/gi, "");
121150

122-
// Pattern 2: BR-separated content
123-
const brPattern = /(\s*---\s*(<br\s*\/?>)\s*)([^<]*:[^<]*(<br\s*\/?>)\s*)*(\s*---\s*(<br\s*\/?>)?\s*)$/i;
124-
result = result.replace(brPattern, "");
151+
// Remove trailing </p> with only <br> before it
152+
result = result.replace(/<br\s*\/?>\s*\n*<\/p>\s*$/gi, "</p>");
125153

126-
// Pattern 3: Mixed - opening/closing dashes in blocks, content with BRs
127-
const mixedPattern = /(\s*<(p|div)[^>]*>\s*---[\s\S]*?---\s*<\/(p|div)>\s*)$/i;
128-
result = result.replace(mixedPattern, "");
154+
// Remove empty paragraphs that only contain <br> (these create excessive whitespace)
155+
result = result.replace(/<p[^>]*>[\s\n]*<br\s*\/?>[\s\n]*<\/p>/gi, "");
129156

130157
return result.trim();
131158
}

0 commit comments

Comments
 (0)