|
164 | 164 | }); |
165 | 165 | } |
166 | 166 |
|
| 167 | + // See also: |
| 168 | + // https://github.com/danny-avila/LibreChat/blob/main/client/src/utils/latex.ts |
| 169 | +
|
| 170 | + // Protect code blocks: ```...``` and `...` |
| 171 | + const codeBlockRegex = /(```[\s\S]*?```|`[^`\n]+`)/g; |
| 172 | +
|
| 173 | + export function preprocessLaTeX(content: string): string { |
| 174 | + // Step 1: Protect code blocks |
| 175 | + const codeBlocks: string[] = []; |
| 176 | + content = content.replace(codeBlockRegex, (match) => { |
| 177 | + codeBlocks.push(match); |
| 178 | + return `<<CODE_BLOCK_${codeBlocks.length - 1}>>`; |
| 179 | + }); |
| 180 | +
|
| 181 | + // Step 2: Protect existing LaTeX expressions |
| 182 | + const latexExpressions: string[] = []; |
| 183 | +
|
| 184 | + // Match \(...\), \[...\], $$...$$ and protect them |
| 185 | + content = content.replace(/(\$\$[\s\S]*?\$\$|\\\[[\s\S]*?\\\]|\\\(.*?\\\))/g, (match) => { |
| 186 | + latexExpressions.push(match); |
| 187 | + return `<<LATEX_${latexExpressions.length - 1}>>`; |
| 188 | + }); |
| 189 | +
|
| 190 | + // Protect inline $...$ but NOT if it looks like money (e.g., $10, $3.99) |
| 191 | + content = protectLaTeXButNotMoney(content, latexExpressions); |
| 192 | +
|
| 193 | + // Step 3: Escape standalone $ before digits (currency like $5 → \$5) |
| 194 | + // (Now that inline math is protected, this will only escape dollars not already protected) |
| 195 | + content = content.replace(/\$(?=\d)/g, '\\$'); |
| 196 | +
|
| 197 | + // Step 4: Restore protected LaTeX expressions (they are valid) |
| 198 | + content = content.replace(/<<LATEX_(\d+)>>/g, (_, index) => { |
| 199 | + return latexExpressions[parseInt(index)]; |
| 200 | + }); |
| 201 | +
|
| 202 | + // Step 5: Restore code blocks |
| 203 | + content = content.replace(/<<CODE_BLOCK_(\d+)>>/g, (_, index) => { |
| 204 | + return codeBlocks[parseInt(index)]; |
| 205 | + }); |
| 206 | +
|
| 207 | + // Step 6: Apply additional escaping functions (brackets and mhchem) |
| 208 | + content = escapeBrackets(content); |
| 209 | + if (content.includes('\\ce{') || content.includes('\\pu{')) { |
| 210 | + content = escapeMhchem(content); |
| 211 | + } |
| 212 | +
|
| 213 | + // Final pass: Convert \(...\) → $...$, \[...\] → $$...$$ |
| 214 | + content = content |
| 215 | + .replace(/\\\((.+?)\\\)/g, '$$$1$') // inline |
| 216 | + .replace(/\\\[(.+?)\\\]/g, '$$$$1$$'); // display |
| 217 | +
|
| 218 | + return content; |
| 219 | + } |
| 220 | +
|
| 221 | + function protectLaTeXButNotMoney(content: string, latexExpressions: string[]): string { |
| 222 | + if (content.indexOf('$') == -1) { |
| 223 | + return content; |
| 224 | + } |
| 225 | + return content |
| 226 | + .split('\n') |
| 227 | + .map((line) => { |
| 228 | + if (line.indexOf('$') == -1) { |
| 229 | + return line; |
| 230 | + } |
| 231 | + let result = ''; |
| 232 | + let index = 0; |
| 233 | + while (index + 2 < line.length) { |
| 234 | + const openIndex = line.indexOf('$', index); |
| 235 | + if (openIndex == -1) { |
| 236 | + result += line.slice(index); |
| 237 | + break; |
| 238 | + } |
| 239 | +
|
| 240 | + // Is there a next $-sign? |
| 241 | + const nextIndex = line.indexOf('$', openIndex + 1); |
| 242 | + if (nextIndex == -1) { |
| 243 | + result += line.slice(index); |
| 244 | + break; |
| 245 | + } |
| 246 | +
|
| 247 | + const beforeOpenChar = openIndex > 0 ? line[openIndex - 1] : ''; |
| 248 | + const afterOpenChar = line[openIndex + 1]; |
| 249 | + const afterCloseChar = nextIndex + 1 < line.length ? line[nextIndex + 1] : ''; |
| 250 | + if (/[A-Za-z0-9_$-]/.test(beforeOpenChar)) { |
| 251 | + // character, digit, $, _ or - before first '$', no TeX. |
| 252 | + result += line.slice(index, openIndex + 1); |
| 253 | + index = openIndex + 1; |
| 254 | + continue; |
| 255 | + } |
| 256 | + if (/[0-9]/.test(afterOpenChar) && /[A-Za-z0-9_$-]/.test(afterCloseChar)) { |
| 257 | + // First $ seems to belong to an amount. |
| 258 | + result += line.slice(index, openIndex + 1); |
| 259 | + index = openIndex + 1; |
| 260 | + continue; |
| 261 | + } |
| 262 | +
|
| 263 | + // Treat as LaTeX |
| 264 | + result += line.slice(index, openIndex); |
| 265 | + const latexContent = line.slice(openIndex, nextIndex + 1); |
| 266 | + latexExpressions.push(latexContent); |
| 267 | + result += `<<LATEX_${latexExpressions.length - 1}>>`; |
| 268 | + index = nextIndex + 1; |
| 269 | + } |
| 270 | + return result; |
| 271 | + }) |
| 272 | + .join('\n'); |
| 273 | + } |
| 274 | +
|
| 275 | + function escapeBrackets(text: string): string { |
| 276 | + const pattern = /(```[\S\s]*?```|`.*?`)|\\\[([\S\s]*?[^\\])\\]|\\\((.*?)\\\)/g; |
| 277 | + return text.replace( |
| 278 | + pattern, |
| 279 | + ( |
| 280 | + match: string, |
| 281 | + codeBlock: string | undefined, |
| 282 | + squareBracket: string | undefined, |
| 283 | + roundBracket: string | undefined |
| 284 | + ): string => { |
| 285 | + if (codeBlock != null) { |
| 286 | + return codeBlock; |
| 287 | + } else if (squareBracket != null) { |
| 288 | + return `$$${squareBracket}$$`; |
| 289 | + } else if (roundBracket != null) { |
| 290 | + return `$${roundBracket}$`; |
| 291 | + } |
| 292 | + return match; |
| 293 | + } |
| 294 | + ); |
| 295 | + } |
| 296 | +
|
| 297 | + // Escape $\\ce{...} → $\\ce{...} but with proper handling |
| 298 | + function escapeMhchem(text: string): string { |
| 299 | + return text.replaceAll('$\\ce{', '$\\\\ce{').replaceAll('$\\pu{', '$\\\\pu{'); |
| 300 | + } |
| 301 | +
|
167 | 302 | async function processMarkdown(text: string): Promise<string> { |
168 | 303 | try { |
169 | | - const normalized = normalizeMathDelimiters(text); |
170 | | - const result = await processor().process(normalized); |
| 304 | + // const normalized = normalizeMathDelimiters(text); |
| 305 | + // const result = await processor().process(normalized); |
| 306 | + const processedText = preprocessLaTeX(text); |
| 307 | +
|
| 308 | + const result = await processor().process(processedText); |
171 | 309 | const html = String(result); |
172 | 310 | const enhancedLinks = enhanceLinks(html); |
173 | 311 |
|
|
0 commit comments