Skip to content

Commit b1846f1

Browse files
webui: add rehype plugin to restore HTML in Markdown table cells (ggml-org#17477)
* webui: add rehype plugin to restore HTML in Markdown table cells The remark/rehype pipeline neutralizes inline HTML as literal text (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display as-is instead of being rendered. This causes <br> and <ul> markup in table cells to show as plain text. This plugin traverses the HAST post-conversion, parses whitelisted HTML patterns (<br>, <ul><li>) from text nodes, and replaces them with actual HAST element nodes. For lists, adjacent siblings must be combined first as the AST fragmentation breaks pattern matching. Strict validation rejects malformed markup, keeping it as raw text. * chore: update webui build output
1 parent d414db0 commit b1846f1

File tree

4 files changed

+203
-0
lines changed

4 files changed

+203
-0
lines changed

tools/server/public/index.html.gz

470 Bytes
Binary file not shown.

tools/server/webui/src/lib/components/app/misc/MarkdownContent.svelte

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import rehypeKatex from 'rehype-katex';
99
import rehypeStringify from 'rehype-stringify';
1010
import { copyCodeToClipboard } from '$lib/utils/copy';
11+
import { rehypeRestoreTableHtml } from '$lib/markdown/table-html-restorer';
1112
import { preprocessLaTeX } from '$lib/utils/latex-protection';
1213
import { browser } from '$app/environment';
1314
import '$styles/katex-custom.scss';
@@ -60,6 +61,7 @@
6061
.use(remarkRehype) // Convert Markdown AST to rehype
6162
.use(rehypeKatex) // Render math using KaTeX
6263
.use(rehypeHighlight) // Add syntax highlighting
64+
.use(rehypeRestoreTableHtml) // Restore limited HTML (e.g., <br>, <ul>) inside Markdown tables
6365
.use(rehypeStringify); // Convert to HTML string
6466
});
6567
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/**
2+
* Matches <br>, <br/>, <br /> tags (case-insensitive).
3+
* Used to detect line breaks in table cell text content.
4+
*/
5+
export const BR_PATTERN = /<br\s*\/?\s*>/gi;
6+
7+
/**
8+
* Matches a complete <ul>...</ul> block.
9+
* Captures the inner content (group 1) for further <li> extraction.
10+
* Case-insensitive, allows multiline content.
11+
*/
12+
export const LIST_PATTERN = /^<ul>([\s\S]*)<\/ul>$/i;
13+
14+
/**
15+
* Matches individual <li>...</li> elements within a list.
16+
* Captures the inner content (group 1) of each list item.
17+
* Non-greedy to handle multiple consecutive items.
18+
* Case-insensitive, allows multiline content.
19+
*/
20+
export const LI_PATTERN = /<li>([\s\S]*?)<\/li>/gi;
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/**
2+
* Rehype plugin to restore limited HTML elements inside Markdown table cells.
3+
*
4+
* ## Problem
5+
* The remark/rehype pipeline neutralizes inline HTML as literal text
6+
* (remarkLiteralHtml) so that XML/HTML snippets in LLM responses display
7+
* as-is instead of being rendered. This causes <br> and <ul> markup in
8+
* table cells to show as plain text.
9+
*
10+
* ## Solution
11+
* This plugin traverses the HAST post-conversion, parses whitelisted HTML
12+
* patterns from text nodes, and replaces them with actual HAST element nodes
13+
* that will be rendered as real HTML.
14+
*
15+
* ## Supported HTML
16+
* - `<br>` / `<br/>` / `<br />` - Line breaks (inline)
17+
* - `<ul><li>...</li></ul>` - Unordered lists (block)
18+
*
19+
* ## Key Implementation Details
20+
*
21+
* ### 1. Sibling Combination (Critical)
22+
* The Markdown pipeline may fragment content across multiple text nodes and `<br>`
23+
* elements. For example, `<ul><li>a</li></ul>` might arrive as:
24+
* - Text: `"<ul>"`
25+
* - Element: `<br>`
26+
* - Text: `"<li>a</li></ul>"`
27+
*
28+
* We must combine consecutive text nodes and `<br>` elements into a single string
29+
* before attempting to parse list markup. Without this, list detection fails.
30+
*
31+
* ### 2. visitParents for Deep Traversal
32+
* Table cell content may be wrapped in intermediate elements (e.g., `<p>` tags).
33+
* Using `visitParents` instead of direct child iteration ensures we find text
34+
* nodes at any depth within the cell.
35+
*
36+
* ### 3. Reference Comparison for No-Op Detection
37+
* When checking if `<br>` expansion changed anything, we compare:
38+
* `expanded.length !== 1 || expanded[0] !== textNode`
39+
*
40+
* This catches both cases:
41+
* - Multiple nodes created (text was split)
42+
* - Single NEW node created (original had only `<br>`, now it's an element)
43+
*
44+
* A simple `length > 1` check would miss the single `<br>` case.
45+
*
46+
* ### 4. Strict List Validation
47+
* `parseList()` rejects malformed markup by checking for garbage text between
48+
* `<li>` elements. This prevents creating broken DOM from partial matches like
49+
* `<ul>garbage<li>a</li></ul>`.
50+
*
51+
* ### 5. Newline Substitution for `<br>` in Combined String
52+
* When combining siblings, existing `<br>` elements become `\n` in the combined
53+
* string. This allows list content to span visual lines while still being parsed
54+
* as a single unit.
55+
*
56+
* @example
57+
* // Input Markdown:
58+
* // | Feature | Notes |
59+
* // |---------|-------|
60+
* // | Multi-line | First<br>Second |
61+
* // | List | <ul><li>A</li><li>B</li></ul> |
62+
* //
63+
* // Without this plugin: <br> and <ul> render as literal text
64+
* // With this plugin: <br> becomes line break, <ul> becomes actual list
65+
*/
66+
67+
import type { Plugin } from 'unified';
68+
import type { Element, ElementContent, Root, Text } from 'hast';
69+
import { visit } from 'unist-util-visit';
70+
import { visitParents } from 'unist-util-visit-parents';
71+
import { BR_PATTERN, LIST_PATTERN, LI_PATTERN } from '$lib/constants/table-html-restorer';
72+
73+
/**
74+
* Expands text containing `<br>` tags into an array of text nodes and br elements.
75+
*/
76+
function expandBrTags(value: string): ElementContent[] {
77+
const matches = [...value.matchAll(BR_PATTERN)];
78+
if (!matches.length) return [{ type: 'text', value } as Text];
79+
80+
const result: ElementContent[] = [];
81+
let cursor = 0;
82+
83+
for (const m of matches) {
84+
if (m.index! > cursor) {
85+
result.push({ type: 'text', value: value.slice(cursor, m.index) } as Text);
86+
}
87+
result.push({ type: 'element', tagName: 'br', properties: {}, children: [] } as Element);
88+
cursor = m.index! + m[0].length;
89+
}
90+
91+
if (cursor < value.length) {
92+
result.push({ type: 'text', value: value.slice(cursor) } as Text);
93+
}
94+
95+
return result;
96+
}
97+
98+
/**
99+
* Parses a `<ul><li>...</li></ul>` string into a HAST element.
100+
* Returns null if the markup is malformed or contains unexpected content.
101+
*/
102+
function parseList(value: string): Element | null {
103+
const match = value.trim().match(LIST_PATTERN);
104+
if (!match) return null;
105+
106+
const body = match[1];
107+
const items: ElementContent[] = [];
108+
let cursor = 0;
109+
110+
for (const liMatch of body.matchAll(LI_PATTERN)) {
111+
// Reject if there's non-whitespace between list items
112+
if (body.slice(cursor, liMatch.index!).trim()) return null;
113+
114+
items.push({
115+
type: 'element',
116+
tagName: 'li',
117+
properties: {},
118+
children: expandBrTags(liMatch[1] ?? '')
119+
} as Element);
120+
121+
cursor = liMatch.index! + liMatch[0].length;
122+
}
123+
124+
// Reject if no items found or trailing garbage exists
125+
if (!items.length || body.slice(cursor).trim()) return null;
126+
127+
return { type: 'element', tagName: 'ul', properties: {}, children: items } as Element;
128+
}
129+
130+
/**
131+
* Processes a single table cell, restoring HTML elements from text content.
132+
*/
133+
function processCell(cell: Element) {
134+
visitParents(cell, 'text', (textNode: Text, ancestors) => {
135+
const parent = ancestors[ancestors.length - 1];
136+
if (!parent || parent.type !== 'element') return;
137+
138+
const parentEl = parent as Element;
139+
const siblings = parentEl.children as ElementContent[];
140+
const startIndex = siblings.indexOf(textNode as ElementContent);
141+
if (startIndex === -1) return;
142+
143+
// Combine consecutive text nodes and <br> elements into one string
144+
let combined = '';
145+
let endIndex = startIndex;
146+
147+
for (let i = startIndex; i < siblings.length; i++) {
148+
const sib = siblings[i];
149+
if (sib.type === 'text') {
150+
combined += (sib as Text).value;
151+
endIndex = i;
152+
} else if (sib.type === 'element' && (sib as Element).tagName === 'br') {
153+
combined += '\n';
154+
endIndex = i;
155+
} else {
156+
break;
157+
}
158+
}
159+
160+
// Try parsing as list first (replaces entire combined range)
161+
const list = parseList(combined);
162+
if (list) {
163+
siblings.splice(startIndex, endIndex - startIndex + 1, list);
164+
return;
165+
}
166+
167+
// Otherwise, just expand <br> tags in this text node
168+
const expanded = expandBrTags(textNode.value);
169+
if (expanded.length !== 1 || expanded[0] !== textNode) {
170+
siblings.splice(startIndex, 1, ...expanded);
171+
}
172+
});
173+
}
174+
175+
export const rehypeRestoreTableHtml: Plugin<[], Root> = () => (tree) => {
176+
visit(tree, 'element', (node: Element) => {
177+
if (node.tagName === 'td' || node.tagName === 'th') {
178+
processCell(node);
179+
}
180+
});
181+
};

0 commit comments

Comments
 (0)