From 53fac5f6017b105332cdc3ae883debea5d499af3 Mon Sep 17 00:00:00 2001 From: huasushis Date: Wed, 25 Feb 2026 16:23:37 +0800 Subject: [PATCH 1/4] fix: handle complex HTML in DataviewJS output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Wrap decodeURI in try/catch to prevent URIError on bare % characters (e.g. '进度:50%' in rendered table output) - Add whitelist-based isMarkdownSafeNode() to detect content that Markdown cannot faithfully represent (progress bars, styled spans, merged cells, inline styles, etc.) - Add convertRenderedContent() that processes each top-level child independently: markdown-safe children are converted normally, complex children are kept as raw HTML with internal links cleaned for Quartz consumption - Internal links in HTML output use tags (not [[wikilinks]]) so Quartz's CrawlLinks plugin can resolve them correctly --- src/compiler/integrations/dataview.ts | 8 +- src/utils/utils.ts | 195 +++++++++++++++++++++++++- 2 files changed, 197 insertions(+), 6 deletions(-) diff --git a/src/compiler/integrations/dataview.ts b/src/compiler/integrations/dataview.ts index 2d006de..226ca3d 100644 --- a/src/compiler/integrations/dataview.ts +++ b/src/compiler/integrations/dataview.ts @@ -1,4 +1,4 @@ -import { Component, Notice, htmlToMarkdown } from "obsidian"; +import { Component, Notice } from "obsidian"; import { DataviewApi, getAPI } from "obsidian-dataview"; import Logger from "js-logger"; import { @@ -10,10 +10,10 @@ import { } from "./types"; import { escapeRegExp, - cleanQueryResult, renderPromise, surroundWithCalloutBlock, sanitizeQuery, + convertRenderedContent, } from "src/utils/utils"; function getDataviewApi(): DataviewApi | undefined { @@ -63,9 +63,7 @@ async function tryExecuteJs( await renderPromise(div, "[data-tag-name]"); - const markdown = htmlToMarkdown(div) || ""; - - return cleanQueryResult(markdown); + return convertRenderedContent(div); } export const DataviewIntegration: PluginIntegration = { diff --git a/src/utils/utils.ts b/src/utils/utils.ts index fd87499..c9f8a1f 100644 --- a/src/utils/utils.ts +++ b/src/utils/utils.ts @@ -175,7 +175,13 @@ function isPluginEnabled(pluginId: string): boolean { */ function cleanQueryResult(markdown: string): string { // Replace URI escape characters with their actual characters - markdown = decodeURI(markdown); + try { + markdown = decodeURI(markdown); + } catch { + // decodeURI throws URIError if the string contains bare % not followed + // by two hex digits (e.g. "进度:50%"). In that case, keep the + // original string as-is. + } // Rewrite tag links markdown = markdown.replace( @@ -568,6 +574,190 @@ function svgToData(svgElement: SVGSVGElement): string { return `data:image/svg+xml;base64,${encodedData}`; } +/** + * Tags that have a direct Markdown equivalent (whitelist). + * Any tag NOT in this set makes a node "complex" and forces HTML output. + */ +const MARKDOWN_SAFE_TAGS = new Set([ + // Inline formatting + "strong", + "b", + "em", + "i", + "del", + "s", + "code", + "mark", + "sub", + "sup", + // Links & media + "a", + "img", + // Block elements + "p", + "div", + "span", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "blockquote", + "pre", + "hr", + // Lists + "ul", + "ol", + "li", + // Table structure + "table", + "thead", + "tbody", + "tfoot", + "tr", + "th", + "td", + "caption", + // Misc + "br", +]); + +/** + * Attributes whose mere presence on any element indicates content that + * Markdown cannot faithfully represent. + */ +const COMPLEX_ATTRIBUTES = ["style", "colspan", "rowspan"]; + +/** + * Recursively check whether a DOM node (and all its descendants) can be + * faithfully represented in Markdown. + * + * Uses a **whitelist** of safe tags – any tag not in the set is considered + * complex. Additionally, certain attributes (`style`, `colspan`, `rowspan`) + * and `` elements carrying a `class` are treated as complex because + * Markdown has no way to express them. + */ +function isMarkdownSafeNode(node: Node): boolean { + // Text and comment nodes are always safe + if (node.nodeType === Node.TEXT_NODE) return true; + if (node.nodeType === Node.COMMENT_NODE) return true; + if (node.nodeType !== Node.ELEMENT_NODE) return true; + + const el = node as HTMLElement; + const tag = el.tagName.toLowerCase(); + + // Tag must be in the whitelist + if (!MARKDOWN_SAFE_TAGS.has(tag)) return false; + + // Reject elements with attributes that Markdown cannot represent + for (const attr of COMPLEX_ATTRIBUTES) { + if (el.hasAttribute(attr)) return false; + } + + // with a class carries styling intent that would be lost + if ( + tag === "span" && + el.hasAttribute("class") && + el.getAttribute("class")?.trim() + ) { + return false; + } + + // Recursively check every child node + for (const child of Array.from(node.childNodes)) { + if (!isMarkdownSafeNode(child)) return false; + } + + return true; +} + +/** + * Clean up Obsidian-style internal links (``) + * for Quartz consumption. + * + * - Removes `target`, `rel` attributes + * - Strips `.md` extension from `href` + * - Ensures the `internal-link` class is present + * + * Quartz's `CrawlLinks` plugin will pick up these `` tags and resolve + * them correctly (slug transformation, SPA navigation etc.). + */ +function cleanInternalLinks(el: HTMLElement): void { + const links = el.querySelectorAll( + "a.internal-link, a[data-href]", + ); + + for (const link of Array.from(links)) { + link.removeAttribute("target"); + link.removeAttribute("rel"); + + // Prefer data-href (Obsidian's canonical path), fall back to href + const rawHref = + link.getAttribute("data-href") || + link.getAttribute("href") || + ""; + const cleanHref = rawHref.replace(/\.md$/, ""); + + link.setAttribute("href", cleanHref); + link.removeAttribute("data-href"); + + if (!link.classList.contains("external-link")) { + link.classList.add("internal-link"); + } + } +} + +/** + * Convert the rendered HTML produced by DataviewJS (or similar) into a + * string suitable for embedding in a Quartz Markdown file. + * + * The function processes each **top-level child** of `div` independently: + * - Children that are fully representable in Markdown (checked via + * {@link isMarkdownSafeNode}) are converted with `htmlToMarkdown` and then + * cleaned with {@link cleanQueryResult}. + * - Children containing complex HTML (e.g. ``, ``, + * merged cells, inline styles) are kept as raw HTML with internal links + * cleaned for Quartz. + * + * This approach correctly handles mixed content (e.g. an `

` heading + * followed by a complex ``). + */ +function convertRenderedContent(div: HTMLDivElement): string { + // Fast path: if everything is Markdown-safe, use the standard conversion + if (isMarkdownSafeNode(div)) { + const md = htmlToMarkdown(div) || ""; + + return cleanQueryResult(md); + } + + // Mixed / complex content: decide per top-level child + const parts: string[] = []; + + for (const rawChild of Array.from(div.childNodes)) { + if (rawChild.nodeType === Node.TEXT_NODE) { + const text = rawChild.textContent?.trim(); + if (text) parts.push(text); + continue; + } + + if (rawChild.nodeType !== Node.ELEMENT_NODE) continue; + + const child = rawChild as HTMLElement; + + if (isMarkdownSafeNode(child)) { + const md = htmlToMarkdown(child) || ""; + if (md.trim()) parts.push(cleanQueryResult(md)); + } else { + // Keep as HTML; clean up internal links so Quartz can resolve them + cleanInternalLinks(child); + parts.push(child.outerHTML); + } + } + + return parts.join("\n\n"); +} + export { generateUrlPath, generateBlobHash, @@ -585,4 +775,7 @@ export { sanitizeQuery, removeUnwantedElements, svgToData, + isMarkdownSafeNode, + cleanInternalLinks, + convertRenderedContent, }; From 1e4cd3ab4c4d49cc9d599f7f0162d627ea0265bb Mon Sep 17 00:00:00 2001 From: huasushis Date: Wed, 25 Feb 2026 17:38:44 +0800 Subject: [PATCH 2/4] fix: resolve eslint/prettier formatting issues --- src/utils/utils.ts | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/utils/utils.ts b/src/utils/utils.ts index c9f8a1f..e7574bd 100644 --- a/src/utils/utils.ts +++ b/src/utils/utils.ts @@ -641,7 +641,9 @@ const COMPLEX_ATTRIBUTES = ["style", "colspan", "rowspan"]; function isMarkdownSafeNode(node: Node): boolean { // Text and comment nodes are always safe if (node.nodeType === Node.TEXT_NODE) return true; + if (node.nodeType === Node.COMMENT_NODE) return true; + if (node.nodeType !== Node.ELEMENT_NODE) return true; const el = node as HTMLElement; @@ -684,9 +686,7 @@ function isMarkdownSafeNode(node: Node): boolean { * them correctly (slug transformation, SPA navigation etc.). */ function cleanInternalLinks(el: HTMLElement): void { - const links = el.querySelectorAll( - "a.internal-link, a[data-href]", - ); + const links = el.querySelectorAll("a.internal-link, a[data-href]"); for (const link of Array.from(links)) { link.removeAttribute("target"); @@ -694,9 +694,7 @@ function cleanInternalLinks(el: HTMLElement): void { // Prefer data-href (Obsidian's canonical path), fall back to href const rawHref = - link.getAttribute("data-href") || - link.getAttribute("href") || - ""; + link.getAttribute("data-href") || link.getAttribute("href") || ""; const cleanHref = rawHref.replace(/\.md$/, ""); link.setAttribute("href", cleanHref); @@ -738,6 +736,7 @@ function convertRenderedContent(div: HTMLDivElement): string { if (rawChild.nodeType === Node.TEXT_NODE) { const text = rawChild.textContent?.trim(); if (text) parts.push(text); + continue; } From c507343674c409623265f26c39fc48a676128765 Mon Sep 17 00:00:00 2001 From: huasushis Date: Wed, 25 Feb 2026 17:54:24 +0800 Subject: [PATCH 3/4] fix: sanitize href to prevent XSS in outerHTML serialization (CodeQL) --- src/utils/utils.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/utils/utils.ts b/src/utils/utils.ts index e7574bd..c58c09b 100644 --- a/src/utils/utils.ts +++ b/src/utils/utils.ts @@ -695,6 +695,22 @@ function cleanInternalLinks(el: HTMLElement): void { // Prefer data-href (Obsidian's canonical path), fall back to href const rawHref = link.getAttribute("data-href") || link.getAttribute("href") || ""; + + // Block dangerous URL protocols to prevent XSS when the element + // is later serialised via outerHTML. + const trimmed = rawHref.trim().toLowerCase(); + + if ( + trimmed.startsWith("javascript:") || + trimmed.startsWith("data:") || + trimmed.startsWith("vbscript:") + ) { + link.setAttribute("href", ""); + link.removeAttribute("data-href"); + + continue; + } + const cleanHref = rawHref.replace(/\.md$/, ""); link.setAttribute("href", cleanHref); @@ -735,6 +751,7 @@ function convertRenderedContent(div: HTMLDivElement): string { for (const rawChild of Array.from(div.childNodes)) { if (rawChild.nodeType === Node.TEXT_NODE) { const text = rawChild.textContent?.trim(); + if (text) parts.push(text); continue; @@ -746,6 +763,7 @@ function convertRenderedContent(div: HTMLDivElement): string { if (isMarkdownSafeNode(child)) { const md = htmlToMarkdown(child) || ""; + if (md.trim()) parts.push(cleanQueryResult(md)); } else { // Keep as HTML; clean up internal links so Quartz can resolve them From 86806afba25560c70acbcf720624b36b4aef95dd Mon Sep 17 00:00:00 2001 From: huasushis Date: Wed, 25 Feb 2026 18:02:12 +0800 Subject: [PATCH 4/4] fix: use encodeURI to sanitize href (CodeQL DOM-text-as-HTML) --- src/utils/utils.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/utils/utils.ts b/src/utils/utils.ts index c58c09b..79900ca 100644 --- a/src/utils/utils.ts +++ b/src/utils/utils.ts @@ -711,7 +711,10 @@ function cleanInternalLinks(el: HTMLElement): void { continue; } - const cleanHref = rawHref.replace(/\.md$/, ""); + // encodeURI escapes HTML meta-characters (<, >, " etc.) while + // preserving path separators – this also satisfies CodeQL's taint + // analysis (recognised sanitiser for "DOM text → HTML" flows). + const cleanHref = encodeURI(rawHref.replace(/\.md$/, "")); link.setAttribute("href", cleanHref); link.removeAttribute("data-href");