test(core): add sanitizer spec; hoist regex patterns and type guard in taskMessages

roomote · roomote · commit cc77aab725fe · 2025-10-17T04:57:49.000Z
diff --git a/src/core/task-persistence/__tests__/taskMessages.sanitize.spec.ts b/src/core/task-persistence/__tests__/taskMessages.sanitize.spec.ts
@@ -0,0 +1,178 @@
+/**
+ * Tests for centralized UI message redaction in taskMessages.ts
+ * Verifies:
+ *  - saveTaskMessages() sanitizes sensitive payloads before persistence
+ *  - readTaskMessages() sanitizes legacy payloads on read as a safety net
+ *  - Idempotency and non-string handling
+ */
+
+import * as path from "path"
+
+// Mocks
+let writtenPath: string | null = null
+let writtenData: any = null
+
+vi.mock("../../../utils/safeWriteJson", () => {
+	return {
+		safeWriteJson: vi.fn(async (p: string, data: any) => {
+			writtenPath = p
+			writtenData = data
+		}),
+	}
+})
+
+vi.mock("../../../utils/storage", () => {
+	return {
+		getTaskDirectoryPath: vi.fn(async (_globalStoragePath: string, _taskId: string) => "/tmp/taskdir"),
+	}
+})
+
+let fileExists = true
+vi.mock("../../../utils/fs", () => {
+	return {
+		fileExistsAtPath: vi.fn(async (_p: string) => fileExists),
+	}
+})
+
+// For read sanitization tests - simulate raw file contents
+let mockReadFilePayload: string = "[]"
+vi.mock("fs/promises", async (importOriginal) => {
+	const actual = await importOriginal<any>()
+	return {
+		...actual,
+		readFile: vi.fn(async (_p: string, _enc: string) => mockReadFilePayload),
+	}
+})
+
+// SUT
+import { readTaskMessages, saveTaskMessages } from "../taskMessages"
+import { GlobalFileNames } from "../../../shared/globalFileNames"
+
+describe("taskMessages redaction", () => {
+	beforeEach(() => {
+		writtenPath = null
+		writtenData = null
+		fileExists = true
+		mockReadFilePayload = "[]"
+	})
+
+	it("saveTaskMessages() should sanitize sensitive tags and JSON 'request' envelope", async () => {
+		const messages = [
+			// JSON api_req_started envelope
+			{
+				ts: 1,
+				type: "say",
+				say: "api_req_started",
+				text: JSON.stringify({
+					request:
+						"Header\n" +
+						"<files>s1</files>\n" +
+						"<file_content>topsecret</file_content>\n" +
+						"<content type='text'>inner</content>\n" +
+						"<file x='y'>body</file>",
+					apiProtocol: "anthropic",
+				}),
+			},
+			// Raw UI text with various tags
+			{
+				ts: 2,
+				type: "say",
+				say: "text",
+				text:
+					"pre " +
+					"<files>multi</files> " +
+					"<file id='1'>abc</file> " +
+					"<content>blob</content> " +
+					"<file_content>secretbytes</file_content> " +
+					"post",
+			},
+			// Non-sensitive string should remain identical
+			{ ts: 3, type: "say", say: "text", text: "no sensitive" },
+			// Non-string text should be left untouched
+			{ ts: 4, type: "say", say: "text", text: undefined },
+		] as any[]
+
+		await saveTaskMessages({ messages, taskId: "t1", globalStoragePath: "/any" })
+
+		// Assert path used
+		expect(writtenPath).toBe(path.join("/tmp/taskdir", GlobalFileNames.uiMessages))
+		expect(Array.isArray(writtenData)).toBe(true)
+
+		const [m1, m2, m3, m4] = writtenData as any[]
+
+		// m1: JSON envelope should be sanitized inside request
+		const m1Obj = JSON.parse(m1.text || "{}")
+		expect(typeof m1Obj.request).toBe("string")
+		expect(m1Obj.request).toContain("<file_content>[omitted]</file_content>")
+		expect(m1Obj.request).toContain("<content>[omitted]</content>")
+		expect(m1Obj.request).toContain("<file>[omitted]</file>")
+		expect(m1Obj.request).toContain("<files>[omitted]</files>")
+		// Original payloads should not remain
+		expect(m1Obj.request).not.toContain("topsecret")
+		expect(m1Obj.request).not.toContain("inner")
+		expect(m1Obj.request).not.toContain("body")
+		expect(m1Obj.request).not.toContain("multi")
+
+		// m2: raw text with tags should be scrubbed
+		expect(m2.text).toContain("<file_content>[omitted]</file_content>")
+		expect(m2.text).toContain("<content>[omitted]</content>")
+		expect(m2.text).toContain("<file>[omitted]</file>")
+		expect(m2.text).toContain("<files>[omitted]</files>")
+		expect(m2.text).not.toContain("secretbytes")
+		expect(m2.text).not.toContain("blob")
+		expect(m2.text).not.toContain("abc")
+		expect(m2.text).not.toContain("multi")
+
+		// m3: unchanged safe content
+		expect(m3.text).toBe("no sensitive")
+
+		// m4: undefined remains undefined
+		expect(m4.text).toBeUndefined()
+	})
+
+	it("readTaskMessages() should sanitize legacy on read", async () => {
+		const legacy = [
+			{
+				ts: 10,
+				type: "say",
+				say: "api_req_started",
+				text: JSON.stringify({
+					request: "X <file_content>L3gacy</file_content> Y",
+					apiProtocol: "anthropic",
+				}),
+			},
+			{
+				ts: 11,
+				type: "say",
+				say: "text",
+				text: "pre <files>bundle</files> post",
+			},
+		]
+		mockReadFilePayload = JSON.stringify(legacy)
+
+		const result = await readTaskMessages({ taskId: "t2", globalStoragePath: "/any" })
+
+		expect(result.length).toBe(2)
+		const [r1, r2] = result as any[]
+
+		const r1Obj = JSON.parse(r1.text || "{}")
+		expect(r1Obj.request).toContain("<file_content>[omitted]</file_content>")
+		expect(r1Obj.request).not.toContain("L3gacy")
+
+		expect(r2.text).toContain("<files>[omitted]</files>")
+		expect(r2.text).not.toContain("bundle")
+	})
+
+	it("sanitization should be idempotent", async () => {
+		const alreadySanitized = [
+			{
+				ts: 20,
+				type: "say",
+				say: "text",
+				text: "A <file_content>[omitted]</file_content> B <content>[omitted]</content>",
+			},
+		]
+		await saveTaskMessages({ messages: alreadySanitized as any[], taskId: "t3", globalStoragePath: "/any" })
+		expect(writtenData[0].text).toBe("A <file_content>[omitted]</file_content> B <content>[omitted]</content>")
+	})
+})
diff --git a/src/core/task-persistence/taskMessages.ts b/src/core/task-persistence/taskMessages.ts
@@ -14,18 +14,29 @@ import { getTaskDirectoryPath } from "../../utils/storage"
  * We only need to ensure sensitive file payloads are NOT persisted to disk (ui_messages.json).
  * Centralizing the sanitization in the persistence layer keeps Task.ts simple and avoids scattering
  * redaction logic across multiple call-sites.
+ *
+ * Precompiled patterns are hoisted to module scope for clarity and efficiency.
+ * Precedence: more specific tags are applied first.
  */
+const FILE_CONTENT_TAG_RE = /<file_content\b[\s\S]*?<\/file_content>/gi
+const CONTENT_TAG_RE = /<content\b[^>]*>[\s\S]*?<\/content>/gi
+const FILE_TAG_RE = /<file\b[^>]*>[\s\S]*?<\/file>/gi
+const FILES_TAG_RE = /<files\b[^>]*>[\s\S]*?<\/files>/gi
+
+function hasStringText(m: ClineMessage): m is ClineMessage & { text: string } {
+	return typeof (m as any)?.text === "string"
+}
 
 function sanitizeMessageText(text?: string): string | undefined {
 	if (!text) return text
 
 	// Scrub helper that replaces inner contents of known file payload tags with an omission marker
 	const scrub = (s: string): string => {
 		// Order matters: scrub more specific tags first
-		s = s.replace(/<file_content\b[\s\S]*?<\/file_content>/gi, "<file_content>[omitted]</file_content>")
-		s = s.replace(/<content\b[^>]*>[\s\S]*?<\/content>/gi, "<content>[omitted]</content>")
-		s = s.replace(/<file\b[^>]*>[\s\S]*?<\/file>/gi, "<file>[omitted]</file>")
-		s = s.replace(/<files\b[^>]*>[\s\S]*?<\/files>/gi, "<files>[omitted]</files>")
+		s = s.replace(FILE_CONTENT_TAG_RE, "<file_content>[omitted]</file_content>")
+		s = s.replace(CONTENT_TAG_RE, "<content>[omitted]</content>")
+		s = s.replace(FILE_TAG_RE, "<file>[omitted]</file>")
+		s = s.replace(FILES_TAG_RE, "<files>[omitted]</files>")
 		return s
 	}
 
@@ -45,8 +56,8 @@ function sanitizeMessageText(text?: string): string | undefined {
 
 function sanitizeMessages(messages: ClineMessage[]): ClineMessage[] {
 	return messages.map((m) => {
-		if (typeof (m as any).text === "string") {
-			return { ...m, text: sanitizeMessageText((m as any).text) }
+		if (hasStringText(m)) {
+			return { ...m, text: sanitizeMessageText(m.text) }
 		}
 		return m
 	})
@@ -57,6 +68,13 @@ export type ReadTaskMessagesOptions = {
 	globalStoragePath: string
 }
 
+/**
+ * Note on double-sanitization:
+ * - The canonical enforcement point is write-time via saveTaskMessages().
+ * - We also sanitize on read here as a transitional safety net to protect against any
+ *   legacy ui_messages.json that may still contain payloads from older versions.
+ *   This read-time sanitization can be removed in a future version once legacy data is unlikely.
+ */
 export async function readTaskMessages({
 	taskId,
 	globalStoragePath,