fix: add UTF-8 BOM handling to file reading operations

roomote · roomote · commit 5063f3190ae2 · 2025-07-19T14:38:09.000Z
- Created bomUtils module with functions to detect and strip UTF-8 BOM - Updated extractTextFromFile to strip BOM from text files - Updated readLines to strip BOM from the first chunk when streaming - Added comprehensive tests for BOM handling - Updated readFileTool to treat files with only BOM as empty - Ensures Windows files with BOM are handled correctly Addresses the concern raised in #5789 about UTF-8 BOM causing issues on Windows
diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts
@@ -481,6 +481,36 @@ describe("read_file tool XML output structure", () => {
 				`<files>\n<file><path>${testFilePath}</path>\n<content/><notice>File is empty</notice>\n</file>\n</files>`,
 			)
 		})
+
+		it("should treat files with only BOM as empty", async () => {
+			// Setup - file has BOM only
+			mockedCountFileLines.mockResolvedValue(1) // File has 1 line
+			mockedExtractTextFromFile.mockResolvedValue("\uFEFF") // Only BOM
+			mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
+
+			// Execute
+			const result = await executeReadFileTool({}, { totalLines: 1 })
+
+			// Verify - should show empty file notice since BOM is stripped
+			expect(result).toBe(
+				`<files>\n<file><path>${testFilePath}</path>\n<content/><notice>File is empty</notice>\n</file>\n</files>`,
+			)
+		})
+
+		it("should strip BOM from file content", async () => {
+			// Setup - file has BOM followed by content
+			mockedCountFileLines.mockResolvedValue(1)
+			mockedExtractTextFromFile.mockResolvedValue("1 | \uFEFFHello World") // BOM + content with line number
+			mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
+
+			// Execute
+			const result = await executeReadFileTool({}, { totalLines: 1 })
+
+			// Verify - BOM should be stripped from the content
+			expect(result).toBe(
+				`<files>\n<file><path>${testFilePath}</path>\n<content lines="1-1">\n1 | \uFEFFHello World</content>\n</file>\n</files>`,
+			)
+		})
 	})
 
 	describe("Error Handling Tests", () => {
diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts
@@ -519,9 +519,14 @@ export async function readFileTool(
 				// Handle normal file read
 				const content = await extractTextFromFile(fullPath)
 				const lineRangeAttr = ` lines="1-${totalLines}"`
-				let xmlInfo = totalLines > 0 ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
 
-				if (totalLines === 0) {
+				// Check if file is effectively empty (no lines, only whitespace, or only BOM)
+				// Note: BOM is already stripped by extractTextFromFile
+				const isEffectivelyEmpty = totalLines === 0 || content.trim() === ""
+
+				let xmlInfo = !isEffectivelyEmpty ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
+
+				if (isEffectivelyEmpty) {
 					xmlInfo += `<notice>File is empty</notice>\n`
 				}
 
diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts
@@ -5,6 +5,7 @@ import mammoth from "mammoth"
 import fs from "fs/promises"
 import { isBinaryFile } from "isbinaryfile"
 import { extractTextFromXLSX } from "./extract-text-from-xlsx"
+import { stripBOM } from "../../utils/bomUtils"
 
 async function extractTextFromPDF(filePath: string): Promise<string> {
 	const dataBuffer = await fs.readFile(filePath)
@@ -67,7 +68,9 @@ export async function extractTextFromFile(filePath: string): Promise<string> {
 	const isBinary = await isBinaryFile(filePath).catch(() => false)
 
 	if (!isBinary) {
-		return addLineNumbers(await fs.readFile(filePath, "utf8"))
+		const content = await fs.readFile(filePath, "utf8")
+		// Strip BOM if present before adding line numbers
+		return addLineNumbers(stripBOM(content))
 	} else {
 		throw new Error(`Cannot read text for file type: ${fileExtension}`)
 	}
diff --git a/src/integrations/misc/read-lines.ts b/src/integrations/misc/read-lines.ts
@@ -7,6 +7,7 @@
  * Now you can read a range of lines from a file
  */
 import { createReadStream } from "fs"
+import { stripBOM } from "../../utils/bomUtils"
 
 const outOfRangeError = (filepath: string, n: number) => {
 	return new RangeError(`Line with index ${n} does not exist in '${filepath}'. Note that line indexing is zero-based`)
@@ -57,14 +58,24 @@ export function readLines(filepath: string, endLine?: number, startLine?: number
 		let buffer = ""
 		let lineCount = 0
 		let result = ""
+		let isFirstChunk = true
 
 		// Handle errors
 		input.on("error", reject)
 
 		// Process data chunks directly
 		input.on("data", (chunk) => {
+			// Convert chunk to string
+			let chunkStr = chunk.toString()
+
+			// Strip BOM from the first chunk if present
+			if (isFirstChunk) {
+				chunkStr = stripBOM(chunkStr)
+				isFirstChunk = false
+			}
+
 			// Add chunk to buffer
-			buffer += chunk.toString()
+			buffer += chunkStr
 
 			let pos = 0
 			let nextNewline = buffer.indexOf("\n", pos)
diff --git a/src/utils/__tests__/bomUtils.test.ts b/src/utils/__tests__/bomUtils.test.ts
@@ -0,0 +1,81 @@
+import { describe, it, expect } from "vitest"
+import { stripBOM, hasBOM, stripBOMFromBuffer, UTF8_BOM, UTF8_BOM_BYTES } from "../bomUtils"
+
+describe("bomUtils", () => {
+	describe("stripBOM", () => {
+		it("should strip BOM from string with BOM", () => {
+			const contentWithBOM = UTF8_BOM + "Hello World"
+			const result = stripBOM(contentWithBOM)
+			expect(result).toBe("Hello World")
+		})
+
+		it("should return unchanged string without BOM", () => {
+			const contentWithoutBOM = "Hello World"
+			const result = stripBOM(contentWithoutBOM)
+			expect(result).toBe("Hello World")
+		})
+
+		it("should handle empty string", () => {
+			const result = stripBOM("")
+			expect(result).toBe("")
+		})
+
+		it("should handle string with only BOM", () => {
+			const result = stripBOM(UTF8_BOM)
+			expect(result).toBe("")
+		})
+
+		it("should only strip BOM from beginning", () => {
+			const content = UTF8_BOM + "Hello" + UTF8_BOM + "World"
+			const result = stripBOM(content)
+			expect(result).toBe("Hello" + UTF8_BOM + "World")
+		})
+	})
+
+	describe("hasBOM", () => {
+		it("should detect BOM in buffer", () => {
+			const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")])
+			expect(hasBOM(bufferWithBOM)).toBe(true)
+		})
+
+		it("should return false for buffer without BOM", () => {
+			const bufferWithoutBOM = Buffer.from("Hello")
+			expect(hasBOM(bufferWithoutBOM)).toBe(false)
+		})
+
+		it("should return false for empty buffer", () => {
+			const emptyBuffer = Buffer.alloc(0)
+			expect(hasBOM(emptyBuffer)).toBe(false)
+		})
+
+		it("should return false for buffer too short to contain BOM", () => {
+			const shortBuffer = Buffer.from([0xef, 0xbb]) // Only 2 bytes
+			expect(hasBOM(shortBuffer)).toBe(false)
+		})
+	})
+
+	describe("stripBOMFromBuffer", () => {
+		it("should strip BOM from buffer with BOM", () => {
+			const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")])
+			const result = stripBOMFromBuffer(bufferWithBOM)
+			expect(result.toString()).toBe("Hello")
+		})
+
+		it("should return unchanged buffer without BOM", () => {
+			const bufferWithoutBOM = Buffer.from("Hello")
+			const result = stripBOMFromBuffer(bufferWithoutBOM)
+			expect(result.toString()).toBe("Hello")
+		})
+
+		it("should handle empty buffer", () => {
+			const emptyBuffer = Buffer.alloc(0)
+			const result = stripBOMFromBuffer(emptyBuffer)
+			expect(result.length).toBe(0)
+		})
+
+		it("should handle buffer with only BOM", () => {
+			const result = stripBOMFromBuffer(UTF8_BOM_BYTES)
+			expect(result.length).toBe(0)
+		})
+	})
+})
diff --git a/src/utils/bomUtils.ts b/src/utils/bomUtils.ts
@@ -0,0 +1,42 @@
+/**
+ * UTF-8 BOM (Byte Order Mark) utilities
+ */
+
+// UTF-8 BOM as a string
+export const UTF8_BOM = "\uFEFF"
+
+// UTF-8 BOM as bytes
+export const UTF8_BOM_BYTES = Buffer.from([0xef, 0xbb, 0xbf])
+
+/**
+ * Strips UTF-8 BOM from the beginning of a string if present
+ * @param content The string content to process
+ * @returns The content with BOM removed if it was present
+ */
+export function stripBOM(content: string): string {
+	if (content.charCodeAt(0) === 0xfeff) {
+		return content.slice(1)
+	}
+	return content
+}
+
+/**
+ * Checks if a buffer starts with UTF-8 BOM
+ * @param buffer The buffer to check
+ * @returns True if the buffer starts with UTF-8 BOM
+ */
+export function hasBOM(buffer: Buffer): boolean {
+	return buffer.length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf
+}
+
+/**
+ * Strips UTF-8 BOM from the beginning of a buffer if present
+ * @param buffer The buffer to process
+ * @returns A new buffer with BOM removed if it was present
+ */
+export function stripBOMFromBuffer(buffer: Buffer): Buffer {
+	if (hasBOM(buffer)) {
+		return buffer.slice(3)
+	}
+	return buffer
+}