From 5063f3190ae2c79d7a510a45185e760dbbafa74f Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Sat, 19 Jul 2025 14:38:09 +0000
Subject: [PATCH 1/2] fix: add UTF-8 BOM handling to file reading operations

- Created bomUtils module with functions to detect and strip UTF-8 BOM
- Updated extractTextFromFile to strip BOM from text files
- Updated readLines to strip BOM from the first chunk when streaming
- Added comprehensive tests for BOM handling
- Updated readFileTool to treat files with only BOM as empty
- Ensures Windows files with BOM are handled correctly

Addresses the concern raised in #5789 about UTF-8 BOM causing issues on Windows
---
 src/core/tools/__tests__/readFileTool.spec.ts | 30 +++++++
 src/core/tools/readFileTool.ts                |  9 ++-
 src/integrations/misc/extract-text.ts         |  5 +-
 src/integrations/misc/read-lines.ts           | 13 ++-
 src/utils/__tests__/bomUtils.test.ts          | 81 +++++++++++++++++++
 src/utils/bomUtils.ts                         | 42 ++++++++++
 6 files changed, 176 insertions(+), 4 deletions(-)
 create mode 100644 src/utils/__tests__/bomUtils.test.ts
 create mode 100644 src/utils/bomUtils.ts
diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts
index 44be1d3b924..b79ac269b96 100644
--- a/src/core/tools/__tests__/readFileTool.spec.ts
+++ b/src/core/tools/__tests__/readFileTool.spec.ts
@@ -481,6 +481,36 @@ describe("read_file tool XML output structure", () => {
 				`<files>\n<file><path>${testFilePath}</path>\n<content/><notice>File is empty</notice>\n</file>\n</files>`,
 			)
 		})
+
+		it("should treat files with only BOM as empty", async () => {
+			// Setup - file has BOM only
+			mockedCountFileLines.mockResolvedValue(1) // File has 1 line
+			mockedExtractTextFromFile.mockResolvedValue("\uFEFF") // Only BOM
+			mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
+
+			// Execute
+			const result = await executeReadFileTool({}, { totalLines: 1 })
+
+			// Verify - should show empty file notice since BOM is stripped
+			expect(result).toBe(
+				`<files>\n<file><path>${testFilePath}</path>\n<content/><notice>File is empty</notice>\n</file>\n</files>`,
+			)
+		})
+
+		it("should strip BOM from file content", async () => {
+			// Setup - file has BOM followed by content
+			mockedCountFileLines.mockResolvedValue(1)
+			mockedExtractTextFromFile.mockResolvedValue("1 | \uFEFFHello World") // BOM + content with line number
+			mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
+
+			// Execute
+			const result = await executeReadFileTool({}, { totalLines: 1 })
+
+			// Verify - BOM should be stripped from the content
+			expect(result).toBe(
+				`<files>\n<file><path>${testFilePath}</path>\n<content lines="1-1">\n1 | \uFEFFHello World</content>\n</file>\n</files>`,
+			)
+		})
 	})
 
 	describe("Error Handling Tests", () => {
diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts
index 6de8dd56421..0cc196bce1e 100644
--- a/src/core/tools/readFileTool.ts
+++ b/src/core/tools/readFileTool.ts
@@ -519,9 +519,14 @@ export async function readFileTool(
 				// Handle normal file read
 				const content = await extractTextFromFile(fullPath)
 				const lineRangeAttr = ` lines="1-${totalLines}"`
-				let xmlInfo = totalLines > 0 ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
 
-				if (totalLines === 0) {
+				// Check if file is effectively empty (no lines, only whitespace, or only BOM)
+				// Note: BOM is already stripped by extractTextFromFile
+				const isEffectivelyEmpty = totalLines === 0 || content.trim() === ""
+
+				let xmlInfo = !isEffectivelyEmpty ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
+
+				if (isEffectivelyEmpty) {
 					xmlInfo += `<notice>File is empty</notice>\n`
 				}
 
diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts
index 8c7e7408a68..bc4456e0b2d 100644
--- a/src/integrations/misc/extract-text.ts
+++ b/src/integrations/misc/extract-text.ts
@@ -5,6 +5,7 @@ import mammoth from "mammoth"
 import fs from "fs/promises"
 import { isBinaryFile } from "isbinaryfile"
 import { extractTextFromXLSX } from "./extract-text-from-xlsx"
+import { stripBOM } from "../../utils/bomUtils"
 
 async function extractTextFromPDF(filePath: string): Promise<string> {
 	const dataBuffer = await fs.readFile(filePath)
@@ -67,7 +68,9 @@ export async function extractTextFromFile(filePath: string): Promise<string> {
 	const isBinary = await isBinaryFile(filePath).catch(() => false)
 
 	if (!isBinary) {
-		return addLineNumbers(await fs.readFile(filePath, "utf8"))
+		const content = await fs.readFile(filePath, "utf8")
+		// Strip BOM if present before adding line numbers
+		return addLineNumbers(stripBOM(content))
 	} else {
 		throw new Error(`Cannot read text for file type: ${fileExtension}`)
 	}
diff --git a/src/integrations/misc/read-lines.ts b/src/integrations/misc/read-lines.ts
index 5a5eda9f838..f3f7bad648c 100644
--- a/src/integrations/misc/read-lines.ts
+++ b/src/integrations/misc/read-lines.ts
@@ -7,6 +7,7 @@
  * Now you can read a range of lines from a file
  */
 import { createReadStream } from "fs"
+import { stripBOM } from "../../utils/bomUtils"
 
 const outOfRangeError = (filepath: string, n: number) => {
 	return new RangeError(`Line with index ${n} does not exist in '${filepath}'. Note that line indexing is zero-based`)
@@ -57,14 +58,24 @@ export function readLines(filepath: string, endLine?: number, startLine?: number
 		let buffer = ""
 		let lineCount = 0
 		let result = ""
+		let isFirstChunk = true
 
 		// Handle errors
 		input.on("error", reject)
 
 		// Process data chunks directly
 		input.on("data", (chunk) => {
+			// Convert chunk to string
+			let chunkStr = chunk.toString()
+
+			// Strip BOM from the first chunk if present
+			if (isFirstChunk) {
+				chunkStr = stripBOM(chunkStr)
+				isFirstChunk = false
+			}
+
 			// Add chunk to buffer
-			buffer += chunk.toString()
+			buffer += chunkStr
 
 			let pos = 0
 			let nextNewline = buffer.indexOf("\n", pos)
diff --git a/src/utils/__tests__/bomUtils.test.ts b/src/utils/__tests__/bomUtils.test.ts
new file mode 100644
index 00000000000..2969953e35a
--- /dev/null
+++ b/src/utils/__tests__/bomUtils.test.ts
@@ -0,0 +1,81 @@
+import { describe, it, expect } from "vitest"
+import { stripBOM, hasBOM, stripBOMFromBuffer, UTF8_BOM, UTF8_BOM_BYTES } from "../bomUtils"
+
+describe("bomUtils", () => {
+	describe("stripBOM", () => {
+		it("should strip BOM from string with BOM", () => {
+			const contentWithBOM = UTF8_BOM + "Hello World"
+			const result = stripBOM(contentWithBOM)
+			expect(result).toBe("Hello World")
+		})
+
+		it("should return unchanged string without BOM", () => {
+			const contentWithoutBOM = "Hello World"
+			const result = stripBOM(contentWithoutBOM)
+			expect(result).toBe("Hello World")
+		})
+
+		it("should handle empty string", () => {
+			const result = stripBOM("")
+			expect(result).toBe("")
+		})
+
+		it("should handle string with only BOM", () => {
+			const result = stripBOM(UTF8_BOM)
+			expect(result).toBe("")
+		})
+
+		it("should only strip BOM from beginning", () => {
+			const content = UTF8_BOM + "Hello" + UTF8_BOM + "World"
+			const result = stripBOM(content)
+			expect(result).toBe("Hello" + UTF8_BOM + "World")
+		})
+	})
+
+	describe("hasBOM", () => {
+		it("should detect BOM in buffer", () => {
+			const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")])
+			expect(hasBOM(bufferWithBOM)).toBe(true)
+		})
+
+		it("should return false for buffer without BOM", () => {
+			const bufferWithoutBOM = Buffer.from("Hello")
+			expect(hasBOM(bufferWithoutBOM)).toBe(false)
+		})
+
+		it("should return false for empty buffer", () => {
+			const emptyBuffer = Buffer.alloc(0)
+			expect(hasBOM(emptyBuffer)).toBe(false)
+		})
+
+		it("should return false for buffer too short to contain BOM", () => {
+			const shortBuffer = Buffer.from([0xef, 0xbb]) // Only 2 bytes
+			expect(hasBOM(shortBuffer)).toBe(false)
+		})
+	})
+
+	describe("stripBOMFromBuffer", () => {
+		it("should strip BOM from buffer with BOM", () => {
+			const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")])
+			const result = stripBOMFromBuffer(bufferWithBOM)
+			expect(result.toString()).toBe("Hello")
+		})
+
+		it("should return unchanged buffer without BOM", () => {
+			const bufferWithoutBOM = Buffer.from("Hello")
+			const result = stripBOMFromBuffer(bufferWithoutBOM)
+			expect(result.toString()).toBe("Hello")
+		})
+
+		it("should handle empty buffer", () => {
+			const emptyBuffer = Buffer.alloc(0)
+			const result = stripBOMFromBuffer(emptyBuffer)
+			expect(result.length).toBe(0)
+		})
+
+		it("should handle buffer with only BOM", () => {
+			const result = stripBOMFromBuffer(UTF8_BOM_BYTES)
+			expect(result.length).toBe(0)
+		})
+	})
+})
diff --git a/src/utils/bomUtils.ts b/src/utils/bomUtils.ts
new file mode 100644
index 00000000000..8343d4a74d8
--- /dev/null
+++ b/src/utils/bomUtils.ts
@@ -0,0 +1,42 @@
+/**
+ * UTF-8 BOM (Byte Order Mark) utilities
+ */
+
+// UTF-8 BOM as a string
+export const UTF8_BOM = "\uFEFF"
+
+// UTF-8 BOM as bytes
+export const UTF8_BOM_BYTES = Buffer.from([0xef, 0xbb, 0xbf])
+
+/**
+ * Strips UTF-8 BOM from the beginning of a string if present
+ * @param content The string content to process
+ * @returns The content with BOM removed if it was present
+ */
+export function stripBOM(content: string): string {
+	if (content.charCodeAt(0) === 0xfeff) {
+		return content.slice(1)
+	}
+	return content
+}
+
+/**
+ * Checks if a buffer starts with UTF-8 BOM
+ * @param buffer The buffer to check
+ * @returns True if the buffer starts with UTF-8 BOM
+ */
+export function hasBOM(buffer: Buffer): boolean {
+	return buffer.length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf
+}
+
+/**
+ * Strips UTF-8 BOM from the beginning of a buffer if present
+ * @param buffer The buffer to process
+ * @returns A new buffer with BOM removed if it was present
+ */
+export function stripBOMFromBuffer(buffer: Buffer): Buffer {
+	if (hasBOM(buffer)) {
+		return buffer.slice(3)
+	}
+	return buffer
+}

From c0b2a6ce11a1f6118e5338af5ff9ebf41f3366c2 Mon Sep 17 00:00:00 2001
From: Roo Code <roomote@roocode.com>
Date: Sat, 19 Jul 2025 17:34:52 +0000
Subject: [PATCH 2/2] refactor: use existing strip-bom package instead of
 custom bomUtils

- Remove src/utils/bomUtils.ts and its test file
- Update imports in extract-text.ts to use strip-bom package
- Update imports in read-lines.ts to use strip-bom package
- Keep the empty file detection logic in readFileTool.ts unchanged

This eliminates code duplication by using the existing strip-bom package
that is already used in DiffViewProvider.ts and CustomModesManager.ts.
---
 src/integrations/misc/extract-text.ts |  4 +-
 src/integrations/misc/read-lines.ts   |  4 +-
 src/utils/__tests__/bomUtils.test.ts  | 81 ---------------------------
 src/utils/bomUtils.ts                 | 42 --------------
 4 files changed, 4 insertions(+), 127 deletions(-)
 delete mode 100644 src/utils/__tests__/bomUtils.test.ts
 delete mode 100644 src/utils/bomUtils.ts

diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts
index bc4456e0b2d..b7011dd172e 100644
--- a/src/integrations/misc/extract-text.ts
+++ b/src/integrations/misc/extract-text.ts
@@ -5,7 +5,7 @@ import mammoth from "mammoth"
 import fs from "fs/promises"
 import { isBinaryFile } from "isbinaryfile"
 import { extractTextFromXLSX } from "./extract-text-from-xlsx"
-import { stripBOM } from "../../utils/bomUtils"
+import stripBom from "strip-bom"
 
 async function extractTextFromPDF(filePath: string): Promise<string> {
 	const dataBuffer = await fs.readFile(filePath)
@@ -70,7 +70,7 @@ export async function extractTextFromFile(filePath: string): Promise<string> {
 	if (!isBinary) {
 		const content = await fs.readFile(filePath, "utf8")
 		// Strip BOM if present before adding line numbers
-		return addLineNumbers(stripBOM(content))
+		return addLineNumbers(stripBom(content))
 	} else {
 		throw new Error(`Cannot read text for file type: ${fileExtension}`)
 	}
diff --git a/src/integrations/misc/read-lines.ts b/src/integrations/misc/read-lines.ts
index f3f7bad648c..2d9d1e9a22c 100644
--- a/src/integrations/misc/read-lines.ts
+++ b/src/integrations/misc/read-lines.ts
@@ -7,7 +7,7 @@
  * Now you can read a range of lines from a file
  */
 import { createReadStream } from "fs"
-import { stripBOM } from "../../utils/bomUtils"
+import stripBom from "strip-bom"
 
 const outOfRangeError = (filepath: string, n: number) => {
 	return new RangeError(`Line with index ${n} does not exist in '${filepath}'. Note that line indexing is zero-based`)
@@ -70,7 +70,7 @@ export function readLines(filepath: string, endLine?: number, startLine?: number
 
 			// Strip BOM from the first chunk if present
 			if (isFirstChunk) {
-				chunkStr = stripBOM(chunkStr)
+				chunkStr = stripBom(chunkStr)
 				isFirstChunk = false
 			}
 
diff --git a/src/utils/__tests__/bomUtils.test.ts b/src/utils/__tests__/bomUtils.test.ts
deleted file mode 100644
index 2969953e35a..00000000000
--- a/src/utils/__tests__/bomUtils.test.ts
+++ /dev/null
@@ -1,81 +0,0 @@
-import { describe, it, expect } from "vitest"
-import { stripBOM, hasBOM, stripBOMFromBuffer, UTF8_BOM, UTF8_BOM_BYTES } from "../bomUtils"
-
-describe("bomUtils", () => {
-	describe("stripBOM", () => {
-		it("should strip BOM from string with BOM", () => {
-			const contentWithBOM = UTF8_BOM + "Hello World"
-			const result = stripBOM(contentWithBOM)
-			expect(result).toBe("Hello World")
-		})
-
-		it("should return unchanged string without BOM", () => {
-			const contentWithoutBOM = "Hello World"
-			const result = stripBOM(contentWithoutBOM)
-			expect(result).toBe("Hello World")
-		})
-
-		it("should handle empty string", () => {
-			const result = stripBOM("")
-			expect(result).toBe("")
-		})
-
-		it("should handle string with only BOM", () => {
-			const result = stripBOM(UTF8_BOM)
-			expect(result).toBe("")
-		})
-
-		it("should only strip BOM from beginning", () => {
-			const content = UTF8_BOM + "Hello" + UTF8_BOM + "World"
-			const result = stripBOM(content)
-			expect(result).toBe("Hello" + UTF8_BOM + "World")
-		})
-	})
-
-	describe("hasBOM", () => {
-		it("should detect BOM in buffer", () => {
-			const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")])
-			expect(hasBOM(bufferWithBOM)).toBe(true)
-		})
-
-		it("should return false for buffer without BOM", () => {
-			const bufferWithoutBOM = Buffer.from("Hello")
-			expect(hasBOM(bufferWithoutBOM)).toBe(false)
-		})
-
-		it("should return false for empty buffer", () => {
-			const emptyBuffer = Buffer.alloc(0)
-			expect(hasBOM(emptyBuffer)).toBe(false)
-		})
-
-		it("should return false for buffer too short to contain BOM", () => {
-			const shortBuffer = Buffer.from([0xef, 0xbb]) // Only 2 bytes
-			expect(hasBOM(shortBuffer)).toBe(false)
-		})
-	})
-
-	describe("stripBOMFromBuffer", () => {
-		it("should strip BOM from buffer with BOM", () => {
-			const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")])
-			const result = stripBOMFromBuffer(bufferWithBOM)
-			expect(result.toString()).toBe("Hello")
-		})
-
-		it("should return unchanged buffer without BOM", () => {
-			const bufferWithoutBOM = Buffer.from("Hello")
-			const result = stripBOMFromBuffer(bufferWithoutBOM)
-			expect(result.toString()).toBe("Hello")
-		})
-
-		it("should handle empty buffer", () => {
-			const emptyBuffer = Buffer.alloc(0)
-			const result = stripBOMFromBuffer(emptyBuffer)
-			expect(result.length).toBe(0)
-		})
-
-		it("should handle buffer with only BOM", () => {
-			const result = stripBOMFromBuffer(UTF8_BOM_BYTES)
-			expect(result.length).toBe(0)
-		})
-	})
-})
diff --git a/src/utils/bomUtils.ts b/src/utils/bomUtils.ts
deleted file mode 100644
index 8343d4a74d8..00000000000
--- a/src/utils/bomUtils.ts
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * UTF-8 BOM (Byte Order Mark) utilities
- */
-
-// UTF-8 BOM as a string
-export const UTF8_BOM = "\uFEFF"
-
-// UTF-8 BOM as bytes
-export const UTF8_BOM_BYTES = Buffer.from([0xef, 0xbb, 0xbf])
-
-/**
- * Strips UTF-8 BOM from the beginning of a string if present
- * @param content The string content to process
- * @returns The content with BOM removed if it was present
- */
-export function stripBOM(content: string): string {
-	if (content.charCodeAt(0) === 0xfeff) {
-		return content.slice(1)
-	}
-	return content
-}
-
-/**
- * Checks if a buffer starts with UTF-8 BOM
- * @param buffer The buffer to check
- * @returns True if the buffer starts with UTF-8 BOM
- */
-export function hasBOM(buffer: Buffer): boolean {
-	return buffer.length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf
-}
-
-/**
- * Strips UTF-8 BOM from the beginning of a buffer if present
- * @param buffer The buffer to process
- * @returns A new buffer with BOM removed if it was present
- */
-export function stripBOMFromBuffer(buffer: Buffer): Buffer {
-	if (hasBOM(buffer)) {
-		return buffer.slice(3)
-	}
-	return buffer
-}