From 5063f3190ae2c79d7a510a45185e760dbbafa74f Mon Sep 17 00:00:00 2001 From: Roo Code Date: Sat, 19 Jul 2025 14:38:09 +0000 Subject: [PATCH 1/2] fix: add UTF-8 BOM handling to file reading operations - Created bomUtils module with functions to detect and strip UTF-8 BOM - Updated extractTextFromFile to strip BOM from text files - Updated readLines to strip BOM from the first chunk when streaming - Added comprehensive tests for BOM handling - Updated readFileTool to treat files with only BOM as empty - Ensures Windows files with BOM are handled correctly Addresses the concern raised in #5789 about UTF-8 BOM causing issues on Windows --- src/core/tools/__tests__/readFileTool.spec.ts | 30 +++++++ src/core/tools/readFileTool.ts | 9 ++- src/integrations/misc/extract-text.ts | 5 +- src/integrations/misc/read-lines.ts | 13 ++- src/utils/__tests__/bomUtils.test.ts | 81 +++++++++++++++++++ src/utils/bomUtils.ts | 42 ++++++++++ 6 files changed, 176 insertions(+), 4 deletions(-) create mode 100644 src/utils/__tests__/bomUtils.test.ts create mode 100644 src/utils/bomUtils.ts diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index 44be1d3b924..b79ac269b96 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -481,6 +481,36 @@ describe("read_file tool XML output structure", () => { `\n${testFilePath}\nFile is empty\n\n`, ) }) + + it("should treat files with only BOM as empty", async () => { + // Setup - file has BOM only + mockedCountFileLines.mockResolvedValue(1) // File has 1 line + mockedExtractTextFromFile.mockResolvedValue("\uFEFF") // Only BOM + mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 }) + + // Execute + const result = await executeReadFileTool({}, { totalLines: 1 }) + + // Verify - should show empty file notice since BOM is stripped + expect(result).toBe( + `\n${testFilePath}\nFile is empty\n\n`, + ) + }) + + it("should strip BOM from file content", async () => { + // Setup - file has BOM followed by content + mockedCountFileLines.mockResolvedValue(1) + mockedExtractTextFromFile.mockResolvedValue("1 | \uFEFFHello World") // BOM + content with line number + mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 }) + + // Execute + const result = await executeReadFileTool({}, { totalLines: 1 }) + + // Verify - BOM should be stripped from the content + expect(result).toBe( + `\n${testFilePath}\n\n1 | \uFEFFHello World\n\n`, + ) + }) }) describe("Error Handling Tests", () => { diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts index 6de8dd56421..0cc196bce1e 100644 --- a/src/core/tools/readFileTool.ts +++ b/src/core/tools/readFileTool.ts @@ -519,9 +519,14 @@ export async function readFileTool( // Handle normal file read const content = await extractTextFromFile(fullPath) const lineRangeAttr = ` lines="1-${totalLines}"` - let xmlInfo = totalLines > 0 ? `\n${content}\n` : `` - if (totalLines === 0) { + // Check if file is effectively empty (no lines, only whitespace, or only BOM) + // Note: BOM is already stripped by extractTextFromFile + const isEffectivelyEmpty = totalLines === 0 || content.trim() === "" + + let xmlInfo = !isEffectivelyEmpty ? `\n${content}\n` : `` + + if (isEffectivelyEmpty) { xmlInfo += `File is empty\n` } diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts index 8c7e7408a68..bc4456e0b2d 100644 --- a/src/integrations/misc/extract-text.ts +++ b/src/integrations/misc/extract-text.ts @@ -5,6 +5,7 @@ import mammoth from "mammoth" import fs from "fs/promises" import { isBinaryFile } from "isbinaryfile" import { extractTextFromXLSX } from "./extract-text-from-xlsx" +import { stripBOM } from "../../utils/bomUtils" async function extractTextFromPDF(filePath: string): Promise { const dataBuffer = await fs.readFile(filePath) @@ -67,7 +68,9 @@ export async function extractTextFromFile(filePath: string): Promise { const isBinary = await isBinaryFile(filePath).catch(() => false) if (!isBinary) { - return addLineNumbers(await fs.readFile(filePath, "utf8")) + const content = await fs.readFile(filePath, "utf8") + // Strip BOM if present before adding line numbers + return addLineNumbers(stripBOM(content)) } else { throw new Error(`Cannot read text for file type: ${fileExtension}`) } diff --git a/src/integrations/misc/read-lines.ts b/src/integrations/misc/read-lines.ts index 5a5eda9f838..f3f7bad648c 100644 --- a/src/integrations/misc/read-lines.ts +++ b/src/integrations/misc/read-lines.ts @@ -7,6 +7,7 @@ * Now you can read a range of lines from a file */ import { createReadStream } from "fs" +import { stripBOM } from "../../utils/bomUtils" const outOfRangeError = (filepath: string, n: number) => { return new RangeError(`Line with index ${n} does not exist in '${filepath}'. Note that line indexing is zero-based`) @@ -57,14 +58,24 @@ export function readLines(filepath: string, endLine?: number, startLine?: number let buffer = "" let lineCount = 0 let result = "" + let isFirstChunk = true // Handle errors input.on("error", reject) // Process data chunks directly input.on("data", (chunk) => { + // Convert chunk to string + let chunkStr = chunk.toString() + + // Strip BOM from the first chunk if present + if (isFirstChunk) { + chunkStr = stripBOM(chunkStr) + isFirstChunk = false + } + // Add chunk to buffer - buffer += chunk.toString() + buffer += chunkStr let pos = 0 let nextNewline = buffer.indexOf("\n", pos) diff --git a/src/utils/__tests__/bomUtils.test.ts b/src/utils/__tests__/bomUtils.test.ts new file mode 100644 index 00000000000..2969953e35a --- /dev/null +++ b/src/utils/__tests__/bomUtils.test.ts @@ -0,0 +1,81 @@ +import { describe, it, expect } from "vitest" +import { stripBOM, hasBOM, stripBOMFromBuffer, UTF8_BOM, UTF8_BOM_BYTES } from "../bomUtils" + +describe("bomUtils", () => { + describe("stripBOM", () => { + it("should strip BOM from string with BOM", () => { + const contentWithBOM = UTF8_BOM + "Hello World" + const result = stripBOM(contentWithBOM) + expect(result).toBe("Hello World") + }) + + it("should return unchanged string without BOM", () => { + const contentWithoutBOM = "Hello World" + const result = stripBOM(contentWithoutBOM) + expect(result).toBe("Hello World") + }) + + it("should handle empty string", () => { + const result = stripBOM("") + expect(result).toBe("") + }) + + it("should handle string with only BOM", () => { + const result = stripBOM(UTF8_BOM) + expect(result).toBe("") + }) + + it("should only strip BOM from beginning", () => { + const content = UTF8_BOM + "Hello" + UTF8_BOM + "World" + const result = stripBOM(content) + expect(result).toBe("Hello" + UTF8_BOM + "World") + }) + }) + + describe("hasBOM", () => { + it("should detect BOM in buffer", () => { + const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")]) + expect(hasBOM(bufferWithBOM)).toBe(true) + }) + + it("should return false for buffer without BOM", () => { + const bufferWithoutBOM = Buffer.from("Hello") + expect(hasBOM(bufferWithoutBOM)).toBe(false) + }) + + it("should return false for empty buffer", () => { + const emptyBuffer = Buffer.alloc(0) + expect(hasBOM(emptyBuffer)).toBe(false) + }) + + it("should return false for buffer too short to contain BOM", () => { + const shortBuffer = Buffer.from([0xef, 0xbb]) // Only 2 bytes + expect(hasBOM(shortBuffer)).toBe(false) + }) + }) + + describe("stripBOMFromBuffer", () => { + it("should strip BOM from buffer with BOM", () => { + const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")]) + const result = stripBOMFromBuffer(bufferWithBOM) + expect(result.toString()).toBe("Hello") + }) + + it("should return unchanged buffer without BOM", () => { + const bufferWithoutBOM = Buffer.from("Hello") + const result = stripBOMFromBuffer(bufferWithoutBOM) + expect(result.toString()).toBe("Hello") + }) + + it("should handle empty buffer", () => { + const emptyBuffer = Buffer.alloc(0) + const result = stripBOMFromBuffer(emptyBuffer) + expect(result.length).toBe(0) + }) + + it("should handle buffer with only BOM", () => { + const result = stripBOMFromBuffer(UTF8_BOM_BYTES) + expect(result.length).toBe(0) + }) + }) +}) diff --git a/src/utils/bomUtils.ts b/src/utils/bomUtils.ts new file mode 100644 index 00000000000..8343d4a74d8 --- /dev/null +++ b/src/utils/bomUtils.ts @@ -0,0 +1,42 @@ +/** + * UTF-8 BOM (Byte Order Mark) utilities + */ + +// UTF-8 BOM as a string +export const UTF8_BOM = "\uFEFF" + +// UTF-8 BOM as bytes +export const UTF8_BOM_BYTES = Buffer.from([0xef, 0xbb, 0xbf]) + +/** + * Strips UTF-8 BOM from the beginning of a string if present + * @param content The string content to process + * @returns The content with BOM removed if it was present + */ +export function stripBOM(content: string): string { + if (content.charCodeAt(0) === 0xfeff) { + return content.slice(1) + } + return content +} + +/** + * Checks if a buffer starts with UTF-8 BOM + * @param buffer The buffer to check + * @returns True if the buffer starts with UTF-8 BOM + */ +export function hasBOM(buffer: Buffer): boolean { + return buffer.length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf +} + +/** + * Strips UTF-8 BOM from the beginning of a buffer if present + * @param buffer The buffer to process + * @returns A new buffer with BOM removed if it was present + */ +export function stripBOMFromBuffer(buffer: Buffer): Buffer { + if (hasBOM(buffer)) { + return buffer.slice(3) + } + return buffer +} From c0b2a6ce11a1f6118e5338af5ff9ebf41f3366c2 Mon Sep 17 00:00:00 2001 From: Roo Code Date: Sat, 19 Jul 2025 17:34:52 +0000 Subject: [PATCH 2/2] refactor: use existing strip-bom package instead of custom bomUtils - Remove src/utils/bomUtils.ts and its test file - Update imports in extract-text.ts to use strip-bom package - Update imports in read-lines.ts to use strip-bom package - Keep the empty file detection logic in readFileTool.ts unchanged This eliminates code duplication by using the existing strip-bom package that is already used in DiffViewProvider.ts and CustomModesManager.ts. --- src/integrations/misc/extract-text.ts | 4 +- src/integrations/misc/read-lines.ts | 4 +- src/utils/__tests__/bomUtils.test.ts | 81 --------------------------- src/utils/bomUtils.ts | 42 -------------- 4 files changed, 4 insertions(+), 127 deletions(-) delete mode 100644 src/utils/__tests__/bomUtils.test.ts delete mode 100644 src/utils/bomUtils.ts diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts index bc4456e0b2d..b7011dd172e 100644 --- a/src/integrations/misc/extract-text.ts +++ b/src/integrations/misc/extract-text.ts @@ -5,7 +5,7 @@ import mammoth from "mammoth" import fs from "fs/promises" import { isBinaryFile } from "isbinaryfile" import { extractTextFromXLSX } from "./extract-text-from-xlsx" -import { stripBOM } from "../../utils/bomUtils" +import stripBom from "strip-bom" async function extractTextFromPDF(filePath: string): Promise { const dataBuffer = await fs.readFile(filePath) @@ -70,7 +70,7 @@ export async function extractTextFromFile(filePath: string): Promise { if (!isBinary) { const content = await fs.readFile(filePath, "utf8") // Strip BOM if present before adding line numbers - return addLineNumbers(stripBOM(content)) + return addLineNumbers(stripBom(content)) } else { throw new Error(`Cannot read text for file type: ${fileExtension}`) } diff --git a/src/integrations/misc/read-lines.ts b/src/integrations/misc/read-lines.ts index f3f7bad648c..2d9d1e9a22c 100644 --- a/src/integrations/misc/read-lines.ts +++ b/src/integrations/misc/read-lines.ts @@ -7,7 +7,7 @@ * Now you can read a range of lines from a file */ import { createReadStream } from "fs" -import { stripBOM } from "../../utils/bomUtils" +import stripBom from "strip-bom" const outOfRangeError = (filepath: string, n: number) => { return new RangeError(`Line with index ${n} does not exist in '${filepath}'. Note that line indexing is zero-based`) @@ -70,7 +70,7 @@ export function readLines(filepath: string, endLine?: number, startLine?: number // Strip BOM from the first chunk if present if (isFirstChunk) { - chunkStr = stripBOM(chunkStr) + chunkStr = stripBom(chunkStr) isFirstChunk = false } diff --git a/src/utils/__tests__/bomUtils.test.ts b/src/utils/__tests__/bomUtils.test.ts deleted file mode 100644 index 2969953e35a..00000000000 --- a/src/utils/__tests__/bomUtils.test.ts +++ /dev/null @@ -1,81 +0,0 @@ -import { describe, it, expect } from "vitest" -import { stripBOM, hasBOM, stripBOMFromBuffer, UTF8_BOM, UTF8_BOM_BYTES } from "../bomUtils" - -describe("bomUtils", () => { - describe("stripBOM", () => { - it("should strip BOM from string with BOM", () => { - const contentWithBOM = UTF8_BOM + "Hello World" - const result = stripBOM(contentWithBOM) - expect(result).toBe("Hello World") - }) - - it("should return unchanged string without BOM", () => { - const contentWithoutBOM = "Hello World" - const result = stripBOM(contentWithoutBOM) - expect(result).toBe("Hello World") - }) - - it("should handle empty string", () => { - const result = stripBOM("") - expect(result).toBe("") - }) - - it("should handle string with only BOM", () => { - const result = stripBOM(UTF8_BOM) - expect(result).toBe("") - }) - - it("should only strip BOM from beginning", () => { - const content = UTF8_BOM + "Hello" + UTF8_BOM + "World" - const result = stripBOM(content) - expect(result).toBe("Hello" + UTF8_BOM + "World") - }) - }) - - describe("hasBOM", () => { - it("should detect BOM in buffer", () => { - const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")]) - expect(hasBOM(bufferWithBOM)).toBe(true) - }) - - it("should return false for buffer without BOM", () => { - const bufferWithoutBOM = Buffer.from("Hello") - expect(hasBOM(bufferWithoutBOM)).toBe(false) - }) - - it("should return false for empty buffer", () => { - const emptyBuffer = Buffer.alloc(0) - expect(hasBOM(emptyBuffer)).toBe(false) - }) - - it("should return false for buffer too short to contain BOM", () => { - const shortBuffer = Buffer.from([0xef, 0xbb]) // Only 2 bytes - expect(hasBOM(shortBuffer)).toBe(false) - }) - }) - - describe("stripBOMFromBuffer", () => { - it("should strip BOM from buffer with BOM", () => { - const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")]) - const result = stripBOMFromBuffer(bufferWithBOM) - expect(result.toString()).toBe("Hello") - }) - - it("should return unchanged buffer without BOM", () => { - const bufferWithoutBOM = Buffer.from("Hello") - const result = stripBOMFromBuffer(bufferWithoutBOM) - expect(result.toString()).toBe("Hello") - }) - - it("should handle empty buffer", () => { - const emptyBuffer = Buffer.alloc(0) - const result = stripBOMFromBuffer(emptyBuffer) - expect(result.length).toBe(0) - }) - - it("should handle buffer with only BOM", () => { - const result = stripBOMFromBuffer(UTF8_BOM_BYTES) - expect(result.length).toBe(0) - }) - }) -}) diff --git a/src/utils/bomUtils.ts b/src/utils/bomUtils.ts deleted file mode 100644 index 8343d4a74d8..00000000000 --- a/src/utils/bomUtils.ts +++ /dev/null @@ -1,42 +0,0 @@ -/** - * UTF-8 BOM (Byte Order Mark) utilities - */ - -// UTF-8 BOM as a string -export const UTF8_BOM = "\uFEFF" - -// UTF-8 BOM as bytes -export const UTF8_BOM_BYTES = Buffer.from([0xef, 0xbb, 0xbf]) - -/** - * Strips UTF-8 BOM from the beginning of a string if present - * @param content The string content to process - * @returns The content with BOM removed if it was present - */ -export function stripBOM(content: string): string { - if (content.charCodeAt(0) === 0xfeff) { - return content.slice(1) - } - return content -} - -/** - * Checks if a buffer starts with UTF-8 BOM - * @param buffer The buffer to check - * @returns True if the buffer starts with UTF-8 BOM - */ -export function hasBOM(buffer: Buffer): boolean { - return buffer.length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf -} - -/** - * Strips UTF-8 BOM from the beginning of a buffer if present - * @param buffer The buffer to process - * @returns A new buffer with BOM removed if it was present - */ -export function stripBOMFromBuffer(buffer: Buffer): Buffer { - if (hasBOM(buffer)) { - return buffer.slice(3) - } - return buffer -}