Skip to content

Commit 5063f31

Browse files
committed
fix: add UTF-8 BOM handling to file reading operations
- Created bomUtils module with functions to detect and strip UTF-8 BOM - Updated extractTextFromFile to strip BOM from text files - Updated readLines to strip BOM from the first chunk when streaming - Added comprehensive tests for BOM handling - Updated readFileTool to treat files with only BOM as empty - Ensures Windows files with BOM are handled correctly Addresses the concern raised in #5789 about UTF-8 BOM causing issues on Windows
1 parent cc369da commit 5063f31

File tree

6 files changed

+176
-4
lines changed

6 files changed

+176
-4
lines changed

src/core/tools/__tests__/readFileTool.spec.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,36 @@ describe("read_file tool XML output structure", () => {
481481
`<files>\n<file><path>${testFilePath}</path>\n<content/><notice>File is empty</notice>\n</file>\n</files>`,
482482
)
483483
})
484+
485+
it("should treat files with only BOM as empty", async () => {
486+
// Setup - file has BOM only
487+
mockedCountFileLines.mockResolvedValue(1) // File has 1 line
488+
mockedExtractTextFromFile.mockResolvedValue("\uFEFF") // Only BOM
489+
mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
490+
491+
// Execute
492+
const result = await executeReadFileTool({}, { totalLines: 1 })
493+
494+
// Verify - should show empty file notice since BOM is stripped
495+
expect(result).toBe(
496+
`<files>\n<file><path>${testFilePath}</path>\n<content/><notice>File is empty</notice>\n</file>\n</files>`,
497+
)
498+
})
499+
500+
it("should strip BOM from file content", async () => {
501+
// Setup - file has BOM followed by content
502+
mockedCountFileLines.mockResolvedValue(1)
503+
mockedExtractTextFromFile.mockResolvedValue("1 | \uFEFFHello World") // BOM + content with line number
504+
mockProvider.getState.mockResolvedValue({ maxReadFileLine: -1 })
505+
506+
// Execute
507+
const result = await executeReadFileTool({}, { totalLines: 1 })
508+
509+
// Verify - BOM should be stripped from the content
510+
expect(result).toBe(
511+
`<files>\n<file><path>${testFilePath}</path>\n<content lines="1-1">\n1 | \uFEFFHello World</content>\n</file>\n</files>`,
512+
)
513+
})
484514
})
485515

486516
describe("Error Handling Tests", () => {

src/core/tools/readFileTool.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -519,9 +519,14 @@ export async function readFileTool(
519519
// Handle normal file read
520520
const content = await extractTextFromFile(fullPath)
521521
const lineRangeAttr = ` lines="1-${totalLines}"`
522-
let xmlInfo = totalLines > 0 ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
523522

524-
if (totalLines === 0) {
523+
// Check if file is effectively empty (no lines, only whitespace, or only BOM)
524+
// Note: BOM is already stripped by extractTextFromFile
525+
const isEffectivelyEmpty = totalLines === 0 || content.trim() === ""
526+
527+
let xmlInfo = !isEffectivelyEmpty ? `<content${lineRangeAttr}>\n${content}</content>\n` : `<content/>`
528+
529+
if (isEffectivelyEmpty) {
525530
xmlInfo += `<notice>File is empty</notice>\n`
526531
}
527532

src/integrations/misc/extract-text.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import mammoth from "mammoth"
55
import fs from "fs/promises"
66
import { isBinaryFile } from "isbinaryfile"
77
import { extractTextFromXLSX } from "./extract-text-from-xlsx"
8+
import { stripBOM } from "../../utils/bomUtils"
89

910
async function extractTextFromPDF(filePath: string): Promise<string> {
1011
const dataBuffer = await fs.readFile(filePath)
@@ -67,7 +68,9 @@ export async function extractTextFromFile(filePath: string): Promise<string> {
6768
const isBinary = await isBinaryFile(filePath).catch(() => false)
6869

6970
if (!isBinary) {
70-
return addLineNumbers(await fs.readFile(filePath, "utf8"))
71+
const content = await fs.readFile(filePath, "utf8")
72+
// Strip BOM if present before adding line numbers
73+
return addLineNumbers(stripBOM(content))
7174
} else {
7275
throw new Error(`Cannot read text for file type: ${fileExtension}`)
7376
}

src/integrations/misc/read-lines.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* Now you can read a range of lines from a file
88
*/
99
import { createReadStream } from "fs"
10+
import { stripBOM } from "../../utils/bomUtils"
1011

1112
const outOfRangeError = (filepath: string, n: number) => {
1213
return new RangeError(`Line with index ${n} does not exist in '${filepath}'. Note that line indexing is zero-based`)
@@ -57,14 +58,24 @@ export function readLines(filepath: string, endLine?: number, startLine?: number
5758
let buffer = ""
5859
let lineCount = 0
5960
let result = ""
61+
let isFirstChunk = true
6062

6163
// Handle errors
6264
input.on("error", reject)
6365

6466
// Process data chunks directly
6567
input.on("data", (chunk) => {
68+
// Convert chunk to string
69+
let chunkStr = chunk.toString()
70+
71+
// Strip BOM from the first chunk if present
72+
if (isFirstChunk) {
73+
chunkStr = stripBOM(chunkStr)
74+
isFirstChunk = false
75+
}
76+
6677
// Add chunk to buffer
67-
buffer += chunk.toString()
78+
buffer += chunkStr
6879

6980
let pos = 0
7081
let nextNewline = buffer.indexOf("\n", pos)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import { describe, it, expect } from "vitest"
2+
import { stripBOM, hasBOM, stripBOMFromBuffer, UTF8_BOM, UTF8_BOM_BYTES } from "../bomUtils"
3+
4+
describe("bomUtils", () => {
5+
describe("stripBOM", () => {
6+
it("should strip BOM from string with BOM", () => {
7+
const contentWithBOM = UTF8_BOM + "Hello World"
8+
const result = stripBOM(contentWithBOM)
9+
expect(result).toBe("Hello World")
10+
})
11+
12+
it("should return unchanged string without BOM", () => {
13+
const contentWithoutBOM = "Hello World"
14+
const result = stripBOM(contentWithoutBOM)
15+
expect(result).toBe("Hello World")
16+
})
17+
18+
it("should handle empty string", () => {
19+
const result = stripBOM("")
20+
expect(result).toBe("")
21+
})
22+
23+
it("should handle string with only BOM", () => {
24+
const result = stripBOM(UTF8_BOM)
25+
expect(result).toBe("")
26+
})
27+
28+
it("should only strip BOM from beginning", () => {
29+
const content = UTF8_BOM + "Hello" + UTF8_BOM + "World"
30+
const result = stripBOM(content)
31+
expect(result).toBe("Hello" + UTF8_BOM + "World")
32+
})
33+
})
34+
35+
describe("hasBOM", () => {
36+
it("should detect BOM in buffer", () => {
37+
const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")])
38+
expect(hasBOM(bufferWithBOM)).toBe(true)
39+
})
40+
41+
it("should return false for buffer without BOM", () => {
42+
const bufferWithoutBOM = Buffer.from("Hello")
43+
expect(hasBOM(bufferWithoutBOM)).toBe(false)
44+
})
45+
46+
it("should return false for empty buffer", () => {
47+
const emptyBuffer = Buffer.alloc(0)
48+
expect(hasBOM(emptyBuffer)).toBe(false)
49+
})
50+
51+
it("should return false for buffer too short to contain BOM", () => {
52+
const shortBuffer = Buffer.from([0xef, 0xbb]) // Only 2 bytes
53+
expect(hasBOM(shortBuffer)).toBe(false)
54+
})
55+
})
56+
57+
describe("stripBOMFromBuffer", () => {
58+
it("should strip BOM from buffer with BOM", () => {
59+
const bufferWithBOM = Buffer.concat([UTF8_BOM_BYTES, Buffer.from("Hello")])
60+
const result = stripBOMFromBuffer(bufferWithBOM)
61+
expect(result.toString()).toBe("Hello")
62+
})
63+
64+
it("should return unchanged buffer without BOM", () => {
65+
const bufferWithoutBOM = Buffer.from("Hello")
66+
const result = stripBOMFromBuffer(bufferWithoutBOM)
67+
expect(result.toString()).toBe("Hello")
68+
})
69+
70+
it("should handle empty buffer", () => {
71+
const emptyBuffer = Buffer.alloc(0)
72+
const result = stripBOMFromBuffer(emptyBuffer)
73+
expect(result.length).toBe(0)
74+
})
75+
76+
it("should handle buffer with only BOM", () => {
77+
const result = stripBOMFromBuffer(UTF8_BOM_BYTES)
78+
expect(result.length).toBe(0)
79+
})
80+
})
81+
})

src/utils/bomUtils.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/**
2+
* UTF-8 BOM (Byte Order Mark) utilities
3+
*/
4+
5+
// UTF-8 BOM as a string
6+
export const UTF8_BOM = "\uFEFF"
7+
8+
// UTF-8 BOM as bytes
9+
export const UTF8_BOM_BYTES = Buffer.from([0xef, 0xbb, 0xbf])
10+
11+
/**
12+
* Strips UTF-8 BOM from the beginning of a string if present
13+
* @param content The string content to process
14+
* @returns The content with BOM removed if it was present
15+
*/
16+
export function stripBOM(content: string): string {
17+
if (content.charCodeAt(0) === 0xfeff) {
18+
return content.slice(1)
19+
}
20+
return content
21+
}
22+
23+
/**
24+
* Checks if a buffer starts with UTF-8 BOM
25+
* @param buffer The buffer to check
26+
* @returns True if the buffer starts with UTF-8 BOM
27+
*/
28+
export function hasBOM(buffer: Buffer): boolean {
29+
return buffer.length >= 3 && buffer[0] === 0xef && buffer[1] === 0xbb && buffer[2] === 0xbf
30+
}
31+
32+
/**
33+
* Strips UTF-8 BOM from the beginning of a buffer if present
34+
* @param buffer The buffer to process
35+
* @returns A new buffer with BOM removed if it was present
36+
*/
37+
export function stripBOMFromBuffer(buffer: Buffer): Buffer {
38+
if (hasBOM(buffer)) {
39+
return buffer.slice(3)
40+
}
41+
return buffer
42+
}

0 commit comments

Comments
 (0)