Skip to content

Commit ef0a5fa

Browse files
committed
feat(encoding): improve file encoding detection and error handling
- Add jschardet dependency for better encoding detection - Improve error handling in readLines with explicit stream error handling - Update isBinaryFile calls to include buffer length parameter - Enhance encoding tests with BOM preservation and error cases - Fix binary file detection to return false on read errors
1 parent cd41603 commit ef0a5fa

File tree

7 files changed

+465
-44
lines changed

7 files changed

+465
-44
lines changed

pnpm-lock.yaml

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/core/tools/multiApplyDiffTool.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ Original error: ${errorMessage}`
301301

302302
let unified = ""
303303
try {
304-
const original = await fs.readFile(opResult.absolutePath!, "utf-8")
304+
const original = await readFileWithEncodingDetection(opResult.absolutePath!)
305305
const processed = !cline.api.getModel().id.includes("claude")
306306
? (opResult.diffItems || []).map((item) => ({
307307
...item,

src/core/tools/writeToFileTool.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ export async function writeToFileTool(
179179
cline.diffViewProvider.editType = fileExists ? "modify" : "create"
180180
if (fileExists) {
181181
const absolutePath = path.resolve(cline.cwd, relPath)
182-
cline.diffViewProvider.originalContent = await fs.readFile(absolutePath, "utf-8")
182+
cline.diffViewProvider.originalContent = await readFileWithEncodingDetection(absolutePath)
183183
} else {
184184
cline.diffViewProvider.originalContent = ""
185185
}

src/integrations/misc/__tests__/read-lines.spec.ts

Lines changed: 252 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
1+
import { describe, it, expect, beforeAll, afterAll, vi } from "vitest"
12
import { promises as fs } from "fs"
23
import path from "path"
34
import { readLines } from "../read-lines"
45

56
describe("nthline", () => {
67
const testFile = path.join(__dirname, "test.txt")
78

9+
// Helper function to create a temporary file, run a test, and clean up
10+
async function withTempFile(filename: string, content: string, testFn: (filepath: string) => Promise<void>) {
11+
const filepath = path.join(__dirname, filename)
12+
await fs.writeFile(filepath, content)
13+
try {
14+
await testFn(filepath)
15+
} finally {
16+
await fs.unlink(filepath)
17+
}
18+
}
19+
820
beforeAll(async () => {
921
// Create a test file with numbered lines
1022
const content = Array.from({ length: 10 }, (_, i) => `Line ${i + 1}`).join("\n")
@@ -71,17 +83,6 @@ describe("nthline", () => {
7183
await expect(readLines(testFile, 20, 15)).rejects.toThrow("does not exist")
7284
})
7385

74-
// Helper function to create a temporary file, run a test, and clean up
75-
async function withTempFile(filename: string, content: string, testFn: (filepath: string) => Promise<void>) {
76-
const filepath = path.join(__dirname, filename)
77-
await fs.writeFile(filepath, content)
78-
try {
79-
await testFn(filepath)
80-
} finally {
81-
await fs.unlink(filepath)
82-
}
83-
}
84-
8586
it("should handle empty files", async () => {
8687
await withTempFile("empty.txt", "", async (filepath) => {
8788
await expect(readLines(filepath, 0, 0)).rejects.toThrow("does not exist")
@@ -129,4 +130,244 @@ describe("nthline", () => {
129130
})
130131
})
131132
})
133+
134+
describe("bytesRead sampling for encoding detection", () => {
135+
it("should sample exactly 64KB for encoding detection on large files", async () => {
136+
// Create a large file with line breaks to test proper sampling
137+
const lineContent = "This is a test line for large file sampling\n"
138+
const linesNeeded = Math.ceil(100000 / lineContent.length) // Ensure > 64KB
139+
const largeContent = lineContent.repeat(linesNeeded)
140+
141+
await withTempFile("large-file.txt", largeContent, async (filepath) => {
142+
// For large files, the function should read and process correctly
143+
// We'll verify the function works with large files that exceed 64KB
144+
const lines = await readLines(filepath, 1) // Read first 2 lines (0-1)
145+
146+
// Verify that the content is read correctly
147+
expect(lines).toContain("This is a test line for large file sampling")
148+
// Should only contain 2 lines
149+
const lineArray = lines.split("\n").filter((line) => line.length > 0)
150+
expect(lineArray).toHaveLength(2)
151+
})
152+
})
153+
154+
it("should handle files smaller than 64KB sampling correctly", async () => {
155+
const smallContent = "Line 1\nLine 2\nLine 3\n"
156+
157+
await withTempFile("small-file.txt", smallContent, async (filepath) => {
158+
// For small files, the function should still attempt to read 64KB for encoding detection
159+
// We'll just verify the function works correctly with small files
160+
const lines = await readLines(filepath, 0) // Read first line (0)
161+
162+
// Verify that the content is read correctly
163+
expect(lines).toContain("Line 1")
164+
expect(lines).not.toContain("Line 2") // Should only read first line
165+
})
166+
})
167+
168+
it("should handle UTF-8 BOM in the 64KB sample correctly", async () => {
169+
// Create content with UTF-8 BOM at the beginning
170+
const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
171+
const textContent = "Line 1 with UTF-8 content\nLine 2\nLine 3\n"
172+
const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(textContent, "utf8")])
173+
174+
await withTempFile("bom-file.txt", contentWithBOM.toString(), async (filepath) => {
175+
// Write the actual binary content with BOM
176+
await fs.writeFile(filepath, contentWithBOM)
177+
178+
const lines = await readLines(filepath, 0) // Read first line (0)
179+
180+
// Should successfully read the content, BOM should be handled by encoding detection
181+
expect(lines).toContain("Line 1 with UTF-8 content")
182+
})
183+
})
184+
185+
it("should handle UTF-16 LE BOM in the 64KB sample correctly", async () => {
186+
// Create content with UTF-16 LE BOM
187+
const bomBytes = Buffer.from([0xff, 0xfe])
188+
const textContent = "Line 1\nLine 2\n"
189+
const utf16Content = Buffer.from(textContent, "utf16le")
190+
const contentWithBOM = Buffer.concat([bomBytes, utf16Content])
191+
192+
await withTempFile("utf16le-bom-file.txt", "", async (filepath) => {
193+
// Write the actual binary content with BOM
194+
await fs.writeFile(filepath, contentWithBOM)
195+
196+
const lines = await readLines(filepath, 1)
197+
198+
// Should successfully read the content, BOM should be handled by encoding detection
199+
expect(lines).toContain("Line 1")
200+
})
201+
})
202+
203+
it("should handle partial multi-byte characters at 64KB boundary", async () => {
204+
// Create content where a multi-byte UTF-8 character might be split at 64KB boundary
205+
const lineContent = "Line with content: 你好世界\n"
206+
const linesNeeded = Math.ceil(65536 / lineContent.length) + 5 // Ensure > 64KB
207+
const fullContent = lineContent.repeat(linesNeeded) + "Final line after boundary\n"
208+
209+
await withTempFile("multibyte-boundary.txt", fullContent, async (filepath) => {
210+
// Read the last few lines to check the content after the boundary
211+
const lines = await readLines(filepath, linesNeeded + 1, linesNeeded - 1) // Read last 3 lines
212+
expect(lines).toContain("Final line after boundary")
213+
// The multi-byte characters should be preserved
214+
expect(lines).toContain("你好世界")
215+
})
216+
})
217+
218+
it("should handle encoding detection failure gracefully with 64KB sampling", async () => {
219+
// Create binary-like content that might confuse encoding detection
220+
const binaryLikeContent = Buffer.alloc(70000) // Larger than 64KB
221+
// Fill with values that might be detected as binary
222+
for (let i = 0; i < binaryLikeContent.length; i++) {
223+
binaryLikeContent[i] = i % 256
224+
}
225+
// Add some text at the end
226+
const textPortion = Buffer.from("\nSome text at the end\n", "utf8")
227+
const mixedContent = Buffer.concat([binaryLikeContent, textPortion])
228+
229+
await withTempFile("mixed-content.txt", "", async (filepath) => {
230+
await fs.writeFile(filepath, mixedContent)
231+
232+
// Should either succeed with fallback encoding or handle gracefully
233+
try {
234+
const lines = await readLines(filepath, 0, 0)
235+
// If it succeeds, it should contain the text portion
236+
expect(typeof lines).toBe("string")
237+
} catch (error) {
238+
// If it fails, it should be a meaningful error about binary content
239+
expect(error).toBeInstanceOf(Error)
240+
}
241+
})
242+
})
243+
})
244+
245+
describe("BOM preservation integration tests", () => {
246+
it("should preserve UTF-8 BOM when reading lines from file", async () => {
247+
// Create content with UTF-8 BOM
248+
const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
249+
const textContent = "First line with UTF-8 content\nSecond line\nThird line\n"
250+
const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(textContent, "utf8")])
251+
252+
await withTempFile("utf8-bom-integration.txt", "", async (filepath) => {
253+
// Write the actual binary content with BOM
254+
await fs.writeFile(filepath, contentWithBOM)
255+
256+
// Read first line
257+
const firstLine = await readLines(filepath, 1)
258+
expect(firstLine).toContain("First line with UTF-8 content")
259+
260+
// Read multiple lines
261+
const multipleLines = await readLines(filepath, 2)
262+
expect(multipleLines).toContain("First line with UTF-8 content")
263+
expect(multipleLines).toContain("Second line")
264+
265+
// Read from specific line
266+
const fromSecondLine = await readLines(filepath, 1, 1)
267+
expect(fromSecondLine).toContain("Second line")
268+
})
269+
})
270+
271+
it("should preserve UTF-16 LE BOM when reading lines from file", async () => {
272+
// Create content with UTF-16 LE BOM
273+
const bomBytes = Buffer.from([0xff, 0xfe])
274+
const textContent = "UTF-16 LE first line\nUTF-16 LE second line\n"
275+
const utf16Content = Buffer.from(textContent, "utf16le")
276+
const contentWithBOM = Buffer.concat([bomBytes, utf16Content])
277+
278+
await withTempFile("utf16le-bom-integration.txt", "", async (filepath) => {
279+
// Write the actual binary content with BOM
280+
await fs.writeFile(filepath, contentWithBOM)
281+
282+
// Read first line
283+
const firstLine = await readLines(filepath, 0) // Read first line (0)
284+
expect(firstLine).toContain("UTF-16 LE first line")
285+
286+
// Read multiple lines
287+
const multipleLines = await readLines(filepath, 1) // Read first 2 lines (0-1)
288+
expect(multipleLines).toContain("UTF-16 LE first line")
289+
expect(multipleLines).toContain("UTF-16 LE second line")
290+
})
291+
})
292+
293+
it("should preserve UTF-16 BE BOM when reading lines from file", async () => {
294+
// Create content with UTF-16 BE BOM
295+
const bomBytes = Buffer.from([0xfe, 0xff])
296+
const textContent = "UTF-16 BE first line\nUTF-16 BE second line\n"
297+
// Manually create UTF-16 BE content
298+
const utf16beBytes = []
299+
for (let i = 0; i < textContent.length; i++) {
300+
const charCode = textContent.charCodeAt(i)
301+
utf16beBytes.push((charCode >> 8) & 0xff) // High byte first
302+
utf16beBytes.push(charCode & 0xff) // Low byte second
303+
}
304+
const utf16Content = Buffer.from(utf16beBytes)
305+
const contentWithBOM = Buffer.concat([bomBytes, utf16Content])
306+
307+
await withTempFile("utf16be-bom-integration.txt", "", async (filepath) => {
308+
// Write the actual binary content with BOM
309+
await fs.writeFile(filepath, contentWithBOM)
310+
311+
// Read first line
312+
const firstLine = await readLines(filepath, 0) // Read first line (0)
313+
expect(firstLine).toContain("UTF-16 BE first line")
314+
315+
// Read multiple lines
316+
const multipleLines = await readLines(filepath, 1) // Read first 2 lines (0-1)
317+
expect(multipleLines).toContain("UTF-16 BE first line")
318+
expect(multipleLines).toContain("UTF-16 BE second line")
319+
})
320+
})
321+
322+
it("should handle BOM preservation with large files that exceed 64KB sampling", async () => {
323+
// Create a large file with UTF-8 BOM that exceeds 64KB
324+
const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
325+
const lineContent = "This is a test line with UTF-8 content and BOM: 你好世界 🌍\n"
326+
const linesNeeded = Math.ceil(65536 / lineContent.length) + 100 // Ensure > 64KB
327+
const largeTextContent = lineContent.repeat(linesNeeded)
328+
const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(largeTextContent, "utf8")])
329+
330+
await withTempFile("large-utf8-bom-integration.txt", "", async (filepath) => {
331+
// Write the actual binary content with BOM
332+
await fs.writeFile(filepath, contentWithBOM)
333+
334+
// Read first few lines
335+
const firstLines = await readLines(filepath, 3)
336+
expect(firstLines).toContain("This is a test line with UTF-8 content and BOM: 你好世界 🌍")
337+
338+
// Read from middle of file (read 2 lines starting from line 50)
339+
const middleLines = await readLines(filepath, 51, 49) // Read lines 49-51 (0-based)
340+
expect(middleLines).toContain("This is a test line with UTF-8 content and BOM: 你好世界 🌍")
341+
342+
// Verify the content is properly decoded despite BOM
343+
const lines = firstLines.split("\n")
344+
expect(lines[0]).not.toMatch(/^\uFEFF/) // BOM should not appear in decoded text
345+
})
346+
})
347+
348+
it("should handle mixed BOM and non-BOM files correctly", async () => {
349+
// Test reading from a UTF-8 BOM file and then a regular UTF-8 file
350+
const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
351+
const bomContent = Buffer.concat([bomBytes, Buffer.from("BOM file content\nSecond line\n", "utf8")])
352+
const regularContent = "Regular UTF-8 content\nAnother line\n"
353+
354+
await withTempFile("bom-file-mixed.txt", "", async (bomFilepath) => {
355+
await fs.writeFile(bomFilepath, bomContent)
356+
357+
await withTempFile("regular-file-mixed.txt", regularContent, async (regularFilepath) => {
358+
// Read from BOM file
359+
const bomLines = await readLines(bomFilepath, 0) // Read first line (0)
360+
expect(bomLines).toContain("BOM file content")
361+
362+
// Read from regular file
363+
const regularLines = await readLines(regularFilepath, 0) // Read first line (0)
364+
expect(regularLines).toContain("Regular UTF-8 content")
365+
366+
// Both should work correctly without interference
367+
expect(bomLines).not.toContain("Regular UTF-8 content")
368+
expect(regularLines).not.toContain("BOM file content")
369+
})
370+
})
371+
})
372+
})
132373
})

src/integrations/misc/read-lines.ts

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -56,35 +56,44 @@ export function readLines(filepath: string, endLine?: number, startLine?: number
5656
}
5757

5858
// Sample the first 64KB for encoding detection
59-
open(filepath, 'r')
60-
.then(fileHandle => {
61-
const sampleBuffer = Buffer.alloc(65536);
62-
return fileHandle.read(sampleBuffer, 0, sampleBuffer.length, 0)
59+
open(filepath, "r")
60+
.then((fileHandle) => {
61+
const sampleBuffer = Buffer.alloc(65536)
62+
return fileHandle
63+
.read(sampleBuffer, 0, sampleBuffer.length, 0)
6364
.then(() => sampleBuffer)
64-
.finally(() => fileHandle.close());
65+
.finally(() => fileHandle.close())
6566
})
66-
.then(sampleBuffer => detectEncoding(sampleBuffer))
67-
.then(encoding => {
67+
.then((sampleBuffer) => detectEncoding(sampleBuffer))
68+
.then((encoding) => {
6869
// Node.js native supported encodings
69-
const nodeEncodings = ['utf8', 'ascii', 'latin1'];
70-
71-
// Choose decoding method based on native support
72-
let input: NodeJS.ReadableStream;
73-
if (nodeEncodings.includes(encoding.toLowerCase())) {
74-
input = createReadStream(filepath, { encoding: encoding as BufferEncoding });
75-
} else {
76-
input = createReadStream(filepath).pipe(iconv.decodeStream(encoding));
77-
}
78-
70+
const nodeEncodings = ["utf8", "ascii", "latin1"]
71+
7972
let buffer = ""
8073
let lineCount = 0
8174
let result = ""
8275

83-
// Handle errors
84-
input.on("error", reject)
76+
// Choose decoding method based on native support
77+
let input: NodeJS.ReadableStream
78+
if (nodeEncodings.includes(encoding.toLowerCase())) {
79+
input = createReadStream(filepath, { encoding: encoding as BufferEncoding })
80+
// Handle errors directly
81+
input.on("error", reject)
82+
} else {
83+
// For non-native encodings, create streams and handle errors explicitly
84+
const sourceStream = createReadStream(filepath)
85+
const decodeStream = iconv.decodeStream(encoding)
86+
87+
// Handle errors from both streams
88+
sourceStream.on("error", reject)
89+
decodeStream.on("error", reject)
90+
91+
// Use pipe but with explicit error handling
92+
input = sourceStream.pipe(decodeStream)
93+
}
8594

8695
// Process data chunks directly
87-
input.on("data", (chunk) => {
96+
input.on("data", (chunk: string) => {
8897
// Add chunk to buffer (chunk is already decoded using the detected encoding)
8998
buffer += chunk
9099

@@ -104,7 +113,7 @@ export function readLines(filepath: string, endLine?: number, startLine?: number
104113

105114
// If we've reached the end line, we can stop
106115
if (endLine !== undefined && lineCount > endLine) {
107-
(input as any).destroy?.()
116+
;(input as any).destroy?.()
108117
resolve(result)
109118
return
110119
}
@@ -135,8 +144,8 @@ export function readLines(filepath: string, endLine?: number, startLine?: number
135144
}
136145
})
137146
})
138-
.catch(error => {
139-
reject(error);
140-
});
147+
.catch((error) => {
148+
reject(error)
149+
})
141150
})
142151
}

0 commit comments

Comments
 (0)