Skip to content

Commit 552054a

Browse files
authored
Fix Non-UTF-8 File Handling: Improve Encoding Detection to Prevent Garbled Text and Binary Misclassification (RooCodeInc#2347)
* Fix Non-UTF-8 File Handling: Improve Encoding Detection to Prevent Garbled Text and Binary Misclassification * update package-lock.json * update * update * fix * fix * fix
1 parent cff8a23 commit 552054a

File tree

5 files changed

+51
-12
lines changed

5 files changed

+51
-12
lines changed

.changeset/shiny-dingos-float.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"claude-dev": patch
3+
---
4+
5+
Fix Non-UTF-8 File Handling: Improve Encoding Detection to Prevent Garbled Text and Binary Misclassification

package-lock.json

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,8 +386,10 @@
386386
"fzf": "^0.5.2",
387387
"get-folder-size": "^5.0.0",
388388
"globby": "^14.0.2",
389+
"iconv-lite": "^0.6.3",
389390
"ignore": "^7.0.3",
390391
"isbinaryfile": "^5.0.2",
392+
"jschardet": "^3.1.4",
391393
"mammoth": "^1.8.0",
392394
"monaco-vscode-textmate-theme-converter": "^0.1.7",
393395
"ollama": "^0.5.13",

src/integrations/editor/DiffViewProvider.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import { formatResponse } from "../../core/prompts/responses"
77
import { DecorationController } from "./DecorationController"
88
import * as diff from "diff"
99
import { diagnosticsToProblemsString, getNewDiagnostics } from "../diagnostics"
10+
import { detectEncoding } from "../misc/extract-text"
11+
import * as iconv from "iconv-lite"
1012

1113
export const DIFF_VIEW_URI_SCHEME = "cline-diff"
1214

@@ -23,6 +25,7 @@ export class DiffViewProvider {
2325
private activeLineController?: DecorationController
2426
private streamedLines: string[] = []
2527
private preDiagnostics: [vscode.Uri, vscode.Diagnostic[]][] = []
28+
private fileEncoding: string = "utf8"
2629

2730
constructor(private cwd: string) {}
2831

@@ -43,9 +46,12 @@ export class DiffViewProvider {
4346
this.preDiagnostics = vscode.languages.getDiagnostics()
4447

4548
if (fileExists) {
46-
this.originalContent = await fs.readFile(absolutePath, "utf-8")
49+
const fileBuffer = await fs.readFile(absolutePath)
50+
this.fileEncoding = await detectEncoding(fileBuffer)
51+
this.originalContent = iconv.decode(fileBuffer, this.fileEncoding)
4752
} else {
4853
this.originalContent = ""
54+
this.fileEncoding = "utf8"
4955
}
5056
// for new files, create any necessary directories and keep track of new directories to delete if the user denies the operation
5157
this.createdDirs = await createDirectoriesForFile(absolutePath)

src/integrations/misc/extract-text.ts

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,25 @@ import mammoth from "mammoth"
55
import fs from "fs/promises"
66
import { isBinaryFile } from "isbinaryfile"
77
import { getFileSizeInKB } from "../../utils/fs"
8+
import * as chardet from "jschardet"
9+
import * as iconv from "iconv-lite"
10+
11+
export async function detectEncoding(fileBuffer: Buffer, fileExtension?: string): Promise<string> {
12+
const detected = chardet.detect(fileBuffer)
13+
if (typeof detected === "string") {
14+
return detected
15+
} else if (detected && (detected as any).encoding) {
16+
return (detected as any).encoding
17+
} else {
18+
if (fileExtension) {
19+
const isBinary = await isBinaryFile(fileBuffer).catch(() => false)
20+
if (isBinary) {
21+
throw new Error(`Cannot read text for file type: ${fileExtension}`)
22+
}
23+
}
24+
return "utf8"
25+
}
26+
}
827

928
export async function extractTextFromFile(filePath: string): Promise<string> {
1029
try {
@@ -21,17 +40,12 @@ export async function extractTextFromFile(filePath: string): Promise<string> {
2140
case ".ipynb":
2241
return extractTextFromIPYNB(filePath)
2342
default:
24-
const isBinary = await isBinaryFile(filePath).catch(() => false)
25-
if (!isBinary) {
26-
// If file is over 300KB, throw an error
27-
const fileSizeInKB = await getFileSizeInKB(filePath)
28-
if (fileSizeInKB > 300) {
29-
throw new Error(`File is too large to read into context.`)
30-
}
31-
return await fs.readFile(filePath, "utf8")
32-
} else {
33-
throw new Error(`Cannot read text for file type: ${fileExtension}`)
43+
const fileBuffer = await fs.readFile(filePath)
44+
if (fileBuffer.byteLength > 300 * 1024) {
45+
throw new Error(`File is too large to read into context.`)
3446
}
47+
const encoding = await detectEncoding(fileBuffer, fileExtension)
48+
return iconv.decode(fileBuffer, encoding)
3549
}
3650
}
3751

@@ -47,7 +61,9 @@ async function extractTextFromDOCX(filePath: string): Promise<string> {
4761
}
4862

4963
async function extractTextFromIPYNB(filePath: string): Promise<string> {
50-
const data = await fs.readFile(filePath, "utf8")
64+
const fileBuffer = await fs.readFile(filePath)
65+
const encoding = await detectEncoding(fileBuffer)
66+
const data = iconv.decode(fileBuffer, encoding)
5167
const notebook = JSON.parse(data)
5268
let extractedText = ""
5369

0 commit comments

Comments
 (0)