diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index e803a1a72f..6151ee12e6 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -710,12 +710,18 @@ importers:
i18next:
specifier: ^25.0.0
version: 25.2.1(typescript@5.8.3)
+ iconv-lite:
+ specifier: ^0.6.3
+ version: 0.6.3
ignore:
specifier: ^7.0.3
version: 7.0.4
isbinaryfile:
specifier: ^5.0.2
version: 5.0.4
+ jschardet:
+ specifier: ^3.1.4
+ version: 3.1.4
jwt-decode:
specifier: ^4.0.0
version: 4.0.0
@@ -6944,6 +6950,10 @@ packages:
jsbn@1.1.0:
resolution: {integrity: sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==}
+ jschardet@3.1.4:
+ resolution: {integrity: sha512-/kmVISmrwVwtyYU40iQUOp3SUPk2dhNCMsZBQX0R1/jZ8maaXJ/oZIzUOiyOqcgtLnETFKYChbJ5iDC/eWmFHg==}
+ engines: {node: '>=0.1.90'}
+
jsdom@26.1.0:
resolution: {integrity: sha512-Cvc9WUhxSMEo4McES3P7oK3QaXldCfNWp7pl2NNeiIFlCoLr3kfq9kb1fxftiwk1FLV7CvpvDfonxtzUDeSOPg==}
engines: {node: '>=18'}
@@ -16988,6 +16998,8 @@ snapshots:
jsbn@1.1.0: {}
+ jschardet@3.1.4: {}
+
jsdom@26.1.0:
dependencies:
cssstyle: 4.4.0
diff --git a/src/core/mentions/index.ts b/src/core/mentions/index.ts
index f038b5b783..e93cd856bd 100644
--- a/src/core/mentions/index.ts
+++ b/src/core/mentions/index.ts
@@ -2,7 +2,7 @@ import fs from "fs/promises"
import * as path from "path"
import * as vscode from "vscode"
-import { isBinaryFile } from "isbinaryfile"
+import { isBinaryFileWithEncodingDetection } from "../../utils/encoding"
import { mentionRegexGlobal, commandRegexGlobal, unescapeSpaces } from "../../shared/context-mentions"
@@ -314,7 +314,7 @@ async function getFileOrFolderContent(
fileContentPromises.push(
(async () => {
try {
- const isBinary = await isBinaryFile(absoluteFilePath).catch(() => false)
+ const isBinary = await isBinaryFileWithEncodingDetection(absoluteFilePath)
if (isBinary) {
return undefined
}
diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts
index d693d6ba44..c3e53aaefd 100644
--- a/src/core/tools/__tests__/readFileTool.spec.ts
+++ b/src/core/tools/__tests__/readFileTool.spec.ts
@@ -6,7 +6,7 @@ import { countFileLines } from "../../../integrations/misc/line-counter"
import { readLines } from "../../../integrations/misc/read-lines"
import { extractTextFromFile } from "../../../integrations/misc/extract-text"
import { parseSourceCodeDefinitionsForFile } from "../../../services/tree-sitter"
-import { isBinaryFile } from "isbinaryfile"
+import { isBinaryFileWithEncodingDetection } from "../../../utils/encoding"
import { ReadFileToolUse, ToolParamName, ToolResponse } from "../../../shared/tools"
import { readFileTool } from "../readFileTool"
import { formatResponse } from "../../prompts/responses"
@@ -23,7 +23,9 @@ vi.mock("path", async () => {
// Already mocked above with hoisted fsPromises
-vi.mock("isbinaryfile")
+vi.mock("../../../utils/encoding", () => ({
+ isBinaryFileWithEncodingDetection: vi.fn(),
+}))
vi.mock("../../../integrations/misc/line-counter")
vi.mock("../../../integrations/misc/read-lines")
@@ -238,7 +240,7 @@ describe("read_file tool with maxReadFileLine setting", () => {
const mockedExtractTextFromFile = vi.mocked(extractTextFromFile)
const mockedParseSourceCodeDefinitionsForFile = vi.mocked(parseSourceCodeDefinitionsForFile)
- const mockedIsBinaryFile = vi.mocked(isBinaryFile)
+ const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection)
const mockedPathResolve = vi.mocked(path.resolve)
let mockCline: any
@@ -249,7 +251,7 @@ describe("read_file tool with maxReadFileLine setting", () => {
// Clear specific mocks (not all mocks to preserve shared state)
mockedCountFileLines.mockClear()
mockedExtractTextFromFile.mockClear()
- mockedIsBinaryFile.mockClear()
+ mockedIsBinaryFileWithEncodingDetection.mockClear()
mockedPathResolve.mockClear()
addLineNumbersMock.mockClear()
extractTextFromFileMock.mockClear()
@@ -264,7 +266,7 @@ describe("read_file tool with maxReadFileLine setting", () => {
setImageSupport(mockCline, false)
mockedPathResolve.mockReturnValue(absoluteFilePath)
- mockedIsBinaryFile.mockResolvedValue(false)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false)
mockInputContent = fileContent
@@ -502,7 +504,7 @@ describe("read_file tool with maxReadFileLine setting", () => {
describe("when file is binary", () => {
it("should always use extractTextFromFile regardless of maxReadFileLine", async () => {
// Setup
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(3)
mockedExtractTextFromFile.mockResolvedValue("")
@@ -544,7 +546,7 @@ describe("read_file tool XML output structure", () => {
const mockedCountFileLines = vi.mocked(countFileLines)
const mockedExtractTextFromFile = vi.mocked(extractTextFromFile)
- const mockedIsBinaryFile = vi.mocked(isBinaryFile)
+ const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection)
const mockedPathResolve = vi.mocked(path.resolve)
const mockedFsReadFile = vi.mocked(fsPromises.readFile)
const imageBuffer = Buffer.from(
@@ -560,7 +562,7 @@ describe("read_file tool XML output structure", () => {
// Clear specific mocks (not all mocks to preserve shared state)
mockedCountFileLines.mockClear()
mockedExtractTextFromFile.mockClear()
- mockedIsBinaryFile.mockClear()
+ mockedIsBinaryFileWithEncodingDetection.mockClear()
mockedPathResolve.mockClear()
addLineNumbersMock.mockClear()
extractTextFromFileMock.mockClear()
@@ -580,7 +582,7 @@ describe("read_file tool XML output structure", () => {
setImageSupport(mockCline, true)
mockedPathResolve.mockReturnValue(absoluteFilePath)
- mockedIsBinaryFile.mockResolvedValue(false)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false)
// Set default implementation for extractTextFromFile
mockedExtractTextFromFile.mockImplementation((filePath) => {
@@ -617,7 +619,7 @@ describe("read_file tool XML output structure", () => {
mockProvider.getState.mockResolvedValue({ maxReadFileLine, maxImageFileSize: 20, maxTotalImageSize: 20 })
mockedCountFileLines.mockResolvedValue(totalLines)
- mockedIsBinaryFile.mockResolvedValue(isBinary)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(isBinary)
mockCline.rooIgnoreController.validateAccess = vi.fn().mockReturnValue(validateAccess)
let argsContent = `${testFilePath}`
@@ -758,7 +760,7 @@ describe("read_file tool XML output structure", () => {
it("should allow multiple images under the total memory limit", async () => {
// Setup required mocks (don't clear all mocks - preserve API setup)
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
fsPromises.readFile.mockResolvedValue(
Buffer.from(
@@ -831,7 +833,7 @@ describe("read_file tool XML output structure", () => {
it("should skip images that would exceed the total memory limit", async () => {
// Setup required mocks (don't clear all mocks)
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
fsPromises.readFile.mockResolvedValue(
Buffer.from(
@@ -917,7 +919,7 @@ describe("read_file tool XML output structure", () => {
// Setup mocks (don't clear all mocks)
// Setup required mocks
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
fsPromises.readFile.mockResolvedValue(
Buffer.from(
@@ -990,7 +992,7 @@ describe("read_file tool XML output structure", () => {
// Setup mocks (don't clear all mocks)
// Setup required mocks
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
fsPromises.readFile.mockResolvedValue(
Buffer.from(
@@ -1084,7 +1086,7 @@ describe("read_file tool XML output structure", () => {
maxTotalImageSize: 20, // 20MB total
})
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
mockedFsReadFile.mockResolvedValue(imageBuffer)
@@ -1115,7 +1117,7 @@ describe("read_file tool XML output structure", () => {
// Setup mocks (don't clear all mocks)
// Setup required mocks for first batch
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
fsPromises.readFile.mockResolvedValue(
Buffer.from(
@@ -1161,7 +1163,7 @@ describe("read_file tool XML output structure", () => {
await executeReadMultipleImagesTool(firstBatch.map((img) => img.path))
// Setup second batch (don't clear all mocks)
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
fsPromises.readFile.mockResolvedValue(
Buffer.from(
@@ -1203,7 +1205,7 @@ describe("read_file tool XML output structure", () => {
// Clear and reset file system mocks for second batch
fsPromises.stat.mockClear()
fsPromises.readFile.mockClear()
- mockedIsBinaryFile.mockClear()
+ mockedIsBinaryFileWithEncodingDetection.mockClear()
mockedCountFileLines.mockClear()
// Reset mocks for second batch
@@ -1214,7 +1216,7 @@ describe("read_file tool XML output structure", () => {
"base64",
),
)
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
mockedPathResolve.mockImplementation((cwd, relPath) => `/${relPath}`)
@@ -1241,7 +1243,7 @@ describe("read_file tool XML output structure", () => {
]
// Setup mocks
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
fsPromises.readFile.mockResolvedValue(imageBuffer)
@@ -1289,7 +1291,7 @@ describe("read_file tool XML output structure", () => {
// starts with fresh memory tracking
// Setup mocks
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
fsPromises.readFile.mockResolvedValue(imageBuffer)
@@ -1394,7 +1396,7 @@ describe("read_file tool with image support", () => {
const imageBuffer = Buffer.from(base64ImageData, "base64")
const mockedCountFileLines = vi.mocked(countFileLines)
- const mockedIsBinaryFile = vi.mocked(isBinaryFile)
+ const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection)
const mockedPathResolve = vi.mocked(path.resolve)
const mockedFsReadFile = vi.mocked(fsPromises.readFile)
const mockedExtractTextFromFile = vi.mocked(extractTextFromFile)
@@ -1406,7 +1408,7 @@ describe("read_file tool with image support", () => {
beforeEach(() => {
// Clear specific mocks (not all mocks to preserve shared state)
mockedPathResolve.mockClear()
- mockedIsBinaryFile.mockClear()
+ mockedIsBinaryFileWithEncodingDetection.mockClear()
mockedCountFileLines.mockClear()
mockedFsReadFile.mockClear()
mockedExtractTextFromFile.mockClear()
@@ -1425,7 +1427,7 @@ describe("read_file tool with image support", () => {
setImageSupport(localMockCline, true)
mockedPathResolve.mockReturnValue(absoluteImagePath)
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
mockedCountFileLines.mockResolvedValue(0)
mockedFsReadFile.mockResolvedValue(imageBuffer)
diff --git a/src/core/tools/applyDiffTool.ts b/src/core/tools/applyDiffTool.ts
index 1077b7bf39..acc2b01b04 100644
--- a/src/core/tools/applyDiffTool.ts
+++ b/src/core/tools/applyDiffTool.ts
@@ -2,6 +2,7 @@ import path from "path"
import fs from "fs/promises"
import { TelemetryService } from "@roo-code/telemetry"
+import { readFileWithEncodingDetection } from "../../utils/encoding"
import { DEFAULT_WRITE_DELAY_MS } from "@roo-code/types"
import { ClineSayTool } from "../../shared/ExtensionMessage"
@@ -89,7 +90,7 @@ export async function applyDiffToolLegacy(
return
}
- const originalContent: string = await fs.readFile(absolutePath, "utf-8")
+ const originalContent: string = await readFileWithEncodingDetection(absolutePath)
// Apply the diff to the original content
const diffResult = (await cline.diffStrategy?.applyDiff(
diff --git a/src/core/tools/insertContentTool.ts b/src/core/tools/insertContentTool.ts
index 38ca309a3b..767383eb2c 100644
--- a/src/core/tools/insertContentTool.ts
+++ b/src/core/tools/insertContentTool.ts
@@ -3,6 +3,7 @@ import fs from "fs/promises"
import path from "path"
import { getReadablePath } from "../../utils/path"
+import { readFileWithEncodingDetection } from "../../utils/encoding"
import { Task } from "../task/Task"
import { ToolUse, AskApproval, HandleError, PushToolResult, RemoveClosingTag } from "../../shared/tools"
import { formatResponse } from "../prompts/responses"
@@ -93,7 +94,7 @@ export async function insertContentTool(
return
}
} else {
- fileContent = await fs.readFile(absolutePath, "utf8")
+ fileContent = await readFileWithEncodingDetection(absolutePath)
}
cline.consecutiveMistakeCount = 0
diff --git a/src/core/tools/multiApplyDiffTool.ts b/src/core/tools/multiApplyDiffTool.ts
index 08bce08ede..3cd8297ff6 100644
--- a/src/core/tools/multiApplyDiffTool.ts
+++ b/src/core/tools/multiApplyDiffTool.ts
@@ -2,6 +2,7 @@ import path from "path"
import fs from "fs/promises"
import { TelemetryService } from "@roo-code/telemetry"
+import { readFileWithEncodingDetection } from "../../utils/encoding"
import { DEFAULT_WRITE_DELAY_MS } from "@roo-code/types"
import { ClineSayTool } from "../../shared/ExtensionMessage"
@@ -300,7 +301,7 @@ Original error: ${errorMessage}`
let unified = ""
try {
- const original = await fs.readFile(opResult.absolutePath!, "utf-8")
+ const original = await readFileWithEncodingDetection(opResult.absolutePath!)
const processed = !cline.api.getModel().id.includes("claude")
? (opResult.diffItems || []).map((item) => ({
...item,
@@ -457,7 +458,7 @@ Original error: ${errorMessage}`
const fileExists = opResult.fileExists!
try {
- let originalContent: string | null = await fs.readFile(absolutePath, "utf-8")
+ let originalContent: string | null = await readFileWithEncodingDetection(absolutePath)
let beforeContent: string | null = originalContent
let successCount = 0
let formattedError = ""
@@ -611,7 +612,7 @@ ${errorDetails ? `\nTechnical details:\n${errorDetails}\n` : ""}
cline.diffViewProvider.scrollToFirstDiff()
} else {
// For direct save, we still need to set originalContent
- cline.diffViewProvider.originalContent = await fs.readFile(absolutePath, "utf-8")
+ cline.diffViewProvider.originalContent = await readFileWithEncodingDetection(absolutePath)
}
// Ask for approval (same for both flows)
@@ -646,7 +647,7 @@ ${errorDetails ? `\nTechnical details:\n${errorDetails}\n` : ""}
if (isPreventFocusDisruptionEnabled) {
// Direct file write without diff view or opening the file
cline.diffViewProvider.editType = "modify"
- cline.diffViewProvider.originalContent = await fs.readFile(absolutePath, "utf-8")
+ cline.diffViewProvider.originalContent = await readFileWithEncodingDetection(absolutePath)
await cline.diffViewProvider.saveDirectly(
relPath,
originalContent!,
diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts
index 53f0643dbb..75284c7942 100644
--- a/src/core/tools/readFileTool.ts
+++ b/src/core/tools/readFileTool.ts
@@ -1,5 +1,4 @@
import path from "path"
-import { isBinaryFile } from "isbinaryfile"
import { Task } from "../task/Task"
import { ClineSayTool } from "../../shared/ExtensionMessage"
@@ -14,6 +13,7 @@ import { readLines } from "../../integrations/misc/read-lines"
import { extractTextFromFile, addLineNumbers, getSupportedBinaryFormats } from "../../integrations/misc/extract-text"
import { parseSourceCodeDefinitionsForFile } from "../../services/tree-sitter"
import { parseXml } from "../../utils/xml"
+import { isBinaryFileWithEncodingDetection } from "../../utils/encoding"
import {
DEFAULT_MAX_IMAGE_FILE_SIZE_MB,
DEFAULT_MAX_TOTAL_IMAGE_SIZE_MB,
@@ -456,7 +456,10 @@ export async function readFileTool(
// Process approved files
try {
- const [totalLines, isBinary] = await Promise.all([countFileLines(fullPath), isBinaryFile(fullPath)])
+ const [totalLines, isBinary] = await Promise.all([
+ countFileLines(fullPath),
+ isBinaryFileWithEncodingDetection(fullPath),
+ ])
// Handle binary files (but allow specific file types that extractTextFromFile can handle)
if (isBinary) {
diff --git a/src/core/tools/writeToFileTool.ts b/src/core/tools/writeToFileTool.ts
index b8e6da0caa..88130efcd0 100644
--- a/src/core/tools/writeToFileTool.ts
+++ b/src/core/tools/writeToFileTool.ts
@@ -4,6 +4,7 @@ import * as vscode from "vscode"
import fs from "fs/promises"
import { Task } from "../task/Task"
+import { readFileWithEncodingDetection } from "../../utils/encoding"
import { ClineSayTool } from "../../shared/ExtensionMessage"
import { formatResponse } from "../prompts/responses"
import { ToolUse, AskApproval, HandleError, PushToolResult, RemoveClosingTag } from "../../shared/tools"
@@ -178,7 +179,7 @@ export async function writeToFileTool(
cline.diffViewProvider.editType = fileExists ? "modify" : "create"
if (fileExists) {
const absolutePath = path.resolve(cline.cwd, relPath)
- cline.diffViewProvider.originalContent = await fs.readFile(absolutePath, "utf-8")
+ cline.diffViewProvider.originalContent = await readFileWithEncodingDetection(absolutePath)
} else {
cline.diffViewProvider.originalContent = ""
}
diff --git a/src/integrations/editor/DiffViewProvider.ts b/src/integrations/editor/DiffViewProvider.ts
index d42eba082c..5665cce80f 100644
--- a/src/integrations/editor/DiffViewProvider.ts
+++ b/src/integrations/editor/DiffViewProvider.ts
@@ -7,6 +7,7 @@ import { XMLBuilder } from "fast-xml-parser"
import delay from "delay"
import { createDirectoriesForFile } from "../../utils/fs"
+import { readFileWithEncodingDetection, writeFileWithEncodingPreservation } from "../../utils/encoding"
import { arePathsEqual, getReadablePath } from "../../utils/path"
import { formatResponse } from "../../core/prompts/responses"
import { diagnosticsToProblemsString, getNewDiagnostics } from "../diagnostics"
@@ -68,7 +69,7 @@ export class DiffViewProvider {
this.preDiagnostics = vscode.languages.getDiagnostics()
if (fileExists) {
- this.originalContent = await fs.readFile(absolutePath, "utf-8")
+ this.originalContent = await readFileWithEncodingDetection(absolutePath)
} else {
this.originalContent = ""
}
@@ -662,9 +663,9 @@ export class DiffViewProvider {
// Get diagnostics before editing the file
this.preDiagnostics = vscode.languages.getDiagnostics()
- // Write the content directly to the file
+ // Write the content directly to the file with encoding preservation
await createDirectoriesForFile(absolutePath)
- await fs.writeFile(absolutePath, content, "utf-8")
+ await writeFileWithEncodingPreservation(absolutePath, content)
// Open the document to ensure diagnostics are loaded
// When openFile is false (PREVENT_FOCUS_DISRUPTION enabled), we only open in memory
diff --git a/src/integrations/editor/__tests__/DiffViewProvider.spec.ts b/src/integrations/editor/__tests__/DiffViewProvider.spec.ts
index e99f7bf9c8..47f02a8d67 100644
--- a/src/integrations/editor/__tests__/DiffViewProvider.spec.ts
+++ b/src/integrations/editor/__tests__/DiffViewProvider.spec.ts
@@ -9,22 +9,40 @@ vi.mock("delay", () => ({
}))
// Mock fs/promises
-vi.mock("fs/promises", () => ({
- readFile: vi.fn().mockResolvedValue("file content"),
- writeFile: vi.fn().mockResolvedValue(undefined),
-}))
+vi.mock("fs/promises", async () => {
+ const actual = await vi.importActual("fs/promises");
+ return {
+ ...actual,
+ readFile: vi.fn().mockResolvedValue("file content"),
+ writeFile: vi.fn().mockResolvedValue(undefined),
+ default: {
+ readFile: vi.fn().mockResolvedValue("file content"),
+ writeFile: vi.fn().mockResolvedValue(undefined),
+ }
+ }
+})
// Mock utils
vi.mock("../../../utils/fs", () => ({
createDirectoriesForFile: vi.fn().mockResolvedValue([]),
}))
-// Mock path
-vi.mock("path", () => ({
- resolve: vi.fn((cwd, relPath) => `${cwd}/${relPath}`),
- basename: vi.fn((path) => path.split("/").pop()),
+// Mock encoding utilities
+vi.mock("../../../utils/encoding", () => ({
+ readFileWithEncodingDetection: vi.fn().mockResolvedValue("file content"),
+ writeFileWithEncodingPreservation: vi.fn().mockResolvedValue(undefined),
}))
+// Mock path
+vi.mock("path", async () => {
+ const actual = await vi.importActual("path");
+ return {
+ ...actual,
+ resolve: vi.fn((cwd, relPath) => `${cwd}/${relPath}`),
+ basename: vi.fn((path) => path.split("/").pop()),
+ };
+})
+// Mock vscode
// Mock vscode
vi.mock("vscode", () => ({
workspace: {
@@ -90,7 +108,6 @@ vi.mock("vscode", () => ({
parse: vi.fn((uri) => ({ with: vi.fn(() => ({})) })),
},
}))
-
// Mock DecorationController
vi.mock("../DecorationController", () => ({
DecorationController: vi.fn().mockImplementation(() => ({
@@ -371,8 +388,8 @@ describe("DiffViewProvider", () => {
const result = await diffViewProvider.saveDirectly("test.ts", "new content", true, true, 2000)
// Verify file was written
- const fs = await import("fs/promises")
- expect(fs.writeFile).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content", "utf-8")
+ const { writeFileWithEncodingPreservation } = await import("../../../utils/encoding")
+ expect(writeFileWithEncodingPreservation).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content")
// Verify file was opened without focus
expect(vscode.window.showTextDocument).toHaveBeenCalledWith(
@@ -394,8 +411,8 @@ describe("DiffViewProvider", () => {
await diffViewProvider.saveDirectly("test.ts", "new content", false, true, 1000)
// Verify file was written
- const fs = await import("fs/promises")
- expect(fs.writeFile).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content", "utf-8")
+ const { writeFileWithEncodingPreservation } = await import("../../../utils/encoding")
+ expect(writeFileWithEncodingPreservation).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content")
// Verify file was NOT opened
expect(vscode.window.showTextDocument).not.toHaveBeenCalled()
@@ -409,8 +426,8 @@ describe("DiffViewProvider", () => {
await diffViewProvider.saveDirectly("test.ts", "new content", true, false, 1000)
// Verify file was written
- const fs = await import("fs/promises")
- expect(fs.writeFile).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content", "utf-8")
+ const { writeFileWithEncodingPreservation } = await import("../../../utils/encoding")
+ expect(writeFileWithEncodingPreservation).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content")
// Verify delay was NOT called
expect(mockDelay).not.toHaveBeenCalled()
diff --git a/src/integrations/misc/__tests__/extract-text-large-files.spec.ts b/src/integrations/misc/__tests__/extract-text-large-files.spec.ts
index c9e2f181f5..de9d1da789 100644
--- a/src/integrations/misc/__tests__/extract-text-large-files.spec.ts
+++ b/src/integrations/misc/__tests__/extract-text-large-files.spec.ts
@@ -5,26 +5,30 @@ import * as fs from "fs/promises"
import { extractTextFromFile } from "../extract-text"
import { countFileLines } from "../line-counter"
import { readLines } from "../read-lines"
-import { isBinaryFile } from "isbinaryfile"
+import { isBinaryFileWithEncodingDetection, readFileWithEncodingDetection } from "../../../utils/encoding"
// Mock all dependencies
vi.mock("fs/promises")
vi.mock("../line-counter")
vi.mock("../read-lines")
-vi.mock("isbinaryfile")
+vi.mock("../../../utils/encoding", () => ({
+ isBinaryFileWithEncodingDetection: vi.fn(),
+ readFileWithEncodingDetection: vi.fn(),
+}))
describe("extractTextFromFile - Large File Handling", () => {
// Type the mocks
const mockedFs = vi.mocked(fs)
const mockedCountFileLines = vi.mocked(countFileLines)
const mockedReadLines = vi.mocked(readLines)
- const mockedIsBinaryFile = vi.mocked(isBinaryFile)
+ const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection)
+ const mockedReadFileWithEncodingDetection = vi.mocked(readFileWithEncodingDetection)
beforeEach(() => {
vi.clearAllMocks()
// Set default mock behavior
mockedFs.access.mockResolvedValue(undefined)
- mockedIsBinaryFile.mockResolvedValue(false)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false)
})
it("should truncate files that exceed maxReadFileLine limit", async () => {
@@ -61,7 +65,7 @@ describe("extractTextFromFile - Large File Handling", () => {
.join("\n")
mockedCountFileLines.mockResolvedValue(50)
- mockedFs.readFile.mockResolvedValue(smallFileContent as any)
+ mockedReadFileWithEncodingDetection.mockResolvedValue(smallFileContent)
const result = await extractTextFromFile("/test/small-file.ts", 100)
@@ -80,7 +84,7 @@ describe("extractTextFromFile - Large File Handling", () => {
.join("\n")
mockedCountFileLines.mockResolvedValue(100)
- mockedFs.readFile.mockResolvedValue(exactFileContent as any)
+ mockedReadFileWithEncodingDetection.mockResolvedValue(exactFileContent)
const result = await extractTextFromFile("/test/exact-file.ts", 100)
@@ -98,7 +102,7 @@ describe("extractTextFromFile - Large File Handling", () => {
.map((_, i) => `Line ${i + 1}`)
.join("\n")
- mockedFs.readFile.mockResolvedValue(largeFileContent as any)
+ mockedReadFileWithEncodingDetection.mockResolvedValue(largeFileContent)
const result = await extractTextFromFile("/test/large-file.ts", undefined)
@@ -111,7 +115,7 @@ describe("extractTextFromFile - Large File Handling", () => {
})
it("should handle empty files", async () => {
- mockedFs.readFile.mockResolvedValue("" as any)
+ mockedReadFileWithEncodingDetection.mockResolvedValue("")
const result = await extractTextFromFile("/test/empty-file.ts", 100)
@@ -155,7 +159,7 @@ describe("extractTextFromFile - Large File Handling", () => {
it("should handle maxReadFileLine of 0 by throwing an error", async () => {
const fileContent = "Line 1\nLine 2\nLine 3"
- mockedFs.readFile.mockResolvedValue(fileContent as any)
+ mockedReadFileWithEncodingDetection.mockResolvedValue(fileContent)
// maxReadFileLine of 0 should throw an error
await expect(extractTextFromFile("/test/file.ts", 0)).rejects.toThrow(
@@ -166,7 +170,7 @@ describe("extractTextFromFile - Large File Handling", () => {
it("should handle negative maxReadFileLine by treating as undefined", async () => {
const fileContent = "Line 1\nLine 2\nLine 3"
- mockedFs.readFile.mockResolvedValue(fileContent as any)
+ mockedReadFileWithEncodingDetection.mockResolvedValue(fileContent)
const result = await extractTextFromFile("/test/file.ts", -1)
@@ -204,7 +208,7 @@ describe("extractTextFromFile - Large File Handling", () => {
})
it("should handle binary files by throwing an error", async () => {
- mockedIsBinaryFile.mockResolvedValue(true)
+ mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
await expect(extractTextFromFile("/test/binary.bin", 100)).rejects.toThrow(
"Cannot read text for file type: .bin",
diff --git a/src/integrations/misc/__tests__/read-lines.spec.ts b/src/integrations/misc/__tests__/read-lines.spec.ts
index 14456d24f1..d1bc93cb83 100644
--- a/src/integrations/misc/__tests__/read-lines.spec.ts
+++ b/src/integrations/misc/__tests__/read-lines.spec.ts
@@ -1,3 +1,4 @@
+import { describe, it, expect, beforeAll, afterAll, vi } from "vitest"
import { promises as fs } from "fs"
import path from "path"
import { readLines } from "../read-lines"
@@ -5,6 +6,17 @@ import { readLines } from "../read-lines"
describe("nthline", () => {
const testFile = path.join(__dirname, "test.txt")
+ // Helper function to create a temporary file, run a test, and clean up
+ async function withTempFile(filename: string, content: string, testFn: (filepath: string) => Promise) {
+ const filepath = path.join(__dirname, filename)
+ await fs.writeFile(filepath, content)
+ try {
+ await testFn(filepath)
+ } finally {
+ await fs.unlink(filepath)
+ }
+ }
+
beforeAll(async () => {
// Create a test file with numbered lines
const content = Array.from({ length: 10 }, (_, i) => `Line ${i + 1}`).join("\n")
@@ -71,17 +83,6 @@ describe("nthline", () => {
await expect(readLines(testFile, 20, 15)).rejects.toThrow("does not exist")
})
- // Helper function to create a temporary file, run a test, and clean up
- async function withTempFile(filename: string, content: string, testFn: (filepath: string) => Promise) {
- const filepath = path.join(__dirname, filename)
- await fs.writeFile(filepath, content)
- try {
- await testFn(filepath)
- } finally {
- await fs.unlink(filepath)
- }
- }
-
it("should handle empty files", async () => {
await withTempFile("empty.txt", "", async (filepath) => {
await expect(readLines(filepath, 0, 0)).rejects.toThrow("does not exist")
@@ -129,4 +130,244 @@ describe("nthline", () => {
})
})
})
+
+ describe("bytesRead sampling for encoding detection", () => {
+ it("should sample exactly 64KB for encoding detection on large files", async () => {
+ // Create a large file with line breaks to test proper sampling
+ const lineContent = "This is a test line for large file sampling\n"
+ const linesNeeded = Math.ceil(100000 / lineContent.length) // Ensure > 64KB
+ const largeContent = lineContent.repeat(linesNeeded)
+
+ await withTempFile("large-file.txt", largeContent, async (filepath) => {
+ // For large files, the function should read and process correctly
+ // We'll verify the function works with large files that exceed 64KB
+ const lines = await readLines(filepath, 1) // Read first 2 lines (0-1)
+
+ // Verify that the content is read correctly
+ expect(lines).toContain("This is a test line for large file sampling")
+ // Should only contain 2 lines
+ const lineArray = lines.split("\n").filter((line) => line.length > 0)
+ expect(lineArray).toHaveLength(2)
+ })
+ })
+
+ it("should handle files smaller than 64KB sampling correctly", async () => {
+ const smallContent = "Line 1\nLine 2\nLine 3\n"
+
+ await withTempFile("small-file.txt", smallContent, async (filepath) => {
+ // For small files, the function should still attempt to read 64KB for encoding detection
+ // We'll just verify the function works correctly with small files
+ const lines = await readLines(filepath, 0) // Read first line (0)
+
+ // Verify that the content is read correctly
+ expect(lines).toContain("Line 1")
+ expect(lines).not.toContain("Line 2") // Should only read first line
+ })
+ })
+
+ it("should handle UTF-8 BOM in the 64KB sample correctly", async () => {
+ // Create content with UTF-8 BOM at the beginning
+ const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
+ const textContent = "Line 1 with UTF-8 content\nLine 2\nLine 3\n"
+ const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(textContent, "utf8")])
+
+ await withTempFile("bom-file.txt", contentWithBOM.toString(), async (filepath) => {
+ // Write the actual binary content with BOM
+ await fs.writeFile(filepath, contentWithBOM)
+
+ const lines = await readLines(filepath, 0) // Read first line (0)
+
+ // Should successfully read the content, BOM should be handled by encoding detection
+ expect(lines).toContain("Line 1 with UTF-8 content")
+ })
+ })
+
+ it("should handle UTF-16 LE BOM in the 64KB sample correctly", async () => {
+ // Create content with UTF-16 LE BOM
+ const bomBytes = Buffer.from([0xff, 0xfe])
+ const textContent = "Line 1\nLine 2\n"
+ const utf16Content = Buffer.from(textContent, "utf16le")
+ const contentWithBOM = Buffer.concat([bomBytes, utf16Content])
+
+ await withTempFile("utf16le-bom-file.txt", "", async (filepath) => {
+ // Write the actual binary content with BOM
+ await fs.writeFile(filepath, contentWithBOM)
+
+ const lines = await readLines(filepath, 1)
+
+ // Should successfully read the content, BOM should be handled by encoding detection
+ expect(lines).toContain("Line 1")
+ })
+ })
+
+ it("should handle partial multi-byte characters at 64KB boundary", async () => {
+ // Create content where a multi-byte UTF-8 character might be split at 64KB boundary
+ const lineContent = "Line with content: 你好世界\n"
+ const linesNeeded = Math.ceil(65536 / lineContent.length) + 5 // Ensure > 64KB
+ const fullContent = lineContent.repeat(linesNeeded) + "Final line after boundary\n"
+
+ await withTempFile("multibyte-boundary.txt", fullContent, async (filepath) => {
+ // Read the last few lines to check the content after the boundary
+ const lines = await readLines(filepath, linesNeeded + 1, linesNeeded - 1) // Read last 3 lines
+ expect(lines).toContain("Final line after boundary")
+ // The multi-byte characters should be preserved
+ expect(lines).toContain("你好世界")
+ })
+ })
+
+ it("should handle encoding detection failure gracefully with 64KB sampling", async () => {
+ // Create binary-like content that might confuse encoding detection
+ const binaryLikeContent = Buffer.alloc(70000) // Larger than 64KB
+ // Fill with values that might be detected as binary
+ for (let i = 0; i < binaryLikeContent.length; i++) {
+ binaryLikeContent[i] = i % 256
+ }
+ // Add some text at the end
+ const textPortion = Buffer.from("\nSome text at the end\n", "utf8")
+ const mixedContent = Buffer.concat([binaryLikeContent, textPortion])
+
+ await withTempFile("mixed-content.txt", "", async (filepath) => {
+ await fs.writeFile(filepath, mixedContent)
+
+ // Should either succeed with fallback encoding or handle gracefully
+ try {
+ const lines = await readLines(filepath, 0, 0)
+ // If it succeeds, it should contain the text portion
+ expect(typeof lines).toBe("string")
+ } catch (error) {
+ // If it fails, it should be a meaningful error about binary content
+ expect(error).toBeInstanceOf(Error)
+ }
+ })
+ })
+ })
+
+ describe("BOM preservation integration tests", () => {
+ it("should preserve UTF-8 BOM when reading lines from file", async () => {
+ // Create content with UTF-8 BOM
+ const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
+ const textContent = "First line with UTF-8 content\nSecond line\nThird line\n"
+ const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(textContent, "utf8")])
+
+ await withTempFile("utf8-bom-integration.txt", "", async (filepath) => {
+ // Write the actual binary content with BOM
+ await fs.writeFile(filepath, contentWithBOM)
+
+ // Read first line
+ const firstLine = await readLines(filepath, 1)
+ expect(firstLine).toContain("First line with UTF-8 content")
+
+ // Read multiple lines
+ const multipleLines = await readLines(filepath, 2)
+ expect(multipleLines).toContain("First line with UTF-8 content")
+ expect(multipleLines).toContain("Second line")
+
+ // Read from specific line
+ const fromSecondLine = await readLines(filepath, 1, 1)
+ expect(fromSecondLine).toContain("Second line")
+ })
+ })
+
+ it("should preserve UTF-16 LE BOM when reading lines from file", async () => {
+ // Create content with UTF-16 LE BOM
+ const bomBytes = Buffer.from([0xff, 0xfe])
+ const textContent = "UTF-16 LE first line\nUTF-16 LE second line\n"
+ const utf16Content = Buffer.from(textContent, "utf16le")
+ const contentWithBOM = Buffer.concat([bomBytes, utf16Content])
+
+ await withTempFile("utf16le-bom-integration.txt", "", async (filepath) => {
+ // Write the actual binary content with BOM
+ await fs.writeFile(filepath, contentWithBOM)
+
+ // Read first line
+ const firstLine = await readLines(filepath, 0) // Read first line (0)
+ expect(firstLine).toContain("UTF-16 LE first line")
+
+ // Read multiple lines
+ const multipleLines = await readLines(filepath, 1) // Read first 2 lines (0-1)
+ expect(multipleLines).toContain("UTF-16 LE first line")
+ expect(multipleLines).toContain("UTF-16 LE second line")
+ })
+ })
+
+ it("should preserve UTF-16 BE BOM when reading lines from file", async () => {
+ // Create content with UTF-16 BE BOM
+ const bomBytes = Buffer.from([0xfe, 0xff])
+ const textContent = "UTF-16 BE first line\nUTF-16 BE second line\n"
+ // Manually create UTF-16 BE content
+ const utf16beBytes = []
+ for (let i = 0; i < textContent.length; i++) {
+ const charCode = textContent.charCodeAt(i)
+ utf16beBytes.push((charCode >> 8) & 0xff) // High byte first
+ utf16beBytes.push(charCode & 0xff) // Low byte second
+ }
+ const utf16Content = Buffer.from(utf16beBytes)
+ const contentWithBOM = Buffer.concat([bomBytes, utf16Content])
+
+ await withTempFile("utf16be-bom-integration.txt", "", async (filepath) => {
+ // Write the actual binary content with BOM
+ await fs.writeFile(filepath, contentWithBOM)
+
+ // Read first line
+ const firstLine = await readLines(filepath, 0) // Read first line (0)
+ expect(firstLine).toContain("UTF-16 BE first line")
+
+ // Read multiple lines
+ const multipleLines = await readLines(filepath, 1) // Read first 2 lines (0-1)
+ expect(multipleLines).toContain("UTF-16 BE first line")
+ expect(multipleLines).toContain("UTF-16 BE second line")
+ })
+ })
+
+ it("should handle BOM preservation with large files that exceed 64KB sampling", async () => {
+ // Create a large file with UTF-8 BOM that exceeds 64KB
+ const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
+ const lineContent = "This is a test line with UTF-8 content and BOM: 你好世界 🌍\n"
+ const linesNeeded = Math.ceil(65536 / lineContent.length) + 100 // Ensure > 64KB
+ const largeTextContent = lineContent.repeat(linesNeeded)
+ const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(largeTextContent, "utf8")])
+
+ await withTempFile("large-utf8-bom-integration.txt", "", async (filepath) => {
+ // Write the actual binary content with BOM
+ await fs.writeFile(filepath, contentWithBOM)
+
+ // Read first few lines
+ const firstLines = await readLines(filepath, 3)
+ expect(firstLines).toContain("This is a test line with UTF-8 content and BOM: 你好世界 🌍")
+
+ // Read from middle of file (read 2 lines starting from line 50)
+ const middleLines = await readLines(filepath, 51, 49) // Read lines 49-51 (0-based)
+ expect(middleLines).toContain("This is a test line with UTF-8 content and BOM: 你好世界 🌍")
+
+ // Verify the content is properly decoded despite BOM
+ const lines = firstLines.split("\n")
+ expect(lines[0]).not.toMatch(/^\uFEFF/) // BOM should not appear in decoded text
+ })
+ })
+
+ it("should handle mixed BOM and non-BOM files correctly", async () => {
+ // Test reading from a UTF-8 BOM file and then a regular UTF-8 file
+ const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
+ const bomContent = Buffer.concat([bomBytes, Buffer.from("BOM file content\nSecond line\n", "utf8")])
+ const regularContent = "Regular UTF-8 content\nAnother line\n"
+
+ await withTempFile("bom-file-mixed.txt", "", async (bomFilepath) => {
+ await fs.writeFile(bomFilepath, bomContent)
+
+ await withTempFile("regular-file-mixed.txt", regularContent, async (regularFilepath) => {
+ // Read from BOM file
+ const bomLines = await readLines(bomFilepath, 0) // Read first line (0)
+ expect(bomLines).toContain("BOM file content")
+
+ // Read from regular file
+ const regularLines = await readLines(regularFilepath, 0) // Read first line (0)
+ expect(regularLines).toContain("Regular UTF-8 content")
+
+ // Both should work correctly without interference
+ expect(bomLines).not.toContain("Regular UTF-8 content")
+ expect(regularLines).not.toContain("BOM file content")
+ })
+ })
+ })
+ })
})
diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts
index bafa7a5bab..7b48bca24c 100644
--- a/src/integrations/misc/extract-text.ts
+++ b/src/integrations/misc/extract-text.ts
@@ -3,10 +3,10 @@ import * as path from "path"
import pdf from "pdf-parse/lib/pdf-parse"
import mammoth from "mammoth"
import fs from "fs/promises"
-import { isBinaryFile } from "isbinaryfile"
import { extractTextFromXLSX } from "./extract-text-from-xlsx"
import { countFileLines } from "./line-counter"
import { readLines } from "./read-lines"
+import { readFileWithEncodingDetection, isBinaryFileWithEncodingDetection } from "../../utils/encoding"
async function extractTextFromPDF(filePath: string): Promise {
const dataBuffer = await fs.readFile(filePath)
@@ -20,7 +20,7 @@ async function extractTextFromDOCX(filePath: string): Promise {
}
async function extractTextFromIPYNB(filePath: string): Promise {
- const data = await fs.readFile(filePath, "utf8")
+ const data = await readFileWithEncodingDetection(filePath)
const notebook = JSON.parse(data)
let extractedText = ""
@@ -84,9 +84,8 @@ export async function extractTextFromFile(filePath: string, maxReadFileLine?: nu
if (extractor) {
return extractor(filePath)
}
-
- // Handle other files
- const isBinary = await isBinaryFile(filePath).catch(() => false)
+ // Handle other files - use unified binary file detection
+ const isBinary = await isBinaryFileWithEncodingDetection(filePath)
if (!isBinary) {
// Check if we need to apply line limit
@@ -103,7 +102,7 @@ export async function extractTextFromFile(filePath: string, maxReadFileLine?: nu
}
}
// Read the entire file if no limit or file is within limit
- return addLineNumbers(await fs.readFile(filePath, "utf8"))
+ return addLineNumbers(await readFileWithEncodingDetection(filePath))
} else {
throw new Error(`Cannot read text for file type: ${fileExtension}`)
}
diff --git a/src/integrations/misc/read-lines.ts b/src/integrations/misc/read-lines.ts
index 5a5eda9f83..c677518e7d 100644
--- a/src/integrations/misc/read-lines.ts
+++ b/src/integrations/misc/read-lines.ts
@@ -7,6 +7,9 @@
* Now you can read a range of lines from a file
*/
import { createReadStream } from "fs"
+import { open } from "fs/promises"
+import * as iconv from "iconv-lite"
+import { detectEncoding } from "../../utils/encoding"
const outOfRangeError = (filepath: string, n: number) => {
return new RangeError(`Line with index ${n} does not exist in '${filepath}'. Note that line indexing is zero-based`)
@@ -52,65 +55,97 @@ export function readLines(filepath: string, endLine?: number, startLine?: number
)
}
- // Set up stream
- const input = createReadStream(filepath)
- let buffer = ""
- let lineCount = 0
- let result = ""
-
- // Handle errors
- input.on("error", reject)
-
- // Process data chunks directly
- input.on("data", (chunk) => {
- // Add chunk to buffer
- buffer += chunk.toString()
-
- let pos = 0
- let nextNewline = buffer.indexOf("\n", pos)
-
- // Process complete lines in the buffer
- while (nextNewline !== -1) {
- // If we're in the target range, add this line to the result
- if (lineCount >= effectiveStartLine && (endLine === undefined || lineCount <= endLine)) {
- result += buffer.substring(pos, nextNewline + 1) // Include the newline
- }
-
- // Move position and increment line counter
- pos = nextNewline + 1
- lineCount++
-
- // If we've reached the end line, we can stop
- if (endLine !== undefined && lineCount > endLine) {
- input.destroy()
- resolve(result)
- return
+ // Sample the first 64KB for encoding detection
+ open(filepath, "r")
+ .then((fileHandle) => {
+ const sampleBuffer = Buffer.alloc(65536)
+ return fileHandle
+ .read(sampleBuffer, 0, sampleBuffer.length, 0)
+ .then(({ bytesRead }) => sampleBuffer.subarray(0, bytesRead))
+ .finally(() => fileHandle.close())
+ })
+ .then((sampleBuffer) => detectEncoding(sampleBuffer))
+ .then((encoding) => {
+ // Node.js native supported encodings
+ const nodeEncodings = ["utf8", "ascii", "latin1"]
+
+ let buffer = ""
+ let lineCount = 0
+ let result = ""
+
+ // Choose decoding method based on native support
+ let input: NodeJS.ReadableStream
+ if (nodeEncodings.includes(encoding.toLowerCase())) {
+ input = createReadStream(filepath, { encoding: encoding as BufferEncoding })
+ // Handle errors directly
+ input.on("error", reject)
+ } else {
+ // For non-native encodings, create streams and handle errors explicitly
+ const sourceStream = createReadStream(filepath)
+ const decodeStream = iconv.decodeStream(encoding)
+
+ // Handle errors from both streams
+ sourceStream.on("error", reject)
+ decodeStream.on("error", reject)
+
+ // Use pipe but with explicit error handling
+ input = sourceStream.pipe(decodeStream)
}
- // Find next newline
- nextNewline = buffer.indexOf("\n", pos)
- }
-
- // Trim buffer - keep only the incomplete line
- buffer = buffer.substring(pos)
- })
-
- // Handle end of file
- input.on("end", () => {
- // Process any remaining data in buffer (last line without newline)
- if (buffer.length > 0) {
- if (lineCount >= effectiveStartLine && (endLine === undefined || lineCount <= endLine)) {
- result += buffer
- }
- lineCount++
- }
-
- // Check if we found any lines in the requested range
- if (lineCount <= effectiveStartLine) {
- reject(outOfRangeError(filepath, effectiveStartLine))
- } else {
- resolve(result)
- }
- })
+ // Process data chunks directly
+ input.on("data", (chunk: string) => {
+ // Add chunk to buffer (chunk is already decoded using the detected encoding)
+ buffer += chunk
+
+ let pos = 0
+ let nextNewline = buffer.indexOf("\n", pos)
+
+ // Process complete lines in the buffer
+ while (nextNewline !== -1) {
+ // If we're in the target range, add this line to the result
+ if (lineCount >= effectiveStartLine && (endLine === undefined || lineCount <= endLine)) {
+ result += buffer.substring(pos, nextNewline + 1) // Include the newline
+ }
+
+ // Move position and increment line counter
+ pos = nextNewline + 1
+ lineCount++
+
+ // If we've reached the end line, we can stop
+ if (endLine !== undefined && lineCount > endLine) {
+ ;(input as any).destroy?.()
+ resolve(result)
+ return
+ }
+
+ // Find next newline
+ nextNewline = buffer.indexOf("\n", pos)
+ }
+
+ // Trim buffer - keep only the incomplete line
+ buffer = buffer.substring(pos)
+ })
+
+ // Handle end of file
+ input.on("end", () => {
+ // Process any remaining data in buffer (last line without newline)
+ if (buffer.length > 0) {
+ if (lineCount >= effectiveStartLine && (endLine === undefined || lineCount <= endLine)) {
+ result += buffer
+ }
+ lineCount++
+ }
+
+ // Check if we found any lines in the requested range
+ if (lineCount <= effectiveStartLine) {
+ reject(outOfRangeError(filepath, effectiveStartLine))
+ } else {
+ resolve(result)
+ }
+ })
+ })
+ .catch((error) => {
+ reject(error)
+ })
})
}
diff --git a/src/package.json b/src/package.json
index b7a5e42366..531e57c321 100644
--- a/src/package.json
+++ b/src/package.json
@@ -529,7 +529,9 @@
"web-tree-sitter": "^0.25.6",
"workerpool": "^9.2.0",
"yaml": "^2.8.0",
- "zod": "^3.25.61"
+ "zod": "^3.25.61",
+ "iconv-lite": "^0.6.3",
+ "jschardet": "^3.1.4"
},
"devDependencies": {
"@roo-code/build": "workspace:^",
diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts
index 92a7d77c27..37cf36d07f 100644
--- a/src/services/code-index/processors/scanner.ts
+++ b/src/services/code-index/processors/scanner.ts
@@ -3,6 +3,8 @@ import { Ignore } from "ignore"
import { RooIgnoreController } from "../../../core/ignore/RooIgnoreController"
import { stat } from "fs/promises"
import * as path from "path"
+import * as iconv from "iconv-lite"
+import { detectEncoding } from "../../../utils/encoding"
import { generateNormalizedAbsolutePath, generateRelativeFilePath } from "../shared/get-relative-path"
import { getWorkspacePathForContext } from "../../../utils/path"
import { scannerExtensions } from "../shared/supported-extensions"
@@ -134,10 +136,11 @@ export class DirectoryScanner implements IDirectoryScanner {
return
}
- // Read file content
- const content = await vscode.workspace.fs
- .readFile(vscode.Uri.file(filePath))
- .then((buffer) => Buffer.from(buffer).toString("utf-8"))
+ // Read file content with encoding detection
+ const fileBuffer = await vscode.workspace.fs.readFile(vscode.Uri.file(filePath))
+ const buffer = Buffer.from(fileBuffer)
+ const encoding = await detectEncoding(buffer)
+ const content = iconv.decode(buffer, encoding)
// Calculate current hash
const currentFileHash = createHash("sha256").update(content).digest("hex")
diff --git a/src/services/tree-sitter/__tests__/markdownIntegration.spec.ts b/src/services/tree-sitter/__tests__/markdownIntegration.spec.ts
index 9ada01a078..772e07db16 100644
--- a/src/services/tree-sitter/__tests__/markdownIntegration.spec.ts
+++ b/src/services/tree-sitter/__tests__/markdownIntegration.spec.ts
@@ -1,18 +1,32 @@
// Mocks must come first, before imports
-vi.mock("fs/promises", () => ({
- readFile: vi.fn().mockImplementation(() => Promise.resolve("")),
- stat: vi.fn().mockImplementation(() => Promise.resolve({ isDirectory: () => false })),
-}))
+vi.mock("fs/promises", async () => {
+ const actual = await vi.importActual("fs/promises");
+ return {
+ ...actual,
+ readFile: vi.fn().mockImplementation(() => Promise.resolve("")),
+ stat: vi.fn().mockImplementation(() => Promise.resolve({ isDirectory: () => false })),
+ default: {
+ readFile: vi.fn().mockImplementation(() => Promise.resolve("")),
+ stat: vi.fn().mockImplementation(() => Promise.resolve({ isDirectory: () => false })),
+ }
+ }
+})
vi.mock("../../../utils/fs", () => ({
fileExistsAtPath: vi.fn().mockImplementation(() => Promise.resolve(true)),
}))
+vi.mock("../../../utils/encoding", () => ({
+ readFileWithEncodingDetection: vi.fn().mockImplementation((filePath) => {
+ return Promise.resolve("")
+ }),
+}))
+
// Then imports
import * as fs from "fs/promises"
import type { Mock } from "vitest"
-
+import { readFileWithEncodingDetection } from "../../../utils/encoding"
import { parseSourceCodeDefinitionsForFile } from "../index"
describe("Markdown Integration Tests", () => {
@@ -26,14 +40,14 @@ describe("Markdown Integration Tests", () => {
const markdownContent =
"# Main Header\n\nThis is some content under the main header.\nIt spans multiple lines to meet the minimum section length.\n\n## Section 1\n\nThis is content for section 1.\nIt also spans multiple lines.\n\n### Subsection 1.1\n\nThis is a subsection with enough lines\nto meet the minimum section length requirement.\n\n## Section 2\n\nFinal section content.\nWith multiple lines.\n"
- // Mock fs.readFile to return our markdown content
- ;(fs.readFile as Mock).mockImplementation(() => Promise.resolve(markdownContent))
+ // Mock readFileWithEncodingDetection to return our markdown content
+ ;(readFileWithEncodingDetection as Mock).mockImplementation(() => Promise.resolve(markdownContent))
// Call the function with a markdown file path
const result = await parseSourceCodeDefinitionsForFile("test.md")
- // Verify fs.readFile was called with the correct path
- expect(fs.readFile).toHaveBeenCalledWith("test.md", "utf8")
+ // Verify readFileWithEncodingDetection was called with the correct path
+ expect(readFileWithEncodingDetection).toHaveBeenCalledWith("test.md")
// Check the result formatting for definition listing
expect(result).toBeDefined()
@@ -48,14 +62,14 @@ describe("Markdown Integration Tests", () => {
// This test verifies behavior when no headers meet the minimum requirements
const markdownContent = "This is just some text.\nNo headers here.\nJust plain text."
- // Mock fs.readFile to return our markdown content
- ;(fs.readFile as Mock).mockImplementation(() => Promise.resolve(markdownContent))
+ // Mock readFileWithEncodingDetection to return our markdown content
+ ;(readFileWithEncodingDetection as Mock).mockImplementation(() => Promise.resolve(markdownContent))
// Call the function with a markdown file path
const result = await parseSourceCodeDefinitionsForFile("no-headers.md")
- // Verify fs.readFile was called with the correct path
- expect(fs.readFile).toHaveBeenCalledWith("no-headers.md", "utf8")
+ // Verify readFileWithEncodingDetection was called with the correct path
+ expect(readFileWithEncodingDetection).toHaveBeenCalledWith("no-headers.md")
// Check the result - should be undefined since no definitions found
expect(result).toBeUndefined()
diff --git a/src/services/tree-sitter/index.ts b/src/services/tree-sitter/index.ts
index 145ba84730..cfeabe8d24 100644
--- a/src/services/tree-sitter/index.ts
+++ b/src/services/tree-sitter/index.ts
@@ -6,6 +6,7 @@ import { fileExistsAtPath } from "../../utils/fs"
import { parseMarkdown } from "./markdownParser"
import { RooIgnoreController } from "../../core/ignore/RooIgnoreController"
import { QueryCapture } from "web-tree-sitter"
+import { readFileWithEncodingDetection } from "../../utils/encoding"
// Private constant
const DEFAULT_MIN_COMPONENT_LINES_VALUE = 4
@@ -120,7 +121,7 @@ export async function parseSourceCodeDefinitionsForFile(
}
// Read file content
- const fileContent = await fs.readFile(filePath, "utf8")
+ const fileContent = await readFileWithEncodingDetection(filePath)
// Split the file content into individual lines
const lines = fileContent.split("\n")
@@ -196,7 +197,7 @@ export async function parseSourceCodeForDefinitionsTopLevel(
try {
// Read file content
- const fileContent = await fs.readFile(file, "utf8")
+ const fileContent = await readFileWithEncodingDetection(file)
// Split the file content into individual lines
const lines = fileContent.split("\n")
@@ -386,7 +387,7 @@ async function parseFile(
}
// Read file content
- const fileContent = await fs.readFile(filePath, "utf8")
+ const fileContent = await readFileWithEncodingDetection(filePath)
const extLang = path.extname(filePath).toLowerCase().slice(1)
// Check if we have a parser for this file type
diff --git a/src/utils/__tests__/encoding.spec.ts b/src/utils/__tests__/encoding.spec.ts
new file mode 100644
index 0000000000..e127e70490
--- /dev/null
+++ b/src/utils/__tests__/encoding.spec.ts
@@ -0,0 +1,577 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"
+import * as jschardet from "jschardet"
+import * as iconv from "iconv-lite"
+import { isBinaryFile } from "isbinaryfile"
+import fs from "fs/promises"
+import path from "path"
+import {
+ detectEncoding,
+ readFileWithEncodingDetection,
+ detectFileEncoding,
+ writeFileWithEncodingPreservation,
+ isBinaryFileWithEncodingDetection,
+} from "../encoding"
+
+// Mock dependencies
+vi.mock("jschardet", () => ({
+ detect: vi.fn(),
+}))
+
+vi.mock("iconv-lite", () => ({
+ encodingExists: vi.fn(),
+ decode: vi.fn(),
+ encode: vi.fn(),
+}))
+
+vi.mock("isbinaryfile", () => ({
+ isBinaryFile: vi.fn(),
+}))
+
+vi.mock("fs/promises", () => ({
+ default: {
+ readFile: vi.fn(),
+ writeFile: vi.fn(),
+ },
+}))
+
+vi.mock("path", () => ({
+ default: {
+ extname: vi.fn(),
+ },
+}))
+
+const mockJschardet = vi.mocked(jschardet)
+const mockIconv = vi.mocked(iconv)
+const mockIsBinaryFile = vi.mocked(isBinaryFile)
+const mockFs = vi.mocked(fs)
+const mockPath = vi.mocked(path)
+
+describe("encoding", () => {
+ beforeEach(() => {
+ vi.clearAllMocks()
+ // Reset default mocks
+ mockPath.extname.mockReturnValue(".txt")
+ mockIconv.encodingExists.mockReturnValue(true)
+ mockIconv.decode.mockReturnValue("decoded content")
+ mockIconv.encode.mockReturnValue(Buffer.from("encoded content"))
+ })
+
+ afterEach(() => {
+ vi.restoreAllMocks()
+ })
+
+ describe("detectEncoding", () => {
+ it("should throw error for binary files", async () => {
+ const buffer = Buffer.from("binary content")
+ mockIsBinaryFile.mockResolvedValue(true)
+
+ await expect(detectEncoding(buffer, ".exe")).rejects.toThrow("Cannot read text for file type: .exe")
+ })
+
+ it("should call isBinaryFile with buffer and buffer length", async () => {
+ const buffer = Buffer.from("test content for binary check")
+ mockJschardet.detect.mockReturnValue({
+ encoding: "",
+ confidence: 0,
+ }) // No encoding detected
+ mockIsBinaryFile.mockResolvedValue(false)
+
+ await detectEncoding(buffer, ".txt")
+
+ expect(mockIsBinaryFile).toHaveBeenCalledWith(buffer, buffer.length)
+ })
+
+ it("should handle string detection result from jschardet", async () => {
+ const buffer = Buffer.from("utf8 content")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.9,
+ })
+
+ const result = await detectEncoding(buffer, ".txt")
+ expect(result).toBe("utf8")
+ })
+
+ it("should handle object detection result with high confidence", async () => {
+ const buffer = Buffer.from("gbk content")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "gbk",
+ confidence: 0.9,
+ })
+
+ const result = await detectEncoding(buffer, ".txt")
+ expect(result).toBe("gbk")
+ })
+
+ it("should handle ISO-8859-1 encoding", async () => {
+ const buffer = Buffer.from("iso-8859-1 content")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "iso-8859-1",
+ confidence: 0.9,
+ })
+ mockIconv.encodingExists.mockReturnValue(true)
+
+ const result = await detectEncoding(buffer, ".txt")
+ expect(result).toBe("iso-8859-1")
+ })
+
+ it("should handle Shift-JIS encoding", async () => {
+ const buffer = Buffer.from("shift-jis content")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "shift-jis",
+ confidence: 0.9,
+ })
+ mockIconv.encodingExists.mockReturnValue(true)
+
+ const result = await detectEncoding(buffer, ".txt")
+ expect(result).toBe("shift-jis")
+ })
+
+ it("should handle empty file gracefully", async () => {
+ const buffer = Buffer.alloc(0)
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "",
+ confidence: 0,
+ })
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(buffer, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith("No encoding detected, falling back to utf8")
+ })
+
+ it("should handle very small file (1 byte)", async () => {
+ const buffer = Buffer.from("a")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "",
+ confidence: 0,
+ })
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(buffer, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith("No encoding detected, falling back to utf8")
+ })
+
+ it("should handle very small file (2 bytes)", async () => {
+ const buffer = Buffer.from("ab")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.3,
+ })
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(buffer, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith(
+ "Low confidence encoding detection: utf8 (confidence: 0.3), falling back to utf8",
+ )
+ })
+
+ it("should fallback to utf8 for low confidence detection", async () => {
+ const buffer = Buffer.from("uncertain content")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "gbk",
+ confidence: 0.5,
+ })
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(buffer, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith(
+ "Low confidence encoding detection: gbk (confidence: 0.5), falling back to utf8",
+ )
+ })
+
+ it("should fallback to utf8 when no encoding detected", async () => {
+ const buffer = Buffer.from("no encoding content")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "",
+ confidence: 0,
+ })
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(buffer, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith("No encoding detected, falling back to utf8")
+ })
+
+ it("should fallback to utf8 for unsupported encodings", async () => {
+ const buffer = Buffer.from("unsupported encoding content")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "unsupported-encoding",
+ confidence: 0.9,
+ })
+ mockIconv.encodingExists.mockReturnValue(false)
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(buffer, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith(
+ "Unsupported encoding detected: unsupported-encoding, falling back to utf8",
+ )
+ })
+
+ it("should handle unsupported encoding with original detection info", async () => {
+ const buffer = Buffer.from("unsupported encoding content")
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "unsupported-encoding",
+ confidence: 0.9,
+ })
+ mockIconv.encodingExists.mockReturnValue(false)
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ await detectEncoding(buffer, ".txt")
+
+ expect(consoleSpy).toHaveBeenCalledWith(
+ "Unsupported encoding detected: unsupported-encoding, falling back to utf8",
+ )
+ })
+
+ it("should handle isBinaryFile error gracefully", async () => {
+ const buffer = Buffer.from("content")
+ mockIsBinaryFile.mockRejectedValue(new Error("Detection failed"))
+
+ const result = await detectEncoding(buffer, ".txt")
+ expect(result).toBe("utf8") // Should fallback to utf8
+ })
+
+ describe("BOM (Byte Order Mark) preservation", () => {
+ it("should preserve UTF-8 BOM in encoding detection", async () => {
+ // UTF-8 BOM: 0xEF 0xBB 0xBF
+ const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
+ const contentBytes = Buffer.from("Hello, world!", "utf8")
+ const bufferWithBOM = Buffer.concat([bomBytes, contentBytes])
+
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.9,
+ })
+
+ const result = await detectEncoding(bufferWithBOM, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(mockJschardet.detect).toHaveBeenCalledWith(bufferWithBOM)
+ // Verify the BOM is included in the buffer passed to jschardet
+ expect(mockJschardet.detect.mock.calls[0][0]).toEqual(bufferWithBOM)
+ })
+
+ it("should handle UTF-8 BOM with low confidence detection", async () => {
+ const bomBytes = Buffer.from([0xef, 0xbb, 0xbf])
+ const contentBytes = Buffer.from("Hello", "utf8")
+ const bufferWithBOM = Buffer.concat([bomBytes, contentBytes])
+
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.5, // Low confidence
+ })
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(bufferWithBOM, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith(
+ "Low confidence encoding detection: utf8 (confidence: 0.5), falling back to utf8",
+ )
+ })
+
+ it("should handle UTF-8 BOM with empty content", async () => {
+ // Only BOM, no content
+ const bomOnlyBuffer = Buffer.from([0xef, 0xbb, 0xbf])
+
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.9,
+ })
+
+ const result = await detectEncoding(bomOnlyBuffer, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(mockJschardet.detect).toHaveBeenCalledWith(bomOnlyBuffer)
+ })
+
+ it("should preserve UTF-16 LE BOM in encoding detection", async () => {
+ // UTF-16 LE BOM: 0xFF 0xFE
+ const bomBytes = Buffer.from([0xff, 0xfe])
+ const contentBytes = Buffer.from("Hello", "utf16le")
+ const bufferWithBOM = Buffer.concat([bomBytes, contentBytes])
+
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf-16le",
+ confidence: 0.9,
+ })
+ mockIconv.encodingExists.mockReturnValue(true)
+
+ const result = await detectEncoding(bufferWithBOM, ".txt")
+
+ expect(result).toBe("utf-16le")
+ expect(mockJschardet.detect).toHaveBeenCalledWith(bufferWithBOM)
+ expect(mockJschardet.detect.mock.calls[0][0]).toEqual(bufferWithBOM)
+ })
+
+ it("should preserve UTF-16 BE BOM in encoding detection", async () => {
+ // UTF-16 BE BOM: 0xFE 0xFF
+ const bomBytes = Buffer.from([0xfe, 0xff])
+ // Create UTF-16 BE content manually since Node.js doesn't have utf16be encoding
+ const contentBytes = Buffer.from([0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f]) // "Hello" in UTF-16 BE
+ const bufferWithBOM = Buffer.concat([bomBytes, contentBytes])
+
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf-16be",
+ confidence: 0.9,
+ })
+ mockIconv.encodingExists.mockReturnValue(true)
+
+ const result = await detectEncoding(bufferWithBOM, ".txt")
+
+ expect(result).toBe("utf-16be")
+ expect(mockJschardet.detect).toHaveBeenCalledWith(bufferWithBOM)
+ expect(mockJschardet.detect.mock.calls[0][0]).toEqual(bufferWithBOM)
+ })
+
+ it("should handle UTF-16 LE BOM with unsupported encoding fallback", async () => {
+ const bomBytes = Buffer.from([0xff, 0xfe])
+ const contentBytes = Buffer.from("Hello", "utf16le")
+ const bufferWithBOM = Buffer.concat([bomBytes, contentBytes])
+
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf-16le",
+ confidence: 0.9,
+ })
+ mockIconv.encodingExists.mockReturnValue(false) // Simulate unsupported encoding
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(bufferWithBOM, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith("Unsupported encoding detected: utf-16le, falling back to utf8")
+ })
+
+ it("should handle UTF-16 BE BOM with low confidence", async () => {
+ const bomBytes = Buffer.from([0xfe, 0xff])
+ const contentBytes = Buffer.from([0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f]) // "Hello" in UTF-16 BE
+ const bufferWithBOM = Buffer.concat([bomBytes, contentBytes])
+
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf-16be",
+ confidence: 0.4, // Low confidence
+ })
+
+ const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
+ const result = await detectEncoding(bufferWithBOM, ".txt")
+
+ expect(result).toBe("utf8")
+ expect(consoleSpy).toHaveBeenCalledWith(
+ "Low confidence encoding detection: utf-16be (confidence: 0.4), falling back to utf8",
+ )
+ })
+ })
+ })
+
+ describe("readFileWithEncodingDetection", () => {
+ it("should read file and detect encoding correctly", async () => {
+ const filePath = "/path/to/file.txt"
+ const buffer = Buffer.from("file content")
+ mockFs.readFile.mockResolvedValue(buffer)
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.9,
+ })
+
+ const result = await readFileWithEncodingDetection(filePath)
+
+ expect(mockFs.readFile).toHaveBeenCalledWith(filePath)
+ expect(mockPath.extname).toHaveBeenCalledWith(filePath)
+ expect(mockIconv.decode).toHaveBeenCalledWith(buffer, "utf8")
+ expect(result).toBe("decoded content")
+ })
+
+ it("should handle binary file detection", async () => {
+ const filePath = "/path/to/file.exe"
+ const buffer = Buffer.from("binary content")
+ mockFs.readFile.mockResolvedValue(buffer)
+ mockIsBinaryFile.mockResolvedValue(true)
+ mockPath.extname.mockReturnValue(".exe")
+
+ await expect(readFileWithEncodingDetection(filePath)).rejects.toThrow(
+ "Cannot read text for file type: .exe",
+ )
+ })
+ })
+
+ describe("detectFileEncoding", () => {
+ it("should detect encoding for existing file", async () => {
+ const filePath = "/path/to/file.txt"
+ const buffer = Buffer.from("file content")
+ mockFs.readFile.mockResolvedValue(buffer)
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "gbk",
+ confidence: 0.9,
+ })
+
+ const result = await detectFileEncoding(filePath)
+
+ expect(mockFs.readFile).toHaveBeenCalledWith(filePath)
+ expect(result).toBe("gbk")
+ })
+
+ it("should return utf8 for non-existent file", async () => {
+ const filePath = "/path/to/nonexistent.txt"
+ mockFs.readFile.mockRejectedValue(new Error("File not found"))
+
+ const result = await detectFileEncoding(filePath)
+
+ expect(result).toBe("utf8")
+ })
+
+ it("should return utf8 for unreadable file", async () => {
+ const filePath = "/path/to/unreadable.txt"
+ mockFs.readFile.mockRejectedValue(new Error("Permission denied"))
+
+ const result = await detectFileEncoding(filePath)
+
+ expect(result).toBe("utf8")
+ })
+ })
+
+ describe("writeFileWithEncodingPreservation", () => {
+ it("should write utf8 file directly when original is utf8", async () => {
+ const filePath = "/path/to/file.txt"
+ const content = "new content"
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.9,
+ })
+
+ await writeFileWithEncodingPreservation(filePath, content)
+
+ expect(mockFs.writeFile).toHaveBeenCalledWith(filePath, content, "utf8")
+ })
+
+ it("should convert and write content for non-utf8 encoding", async () => {
+ const filePath = "/path/to/file.txt"
+ const content = "new content"
+ mockIsBinaryFile.mockResolvedValue(false)
+ mockJschardet.detect.mockReturnValue({
+ encoding: "gbk",
+ confidence: 0.9,
+ })
+
+ await writeFileWithEncodingPreservation(filePath, content)
+
+ expect(mockIconv.encode).toHaveBeenCalledWith(content, "gbk")
+ expect(mockFs.writeFile).toHaveBeenCalledWith(filePath, Buffer.from("encoded content"))
+ })
+
+ it("should handle new file (utf8) correctly", async () => {
+ const filePath = "/path/to/newfile.txt"
+ const content = "new content"
+ mockFs.readFile.mockRejectedValue(new Error("File not found"))
+
+ await writeFileWithEncodingPreservation(filePath, content)
+
+ expect(mockFs.writeFile).toHaveBeenCalledWith(filePath, content, "utf8")
+ })
+ })
+
+ describe("isBinaryFileWithEncodingDetection", () => {
+ it("should return false for text files that can be encoded", async () => {
+ const filePath = "/path/to/file.txt"
+ const buffer = Buffer.from("text content")
+ mockFs.readFile.mockResolvedValue(buffer)
+ mockPath.extname.mockReturnValue(".txt")
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.9,
+ })
+
+ const result = await isBinaryFileWithEncodingDetection(filePath)
+
+ expect(result).toBe(false)
+ expect(mockFs.readFile).toHaveBeenCalledWith(filePath)
+ })
+
+ it("should return true for files that fail encoding detection and are binary", async () => {
+ const filePath = "/path/to/file.exe"
+ const buffer = Buffer.from("binary content")
+ mockFs.readFile.mockResolvedValue(buffer)
+ mockPath.extname.mockReturnValue(".exe")
+ mockJschardet.detect.mockReturnValue({
+ encoding: "",
+ confidence: 0,
+ })
+ mockIsBinaryFile.mockResolvedValue(true)
+
+ const result = await isBinaryFileWithEncodingDetection(filePath)
+
+ expect(result).toBe(true)
+ })
+
+ it("should return false for file read errors", async () => {
+ const filePath = "/path/to/nonexistent.txt"
+ mockFs.readFile.mockRejectedValue(new Error("File not found"))
+
+ const result = await isBinaryFileWithEncodingDetection(filePath)
+
+ expect(result).toBe(false)
+ })
+
+ it("should return false when encoding detection succeeds even with low confidence", async () => {
+ const filePath = "/path/to/file.txt"
+ const buffer = Buffer.from("text content")
+ mockFs.readFile.mockResolvedValue(buffer)
+ mockPath.extname.mockReturnValue(".txt")
+ mockJschardet.detect.mockReturnValue({
+ encoding: "utf8",
+ confidence: 0.3,
+ })
+
+ const result = await isBinaryFileWithEncodingDetection(filePath)
+
+ expect(result).toBe(false)
+ })
+
+ it("should call isBinaryFile with buffer and buffer length when encoding detection fails", async () => {
+ const filePath = "/path/to/file.bin"
+ const buffer = Buffer.from("binary content for length test")
+ mockFs.readFile.mockResolvedValue(buffer)
+ mockPath.extname.mockReturnValue(".bin")
+ mockJschardet.detect.mockReturnValue({
+ encoding: "",
+ confidence: 0,
+ })
+ mockIsBinaryFile.mockResolvedValue(true)
+
+ await isBinaryFileWithEncodingDetection(filePath)
+
+ expect(mockIsBinaryFile).toHaveBeenCalledWith(buffer, buffer.length)
+ })
+ })
+})
diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts
new file mode 100644
index 0000000000..07899b6c66
--- /dev/null
+++ b/src/utils/encoding.ts
@@ -0,0 +1,133 @@
+import * as jschardet from "jschardet"
+import * as iconv from "iconv-lite"
+import { isBinaryFile } from "isbinaryfile"
+import fs from "fs/promises"
+import path from "path"
+
+/**
+ * Detect the encoding of a file buffer
+ * @param fileBuffer The file buffer
+ * @param fileExtension Optional file extension
+ * @returns The detected encoding
+ */
+export async function detectEncoding(fileBuffer: Buffer, fileExtension?: string): Promise {
+ // 1. Perform encoding detection first
+ const detected = jschardet.detect(fileBuffer)
+ let encoding: string
+ let originalEncoding: string | undefined
+
+ if (typeof detected === "string") {
+ encoding = detected
+ originalEncoding = detected
+ } else if (detected && detected.encoding) {
+ originalEncoding = detected.encoding
+ // Check confidence level, use default encoding if too low
+ // 0.7 is a conservative threshold that works well when UTF-8 is the dominant encoding
+ // and we prefer to fall back rather than risk mis-decoding
+ if (detected.confidence < 0.7) {
+ console.warn(
+ `Low confidence encoding detection: ${originalEncoding} (confidence: ${detected.confidence}), falling back to utf8`,
+ )
+ encoding = "utf8"
+ } else {
+ encoding = detected.encoding
+ }
+ } else {
+ // 2. Only check if it's a binary file when encoding detection fails
+ if (fileExtension) {
+ const isBinary = await isBinaryFile(fileBuffer, fileBuffer.length).catch(() => false)
+ if (isBinary) {
+ throw new Error(`Cannot read text for file type: ${fileExtension}`)
+ }
+ }
+ console.warn(`No encoding detected, falling back to utf8`)
+ encoding = "utf8"
+ }
+
+ // 3. Verify if the encoding is supported by iconv-lite
+ if (!iconv.encodingExists(encoding)) {
+ console.warn(
+ `Unsupported encoding detected: ${encoding}${originalEncoding && originalEncoding !== encoding ? ` (originally detected as: ${originalEncoding})` : ""}, falling back to utf8`,
+ )
+ encoding = "utf8"
+ }
+
+ return encoding
+}
+
+/**
+ * Read file with automatic encoding detection
+ * @param filePath Path to the file
+ * @returns File content as string
+ */
+export async function readFileWithEncodingDetection(filePath: string): Promise {
+ const buffer = await fs.readFile(filePath)
+ const fileExtension = path.extname(filePath).toLowerCase()
+
+ const encoding = await detectEncoding(buffer, fileExtension)
+ return iconv.decode(buffer, encoding)
+}
+
+/**
+ * Detect the encoding of an existing file
+ * @param filePath Path to the file
+ * @returns Detected encoding, returns 'utf8' if file does not exist
+ */
+export async function detectFileEncoding(filePath: string): Promise {
+ try {
+ const buffer = await fs.readFile(filePath)
+ const fileExtension = path.extname(filePath).toLowerCase()
+ return await detectEncoding(buffer, fileExtension)
+ } catch (error) {
+ // File does not exist or cannot be read, default to UTF-8
+ return "utf8"
+ }
+}
+
+/**
+ * Smart binary file detection that tries encoding detection first
+ * @param filePath Path to the file
+ * @returns Promise true if file is binary, false if it's text or if there's a read error
+ * @note Returns false on read errors to allow callers to handle file access issues explicitly
+ */
+export async function isBinaryFileWithEncodingDetection(filePath: string): Promise {
+ try {
+ const fileBuffer = await fs.readFile(filePath)
+ const fileExtension = path.extname(filePath).toLowerCase()
+
+ // Try to detect encoding first
+ try {
+ await detectEncoding(fileBuffer, fileExtension)
+ // If detectEncoding succeeds, it's a text file
+ return false
+ } catch (error) {
+ // If detectEncoding fails, check if it's actually a binary file
+ return await isBinaryFile(fileBuffer, fileBuffer.length).catch(() => false)
+ }
+ } catch (error) {
+ // File read error, return false to let callers handle read errors explicitly
+ return false
+ }
+}
+
+/**
+ * Write file using the same encoding as the original file
+ * If the file is new, use UTF-8 encoding
+ * @param filePath Path to the file
+ * @param content Content to write (UTF-8 string)
+ * @returns Promise
+ */
+export async function writeFileWithEncodingPreservation(filePath: string, content: string): Promise {
+ // Detect original file encoding
+ const originalEncoding = await detectFileEncoding(filePath)
+
+ // If original file is UTF-8 or does not exist, write directly
+ if (originalEncoding === "utf8") {
+ await fs.writeFile(filePath, content, "utf8")
+ return
+ }
+
+ // Convert UTF-8 content to original file encoding
+ const encodedBuffer = iconv.encode(content, originalEncoding)
+ await fs.writeFile(filePath, encodedBuffer)
+}