diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e803a1a72f..6151ee12e6 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -710,12 +710,18 @@ importers: i18next: specifier: ^25.0.0 version: 25.2.1(typescript@5.8.3) + iconv-lite: + specifier: ^0.6.3 + version: 0.6.3 ignore: specifier: ^7.0.3 version: 7.0.4 isbinaryfile: specifier: ^5.0.2 version: 5.0.4 + jschardet: + specifier: ^3.1.4 + version: 3.1.4 jwt-decode: specifier: ^4.0.0 version: 4.0.0 @@ -6944,6 +6950,10 @@ packages: jsbn@1.1.0: resolution: {integrity: sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==} + jschardet@3.1.4: + resolution: {integrity: sha512-/kmVISmrwVwtyYU40iQUOp3SUPk2dhNCMsZBQX0R1/jZ8maaXJ/oZIzUOiyOqcgtLnETFKYChbJ5iDC/eWmFHg==} + engines: {node: '>=0.1.90'} + jsdom@26.1.0: resolution: {integrity: sha512-Cvc9WUhxSMEo4McES3P7oK3QaXldCfNWp7pl2NNeiIFlCoLr3kfq9kb1fxftiwk1FLV7CvpvDfonxtzUDeSOPg==} engines: {node: '>=18'} @@ -16988,6 +16998,8 @@ snapshots: jsbn@1.1.0: {} + jschardet@3.1.4: {} + jsdom@26.1.0: dependencies: cssstyle: 4.4.0 diff --git a/src/core/mentions/index.ts b/src/core/mentions/index.ts index f038b5b783..e93cd856bd 100644 --- a/src/core/mentions/index.ts +++ b/src/core/mentions/index.ts @@ -2,7 +2,7 @@ import fs from "fs/promises" import * as path from "path" import * as vscode from "vscode" -import { isBinaryFile } from "isbinaryfile" +import { isBinaryFileWithEncodingDetection } from "../../utils/encoding" import { mentionRegexGlobal, commandRegexGlobal, unescapeSpaces } from "../../shared/context-mentions" @@ -314,7 +314,7 @@ async function getFileOrFolderContent( fileContentPromises.push( (async () => { try { - const isBinary = await isBinaryFile(absoluteFilePath).catch(() => false) + const isBinary = await isBinaryFileWithEncodingDetection(absoluteFilePath) if (isBinary) { return undefined } diff --git a/src/core/tools/__tests__/readFileTool.spec.ts b/src/core/tools/__tests__/readFileTool.spec.ts index d693d6ba44..c3e53aaefd 100644 --- a/src/core/tools/__tests__/readFileTool.spec.ts +++ b/src/core/tools/__tests__/readFileTool.spec.ts @@ -6,7 +6,7 @@ import { countFileLines } from "../../../integrations/misc/line-counter" import { readLines } from "../../../integrations/misc/read-lines" import { extractTextFromFile } from "../../../integrations/misc/extract-text" import { parseSourceCodeDefinitionsForFile } from "../../../services/tree-sitter" -import { isBinaryFile } from "isbinaryfile" +import { isBinaryFileWithEncodingDetection } from "../../../utils/encoding" import { ReadFileToolUse, ToolParamName, ToolResponse } from "../../../shared/tools" import { readFileTool } from "../readFileTool" import { formatResponse } from "../../prompts/responses" @@ -23,7 +23,9 @@ vi.mock("path", async () => { // Already mocked above with hoisted fsPromises -vi.mock("isbinaryfile") +vi.mock("../../../utils/encoding", () => ({ + isBinaryFileWithEncodingDetection: vi.fn(), +})) vi.mock("../../../integrations/misc/line-counter") vi.mock("../../../integrations/misc/read-lines") @@ -238,7 +240,7 @@ describe("read_file tool with maxReadFileLine setting", () => { const mockedExtractTextFromFile = vi.mocked(extractTextFromFile) const mockedParseSourceCodeDefinitionsForFile = vi.mocked(parseSourceCodeDefinitionsForFile) - const mockedIsBinaryFile = vi.mocked(isBinaryFile) + const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection) const mockedPathResolve = vi.mocked(path.resolve) let mockCline: any @@ -249,7 +251,7 @@ describe("read_file tool with maxReadFileLine setting", () => { // Clear specific mocks (not all mocks to preserve shared state) mockedCountFileLines.mockClear() mockedExtractTextFromFile.mockClear() - mockedIsBinaryFile.mockClear() + mockedIsBinaryFileWithEncodingDetection.mockClear() mockedPathResolve.mockClear() addLineNumbersMock.mockClear() extractTextFromFileMock.mockClear() @@ -264,7 +266,7 @@ describe("read_file tool with maxReadFileLine setting", () => { setImageSupport(mockCline, false) mockedPathResolve.mockReturnValue(absoluteFilePath) - mockedIsBinaryFile.mockResolvedValue(false) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false) mockInputContent = fileContent @@ -502,7 +504,7 @@ describe("read_file tool with maxReadFileLine setting", () => { describe("when file is binary", () => { it("should always use extractTextFromFile regardless of maxReadFileLine", async () => { // Setup - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(3) mockedExtractTextFromFile.mockResolvedValue("") @@ -544,7 +546,7 @@ describe("read_file tool XML output structure", () => { const mockedCountFileLines = vi.mocked(countFileLines) const mockedExtractTextFromFile = vi.mocked(extractTextFromFile) - const mockedIsBinaryFile = vi.mocked(isBinaryFile) + const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection) const mockedPathResolve = vi.mocked(path.resolve) const mockedFsReadFile = vi.mocked(fsPromises.readFile) const imageBuffer = Buffer.from( @@ -560,7 +562,7 @@ describe("read_file tool XML output structure", () => { // Clear specific mocks (not all mocks to preserve shared state) mockedCountFileLines.mockClear() mockedExtractTextFromFile.mockClear() - mockedIsBinaryFile.mockClear() + mockedIsBinaryFileWithEncodingDetection.mockClear() mockedPathResolve.mockClear() addLineNumbersMock.mockClear() extractTextFromFileMock.mockClear() @@ -580,7 +582,7 @@ describe("read_file tool XML output structure", () => { setImageSupport(mockCline, true) mockedPathResolve.mockReturnValue(absoluteFilePath) - mockedIsBinaryFile.mockResolvedValue(false) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false) // Set default implementation for extractTextFromFile mockedExtractTextFromFile.mockImplementation((filePath) => { @@ -617,7 +619,7 @@ describe("read_file tool XML output structure", () => { mockProvider.getState.mockResolvedValue({ maxReadFileLine, maxImageFileSize: 20, maxTotalImageSize: 20 }) mockedCountFileLines.mockResolvedValue(totalLines) - mockedIsBinaryFile.mockResolvedValue(isBinary) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(isBinary) mockCline.rooIgnoreController.validateAccess = vi.fn().mockReturnValue(validateAccess) let argsContent = `${testFilePath}` @@ -758,7 +760,7 @@ describe("read_file tool XML output structure", () => { it("should allow multiple images under the total memory limit", async () => { // Setup required mocks (don't clear all mocks - preserve API setup) - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) fsPromises.readFile.mockResolvedValue( Buffer.from( @@ -831,7 +833,7 @@ describe("read_file tool XML output structure", () => { it("should skip images that would exceed the total memory limit", async () => { // Setup required mocks (don't clear all mocks) - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) fsPromises.readFile.mockResolvedValue( Buffer.from( @@ -917,7 +919,7 @@ describe("read_file tool XML output structure", () => { // Setup mocks (don't clear all mocks) // Setup required mocks - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) fsPromises.readFile.mockResolvedValue( Buffer.from( @@ -990,7 +992,7 @@ describe("read_file tool XML output structure", () => { // Setup mocks (don't clear all mocks) // Setup required mocks - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) fsPromises.readFile.mockResolvedValue( Buffer.from( @@ -1084,7 +1086,7 @@ describe("read_file tool XML output structure", () => { maxTotalImageSize: 20, // 20MB total }) - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) mockedFsReadFile.mockResolvedValue(imageBuffer) @@ -1115,7 +1117,7 @@ describe("read_file tool XML output structure", () => { // Setup mocks (don't clear all mocks) // Setup required mocks for first batch - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) fsPromises.readFile.mockResolvedValue( Buffer.from( @@ -1161,7 +1163,7 @@ describe("read_file tool XML output structure", () => { await executeReadMultipleImagesTool(firstBatch.map((img) => img.path)) // Setup second batch (don't clear all mocks) - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) fsPromises.readFile.mockResolvedValue( Buffer.from( @@ -1203,7 +1205,7 @@ describe("read_file tool XML output structure", () => { // Clear and reset file system mocks for second batch fsPromises.stat.mockClear() fsPromises.readFile.mockClear() - mockedIsBinaryFile.mockClear() + mockedIsBinaryFileWithEncodingDetection.mockClear() mockedCountFileLines.mockClear() // Reset mocks for second batch @@ -1214,7 +1216,7 @@ describe("read_file tool XML output structure", () => { "base64", ), ) - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) mockedPathResolve.mockImplementation((cwd, relPath) => `/${relPath}`) @@ -1241,7 +1243,7 @@ describe("read_file tool XML output structure", () => { ] // Setup mocks - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) fsPromises.readFile.mockResolvedValue(imageBuffer) @@ -1289,7 +1291,7 @@ describe("read_file tool XML output structure", () => { // starts with fresh memory tracking // Setup mocks - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) fsPromises.readFile.mockResolvedValue(imageBuffer) @@ -1394,7 +1396,7 @@ describe("read_file tool with image support", () => { const imageBuffer = Buffer.from(base64ImageData, "base64") const mockedCountFileLines = vi.mocked(countFileLines) - const mockedIsBinaryFile = vi.mocked(isBinaryFile) + const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection) const mockedPathResolve = vi.mocked(path.resolve) const mockedFsReadFile = vi.mocked(fsPromises.readFile) const mockedExtractTextFromFile = vi.mocked(extractTextFromFile) @@ -1406,7 +1408,7 @@ describe("read_file tool with image support", () => { beforeEach(() => { // Clear specific mocks (not all mocks to preserve shared state) mockedPathResolve.mockClear() - mockedIsBinaryFile.mockClear() + mockedIsBinaryFileWithEncodingDetection.mockClear() mockedCountFileLines.mockClear() mockedFsReadFile.mockClear() mockedExtractTextFromFile.mockClear() @@ -1425,7 +1427,7 @@ describe("read_file tool with image support", () => { setImageSupport(localMockCline, true) mockedPathResolve.mockReturnValue(absoluteImagePath) - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) mockedCountFileLines.mockResolvedValue(0) mockedFsReadFile.mockResolvedValue(imageBuffer) diff --git a/src/core/tools/applyDiffTool.ts b/src/core/tools/applyDiffTool.ts index 1077b7bf39..acc2b01b04 100644 --- a/src/core/tools/applyDiffTool.ts +++ b/src/core/tools/applyDiffTool.ts @@ -2,6 +2,7 @@ import path from "path" import fs from "fs/promises" import { TelemetryService } from "@roo-code/telemetry" +import { readFileWithEncodingDetection } from "../../utils/encoding" import { DEFAULT_WRITE_DELAY_MS } from "@roo-code/types" import { ClineSayTool } from "../../shared/ExtensionMessage" @@ -89,7 +90,7 @@ export async function applyDiffToolLegacy( return } - const originalContent: string = await fs.readFile(absolutePath, "utf-8") + const originalContent: string = await readFileWithEncodingDetection(absolutePath) // Apply the diff to the original content const diffResult = (await cline.diffStrategy?.applyDiff( diff --git a/src/core/tools/insertContentTool.ts b/src/core/tools/insertContentTool.ts index 38ca309a3b..767383eb2c 100644 --- a/src/core/tools/insertContentTool.ts +++ b/src/core/tools/insertContentTool.ts @@ -3,6 +3,7 @@ import fs from "fs/promises" import path from "path" import { getReadablePath } from "../../utils/path" +import { readFileWithEncodingDetection } from "../../utils/encoding" import { Task } from "../task/Task" import { ToolUse, AskApproval, HandleError, PushToolResult, RemoveClosingTag } from "../../shared/tools" import { formatResponse } from "../prompts/responses" @@ -93,7 +94,7 @@ export async function insertContentTool( return } } else { - fileContent = await fs.readFile(absolutePath, "utf8") + fileContent = await readFileWithEncodingDetection(absolutePath) } cline.consecutiveMistakeCount = 0 diff --git a/src/core/tools/multiApplyDiffTool.ts b/src/core/tools/multiApplyDiffTool.ts index 08bce08ede..3cd8297ff6 100644 --- a/src/core/tools/multiApplyDiffTool.ts +++ b/src/core/tools/multiApplyDiffTool.ts @@ -2,6 +2,7 @@ import path from "path" import fs from "fs/promises" import { TelemetryService } from "@roo-code/telemetry" +import { readFileWithEncodingDetection } from "../../utils/encoding" import { DEFAULT_WRITE_DELAY_MS } from "@roo-code/types" import { ClineSayTool } from "../../shared/ExtensionMessage" @@ -300,7 +301,7 @@ Original error: ${errorMessage}` let unified = "" try { - const original = await fs.readFile(opResult.absolutePath!, "utf-8") + const original = await readFileWithEncodingDetection(opResult.absolutePath!) const processed = !cline.api.getModel().id.includes("claude") ? (opResult.diffItems || []).map((item) => ({ ...item, @@ -457,7 +458,7 @@ Original error: ${errorMessage}` const fileExists = opResult.fileExists! try { - let originalContent: string | null = await fs.readFile(absolutePath, "utf-8") + let originalContent: string | null = await readFileWithEncodingDetection(absolutePath) let beforeContent: string | null = originalContent let successCount = 0 let formattedError = "" @@ -611,7 +612,7 @@ ${errorDetails ? `\nTechnical details:\n${errorDetails}\n` : ""} cline.diffViewProvider.scrollToFirstDiff() } else { // For direct save, we still need to set originalContent - cline.diffViewProvider.originalContent = await fs.readFile(absolutePath, "utf-8") + cline.diffViewProvider.originalContent = await readFileWithEncodingDetection(absolutePath) } // Ask for approval (same for both flows) @@ -646,7 +647,7 @@ ${errorDetails ? `\nTechnical details:\n${errorDetails}\n` : ""} if (isPreventFocusDisruptionEnabled) { // Direct file write without diff view or opening the file cline.diffViewProvider.editType = "modify" - cline.diffViewProvider.originalContent = await fs.readFile(absolutePath, "utf-8") + cline.diffViewProvider.originalContent = await readFileWithEncodingDetection(absolutePath) await cline.diffViewProvider.saveDirectly( relPath, originalContent!, diff --git a/src/core/tools/readFileTool.ts b/src/core/tools/readFileTool.ts index 53f0643dbb..75284c7942 100644 --- a/src/core/tools/readFileTool.ts +++ b/src/core/tools/readFileTool.ts @@ -1,5 +1,4 @@ import path from "path" -import { isBinaryFile } from "isbinaryfile" import { Task } from "../task/Task" import { ClineSayTool } from "../../shared/ExtensionMessage" @@ -14,6 +13,7 @@ import { readLines } from "../../integrations/misc/read-lines" import { extractTextFromFile, addLineNumbers, getSupportedBinaryFormats } from "../../integrations/misc/extract-text" import { parseSourceCodeDefinitionsForFile } from "../../services/tree-sitter" import { parseXml } from "../../utils/xml" +import { isBinaryFileWithEncodingDetection } from "../../utils/encoding" import { DEFAULT_MAX_IMAGE_FILE_SIZE_MB, DEFAULT_MAX_TOTAL_IMAGE_SIZE_MB, @@ -456,7 +456,10 @@ export async function readFileTool( // Process approved files try { - const [totalLines, isBinary] = await Promise.all([countFileLines(fullPath), isBinaryFile(fullPath)]) + const [totalLines, isBinary] = await Promise.all([ + countFileLines(fullPath), + isBinaryFileWithEncodingDetection(fullPath), + ]) // Handle binary files (but allow specific file types that extractTextFromFile can handle) if (isBinary) { diff --git a/src/core/tools/writeToFileTool.ts b/src/core/tools/writeToFileTool.ts index b8e6da0caa..88130efcd0 100644 --- a/src/core/tools/writeToFileTool.ts +++ b/src/core/tools/writeToFileTool.ts @@ -4,6 +4,7 @@ import * as vscode from "vscode" import fs from "fs/promises" import { Task } from "../task/Task" +import { readFileWithEncodingDetection } from "../../utils/encoding" import { ClineSayTool } from "../../shared/ExtensionMessage" import { formatResponse } from "../prompts/responses" import { ToolUse, AskApproval, HandleError, PushToolResult, RemoveClosingTag } from "../../shared/tools" @@ -178,7 +179,7 @@ export async function writeToFileTool( cline.diffViewProvider.editType = fileExists ? "modify" : "create" if (fileExists) { const absolutePath = path.resolve(cline.cwd, relPath) - cline.diffViewProvider.originalContent = await fs.readFile(absolutePath, "utf-8") + cline.diffViewProvider.originalContent = await readFileWithEncodingDetection(absolutePath) } else { cline.diffViewProvider.originalContent = "" } diff --git a/src/integrations/editor/DiffViewProvider.ts b/src/integrations/editor/DiffViewProvider.ts index d42eba082c..5665cce80f 100644 --- a/src/integrations/editor/DiffViewProvider.ts +++ b/src/integrations/editor/DiffViewProvider.ts @@ -7,6 +7,7 @@ import { XMLBuilder } from "fast-xml-parser" import delay from "delay" import { createDirectoriesForFile } from "../../utils/fs" +import { readFileWithEncodingDetection, writeFileWithEncodingPreservation } from "../../utils/encoding" import { arePathsEqual, getReadablePath } from "../../utils/path" import { formatResponse } from "../../core/prompts/responses" import { diagnosticsToProblemsString, getNewDiagnostics } from "../diagnostics" @@ -68,7 +69,7 @@ export class DiffViewProvider { this.preDiagnostics = vscode.languages.getDiagnostics() if (fileExists) { - this.originalContent = await fs.readFile(absolutePath, "utf-8") + this.originalContent = await readFileWithEncodingDetection(absolutePath) } else { this.originalContent = "" } @@ -662,9 +663,9 @@ export class DiffViewProvider { // Get diagnostics before editing the file this.preDiagnostics = vscode.languages.getDiagnostics() - // Write the content directly to the file + // Write the content directly to the file with encoding preservation await createDirectoriesForFile(absolutePath) - await fs.writeFile(absolutePath, content, "utf-8") + await writeFileWithEncodingPreservation(absolutePath, content) // Open the document to ensure diagnostics are loaded // When openFile is false (PREVENT_FOCUS_DISRUPTION enabled), we only open in memory diff --git a/src/integrations/editor/__tests__/DiffViewProvider.spec.ts b/src/integrations/editor/__tests__/DiffViewProvider.spec.ts index e99f7bf9c8..47f02a8d67 100644 --- a/src/integrations/editor/__tests__/DiffViewProvider.spec.ts +++ b/src/integrations/editor/__tests__/DiffViewProvider.spec.ts @@ -9,22 +9,40 @@ vi.mock("delay", () => ({ })) // Mock fs/promises -vi.mock("fs/promises", () => ({ - readFile: vi.fn().mockResolvedValue("file content"), - writeFile: vi.fn().mockResolvedValue(undefined), -})) +vi.mock("fs/promises", async () => { + const actual = await vi.importActual("fs/promises"); + return { + ...actual, + readFile: vi.fn().mockResolvedValue("file content"), + writeFile: vi.fn().mockResolvedValue(undefined), + default: { + readFile: vi.fn().mockResolvedValue("file content"), + writeFile: vi.fn().mockResolvedValue(undefined), + } + } +}) // Mock utils vi.mock("../../../utils/fs", () => ({ createDirectoriesForFile: vi.fn().mockResolvedValue([]), })) -// Mock path -vi.mock("path", () => ({ - resolve: vi.fn((cwd, relPath) => `${cwd}/${relPath}`), - basename: vi.fn((path) => path.split("/").pop()), +// Mock encoding utilities +vi.mock("../../../utils/encoding", () => ({ + readFileWithEncodingDetection: vi.fn().mockResolvedValue("file content"), + writeFileWithEncodingPreservation: vi.fn().mockResolvedValue(undefined), })) +// Mock path +vi.mock("path", async () => { + const actual = await vi.importActual("path"); + return { + ...actual, + resolve: vi.fn((cwd, relPath) => `${cwd}/${relPath}`), + basename: vi.fn((path) => path.split("/").pop()), + }; +}) +// Mock vscode // Mock vscode vi.mock("vscode", () => ({ workspace: { @@ -90,7 +108,6 @@ vi.mock("vscode", () => ({ parse: vi.fn((uri) => ({ with: vi.fn(() => ({})) })), }, })) - // Mock DecorationController vi.mock("../DecorationController", () => ({ DecorationController: vi.fn().mockImplementation(() => ({ @@ -371,8 +388,8 @@ describe("DiffViewProvider", () => { const result = await diffViewProvider.saveDirectly("test.ts", "new content", true, true, 2000) // Verify file was written - const fs = await import("fs/promises") - expect(fs.writeFile).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content", "utf-8") + const { writeFileWithEncodingPreservation } = await import("../../../utils/encoding") + expect(writeFileWithEncodingPreservation).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content") // Verify file was opened without focus expect(vscode.window.showTextDocument).toHaveBeenCalledWith( @@ -394,8 +411,8 @@ describe("DiffViewProvider", () => { await diffViewProvider.saveDirectly("test.ts", "new content", false, true, 1000) // Verify file was written - const fs = await import("fs/promises") - expect(fs.writeFile).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content", "utf-8") + const { writeFileWithEncodingPreservation } = await import("../../../utils/encoding") + expect(writeFileWithEncodingPreservation).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content") // Verify file was NOT opened expect(vscode.window.showTextDocument).not.toHaveBeenCalled() @@ -409,8 +426,8 @@ describe("DiffViewProvider", () => { await diffViewProvider.saveDirectly("test.ts", "new content", true, false, 1000) // Verify file was written - const fs = await import("fs/promises") - expect(fs.writeFile).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content", "utf-8") + const { writeFileWithEncodingPreservation } = await import("../../../utils/encoding") + expect(writeFileWithEncodingPreservation).toHaveBeenCalledWith(`${mockCwd}/test.ts`, "new content") // Verify delay was NOT called expect(mockDelay).not.toHaveBeenCalled() diff --git a/src/integrations/misc/__tests__/extract-text-large-files.spec.ts b/src/integrations/misc/__tests__/extract-text-large-files.spec.ts index c9e2f181f5..de9d1da789 100644 --- a/src/integrations/misc/__tests__/extract-text-large-files.spec.ts +++ b/src/integrations/misc/__tests__/extract-text-large-files.spec.ts @@ -5,26 +5,30 @@ import * as fs from "fs/promises" import { extractTextFromFile } from "../extract-text" import { countFileLines } from "../line-counter" import { readLines } from "../read-lines" -import { isBinaryFile } from "isbinaryfile" +import { isBinaryFileWithEncodingDetection, readFileWithEncodingDetection } from "../../../utils/encoding" // Mock all dependencies vi.mock("fs/promises") vi.mock("../line-counter") vi.mock("../read-lines") -vi.mock("isbinaryfile") +vi.mock("../../../utils/encoding", () => ({ + isBinaryFileWithEncodingDetection: vi.fn(), + readFileWithEncodingDetection: vi.fn(), +})) describe("extractTextFromFile - Large File Handling", () => { // Type the mocks const mockedFs = vi.mocked(fs) const mockedCountFileLines = vi.mocked(countFileLines) const mockedReadLines = vi.mocked(readLines) - const mockedIsBinaryFile = vi.mocked(isBinaryFile) + const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection) + const mockedReadFileWithEncodingDetection = vi.mocked(readFileWithEncodingDetection) beforeEach(() => { vi.clearAllMocks() // Set default mock behavior mockedFs.access.mockResolvedValue(undefined) - mockedIsBinaryFile.mockResolvedValue(false) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false) }) it("should truncate files that exceed maxReadFileLine limit", async () => { @@ -61,7 +65,7 @@ describe("extractTextFromFile - Large File Handling", () => { .join("\n") mockedCountFileLines.mockResolvedValue(50) - mockedFs.readFile.mockResolvedValue(smallFileContent as any) + mockedReadFileWithEncodingDetection.mockResolvedValue(smallFileContent) const result = await extractTextFromFile("/test/small-file.ts", 100) @@ -80,7 +84,7 @@ describe("extractTextFromFile - Large File Handling", () => { .join("\n") mockedCountFileLines.mockResolvedValue(100) - mockedFs.readFile.mockResolvedValue(exactFileContent as any) + mockedReadFileWithEncodingDetection.mockResolvedValue(exactFileContent) const result = await extractTextFromFile("/test/exact-file.ts", 100) @@ -98,7 +102,7 @@ describe("extractTextFromFile - Large File Handling", () => { .map((_, i) => `Line ${i + 1}`) .join("\n") - mockedFs.readFile.mockResolvedValue(largeFileContent as any) + mockedReadFileWithEncodingDetection.mockResolvedValue(largeFileContent) const result = await extractTextFromFile("/test/large-file.ts", undefined) @@ -111,7 +115,7 @@ describe("extractTextFromFile - Large File Handling", () => { }) it("should handle empty files", async () => { - mockedFs.readFile.mockResolvedValue("" as any) + mockedReadFileWithEncodingDetection.mockResolvedValue("") const result = await extractTextFromFile("/test/empty-file.ts", 100) @@ -155,7 +159,7 @@ describe("extractTextFromFile - Large File Handling", () => { it("should handle maxReadFileLine of 0 by throwing an error", async () => { const fileContent = "Line 1\nLine 2\nLine 3" - mockedFs.readFile.mockResolvedValue(fileContent as any) + mockedReadFileWithEncodingDetection.mockResolvedValue(fileContent) // maxReadFileLine of 0 should throw an error await expect(extractTextFromFile("/test/file.ts", 0)).rejects.toThrow( @@ -166,7 +170,7 @@ describe("extractTextFromFile - Large File Handling", () => { it("should handle negative maxReadFileLine by treating as undefined", async () => { const fileContent = "Line 1\nLine 2\nLine 3" - mockedFs.readFile.mockResolvedValue(fileContent as any) + mockedReadFileWithEncodingDetection.mockResolvedValue(fileContent) const result = await extractTextFromFile("/test/file.ts", -1) @@ -204,7 +208,7 @@ describe("extractTextFromFile - Large File Handling", () => { }) it("should handle binary files by throwing an error", async () => { - mockedIsBinaryFile.mockResolvedValue(true) + mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true) await expect(extractTextFromFile("/test/binary.bin", 100)).rejects.toThrow( "Cannot read text for file type: .bin", diff --git a/src/integrations/misc/__tests__/read-lines.spec.ts b/src/integrations/misc/__tests__/read-lines.spec.ts index 14456d24f1..d1bc93cb83 100644 --- a/src/integrations/misc/__tests__/read-lines.spec.ts +++ b/src/integrations/misc/__tests__/read-lines.spec.ts @@ -1,3 +1,4 @@ +import { describe, it, expect, beforeAll, afterAll, vi } from "vitest" import { promises as fs } from "fs" import path from "path" import { readLines } from "../read-lines" @@ -5,6 +6,17 @@ import { readLines } from "../read-lines" describe("nthline", () => { const testFile = path.join(__dirname, "test.txt") + // Helper function to create a temporary file, run a test, and clean up + async function withTempFile(filename: string, content: string, testFn: (filepath: string) => Promise) { + const filepath = path.join(__dirname, filename) + await fs.writeFile(filepath, content) + try { + await testFn(filepath) + } finally { + await fs.unlink(filepath) + } + } + beforeAll(async () => { // Create a test file with numbered lines const content = Array.from({ length: 10 }, (_, i) => `Line ${i + 1}`).join("\n") @@ -71,17 +83,6 @@ describe("nthline", () => { await expect(readLines(testFile, 20, 15)).rejects.toThrow("does not exist") }) - // Helper function to create a temporary file, run a test, and clean up - async function withTempFile(filename: string, content: string, testFn: (filepath: string) => Promise) { - const filepath = path.join(__dirname, filename) - await fs.writeFile(filepath, content) - try { - await testFn(filepath) - } finally { - await fs.unlink(filepath) - } - } - it("should handle empty files", async () => { await withTempFile("empty.txt", "", async (filepath) => { await expect(readLines(filepath, 0, 0)).rejects.toThrow("does not exist") @@ -129,4 +130,244 @@ describe("nthline", () => { }) }) }) + + describe("bytesRead sampling for encoding detection", () => { + it("should sample exactly 64KB for encoding detection on large files", async () => { + // Create a large file with line breaks to test proper sampling + const lineContent = "This is a test line for large file sampling\n" + const linesNeeded = Math.ceil(100000 / lineContent.length) // Ensure > 64KB + const largeContent = lineContent.repeat(linesNeeded) + + await withTempFile("large-file.txt", largeContent, async (filepath) => { + // For large files, the function should read and process correctly + // We'll verify the function works with large files that exceed 64KB + const lines = await readLines(filepath, 1) // Read first 2 lines (0-1) + + // Verify that the content is read correctly + expect(lines).toContain("This is a test line for large file sampling") + // Should only contain 2 lines + const lineArray = lines.split("\n").filter((line) => line.length > 0) + expect(lineArray).toHaveLength(2) + }) + }) + + it("should handle files smaller than 64KB sampling correctly", async () => { + const smallContent = "Line 1\nLine 2\nLine 3\n" + + await withTempFile("small-file.txt", smallContent, async (filepath) => { + // For small files, the function should still attempt to read 64KB for encoding detection + // We'll just verify the function works correctly with small files + const lines = await readLines(filepath, 0) // Read first line (0) + + // Verify that the content is read correctly + expect(lines).toContain("Line 1") + expect(lines).not.toContain("Line 2") // Should only read first line + }) + }) + + it("should handle UTF-8 BOM in the 64KB sample correctly", async () => { + // Create content with UTF-8 BOM at the beginning + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) + const textContent = "Line 1 with UTF-8 content\nLine 2\nLine 3\n" + const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(textContent, "utf8")]) + + await withTempFile("bom-file.txt", contentWithBOM.toString(), async (filepath) => { + // Write the actual binary content with BOM + await fs.writeFile(filepath, contentWithBOM) + + const lines = await readLines(filepath, 0) // Read first line (0) + + // Should successfully read the content, BOM should be handled by encoding detection + expect(lines).toContain("Line 1 with UTF-8 content") + }) + }) + + it("should handle UTF-16 LE BOM in the 64KB sample correctly", async () => { + // Create content with UTF-16 LE BOM + const bomBytes = Buffer.from([0xff, 0xfe]) + const textContent = "Line 1\nLine 2\n" + const utf16Content = Buffer.from(textContent, "utf16le") + const contentWithBOM = Buffer.concat([bomBytes, utf16Content]) + + await withTempFile("utf16le-bom-file.txt", "", async (filepath) => { + // Write the actual binary content with BOM + await fs.writeFile(filepath, contentWithBOM) + + const lines = await readLines(filepath, 1) + + // Should successfully read the content, BOM should be handled by encoding detection + expect(lines).toContain("Line 1") + }) + }) + + it("should handle partial multi-byte characters at 64KB boundary", async () => { + // Create content where a multi-byte UTF-8 character might be split at 64KB boundary + const lineContent = "Line with content: 你好世界\n" + const linesNeeded = Math.ceil(65536 / lineContent.length) + 5 // Ensure > 64KB + const fullContent = lineContent.repeat(linesNeeded) + "Final line after boundary\n" + + await withTempFile("multibyte-boundary.txt", fullContent, async (filepath) => { + // Read the last few lines to check the content after the boundary + const lines = await readLines(filepath, linesNeeded + 1, linesNeeded - 1) // Read last 3 lines + expect(lines).toContain("Final line after boundary") + // The multi-byte characters should be preserved + expect(lines).toContain("你好世界") + }) + }) + + it("should handle encoding detection failure gracefully with 64KB sampling", async () => { + // Create binary-like content that might confuse encoding detection + const binaryLikeContent = Buffer.alloc(70000) // Larger than 64KB + // Fill with values that might be detected as binary + for (let i = 0; i < binaryLikeContent.length; i++) { + binaryLikeContent[i] = i % 256 + } + // Add some text at the end + const textPortion = Buffer.from("\nSome text at the end\n", "utf8") + const mixedContent = Buffer.concat([binaryLikeContent, textPortion]) + + await withTempFile("mixed-content.txt", "", async (filepath) => { + await fs.writeFile(filepath, mixedContent) + + // Should either succeed with fallback encoding or handle gracefully + try { + const lines = await readLines(filepath, 0, 0) + // If it succeeds, it should contain the text portion + expect(typeof lines).toBe("string") + } catch (error) { + // If it fails, it should be a meaningful error about binary content + expect(error).toBeInstanceOf(Error) + } + }) + }) + }) + + describe("BOM preservation integration tests", () => { + it("should preserve UTF-8 BOM when reading lines from file", async () => { + // Create content with UTF-8 BOM + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) + const textContent = "First line with UTF-8 content\nSecond line\nThird line\n" + const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(textContent, "utf8")]) + + await withTempFile("utf8-bom-integration.txt", "", async (filepath) => { + // Write the actual binary content with BOM + await fs.writeFile(filepath, contentWithBOM) + + // Read first line + const firstLine = await readLines(filepath, 1) + expect(firstLine).toContain("First line with UTF-8 content") + + // Read multiple lines + const multipleLines = await readLines(filepath, 2) + expect(multipleLines).toContain("First line with UTF-8 content") + expect(multipleLines).toContain("Second line") + + // Read from specific line + const fromSecondLine = await readLines(filepath, 1, 1) + expect(fromSecondLine).toContain("Second line") + }) + }) + + it("should preserve UTF-16 LE BOM when reading lines from file", async () => { + // Create content with UTF-16 LE BOM + const bomBytes = Buffer.from([0xff, 0xfe]) + const textContent = "UTF-16 LE first line\nUTF-16 LE second line\n" + const utf16Content = Buffer.from(textContent, "utf16le") + const contentWithBOM = Buffer.concat([bomBytes, utf16Content]) + + await withTempFile("utf16le-bom-integration.txt", "", async (filepath) => { + // Write the actual binary content with BOM + await fs.writeFile(filepath, contentWithBOM) + + // Read first line + const firstLine = await readLines(filepath, 0) // Read first line (0) + expect(firstLine).toContain("UTF-16 LE first line") + + // Read multiple lines + const multipleLines = await readLines(filepath, 1) // Read first 2 lines (0-1) + expect(multipleLines).toContain("UTF-16 LE first line") + expect(multipleLines).toContain("UTF-16 LE second line") + }) + }) + + it("should preserve UTF-16 BE BOM when reading lines from file", async () => { + // Create content with UTF-16 BE BOM + const bomBytes = Buffer.from([0xfe, 0xff]) + const textContent = "UTF-16 BE first line\nUTF-16 BE second line\n" + // Manually create UTF-16 BE content + const utf16beBytes = [] + for (let i = 0; i < textContent.length; i++) { + const charCode = textContent.charCodeAt(i) + utf16beBytes.push((charCode >> 8) & 0xff) // High byte first + utf16beBytes.push(charCode & 0xff) // Low byte second + } + const utf16Content = Buffer.from(utf16beBytes) + const contentWithBOM = Buffer.concat([bomBytes, utf16Content]) + + await withTempFile("utf16be-bom-integration.txt", "", async (filepath) => { + // Write the actual binary content with BOM + await fs.writeFile(filepath, contentWithBOM) + + // Read first line + const firstLine = await readLines(filepath, 0) // Read first line (0) + expect(firstLine).toContain("UTF-16 BE first line") + + // Read multiple lines + const multipleLines = await readLines(filepath, 1) // Read first 2 lines (0-1) + expect(multipleLines).toContain("UTF-16 BE first line") + expect(multipleLines).toContain("UTF-16 BE second line") + }) + }) + + it("should handle BOM preservation with large files that exceed 64KB sampling", async () => { + // Create a large file with UTF-8 BOM that exceeds 64KB + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) + const lineContent = "This is a test line with UTF-8 content and BOM: 你好世界 🌍\n" + const linesNeeded = Math.ceil(65536 / lineContent.length) + 100 // Ensure > 64KB + const largeTextContent = lineContent.repeat(linesNeeded) + const contentWithBOM = Buffer.concat([bomBytes, Buffer.from(largeTextContent, "utf8")]) + + await withTempFile("large-utf8-bom-integration.txt", "", async (filepath) => { + // Write the actual binary content with BOM + await fs.writeFile(filepath, contentWithBOM) + + // Read first few lines + const firstLines = await readLines(filepath, 3) + expect(firstLines).toContain("This is a test line with UTF-8 content and BOM: 你好世界 🌍") + + // Read from middle of file (read 2 lines starting from line 50) + const middleLines = await readLines(filepath, 51, 49) // Read lines 49-51 (0-based) + expect(middleLines).toContain("This is a test line with UTF-8 content and BOM: 你好世界 🌍") + + // Verify the content is properly decoded despite BOM + const lines = firstLines.split("\n") + expect(lines[0]).not.toMatch(/^\uFEFF/) // BOM should not appear in decoded text + }) + }) + + it("should handle mixed BOM and non-BOM files correctly", async () => { + // Test reading from a UTF-8 BOM file and then a regular UTF-8 file + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) + const bomContent = Buffer.concat([bomBytes, Buffer.from("BOM file content\nSecond line\n", "utf8")]) + const regularContent = "Regular UTF-8 content\nAnother line\n" + + await withTempFile("bom-file-mixed.txt", "", async (bomFilepath) => { + await fs.writeFile(bomFilepath, bomContent) + + await withTempFile("regular-file-mixed.txt", regularContent, async (regularFilepath) => { + // Read from BOM file + const bomLines = await readLines(bomFilepath, 0) // Read first line (0) + expect(bomLines).toContain("BOM file content") + + // Read from regular file + const regularLines = await readLines(regularFilepath, 0) // Read first line (0) + expect(regularLines).toContain("Regular UTF-8 content") + + // Both should work correctly without interference + expect(bomLines).not.toContain("Regular UTF-8 content") + expect(regularLines).not.toContain("BOM file content") + }) + }) + }) + }) }) diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts index bafa7a5bab..7b48bca24c 100644 --- a/src/integrations/misc/extract-text.ts +++ b/src/integrations/misc/extract-text.ts @@ -3,10 +3,10 @@ import * as path from "path" import pdf from "pdf-parse/lib/pdf-parse" import mammoth from "mammoth" import fs from "fs/promises" -import { isBinaryFile } from "isbinaryfile" import { extractTextFromXLSX } from "./extract-text-from-xlsx" import { countFileLines } from "./line-counter" import { readLines } from "./read-lines" +import { readFileWithEncodingDetection, isBinaryFileWithEncodingDetection } from "../../utils/encoding" async function extractTextFromPDF(filePath: string): Promise { const dataBuffer = await fs.readFile(filePath) @@ -20,7 +20,7 @@ async function extractTextFromDOCX(filePath: string): Promise { } async function extractTextFromIPYNB(filePath: string): Promise { - const data = await fs.readFile(filePath, "utf8") + const data = await readFileWithEncodingDetection(filePath) const notebook = JSON.parse(data) let extractedText = "" @@ -84,9 +84,8 @@ export async function extractTextFromFile(filePath: string, maxReadFileLine?: nu if (extractor) { return extractor(filePath) } - - // Handle other files - const isBinary = await isBinaryFile(filePath).catch(() => false) + // Handle other files - use unified binary file detection + const isBinary = await isBinaryFileWithEncodingDetection(filePath) if (!isBinary) { // Check if we need to apply line limit @@ -103,7 +102,7 @@ export async function extractTextFromFile(filePath: string, maxReadFileLine?: nu } } // Read the entire file if no limit or file is within limit - return addLineNumbers(await fs.readFile(filePath, "utf8")) + return addLineNumbers(await readFileWithEncodingDetection(filePath)) } else { throw new Error(`Cannot read text for file type: ${fileExtension}`) } diff --git a/src/integrations/misc/read-lines.ts b/src/integrations/misc/read-lines.ts index 5a5eda9f83..c677518e7d 100644 --- a/src/integrations/misc/read-lines.ts +++ b/src/integrations/misc/read-lines.ts @@ -7,6 +7,9 @@ * Now you can read a range of lines from a file */ import { createReadStream } from "fs" +import { open } from "fs/promises" +import * as iconv from "iconv-lite" +import { detectEncoding } from "../../utils/encoding" const outOfRangeError = (filepath: string, n: number) => { return new RangeError(`Line with index ${n} does not exist in '${filepath}'. Note that line indexing is zero-based`) @@ -52,65 +55,97 @@ export function readLines(filepath: string, endLine?: number, startLine?: number ) } - // Set up stream - const input = createReadStream(filepath) - let buffer = "" - let lineCount = 0 - let result = "" - - // Handle errors - input.on("error", reject) - - // Process data chunks directly - input.on("data", (chunk) => { - // Add chunk to buffer - buffer += chunk.toString() - - let pos = 0 - let nextNewline = buffer.indexOf("\n", pos) - - // Process complete lines in the buffer - while (nextNewline !== -1) { - // If we're in the target range, add this line to the result - if (lineCount >= effectiveStartLine && (endLine === undefined || lineCount <= endLine)) { - result += buffer.substring(pos, nextNewline + 1) // Include the newline - } - - // Move position and increment line counter - pos = nextNewline + 1 - lineCount++ - - // If we've reached the end line, we can stop - if (endLine !== undefined && lineCount > endLine) { - input.destroy() - resolve(result) - return + // Sample the first 64KB for encoding detection + open(filepath, "r") + .then((fileHandle) => { + const sampleBuffer = Buffer.alloc(65536) + return fileHandle + .read(sampleBuffer, 0, sampleBuffer.length, 0) + .then(({ bytesRead }) => sampleBuffer.subarray(0, bytesRead)) + .finally(() => fileHandle.close()) + }) + .then((sampleBuffer) => detectEncoding(sampleBuffer)) + .then((encoding) => { + // Node.js native supported encodings + const nodeEncodings = ["utf8", "ascii", "latin1"] + + let buffer = "" + let lineCount = 0 + let result = "" + + // Choose decoding method based on native support + let input: NodeJS.ReadableStream + if (nodeEncodings.includes(encoding.toLowerCase())) { + input = createReadStream(filepath, { encoding: encoding as BufferEncoding }) + // Handle errors directly + input.on("error", reject) + } else { + // For non-native encodings, create streams and handle errors explicitly + const sourceStream = createReadStream(filepath) + const decodeStream = iconv.decodeStream(encoding) + + // Handle errors from both streams + sourceStream.on("error", reject) + decodeStream.on("error", reject) + + // Use pipe but with explicit error handling + input = sourceStream.pipe(decodeStream) } - // Find next newline - nextNewline = buffer.indexOf("\n", pos) - } - - // Trim buffer - keep only the incomplete line - buffer = buffer.substring(pos) - }) - - // Handle end of file - input.on("end", () => { - // Process any remaining data in buffer (last line without newline) - if (buffer.length > 0) { - if (lineCount >= effectiveStartLine && (endLine === undefined || lineCount <= endLine)) { - result += buffer - } - lineCount++ - } - - // Check if we found any lines in the requested range - if (lineCount <= effectiveStartLine) { - reject(outOfRangeError(filepath, effectiveStartLine)) - } else { - resolve(result) - } - }) + // Process data chunks directly + input.on("data", (chunk: string) => { + // Add chunk to buffer (chunk is already decoded using the detected encoding) + buffer += chunk + + let pos = 0 + let nextNewline = buffer.indexOf("\n", pos) + + // Process complete lines in the buffer + while (nextNewline !== -1) { + // If we're in the target range, add this line to the result + if (lineCount >= effectiveStartLine && (endLine === undefined || lineCount <= endLine)) { + result += buffer.substring(pos, nextNewline + 1) // Include the newline + } + + // Move position and increment line counter + pos = nextNewline + 1 + lineCount++ + + // If we've reached the end line, we can stop + if (endLine !== undefined && lineCount > endLine) { + ;(input as any).destroy?.() + resolve(result) + return + } + + // Find next newline + nextNewline = buffer.indexOf("\n", pos) + } + + // Trim buffer - keep only the incomplete line + buffer = buffer.substring(pos) + }) + + // Handle end of file + input.on("end", () => { + // Process any remaining data in buffer (last line without newline) + if (buffer.length > 0) { + if (lineCount >= effectiveStartLine && (endLine === undefined || lineCount <= endLine)) { + result += buffer + } + lineCount++ + } + + // Check if we found any lines in the requested range + if (lineCount <= effectiveStartLine) { + reject(outOfRangeError(filepath, effectiveStartLine)) + } else { + resolve(result) + } + }) + }) + .catch((error) => { + reject(error) + }) }) } diff --git a/src/package.json b/src/package.json index b7a5e42366..531e57c321 100644 --- a/src/package.json +++ b/src/package.json @@ -529,7 +529,9 @@ "web-tree-sitter": "^0.25.6", "workerpool": "^9.2.0", "yaml": "^2.8.0", - "zod": "^3.25.61" + "zod": "^3.25.61", + "iconv-lite": "^0.6.3", + "jschardet": "^3.1.4" }, "devDependencies": { "@roo-code/build": "workspace:^", diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts index 92a7d77c27..37cf36d07f 100644 --- a/src/services/code-index/processors/scanner.ts +++ b/src/services/code-index/processors/scanner.ts @@ -3,6 +3,8 @@ import { Ignore } from "ignore" import { RooIgnoreController } from "../../../core/ignore/RooIgnoreController" import { stat } from "fs/promises" import * as path from "path" +import * as iconv from "iconv-lite" +import { detectEncoding } from "../../../utils/encoding" import { generateNormalizedAbsolutePath, generateRelativeFilePath } from "../shared/get-relative-path" import { getWorkspacePathForContext } from "../../../utils/path" import { scannerExtensions } from "../shared/supported-extensions" @@ -134,10 +136,11 @@ export class DirectoryScanner implements IDirectoryScanner { return } - // Read file content - const content = await vscode.workspace.fs - .readFile(vscode.Uri.file(filePath)) - .then((buffer) => Buffer.from(buffer).toString("utf-8")) + // Read file content with encoding detection + const fileBuffer = await vscode.workspace.fs.readFile(vscode.Uri.file(filePath)) + const buffer = Buffer.from(fileBuffer) + const encoding = await detectEncoding(buffer) + const content = iconv.decode(buffer, encoding) // Calculate current hash const currentFileHash = createHash("sha256").update(content).digest("hex") diff --git a/src/services/tree-sitter/__tests__/markdownIntegration.spec.ts b/src/services/tree-sitter/__tests__/markdownIntegration.spec.ts index 9ada01a078..772e07db16 100644 --- a/src/services/tree-sitter/__tests__/markdownIntegration.spec.ts +++ b/src/services/tree-sitter/__tests__/markdownIntegration.spec.ts @@ -1,18 +1,32 @@ // Mocks must come first, before imports -vi.mock("fs/promises", () => ({ - readFile: vi.fn().mockImplementation(() => Promise.resolve("")), - stat: vi.fn().mockImplementation(() => Promise.resolve({ isDirectory: () => false })), -})) +vi.mock("fs/promises", async () => { + const actual = await vi.importActual("fs/promises"); + return { + ...actual, + readFile: vi.fn().mockImplementation(() => Promise.resolve("")), + stat: vi.fn().mockImplementation(() => Promise.resolve({ isDirectory: () => false })), + default: { + readFile: vi.fn().mockImplementation(() => Promise.resolve("")), + stat: vi.fn().mockImplementation(() => Promise.resolve({ isDirectory: () => false })), + } + } +}) vi.mock("../../../utils/fs", () => ({ fileExistsAtPath: vi.fn().mockImplementation(() => Promise.resolve(true)), })) +vi.mock("../../../utils/encoding", () => ({ + readFileWithEncodingDetection: vi.fn().mockImplementation((filePath) => { + return Promise.resolve("") + }), +})) + // Then imports import * as fs from "fs/promises" import type { Mock } from "vitest" - +import { readFileWithEncodingDetection } from "../../../utils/encoding" import { parseSourceCodeDefinitionsForFile } from "../index" describe("Markdown Integration Tests", () => { @@ -26,14 +40,14 @@ describe("Markdown Integration Tests", () => { const markdownContent = "# Main Header\n\nThis is some content under the main header.\nIt spans multiple lines to meet the minimum section length.\n\n## Section 1\n\nThis is content for section 1.\nIt also spans multiple lines.\n\n### Subsection 1.1\n\nThis is a subsection with enough lines\nto meet the minimum section length requirement.\n\n## Section 2\n\nFinal section content.\nWith multiple lines.\n" - // Mock fs.readFile to return our markdown content - ;(fs.readFile as Mock).mockImplementation(() => Promise.resolve(markdownContent)) + // Mock readFileWithEncodingDetection to return our markdown content + ;(readFileWithEncodingDetection as Mock).mockImplementation(() => Promise.resolve(markdownContent)) // Call the function with a markdown file path const result = await parseSourceCodeDefinitionsForFile("test.md") - // Verify fs.readFile was called with the correct path - expect(fs.readFile).toHaveBeenCalledWith("test.md", "utf8") + // Verify readFileWithEncodingDetection was called with the correct path + expect(readFileWithEncodingDetection).toHaveBeenCalledWith("test.md") // Check the result formatting for definition listing expect(result).toBeDefined() @@ -48,14 +62,14 @@ describe("Markdown Integration Tests", () => { // This test verifies behavior when no headers meet the minimum requirements const markdownContent = "This is just some text.\nNo headers here.\nJust plain text." - // Mock fs.readFile to return our markdown content - ;(fs.readFile as Mock).mockImplementation(() => Promise.resolve(markdownContent)) + // Mock readFileWithEncodingDetection to return our markdown content + ;(readFileWithEncodingDetection as Mock).mockImplementation(() => Promise.resolve(markdownContent)) // Call the function with a markdown file path const result = await parseSourceCodeDefinitionsForFile("no-headers.md") - // Verify fs.readFile was called with the correct path - expect(fs.readFile).toHaveBeenCalledWith("no-headers.md", "utf8") + // Verify readFileWithEncodingDetection was called with the correct path + expect(readFileWithEncodingDetection).toHaveBeenCalledWith("no-headers.md") // Check the result - should be undefined since no definitions found expect(result).toBeUndefined() diff --git a/src/services/tree-sitter/index.ts b/src/services/tree-sitter/index.ts index 145ba84730..cfeabe8d24 100644 --- a/src/services/tree-sitter/index.ts +++ b/src/services/tree-sitter/index.ts @@ -6,6 +6,7 @@ import { fileExistsAtPath } from "../../utils/fs" import { parseMarkdown } from "./markdownParser" import { RooIgnoreController } from "../../core/ignore/RooIgnoreController" import { QueryCapture } from "web-tree-sitter" +import { readFileWithEncodingDetection } from "../../utils/encoding" // Private constant const DEFAULT_MIN_COMPONENT_LINES_VALUE = 4 @@ -120,7 +121,7 @@ export async function parseSourceCodeDefinitionsForFile( } // Read file content - const fileContent = await fs.readFile(filePath, "utf8") + const fileContent = await readFileWithEncodingDetection(filePath) // Split the file content into individual lines const lines = fileContent.split("\n") @@ -196,7 +197,7 @@ export async function parseSourceCodeForDefinitionsTopLevel( try { // Read file content - const fileContent = await fs.readFile(file, "utf8") + const fileContent = await readFileWithEncodingDetection(file) // Split the file content into individual lines const lines = fileContent.split("\n") @@ -386,7 +387,7 @@ async function parseFile( } // Read file content - const fileContent = await fs.readFile(filePath, "utf8") + const fileContent = await readFileWithEncodingDetection(filePath) const extLang = path.extname(filePath).toLowerCase().slice(1) // Check if we have a parser for this file type diff --git a/src/utils/__tests__/encoding.spec.ts b/src/utils/__tests__/encoding.spec.ts new file mode 100644 index 0000000000..e127e70490 --- /dev/null +++ b/src/utils/__tests__/encoding.spec.ts @@ -0,0 +1,577 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest" +import * as jschardet from "jschardet" +import * as iconv from "iconv-lite" +import { isBinaryFile } from "isbinaryfile" +import fs from "fs/promises" +import path from "path" +import { + detectEncoding, + readFileWithEncodingDetection, + detectFileEncoding, + writeFileWithEncodingPreservation, + isBinaryFileWithEncodingDetection, +} from "../encoding" + +// Mock dependencies +vi.mock("jschardet", () => ({ + detect: vi.fn(), +})) + +vi.mock("iconv-lite", () => ({ + encodingExists: vi.fn(), + decode: vi.fn(), + encode: vi.fn(), +})) + +vi.mock("isbinaryfile", () => ({ + isBinaryFile: vi.fn(), +})) + +vi.mock("fs/promises", () => ({ + default: { + readFile: vi.fn(), + writeFile: vi.fn(), + }, +})) + +vi.mock("path", () => ({ + default: { + extname: vi.fn(), + }, +})) + +const mockJschardet = vi.mocked(jschardet) +const mockIconv = vi.mocked(iconv) +const mockIsBinaryFile = vi.mocked(isBinaryFile) +const mockFs = vi.mocked(fs) +const mockPath = vi.mocked(path) + +describe("encoding", () => { + beforeEach(() => { + vi.clearAllMocks() + // Reset default mocks + mockPath.extname.mockReturnValue(".txt") + mockIconv.encodingExists.mockReturnValue(true) + mockIconv.decode.mockReturnValue("decoded content") + mockIconv.encode.mockReturnValue(Buffer.from("encoded content")) + }) + + afterEach(() => { + vi.restoreAllMocks() + }) + + describe("detectEncoding", () => { + it("should throw error for binary files", async () => { + const buffer = Buffer.from("binary content") + mockIsBinaryFile.mockResolvedValue(true) + + await expect(detectEncoding(buffer, ".exe")).rejects.toThrow("Cannot read text for file type: .exe") + }) + + it("should call isBinaryFile with buffer and buffer length", async () => { + const buffer = Buffer.from("test content for binary check") + mockJschardet.detect.mockReturnValue({ + encoding: "", + confidence: 0, + }) // No encoding detected + mockIsBinaryFile.mockResolvedValue(false) + + await detectEncoding(buffer, ".txt") + + expect(mockIsBinaryFile).toHaveBeenCalledWith(buffer, buffer.length) + }) + + it("should handle string detection result from jschardet", async () => { + const buffer = Buffer.from("utf8 content") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.9, + }) + + const result = await detectEncoding(buffer, ".txt") + expect(result).toBe("utf8") + }) + + it("should handle object detection result with high confidence", async () => { + const buffer = Buffer.from("gbk content") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "gbk", + confidence: 0.9, + }) + + const result = await detectEncoding(buffer, ".txt") + expect(result).toBe("gbk") + }) + + it("should handle ISO-8859-1 encoding", async () => { + const buffer = Buffer.from("iso-8859-1 content") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "iso-8859-1", + confidence: 0.9, + }) + mockIconv.encodingExists.mockReturnValue(true) + + const result = await detectEncoding(buffer, ".txt") + expect(result).toBe("iso-8859-1") + }) + + it("should handle Shift-JIS encoding", async () => { + const buffer = Buffer.from("shift-jis content") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "shift-jis", + confidence: 0.9, + }) + mockIconv.encodingExists.mockReturnValue(true) + + const result = await detectEncoding(buffer, ".txt") + expect(result).toBe("shift-jis") + }) + + it("should handle empty file gracefully", async () => { + const buffer = Buffer.alloc(0) + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "", + confidence: 0, + }) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(buffer, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith("No encoding detected, falling back to utf8") + }) + + it("should handle very small file (1 byte)", async () => { + const buffer = Buffer.from("a") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "", + confidence: 0, + }) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(buffer, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith("No encoding detected, falling back to utf8") + }) + + it("should handle very small file (2 bytes)", async () => { + const buffer = Buffer.from("ab") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.3, + }) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(buffer, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith( + "Low confidence encoding detection: utf8 (confidence: 0.3), falling back to utf8", + ) + }) + + it("should fallback to utf8 for low confidence detection", async () => { + const buffer = Buffer.from("uncertain content") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "gbk", + confidence: 0.5, + }) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(buffer, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith( + "Low confidence encoding detection: gbk (confidence: 0.5), falling back to utf8", + ) + }) + + it("should fallback to utf8 when no encoding detected", async () => { + const buffer = Buffer.from("no encoding content") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "", + confidence: 0, + }) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(buffer, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith("No encoding detected, falling back to utf8") + }) + + it("should fallback to utf8 for unsupported encodings", async () => { + const buffer = Buffer.from("unsupported encoding content") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "unsupported-encoding", + confidence: 0.9, + }) + mockIconv.encodingExists.mockReturnValue(false) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(buffer, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith( + "Unsupported encoding detected: unsupported-encoding, falling back to utf8", + ) + }) + + it("should handle unsupported encoding with original detection info", async () => { + const buffer = Buffer.from("unsupported encoding content") + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "unsupported-encoding", + confidence: 0.9, + }) + mockIconv.encodingExists.mockReturnValue(false) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + await detectEncoding(buffer, ".txt") + + expect(consoleSpy).toHaveBeenCalledWith( + "Unsupported encoding detected: unsupported-encoding, falling back to utf8", + ) + }) + + it("should handle isBinaryFile error gracefully", async () => { + const buffer = Buffer.from("content") + mockIsBinaryFile.mockRejectedValue(new Error("Detection failed")) + + const result = await detectEncoding(buffer, ".txt") + expect(result).toBe("utf8") // Should fallback to utf8 + }) + + describe("BOM (Byte Order Mark) preservation", () => { + it("should preserve UTF-8 BOM in encoding detection", async () => { + // UTF-8 BOM: 0xEF 0xBB 0xBF + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) + const contentBytes = Buffer.from("Hello, world!", "utf8") + const bufferWithBOM = Buffer.concat([bomBytes, contentBytes]) + + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.9, + }) + + const result = await detectEncoding(bufferWithBOM, ".txt") + + expect(result).toBe("utf8") + expect(mockJschardet.detect).toHaveBeenCalledWith(bufferWithBOM) + // Verify the BOM is included in the buffer passed to jschardet + expect(mockJschardet.detect.mock.calls[0][0]).toEqual(bufferWithBOM) + }) + + it("should handle UTF-8 BOM with low confidence detection", async () => { + const bomBytes = Buffer.from([0xef, 0xbb, 0xbf]) + const contentBytes = Buffer.from("Hello", "utf8") + const bufferWithBOM = Buffer.concat([bomBytes, contentBytes]) + + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.5, // Low confidence + }) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(bufferWithBOM, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith( + "Low confidence encoding detection: utf8 (confidence: 0.5), falling back to utf8", + ) + }) + + it("should handle UTF-8 BOM with empty content", async () => { + // Only BOM, no content + const bomOnlyBuffer = Buffer.from([0xef, 0xbb, 0xbf]) + + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.9, + }) + + const result = await detectEncoding(bomOnlyBuffer, ".txt") + + expect(result).toBe("utf8") + expect(mockJschardet.detect).toHaveBeenCalledWith(bomOnlyBuffer) + }) + + it("should preserve UTF-16 LE BOM in encoding detection", async () => { + // UTF-16 LE BOM: 0xFF 0xFE + const bomBytes = Buffer.from([0xff, 0xfe]) + const contentBytes = Buffer.from("Hello", "utf16le") + const bufferWithBOM = Buffer.concat([bomBytes, contentBytes]) + + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf-16le", + confidence: 0.9, + }) + mockIconv.encodingExists.mockReturnValue(true) + + const result = await detectEncoding(bufferWithBOM, ".txt") + + expect(result).toBe("utf-16le") + expect(mockJschardet.detect).toHaveBeenCalledWith(bufferWithBOM) + expect(mockJschardet.detect.mock.calls[0][0]).toEqual(bufferWithBOM) + }) + + it("should preserve UTF-16 BE BOM in encoding detection", async () => { + // UTF-16 BE BOM: 0xFE 0xFF + const bomBytes = Buffer.from([0xfe, 0xff]) + // Create UTF-16 BE content manually since Node.js doesn't have utf16be encoding + const contentBytes = Buffer.from([0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f]) // "Hello" in UTF-16 BE + const bufferWithBOM = Buffer.concat([bomBytes, contentBytes]) + + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf-16be", + confidence: 0.9, + }) + mockIconv.encodingExists.mockReturnValue(true) + + const result = await detectEncoding(bufferWithBOM, ".txt") + + expect(result).toBe("utf-16be") + expect(mockJschardet.detect).toHaveBeenCalledWith(bufferWithBOM) + expect(mockJschardet.detect.mock.calls[0][0]).toEqual(bufferWithBOM) + }) + + it("should handle UTF-16 LE BOM with unsupported encoding fallback", async () => { + const bomBytes = Buffer.from([0xff, 0xfe]) + const contentBytes = Buffer.from("Hello", "utf16le") + const bufferWithBOM = Buffer.concat([bomBytes, contentBytes]) + + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf-16le", + confidence: 0.9, + }) + mockIconv.encodingExists.mockReturnValue(false) // Simulate unsupported encoding + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(bufferWithBOM, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith("Unsupported encoding detected: utf-16le, falling back to utf8") + }) + + it("should handle UTF-16 BE BOM with low confidence", async () => { + const bomBytes = Buffer.from([0xfe, 0xff]) + const contentBytes = Buffer.from([0x00, 0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f]) // "Hello" in UTF-16 BE + const bufferWithBOM = Buffer.concat([bomBytes, contentBytes]) + + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf-16be", + confidence: 0.4, // Low confidence + }) + + const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {}) + const result = await detectEncoding(bufferWithBOM, ".txt") + + expect(result).toBe("utf8") + expect(consoleSpy).toHaveBeenCalledWith( + "Low confidence encoding detection: utf-16be (confidence: 0.4), falling back to utf8", + ) + }) + }) + }) + + describe("readFileWithEncodingDetection", () => { + it("should read file and detect encoding correctly", async () => { + const filePath = "/path/to/file.txt" + const buffer = Buffer.from("file content") + mockFs.readFile.mockResolvedValue(buffer) + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.9, + }) + + const result = await readFileWithEncodingDetection(filePath) + + expect(mockFs.readFile).toHaveBeenCalledWith(filePath) + expect(mockPath.extname).toHaveBeenCalledWith(filePath) + expect(mockIconv.decode).toHaveBeenCalledWith(buffer, "utf8") + expect(result).toBe("decoded content") + }) + + it("should handle binary file detection", async () => { + const filePath = "/path/to/file.exe" + const buffer = Buffer.from("binary content") + mockFs.readFile.mockResolvedValue(buffer) + mockIsBinaryFile.mockResolvedValue(true) + mockPath.extname.mockReturnValue(".exe") + + await expect(readFileWithEncodingDetection(filePath)).rejects.toThrow( + "Cannot read text for file type: .exe", + ) + }) + }) + + describe("detectFileEncoding", () => { + it("should detect encoding for existing file", async () => { + const filePath = "/path/to/file.txt" + const buffer = Buffer.from("file content") + mockFs.readFile.mockResolvedValue(buffer) + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "gbk", + confidence: 0.9, + }) + + const result = await detectFileEncoding(filePath) + + expect(mockFs.readFile).toHaveBeenCalledWith(filePath) + expect(result).toBe("gbk") + }) + + it("should return utf8 for non-existent file", async () => { + const filePath = "/path/to/nonexistent.txt" + mockFs.readFile.mockRejectedValue(new Error("File not found")) + + const result = await detectFileEncoding(filePath) + + expect(result).toBe("utf8") + }) + + it("should return utf8 for unreadable file", async () => { + const filePath = "/path/to/unreadable.txt" + mockFs.readFile.mockRejectedValue(new Error("Permission denied")) + + const result = await detectFileEncoding(filePath) + + expect(result).toBe("utf8") + }) + }) + + describe("writeFileWithEncodingPreservation", () => { + it("should write utf8 file directly when original is utf8", async () => { + const filePath = "/path/to/file.txt" + const content = "new content" + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.9, + }) + + await writeFileWithEncodingPreservation(filePath, content) + + expect(mockFs.writeFile).toHaveBeenCalledWith(filePath, content, "utf8") + }) + + it("should convert and write content for non-utf8 encoding", async () => { + const filePath = "/path/to/file.txt" + const content = "new content" + mockIsBinaryFile.mockResolvedValue(false) + mockJschardet.detect.mockReturnValue({ + encoding: "gbk", + confidence: 0.9, + }) + + await writeFileWithEncodingPreservation(filePath, content) + + expect(mockIconv.encode).toHaveBeenCalledWith(content, "gbk") + expect(mockFs.writeFile).toHaveBeenCalledWith(filePath, Buffer.from("encoded content")) + }) + + it("should handle new file (utf8) correctly", async () => { + const filePath = "/path/to/newfile.txt" + const content = "new content" + mockFs.readFile.mockRejectedValue(new Error("File not found")) + + await writeFileWithEncodingPreservation(filePath, content) + + expect(mockFs.writeFile).toHaveBeenCalledWith(filePath, content, "utf8") + }) + }) + + describe("isBinaryFileWithEncodingDetection", () => { + it("should return false for text files that can be encoded", async () => { + const filePath = "/path/to/file.txt" + const buffer = Buffer.from("text content") + mockFs.readFile.mockResolvedValue(buffer) + mockPath.extname.mockReturnValue(".txt") + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.9, + }) + + const result = await isBinaryFileWithEncodingDetection(filePath) + + expect(result).toBe(false) + expect(mockFs.readFile).toHaveBeenCalledWith(filePath) + }) + + it("should return true for files that fail encoding detection and are binary", async () => { + const filePath = "/path/to/file.exe" + const buffer = Buffer.from("binary content") + mockFs.readFile.mockResolvedValue(buffer) + mockPath.extname.mockReturnValue(".exe") + mockJschardet.detect.mockReturnValue({ + encoding: "", + confidence: 0, + }) + mockIsBinaryFile.mockResolvedValue(true) + + const result = await isBinaryFileWithEncodingDetection(filePath) + + expect(result).toBe(true) + }) + + it("should return false for file read errors", async () => { + const filePath = "/path/to/nonexistent.txt" + mockFs.readFile.mockRejectedValue(new Error("File not found")) + + const result = await isBinaryFileWithEncodingDetection(filePath) + + expect(result).toBe(false) + }) + + it("should return false when encoding detection succeeds even with low confidence", async () => { + const filePath = "/path/to/file.txt" + const buffer = Buffer.from("text content") + mockFs.readFile.mockResolvedValue(buffer) + mockPath.extname.mockReturnValue(".txt") + mockJschardet.detect.mockReturnValue({ + encoding: "utf8", + confidence: 0.3, + }) + + const result = await isBinaryFileWithEncodingDetection(filePath) + + expect(result).toBe(false) + }) + + it("should call isBinaryFile with buffer and buffer length when encoding detection fails", async () => { + const filePath = "/path/to/file.bin" + const buffer = Buffer.from("binary content for length test") + mockFs.readFile.mockResolvedValue(buffer) + mockPath.extname.mockReturnValue(".bin") + mockJschardet.detect.mockReturnValue({ + encoding: "", + confidence: 0, + }) + mockIsBinaryFile.mockResolvedValue(true) + + await isBinaryFileWithEncodingDetection(filePath) + + expect(mockIsBinaryFile).toHaveBeenCalledWith(buffer, buffer.length) + }) + }) +}) diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts new file mode 100644 index 0000000000..07899b6c66 --- /dev/null +++ b/src/utils/encoding.ts @@ -0,0 +1,133 @@ +import * as jschardet from "jschardet" +import * as iconv from "iconv-lite" +import { isBinaryFile } from "isbinaryfile" +import fs from "fs/promises" +import path from "path" + +/** + * Detect the encoding of a file buffer + * @param fileBuffer The file buffer + * @param fileExtension Optional file extension + * @returns The detected encoding + */ +export async function detectEncoding(fileBuffer: Buffer, fileExtension?: string): Promise { + // 1. Perform encoding detection first + const detected = jschardet.detect(fileBuffer) + let encoding: string + let originalEncoding: string | undefined + + if (typeof detected === "string") { + encoding = detected + originalEncoding = detected + } else if (detected && detected.encoding) { + originalEncoding = detected.encoding + // Check confidence level, use default encoding if too low + // 0.7 is a conservative threshold that works well when UTF-8 is the dominant encoding + // and we prefer to fall back rather than risk mis-decoding + if (detected.confidence < 0.7) { + console.warn( + `Low confidence encoding detection: ${originalEncoding} (confidence: ${detected.confidence}), falling back to utf8`, + ) + encoding = "utf8" + } else { + encoding = detected.encoding + } + } else { + // 2. Only check if it's a binary file when encoding detection fails + if (fileExtension) { + const isBinary = await isBinaryFile(fileBuffer, fileBuffer.length).catch(() => false) + if (isBinary) { + throw new Error(`Cannot read text for file type: ${fileExtension}`) + } + } + console.warn(`No encoding detected, falling back to utf8`) + encoding = "utf8" + } + + // 3. Verify if the encoding is supported by iconv-lite + if (!iconv.encodingExists(encoding)) { + console.warn( + `Unsupported encoding detected: ${encoding}${originalEncoding && originalEncoding !== encoding ? ` (originally detected as: ${originalEncoding})` : ""}, falling back to utf8`, + ) + encoding = "utf8" + } + + return encoding +} + +/** + * Read file with automatic encoding detection + * @param filePath Path to the file + * @returns File content as string + */ +export async function readFileWithEncodingDetection(filePath: string): Promise { + const buffer = await fs.readFile(filePath) + const fileExtension = path.extname(filePath).toLowerCase() + + const encoding = await detectEncoding(buffer, fileExtension) + return iconv.decode(buffer, encoding) +} + +/** + * Detect the encoding of an existing file + * @param filePath Path to the file + * @returns Detected encoding, returns 'utf8' if file does not exist + */ +export async function detectFileEncoding(filePath: string): Promise { + try { + const buffer = await fs.readFile(filePath) + const fileExtension = path.extname(filePath).toLowerCase() + return await detectEncoding(buffer, fileExtension) + } catch (error) { + // File does not exist or cannot be read, default to UTF-8 + return "utf8" + } +} + +/** + * Smart binary file detection that tries encoding detection first + * @param filePath Path to the file + * @returns Promise true if file is binary, false if it's text or if there's a read error + * @note Returns false on read errors to allow callers to handle file access issues explicitly + */ +export async function isBinaryFileWithEncodingDetection(filePath: string): Promise { + try { + const fileBuffer = await fs.readFile(filePath) + const fileExtension = path.extname(filePath).toLowerCase() + + // Try to detect encoding first + try { + await detectEncoding(fileBuffer, fileExtension) + // If detectEncoding succeeds, it's a text file + return false + } catch (error) { + // If detectEncoding fails, check if it's actually a binary file + return await isBinaryFile(fileBuffer, fileBuffer.length).catch(() => false) + } + } catch (error) { + // File read error, return false to let callers handle read errors explicitly + return false + } +} + +/** + * Write file using the same encoding as the original file + * If the file is new, use UTF-8 encoding + * @param filePath Path to the file + * @param content Content to write (UTF-8 string) + * @returns Promise + */ +export async function writeFileWithEncodingPreservation(filePath: string, content: string): Promise { + // Detect original file encoding + const originalEncoding = await detectFileEncoding(filePath) + + // If original file is UTF-8 or does not exist, write directly + if (originalEncoding === "utf8") { + await fs.writeFile(filePath, content, "utf8") + return + } + + // Convert UTF-8 content to original file encoding + const encodedBuffer = iconv.encode(content, originalEncoding) + await fs.writeFile(filePath, encodedBuffer) +}