diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 82e7df9fca..98efded985 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -660,12 +660,18 @@ importers: i18next: specifier: ^24.2.2 version: 24.2.3(typescript@5.8.3) + iconv-lite: + specifier: ^0.6.3 + version: 0.6.3 ignore: specifier: ^7.0.3 version: 7.0.4 isbinaryfile: specifier: ^5.0.2 version: 5.0.4 + jschardet: + specifier: ^3.1.4 + version: 3.1.4 lodash.debounce: specifier: ^4.0.8 version: 4.0.8 @@ -7567,6 +7573,10 @@ packages: jsbn@1.1.0: resolution: {integrity: sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==} + jschardet@3.1.4: + resolution: {integrity: sha512-/kmVISmrwVwtyYU40iQUOp3SUPk2dhNCMsZBQX0R1/jZ8maaXJ/oZIzUOiyOqcgtLnETFKYChbJ5iDC/eWmFHg==} + engines: {node: '>=0.1.90'} + jsdoc-type-pratt-parser@4.1.0: resolution: {integrity: sha512-Hicd6JK5Njt2QB6XYFS7ok9e37O8AYk3jTcppG4YVQnYjOemymvTcmc7OWsmq/Qqj5TdRFO5/x/tIPmBeRtGHg==} engines: {node: '>=12.0.0'} @@ -11516,7 +11526,7 @@ snapshots: '@babel/traverse': 7.27.1 '@babel/types': 7.27.1 convert-source-map: 2.0.0 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 gensync: 1.0.0-beta.2 json5: 2.2.3 semver: 6.3.1 @@ -11684,7 +11694,7 @@ snapshots: '@babel/parser': 7.27.2 '@babel/template': 7.27.2 '@babel/types': 7.27.1 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -12528,7 +12538,7 @@ snapshots: '@kwsites/file-exists@1.1.1': dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 transitivePeerDependencies: - supports-color @@ -12855,7 +12865,7 @@ snapshots: '@puppeteer/browsers@2.10.5': dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 extract-zip: 2.0.1 progress: 2.0.3 proxy-agent: 6.5.0 @@ -12868,7 +12878,7 @@ snapshots: '@puppeteer/browsers@2.6.1': dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 extract-zip: 2.0.1 progress: 2.0.3 proxy-agent: 6.5.0 @@ -15196,7 +15206,7 @@ snapshots: agent-base@6.0.2: dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 transitivePeerDependencies: - supports-color @@ -15502,7 +15512,7 @@ snapshots: dependencies: bytes: 3.1.2 content-type: 1.0.5 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 http-errors: 2.0.0 iconv-lite: 0.6.3 on-finished: 2.4.1 @@ -16221,6 +16231,10 @@ snapshots: dependencies: ms: 2.1.3 + debug@4.4.1: + dependencies: + ms: 2.1.3 + debug@4.4.1(supports-color@8.1.1): dependencies: ms: 2.1.3 @@ -16984,7 +16998,7 @@ snapshots: content-type: 1.0.5 cookie: 0.7.2 cookie-signature: 1.2.2 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 encodeurl: 2.0.0 escape-html: 1.0.3 etag: 1.8.1 @@ -17022,7 +17036,7 @@ snapshots: extract-zip@2.0.1: dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 get-stream: 5.2.0 yauzl: 2.10.0 optionalDependencies: @@ -17121,7 +17135,7 @@ snapshots: finalhandler@2.1.0: dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 encodeurl: 2.0.0 escape-html: 1.0.3 on-finished: 2.4.1 @@ -17342,7 +17356,7 @@ snapshots: dependencies: basic-ftp: 5.0.5 data-uri-to-buffer: 6.0.2 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 transitivePeerDependencies: - supports-color @@ -17565,28 +17579,28 @@ snapshots: dependencies: '@tootallnate/once': 2.0.0 agent-base: 6.0.2 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 transitivePeerDependencies: - supports-color http-proxy-agent@7.0.2: dependencies: agent-base: 7.1.3 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 transitivePeerDependencies: - supports-color https-proxy-agent@5.0.1: dependencies: agent-base: 6.0.2 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 transitivePeerDependencies: - supports-color https-proxy-agent@7.0.6: dependencies: agent-base: 7.1.3 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 transitivePeerDependencies: - supports-color @@ -17942,7 +17956,7 @@ snapshots: istanbul-lib-source-maps@4.0.1: dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 istanbul-lib-coverage: 3.2.2 source-map: 0.6.1 transitivePeerDependencies: @@ -18424,6 +18438,8 @@ snapshots: jsbn@1.1.0: {} + jschardet@3.1.4: {} + jsdoc-type-pratt-parser@4.1.0: {} jsdom@20.0.3: @@ -19787,7 +19803,7 @@ snapshots: dependencies: '@tootallnate/quickjs-emscripten': 0.23.0 agent-base: 7.1.3 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 get-uri: 6.0.4 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.6 @@ -20125,7 +20141,7 @@ snapshots: proxy-agent@6.5.0: dependencies: agent-base: 7.1.3 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.6 lru-cache: 7.18.3 @@ -20170,7 +20186,7 @@ snapshots: dependencies: '@puppeteer/browsers': 2.6.1 chromium-bidi: 0.11.0(devtools-protocol@0.0.1367902) - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 devtools-protocol: 0.0.1367902 typed-query-selector: 2.12.0 ws: 8.18.2 @@ -20638,7 +20654,7 @@ snapshots: router@2.2.0: dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 depd: 2.0.0 is-promise: 4.0.0 parseurl: 1.3.3 @@ -20713,7 +20729,7 @@ snapshots: send@1.2.0: dependencies: - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 encodeurl: 2.0.0 escape-html: 1.0.3 etag: 1.8.1 @@ -20873,7 +20889,7 @@ snapshots: dependencies: '@kwsites/file-exists': 1.1.1 '@kwsites/promise-deferred': 1.1.1 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 transitivePeerDependencies: - supports-color @@ -20905,7 +20921,7 @@ snapshots: socks-proxy-agent@8.0.5: dependencies: agent-base: 7.1.3 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 socks: 2.8.4 transitivePeerDependencies: - supports-color @@ -21488,7 +21504,7 @@ snapshots: cac: 6.7.14 chokidar: 4.0.3 consola: 3.4.2 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 esbuild: 0.25.5 fix-dts-default-cjs-exports: 1.0.1 joycon: 3.1.1 @@ -21884,7 +21900,7 @@ snapshots: vite-node@3.1.3(@types/node@20.17.50)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0): dependencies: cac: 6.7.14 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 es-module-lexer: 1.7.0 pathe: 2.0.3 vite: 6.3.5(@types/node@20.17.50)(jiti@2.4.2)(lightningcss@1.30.1)(tsx@4.19.4)(yaml@2.8.0) @@ -22039,7 +22055,7 @@ snapshots: '@vitest/spy': 3.1.3 '@vitest/utils': 3.1.3 chai: 5.2.0 - debug: 4.4.1(supports-color@8.1.1) + debug: 4.4.1 expect-type: 1.2.1 magic-string: 0.30.17 pathe: 2.0.3 diff --git a/src/core/tools/applyDiffTool.ts b/src/core/tools/applyDiffTool.ts index 500c7a92c3..66d5cd3546 100644 --- a/src/core/tools/applyDiffTool.ts +++ b/src/core/tools/applyDiffTool.ts @@ -1,5 +1,5 @@ import path from "path" -import fs from "fs/promises" +import { readFileWithEncoding } from "../../integrations/misc/readFileWithEncoding" import { TelemetryService } from "@roo-code/telemetry" @@ -86,7 +86,7 @@ export async function applyDiffTool( return } - let originalContent: string | null = await fs.readFile(absolutePath, "utf-8") + let originalContent: string | null = (await readFileWithEncoding(absolutePath)).content // Apply the diff to the original content const diffResult = (await cline.diffStrategy?.applyDiff( diff --git a/src/core/tools/insertContentTool.ts b/src/core/tools/insertContentTool.ts index 0963bc78cc..2f343449b3 100644 --- a/src/core/tools/insertContentTool.ts +++ b/src/core/tools/insertContentTool.ts @@ -1,5 +1,5 @@ import delay from "delay" -import fs from "fs/promises" +import { readFileWithEncoding } from "../../integrations/misc/readFileWithEncoding" import path from "path" import { getReadablePath } from "../../utils/path" @@ -89,7 +89,7 @@ export async function insertContentTool( cline.consecutiveMistakeCount = 0 // Read the file - const fileContent = await fs.readFile(absolutePath, "utf8") + const fileContent = (await readFileWithEncoding(absolutePath)).content cline.diffViewProvider.editType = "modify" cline.diffViewProvider.originalContent = fileContent const lines = fileContent.split("\n") diff --git a/src/core/tools/searchAndReplaceTool.ts b/src/core/tools/searchAndReplaceTool.ts index 58d246b133..5bd1589cf9 100644 --- a/src/core/tools/searchAndReplaceTool.ts +++ b/src/core/tools/searchAndReplaceTool.ts @@ -1,6 +1,6 @@ // Core Node.js imports import path from "path" -import fs from "fs/promises" +import { readFileWithEncoding } from "../../integrations/misc/readFileWithEncoding" import delay from "delay" // Internal imports @@ -143,7 +143,7 @@ export async function searchAndReplaceTool( // Read and process file content let fileContent: string try { - fileContent = await fs.readFile(absolutePath, "utf-8") + fileContent = (await readFileWithEncoding(absolutePath)).content } catch (error) { cline.consecutiveMistakeCount++ cline.recordToolError("search_and_replace") diff --git a/src/integrations/editor/DiffViewProvider.ts b/src/integrations/editor/DiffViewProvider.ts index 3ab0419618..64da606e33 100644 --- a/src/integrations/editor/DiffViewProvider.ts +++ b/src/integrations/editor/DiffViewProvider.ts @@ -11,6 +11,7 @@ import { formatResponse } from "../../core/prompts/responses" import { diagnosticsToProblemsString, getNewDiagnostics } from "../diagnostics" import { ClineSayTool } from "../../shared/ExtensionMessage" import { Task } from "../../core/task/Task" +import { readFileWithEncoding } from "../misc/readFileWithEncoding" import { DecorationController } from "./DecorationController" @@ -59,7 +60,7 @@ export class DiffViewProvider { this.preDiagnostics = vscode.languages.getDiagnostics() if (fileExists) { - this.originalContent = await fs.readFile(absolutePath, "utf-8") + this.originalContent = (await readFileWithEncoding(absolutePath)).content } else { this.originalContent = "" } diff --git a/src/integrations/misc/extract-text.ts b/src/integrations/misc/extract-text.ts index bd8b9ce9d0..7977f57058 100644 --- a/src/integrations/misc/extract-text.ts +++ b/src/integrations/misc/extract-text.ts @@ -3,7 +3,7 @@ import * as path from "path" import pdf from "pdf-parse/lib/pdf-parse" import mammoth from "mammoth" import fs from "fs/promises" -import { isBinaryFile } from "isbinaryfile" +import { readFileWithEncoding } from "./readFileWithEncoding" async function extractTextFromPDF(filePath: string): Promise { const dataBuffer = await fs.readFile(filePath) @@ -61,12 +61,11 @@ export async function extractTextFromFile(filePath: string): Promise { return extractor(filePath) } - // Handle other files - const isBinary = await isBinaryFile(filePath).catch(() => false) - - if (!isBinary) { - return addLineNumbers(await fs.readFile(filePath, "utf8")) - } else { + // Handle other files using readFileWithEncoding with binary detection + try { + const { content } = await readFileWithEncoding(filePath, { acceptTextOnly: true }) + return addLineNumbers(content) + } catch (error) { throw new Error(`Cannot read text for file type: ${fileExtension}`) } } diff --git a/src/integrations/misc/readFileWithEncoding.ts b/src/integrations/misc/readFileWithEncoding.ts new file mode 100644 index 0000000000..c6ce6e8704 --- /dev/null +++ b/src/integrations/misc/readFileWithEncoding.ts @@ -0,0 +1,735 @@ +import * as fs from "fs" +import * as fsPromises from "fs/promises" +import * as iconv from "iconv-lite" +import * as jschardet from "jschardet" + +// BOM (Byte Order Mark) constants +const UTF8_BOM = [0xef, 0xbb, 0xbf] +const UTF16BE_BOM = [0xfe, 0xff] +const UTF16LE_BOM = [0xff, 0xfe] + +// Encoding constants +const UTF8 = "utf8" +const UTF8_WITH_BOM = "utf8bom" +const UTF16BE = "utf16be" +const UTF16LE = "utf16le" + +// Thresholds for detection +const ZERO_BYTE_DETECTION_BUFFER_MAX_LEN = 512 // For binary detection +const AUTO_ENCODING_GUESS_MIN_BYTES = 512 * 8 // Minimum bytes for encoding guess +const AUTO_ENCODING_GUESS_MAX_BYTES = 512 * 128 // Maximum bytes for encoding guess + +// Supported encodings map (matches encoding.ts) +export const SUPPORTED_ENCODINGS = { + utf8: { + labelLong: "UTF-8", + labelShort: "UTF-8", + order: 1, + alias: "utf8bom", + guessableName: "UTF-8", + }, + utf8bom: { + labelLong: "UTF-8 with BOM", + labelShort: "UTF-8 with BOM", + encodeOnly: true, + order: 2, + alias: "utf8", + }, + utf16le: { + labelLong: "UTF-16 LE", + labelShort: "UTF-16 LE", + order: 3, + guessableName: "UTF-16LE", + }, + utf16be: { + labelLong: "UTF-16 BE", + labelShort: "UTF-16 BE", + order: 4, + guessableName: "UTF-16BE", + }, + windows1252: { + labelLong: "Western (Windows 1252)", + labelShort: "Windows 1252", + order: 5, + guessableName: "windows-1252", + }, + iso88591: { + labelLong: "Western (ISO 8859-1)", + labelShort: "ISO 8859-1", + order: 6, + }, + iso88593: { + labelLong: "Western (ISO 8859-3)", + labelShort: "ISO 8859-3", + order: 7, + }, + iso885915: { + labelLong: "Western (ISO 8859-15)", + labelShort: "ISO 8859-15", + order: 8, + }, + macroman: { + labelLong: "Western (Mac Roman)", + labelShort: "Mac Roman", + order: 9, + }, + cp437: { + labelLong: "DOS (CP 437)", + labelShort: "CP437", + order: 10, + }, + windows1256: { + labelLong: "Arabic (Windows 1256)", + labelShort: "Windows 1256", + order: 11, + }, + iso88596: { + labelLong: "Arabic (ISO 8859-6)", + labelShort: "ISO 8859-6", + order: 12, + }, + windows1257: { + labelLong: "Baltic (Windows 1257)", + labelShort: "Windows 1257", + order: 13, + }, + iso88594: { + labelLong: "Baltic (ISO 8859-4)", + labelShort: "ISO 8859-4", + order: 14, + }, + iso885914: { + labelLong: "Celtic (ISO 8859-14)", + labelShort: "ISO 8859-14", + order: 15, + }, + windows1250: { + labelLong: "Central European (Windows 1250)", + labelShort: "Windows 1250", + order: 16, + guessableName: "windows-1250", + }, + iso88592: { + labelLong: "Central European (ISO 8859-2)", + labelShort: "ISO 8859-2", + order: 17, + guessableName: "ISO-8859-2", + }, + cp852: { + labelLong: "Central European (CP 852)", + labelShort: "CP 852", + order: 18, + }, + windows1251: { + labelLong: "Cyrillic (Windows 1251)", + labelShort: "Windows 1251", + order: 19, + guessableName: "windows-1251", + }, + cp866: { + labelLong: "Cyrillic (CP 866)", + labelShort: "CP 866", + order: 20, + guessableName: "IBM866", + }, + cp1125: { + labelLong: "Cyrillic (CP 1125)", + labelShort: "CP 1125", + order: 21, + guessableName: "IBM1125", + }, + iso88595: { + labelLong: "Cyrillic (ISO 8859-5)", + labelShort: "ISO 8859-5", + order: 22, + guessableName: "ISO-8859-5", + }, + koi8r: { + labelLong: "Cyrillic (KOI8-R)", + labelShort: "KOI8-R", + order: 23, + guessableName: "KOI8-R", + }, + koi8u: { + labelLong: "Cyrillic (KOI8-U)", + labelShort: "KOI8-U", + order: 24, + }, + iso885913: { + labelLong: "Estonian (ISO 8859-13)", + labelShort: "ISO 8859-13", + order: 25, + }, + windows1253: { + labelLong: "Greek (Windows 1253)", + labelShort: "Windows 1253", + order: 26, + guessableName: "windows-1253", + }, + iso88597: { + labelLong: "Greek (ISO 8859-7)", + labelShort: "ISO 8859-7", + order: 27, + guessableName: "ISO-8859-7", + }, + windows1255: { + labelLong: "Hebrew (Windows 1255)", + labelShort: "Windows 1255", + order: 28, + guessableName: "windows-1255", + }, + iso88598: { + labelLong: "Hebrew (ISO 8859-8)", + labelShort: "ISO 8859-8", + order: 29, + guessableName: "ISO-8859-8", + }, + iso885910: { + labelLong: "Nordic (ISO 8859-10)", + labelShort: "ISO 8859-10", + order: 30, + }, + iso885916: { + labelLong: "Romanian (ISO 8859-16)", + labelShort: "ISO 8859-16", + order: 31, + }, + windows1254: { + labelLong: "Turkish (Windows 1254)", + labelShort: "Windows 1254", + order: 32, + }, + iso88599: { + labelLong: "Turkish (ISO 8859-9)", + labelShort: "ISO 8859-9", + order: 33, + }, + windows1258: { + labelLong: "Vietnamese (Windows 1258)", + labelShort: "Windows 1258", + order: 34, + }, + gbk: { + labelLong: "Simplified Chinese (GBK)", + labelShort: "GBK", + order: 35, + }, + gb18030: { + labelLong: "Simplified Chinese (GB18030)", + labelShort: "GB18030", + order: 36, + }, + cp950: { + labelLong: "Traditional Chinese (Big5)", + labelShort: "Big5", + order: 37, + guessableName: "Big5", + }, + big5hkscs: { + labelLong: "Traditional Chinese (Big5-HKSCS)", + labelShort: "Big5-HKSCS", + order: 38, + }, + shiftjis: { + labelLong: "Japanese (Shift JIS)", + labelShort: "Shift JIS", + order: 39, + guessableName: "SHIFT_JIS", + }, + eucjp: { + labelLong: "Japanese (EUC-JP)", + labelShort: "EUC-JP", + order: 40, + guessableName: "EUC-JP", + }, + euckr: { + labelLong: "Korean (EUC-KR)", + labelShort: "EUC-KR", + order: 41, + guessableName: "EUC-KR", + }, + windows874: { + labelLong: "Thai (Windows 874)", + labelShort: "Windows 874", + order: 42, + }, + iso885911: { + labelLong: "Latin/Thai (ISO 8859-11)", + labelShort: "ISO 8859-11", + order: 43, + }, + koi8ru: { + labelLong: "Cyrillic (KOI8-RU)", + labelShort: "KOI8-RU", + order: 44, + }, + koi8t: { + labelLong: "Tajik (KOI8-T)", + labelShort: "KOI8-T", + order: 45, + }, + gb2312: { + labelLong: "Simplified Chinese (GB 2312)", + labelShort: "GB 2312", + order: 46, + guessableName: "GB2312", + }, + cp865: { + labelLong: "Nordic DOS (CP 865)", + labelShort: "CP 865", + order: 47, + }, + cp850: { + labelLong: "Western European DOS (CP 850)", + labelShort: "CP 850", + order: 48, + }, +} + +// Create a map of guessable encodings +export const GUESSABLE_ENCODINGS = (() => { + const guessableEncodings: any = {} + for (const encoding in SUPPORTED_ENCODINGS) { + if ((SUPPORTED_ENCODINGS as any)[encoding].guessableName) { + guessableEncodings[encoding] = (SUPPORTED_ENCODINGS as any)[encoding] + } + } + return guessableEncodings +})() + +// Encodings we explicitly ignore from auto guessing +const IGNORE_ENCODINGS = ["ascii", "utf-16", "utf-32"] + +/** + * Detects encoding by BOM (Byte Order Mark) + * @param buffer - File content buffer + * @returns Detected encoding or null if no BOM found + */ +function detectEncodingByBOM(buffer: Buffer | null, bytesRead: number): string | null { + if (!buffer || bytesRead < UTF16BE_BOM.length) { + return null + } + + const b0 = buffer[0] + const b1 = buffer[1] + + // UTF-16 BE + if (b0 === UTF16BE_BOM[0] && b1 === UTF16BE_BOM[1]) { + return UTF16BE + } + + // UTF-16 LE + if (b0 === UTF16LE_BOM[0] && b1 === UTF16LE_BOM[1]) { + return UTF16LE + } + + if (bytesRead < UTF8_BOM.length) { + return null + } + + const b2 = buffer[2] + + // UTF-8 with BOM + if (b0 === UTF8_BOM[0] && b1 === UTF8_BOM[1] && b2 === UTF8_BOM[2]) { + return UTF8_WITH_BOM + } + + return null +} + +/** + * Checks if buffer seems to be binary by looking for zero bytes + * @param buffer - File content buffer + * @param bytesRead - Number of bytes read from the buffer + * @returns Object with binary detection and encoding if UTF-16 is detected + */ +function checkBinaryAndUTF16(buffer: Buffer, bytesRead: number): { seemsBinary: boolean; encoding: string | null } { + let seemsBinary = false + let encoding: string | null = null + + // Detect 0 bytes to see if file is binary or UTF-16 LE/BE + if (buffer) { + let couldBeUTF16LE = true // e.g. 0xAA 0x00 + let couldBeUTF16BE = true // e.g. 0x00 0xAA + let containsZeroByte = false + + // This is a simplified guess to detect UTF-16 BE or LE by just checking if + // the first 512 bytes have the 0-byte at a specific location. For UTF-16 LE + // this would be the odd byte index and for UTF-16 BE the even one. + for (let i = 0; i < bytesRead && i < ZERO_BYTE_DETECTION_BUFFER_MAX_LEN; i++) { + const isEndian = i % 2 === 1 // assume 2-byte sequences typical for UTF-16 + const isZeroByte = buffer[i] === 0 + + if (isZeroByte) { + containsZeroByte = true + } + + // UTF-16 LE: expect e.g. 0xAA 0x00 + if (couldBeUTF16LE && ((isEndian && !isZeroByte) || (!isEndian && isZeroByte))) { + couldBeUTF16LE = false + } + + // UTF-16 BE: expect e.g. 0x00 0xAA + if (couldBeUTF16BE && ((isEndian && isZeroByte) || (!isEndian && !isZeroByte))) { + couldBeUTF16BE = false + } + + // Return if this is neither UTF16-LE nor UTF16-BE and thus treat as binary + if (isZeroByte && !couldBeUTF16LE && !couldBeUTF16BE) { + seemsBinary = true + break + } + } + + // Handle case of 0-byte included + if (containsZeroByte) { + if (couldBeUTF16LE) { + encoding = UTF16LE + } else if (couldBeUTF16BE) { + encoding = UTF16BE + } else { + seemsBinary = true + } + } + } + + return { seemsBinary, encoding } +} + +/** + * Guesses encoding from buffer using jschardet + * @param buffer - File content buffer + * @returns Guessed encoding or null if detection fails + */ +/** + * Converts a buffer to a Latin1 string + * @param buffer - Buffer to convert + * @returns Latin1 string representation of the buffer + */ +function encodeLatin1(buffer: Uint8Array): string { + let result = "" + for (let i = 0; i < buffer.length; i++) { + result += String.fromCharCode(buffer[i]) + } + return result +} + +async function guessEncodingByBuffer(buffer: Buffer): Promise { + if (!buffer || buffer.length === 0) { + return null + } + + // Ensure to limit buffer for guessing + const limitedBuffer = buffer.slice(0, AUTO_ENCODING_GUESS_MAX_BYTES) + + try { + // Convert buffer to binary string as jschardet expects + const binaryString = encodeLatin1(limitedBuffer) + const result = jschardet.detect(binaryString) + + if (!result || !result.encoding) { + return null + } + + const enc = result.encoding.toLowerCase() + + // Ignore some encodings + if (IGNORE_ENCODINGS.includes(enc)) { + return null + } + + return enc + } catch (error) { + return null // jschardet throws for unknown encodings + } +} + +/** + * Detects encoding from buffer using multiple strategies + * @param buffer - File content buffer + * @param bytesRead - Number of bytes read from the buffer + * @param autoGuessEncoding - Whether to use jschardet for additional guessing + * @returns Object with detected encoding and whether file seems binary + */ +async function detectEncodingFromBuffer( + buffer: Buffer, + bytesRead: number, + autoGuessEncoding: boolean = true, +): Promise<{ encoding: string | null; seemsBinary: boolean }> { + // Always first check for BOM to find out about encoding + let encoding = detectEncodingByBOM(buffer, bytesRead) + + // Detect 0 bytes to see if file is binary or UTF-16 LE/BE + // unless we already know that this file has a UTF-16 encoding + let seemsBinary = false + if (encoding !== UTF16BE && encoding !== UTF16LE) { + const result = checkBinaryAndUTF16(buffer, bytesRead) + seemsBinary = result.seemsBinary + + // If we detected UTF-16 through zero bytes, use that + if (!encoding && result.encoding) { + encoding = result.encoding + } + } + + // Auto guess encoding if configured + if (autoGuessEncoding && !seemsBinary && !encoding && buffer) { + encoding = await guessEncodingByBuffer(buffer) + } + + return { seemsBinary, encoding } +} + +/** + * Creates a decoder for the specified encoding + */ +class TextDecoder { + private encoding: string + private bomLength: number + private remainingBytes: Buffer | null = null + + constructor(encoding: string) { + this.encoding = encoding + + // Determine BOM length for skipping + if (encoding === UTF8_WITH_BOM) { + this.bomLength = 3 + } else if (encoding === UTF16BE || encoding === UTF16LE) { + this.bomLength = 2 + } else { + this.bomLength = 0 + } + } + + decode(chunk: Buffer, isFirst: boolean = false, isLast: boolean = false): string { + // Skip BOM on first chunk if needed + let buffer = chunk + if (isFirst && this.bomLength > 0) { + buffer = chunk.slice(this.bomLength) + } + + // Handle any remaining bytes from previous chunks + if (this.remainingBytes) { + buffer = Buffer.concat([this.remainingBytes, buffer]) + this.remainingBytes = null + } + + // For multi-byte encodings, ensure we don't cut in the middle of a character + if (!isLast && this.encoding !== UTF8 && this.encoding !== "utf-8") { + // For UTF-16, ensure we have an even number of bytes + if (this.encoding === UTF16BE || this.encoding === UTF16LE) { + if (buffer.length % 2 !== 0) { + this.remainingBytes = buffer.slice(buffer.length - 1) + buffer = buffer.slice(0, buffer.length - 1) + } + } + } + + // Decode the buffer + try { + if (this.encoding === UTF8 || this.encoding === "utf-8") { + return buffer.toString("utf8") + } else if (this.encoding === UTF8_WITH_BOM) { + return buffer.toString("utf8") + } else { + return iconv.decode(buffer, this.encoding) + } + } catch (error) { + // Fallback to UTF-8 if decoding fails + return buffer.toString("utf8") + } + } +} + +/** + * Reads a file in chunks and detects its encoding + * @param filePath - Path to the file + * @param options - Reading options + * @returns Buffer with enough bytes for encoding detection and detected encoding + */ +async function readFileForEncodingDetection( + filePath: string, + options: { autoGuessEncoding?: boolean } = {}, +): Promise<{ buffer: Buffer; encoding: string | null; seemsBinary: boolean }> { + return new Promise((resolve, reject) => { + const detectionBuffer = Buffer.alloc(AUTO_ENCODING_GUESS_MIN_BYTES) + let bytesRead = 0 + let fileStream: fs.ReadStream | null = null + let isResolved = false + + // Ensure stream is properly closed in all cases + const cleanup = () => { + if (fileStream) { + fileStream.removeAllListeners() + fileStream.destroy() + fileStream = null + } + } + + // Handle result with proper resource cleanup + const handleResult = async (buffer: Buffer, bytesCount: number) => { + if (isResolved) return + + try { + const { encoding, seemsBinary } = await detectEncodingFromBuffer( + buffer, + bytesCount, + options.autoGuessEncoding !== false, + ) + + isResolved = true + resolve({ + buffer: buffer.slice(0, bytesCount), + encoding, + seemsBinary, + }) + } catch (err) { + cleanup() + isResolved = true + reject(err) + } + } + + try { + fileStream = fs.createReadStream(filePath, { + highWaterMark: 4096, // 4KB chunks + start: 0, + end: AUTO_ENCODING_GUESS_MIN_BYTES - 1, + }) + + fileStream.on("data", (chunk: Buffer | string) => { + // Ensure chunk is a Buffer + const bufferChunk = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk) + const bytesToCopy = Math.min(bufferChunk.length, detectionBuffer.length - bytesRead) + bufferChunk.copy(detectionBuffer, bytesRead, 0, bytesToCopy) + bytesRead += bytesToCopy + + // If we've read enough bytes, close the stream + if (bytesRead >= AUTO_ENCODING_GUESS_MIN_BYTES) { + cleanup() + handleResult(detectionBuffer.slice(0, bytesRead), bytesRead) + } + }) + + fileStream.on("end", () => { + if (!isResolved) { + handleResult(detectionBuffer.slice(0, bytesRead), bytesRead) + } + }) + + fileStream.on("error", (err) => { + cleanup() + if (!isResolved) { + isResolved = true + reject(err) + } + }) + } catch (err) { + cleanup() + reject(err) + } + }) +} + +/** + * Reads file with automatic encoding detection and streaming decoding + * Inspired by VSCode's encoding detection mechanism + * @param filePath - Path to file + * @param options - Options for reading file + * @returns Decoded file content and detected encoding + */ +export async function readFileWithEncoding( + filePath: string, + options: { + autoGuessEncoding?: boolean + acceptTextOnly?: boolean + } = {}, +): Promise<{ content: string; encoding: string }> { + // Get file stats to check size + const stats = await fsPromises.stat(filePath) + const fileSize = stats.size + + // For small files, use the original implementation for simplicity and performance + if (fileSize <= AUTO_ENCODING_GUESS_MIN_BYTES) { + const buffer = await fsPromises.readFile(filePath) + const bytesRead = buffer.length + + // Detect encoding + const { encoding: detectedEncoding, seemsBinary } = await detectEncodingFromBuffer( + buffer, + bytesRead, + options.autoGuessEncoding !== false, + ) + + // If file seems binary and we only accept text, throw error + if (seemsBinary && options.acceptTextOnly) { + throw new Error(`File seems to be binary but only text is accepted: ${filePath}`) + } + + // Decode the buffer based on detected encoding + let content: string + let finalEncoding = detectedEncoding || UTF8 + + // Create a decoder to handle the content + const decoder = new TextDecoder(finalEncoding) + try { + content = decoder.decode(buffer, true, true) + } catch (error) { + // Fallback to UTF-8 if decoding fails + content = buffer.toString("utf8") + finalEncoding = UTF8 + } + + return { content, encoding: finalEncoding } + } + + // For larger files, use streaming approach + // First, read enough bytes to detect encoding + const { encoding: detectedEncoding, seemsBinary } = await readFileForEncodingDetection(filePath, { + autoGuessEncoding: options.autoGuessEncoding, + }) + + // If file seems binary and we only accept text, throw error + if (seemsBinary && options.acceptTextOnly) { + throw new Error(`File seems to be binary but only text is accepted: ${filePath}`) + } + + const finalEncoding = detectedEncoding || UTF8 + const decoder = new TextDecoder(finalEncoding) + + // Now read the entire file in streaming mode + return new Promise((resolve, reject) => { + let content = "" + let isFirstChunk = true + let fileStream: fs.ReadStream | null = null + + try { + fileStream = fs.createReadStream(filePath, { + highWaterMark: 64 * 1024, // 64KB chunks + }) + + fileStream.on("data", (chunk: Buffer | string) => { + // Ensure chunk is a Buffer + const bufferChunk = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk) + // Decode chunk and append to content + content += decoder.decode(bufferChunk, isFirstChunk, false) + isFirstChunk = false + }) + + fileStream.on("end", () => { + // Decode any remaining bytes + content += decoder.decode(Buffer.alloc(0), false, true) + resolve({ content, encoding: finalEncoding }) + }) + + fileStream.on("error", (err) => { + reject(err) + }) + } catch (err) { + if (fileStream) { + fileStream.destroy() + } + reject(err) + } + }) +} diff --git a/src/package.json b/src/package.json index 0f3af8e763..d78daffb39 100644 --- a/src/package.json +++ b/src/package.json @@ -384,8 +384,10 @@ "get-folder-size": "^5.0.0", "google-auth-library": "^9.15.1", "i18next": "^24.2.2", + "iconv-lite": "^0.6.3", "ignore": "^7.0.3", "isbinaryfile": "^5.0.2", + "jschardet": "^3.1.4", "lodash.debounce": "^4.0.8", "mammoth": "^1.8.0", "monaco-vscode-textmate-theme-converter": "^0.1.7",