Skip to content

Commit cd41603

Browse files
committed
refactor: Reconstruct the file encoding detection logic to first attempt encoding detection and then determine the binary file
1 parent c7c0018 commit cd41603

File tree

7 files changed

+180
-94
lines changed

7 files changed

+180
-94
lines changed

src/core/mentions/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import fs from "fs/promises"
22
import * as path from "path"
33

44
import * as vscode from "vscode"
5-
import { isBinaryFile } from "isbinaryfile"
5+
import { isBinaryFileWithEncodingDetection } from "../../utils/encoding"
66

77
import { mentionRegexGlobal, commandRegexGlobal, unescapeSpaces } from "../../shared/context-mentions"
88

@@ -314,7 +314,7 @@ async function getFileOrFolderContent(
314314
fileContentPromises.push(
315315
(async () => {
316316
try {
317-
const isBinary = await isBinaryFile(absoluteFilePath).catch(() => false)
317+
const isBinary = await isBinaryFileWithEncodingDetection(absoluteFilePath)
318318
if (isBinary) {
319319
return undefined
320320
}

src/core/tools/__tests__/readFileTool.spec.ts

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { countFileLines } from "../../../integrations/misc/line-counter"
66
import { readLines } from "../../../integrations/misc/read-lines"
77
import { extractTextFromFile } from "../../../integrations/misc/extract-text"
88
import { parseSourceCodeDefinitionsForFile } from "../../../services/tree-sitter"
9-
import { isBinaryFile } from "isbinaryfile"
9+
import { isBinaryFileWithEncodingDetection } from "../../../utils/encoding"
1010
import { ReadFileToolUse, ToolParamName, ToolResponse } from "../../../shared/tools"
1111
import { readFileTool } from "../readFileTool"
1212
import { formatResponse } from "../../prompts/responses"
@@ -23,7 +23,9 @@ vi.mock("path", async () => {
2323

2424
// Already mocked above with hoisted fsPromises
2525

26-
vi.mock("isbinaryfile")
26+
vi.mock("../../../utils/encoding", () => ({
27+
isBinaryFileWithEncodingDetection: vi.fn(),
28+
}))
2729

2830
vi.mock("../../../integrations/misc/line-counter")
2931
vi.mock("../../../integrations/misc/read-lines")
@@ -238,7 +240,7 @@ describe("read_file tool with maxReadFileLine setting", () => {
238240
const mockedExtractTextFromFile = vi.mocked(extractTextFromFile)
239241
const mockedParseSourceCodeDefinitionsForFile = vi.mocked(parseSourceCodeDefinitionsForFile)
240242

241-
const mockedIsBinaryFile = vi.mocked(isBinaryFile)
243+
const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection)
242244
const mockedPathResolve = vi.mocked(path.resolve)
243245

244246
let mockCline: any
@@ -249,7 +251,7 @@ describe("read_file tool with maxReadFileLine setting", () => {
249251
// Clear specific mocks (not all mocks to preserve shared state)
250252
mockedCountFileLines.mockClear()
251253
mockedExtractTextFromFile.mockClear()
252-
mockedIsBinaryFile.mockClear()
254+
mockedIsBinaryFileWithEncodingDetection.mockClear()
253255
mockedPathResolve.mockClear()
254256
addLineNumbersMock.mockClear()
255257
extractTextFromFileMock.mockClear()
@@ -264,7 +266,7 @@ describe("read_file tool with maxReadFileLine setting", () => {
264266
setImageSupport(mockCline, false)
265267

266268
mockedPathResolve.mockReturnValue(absoluteFilePath)
267-
mockedIsBinaryFile.mockResolvedValue(false)
269+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false)
268270

269271
mockInputContent = fileContent
270272

@@ -502,7 +504,7 @@ describe("read_file tool with maxReadFileLine setting", () => {
502504
describe("when file is binary", () => {
503505
it("should always use extractTextFromFile regardless of maxReadFileLine", async () => {
504506
// Setup
505-
mockedIsBinaryFile.mockResolvedValue(true)
507+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
506508
mockedCountFileLines.mockResolvedValue(3)
507509
mockedExtractTextFromFile.mockResolvedValue("")
508510

@@ -544,7 +546,7 @@ describe("read_file tool XML output structure", () => {
544546

545547
const mockedCountFileLines = vi.mocked(countFileLines)
546548
const mockedExtractTextFromFile = vi.mocked(extractTextFromFile)
547-
const mockedIsBinaryFile = vi.mocked(isBinaryFile)
549+
const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection)
548550
const mockedPathResolve = vi.mocked(path.resolve)
549551
const mockedFsReadFile = vi.mocked(fsPromises.readFile)
550552
const imageBuffer = Buffer.from(
@@ -560,7 +562,7 @@ describe("read_file tool XML output structure", () => {
560562
// Clear specific mocks (not all mocks to preserve shared state)
561563
mockedCountFileLines.mockClear()
562564
mockedExtractTextFromFile.mockClear()
563-
mockedIsBinaryFile.mockClear()
565+
mockedIsBinaryFileWithEncodingDetection.mockClear()
564566
mockedPathResolve.mockClear()
565567
addLineNumbersMock.mockClear()
566568
extractTextFromFileMock.mockClear()
@@ -580,7 +582,7 @@ describe("read_file tool XML output structure", () => {
580582
setImageSupport(mockCline, true)
581583

582584
mockedPathResolve.mockReturnValue(absoluteFilePath)
583-
mockedIsBinaryFile.mockResolvedValue(false)
585+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false)
584586

585587
// Set default implementation for extractTextFromFile
586588
mockedExtractTextFromFile.mockImplementation((filePath) => {
@@ -617,7 +619,7 @@ describe("read_file tool XML output structure", () => {
617619

618620
mockProvider.getState.mockResolvedValue({ maxReadFileLine, maxImageFileSize: 20, maxTotalImageSize: 20 })
619621
mockedCountFileLines.mockResolvedValue(totalLines)
620-
mockedIsBinaryFile.mockResolvedValue(isBinary)
622+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(isBinary)
621623
mockCline.rooIgnoreController.validateAccess = vi.fn().mockReturnValue(validateAccess)
622624

623625
let argsContent = `<file><path>${testFilePath}</path></file>`
@@ -758,7 +760,7 @@ describe("read_file tool XML output structure", () => {
758760

759761
it("should allow multiple images under the total memory limit", async () => {
760762
// Setup required mocks (don't clear all mocks - preserve API setup)
761-
mockedIsBinaryFile.mockResolvedValue(true)
763+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
762764
mockedCountFileLines.mockResolvedValue(0)
763765
fsPromises.readFile.mockResolvedValue(
764766
Buffer.from(
@@ -831,7 +833,7 @@ describe("read_file tool XML output structure", () => {
831833

832834
it("should skip images that would exceed the total memory limit", async () => {
833835
// Setup required mocks (don't clear all mocks)
834-
mockedIsBinaryFile.mockResolvedValue(true)
836+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
835837
mockedCountFileLines.mockResolvedValue(0)
836838
fsPromises.readFile.mockResolvedValue(
837839
Buffer.from(
@@ -917,7 +919,7 @@ describe("read_file tool XML output structure", () => {
917919
// Setup mocks (don't clear all mocks)
918920

919921
// Setup required mocks
920-
mockedIsBinaryFile.mockResolvedValue(true)
922+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
921923
mockedCountFileLines.mockResolvedValue(0)
922924
fsPromises.readFile.mockResolvedValue(
923925
Buffer.from(
@@ -990,7 +992,7 @@ describe("read_file tool XML output structure", () => {
990992
// Setup mocks (don't clear all mocks)
991993

992994
// Setup required mocks
993-
mockedIsBinaryFile.mockResolvedValue(true)
995+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
994996
mockedCountFileLines.mockResolvedValue(0)
995997
fsPromises.readFile.mockResolvedValue(
996998
Buffer.from(
@@ -1084,7 +1086,7 @@ describe("read_file tool XML output structure", () => {
10841086
maxTotalImageSize: 20, // 20MB total
10851087
})
10861088

1087-
mockedIsBinaryFile.mockResolvedValue(true)
1089+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
10881090
mockedCountFileLines.mockResolvedValue(0)
10891091
mockedFsReadFile.mockResolvedValue(imageBuffer)
10901092

@@ -1115,7 +1117,7 @@ describe("read_file tool XML output structure", () => {
11151117
// Setup mocks (don't clear all mocks)
11161118

11171119
// Setup required mocks for first batch
1118-
mockedIsBinaryFile.mockResolvedValue(true)
1120+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
11191121
mockedCountFileLines.mockResolvedValue(0)
11201122
fsPromises.readFile.mockResolvedValue(
11211123
Buffer.from(
@@ -1161,7 +1163,7 @@ describe("read_file tool XML output structure", () => {
11611163
await executeReadMultipleImagesTool(firstBatch.map((img) => img.path))
11621164

11631165
// Setup second batch (don't clear all mocks)
1164-
mockedIsBinaryFile.mockResolvedValue(true)
1166+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
11651167
mockedCountFileLines.mockResolvedValue(0)
11661168
fsPromises.readFile.mockResolvedValue(
11671169
Buffer.from(
@@ -1203,7 +1205,7 @@ describe("read_file tool XML output structure", () => {
12031205
// Clear and reset file system mocks for second batch
12041206
fsPromises.stat.mockClear()
12051207
fsPromises.readFile.mockClear()
1206-
mockedIsBinaryFile.mockClear()
1208+
mockedIsBinaryFileWithEncodingDetection.mockClear()
12071209
mockedCountFileLines.mockClear()
12081210

12091211
// Reset mocks for second batch
@@ -1214,7 +1216,7 @@ describe("read_file tool XML output structure", () => {
12141216
"base64",
12151217
),
12161218
)
1217-
mockedIsBinaryFile.mockResolvedValue(true)
1219+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
12181220
mockedCountFileLines.mockResolvedValue(0)
12191221
mockedPathResolve.mockImplementation((cwd, relPath) => `/${relPath}`)
12201222

@@ -1241,7 +1243,7 @@ describe("read_file tool XML output structure", () => {
12411243
]
12421244

12431245
// Setup mocks
1244-
mockedIsBinaryFile.mockResolvedValue(true)
1246+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
12451247
mockedCountFileLines.mockResolvedValue(0)
12461248
fsPromises.readFile.mockResolvedValue(imageBuffer)
12471249

@@ -1289,7 +1291,7 @@ describe("read_file tool XML output structure", () => {
12891291
// starts with fresh memory tracking
12901292

12911293
// Setup mocks
1292-
mockedIsBinaryFile.mockResolvedValue(true)
1294+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
12931295
mockedCountFileLines.mockResolvedValue(0)
12941296
fsPromises.readFile.mockResolvedValue(imageBuffer)
12951297

@@ -1394,7 +1396,7 @@ describe("read_file tool with image support", () => {
13941396
const imageBuffer = Buffer.from(base64ImageData, "base64")
13951397

13961398
const mockedCountFileLines = vi.mocked(countFileLines)
1397-
const mockedIsBinaryFile = vi.mocked(isBinaryFile)
1399+
const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection)
13981400
const mockedPathResolve = vi.mocked(path.resolve)
13991401
const mockedFsReadFile = vi.mocked(fsPromises.readFile)
14001402
const mockedExtractTextFromFile = vi.mocked(extractTextFromFile)
@@ -1406,7 +1408,7 @@ describe("read_file tool with image support", () => {
14061408
beforeEach(() => {
14071409
// Clear specific mocks (not all mocks to preserve shared state)
14081410
mockedPathResolve.mockClear()
1409-
mockedIsBinaryFile.mockClear()
1411+
mockedIsBinaryFileWithEncodingDetection.mockClear()
14101412
mockedCountFileLines.mockClear()
14111413
mockedFsReadFile.mockClear()
14121414
mockedExtractTextFromFile.mockClear()
@@ -1425,7 +1427,7 @@ describe("read_file tool with image support", () => {
14251427
setImageSupport(localMockCline, true)
14261428

14271429
mockedPathResolve.mockReturnValue(absoluteImagePath)
1428-
mockedIsBinaryFile.mockResolvedValue(true)
1430+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
14291431
mockedCountFileLines.mockResolvedValue(0)
14301432
mockedFsReadFile.mockResolvedValue(imageBuffer)
14311433

src/core/tools/readFileTool.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import path from "path"
2-
import { isBinaryFile } from "isbinaryfile"
32

43
import { Task } from "../task/Task"
54
import { ClineSayTool } from "../../shared/ExtensionMessage"
@@ -14,6 +13,7 @@ import { readLines } from "../../integrations/misc/read-lines"
1413
import { extractTextFromFile, addLineNumbers, getSupportedBinaryFormats } from "../../integrations/misc/extract-text"
1514
import { parseSourceCodeDefinitionsForFile } from "../../services/tree-sitter"
1615
import { parseXml } from "../../utils/xml"
16+
import { isBinaryFileWithEncodingDetection } from "../../utils/encoding"
1717
import {
1818
DEFAULT_MAX_IMAGE_FILE_SIZE_MB,
1919
DEFAULT_MAX_TOTAL_IMAGE_SIZE_MB,
@@ -456,7 +456,10 @@ export async function readFileTool(
456456

457457
// Process approved files
458458
try {
459-
const [totalLines, isBinary] = await Promise.all([countFileLines(fullPath), isBinaryFile(fullPath)])
459+
const [totalLines, isBinary] = await Promise.all([
460+
countFileLines(fullPath),
461+
isBinaryFileWithEncodingDetection(fullPath),
462+
])
460463

461464
// Handle binary files (but allow specific file types that extractTextFromFile can handle)
462465
if (isBinary) {

src/integrations/misc/__tests__/extract-text-large-files.spec.ts

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,26 +5,30 @@ import * as fs from "fs/promises"
55
import { extractTextFromFile } from "../extract-text"
66
import { countFileLines } from "../line-counter"
77
import { readLines } from "../read-lines"
8-
import { isBinaryFile } from "isbinaryfile"
8+
import { isBinaryFileWithEncodingDetection, readFileWithEncodingDetection } from "../../../utils/encoding"
99

1010
// Mock all dependencies
1111
vi.mock("fs/promises")
1212
vi.mock("../line-counter")
1313
vi.mock("../read-lines")
14-
vi.mock("isbinaryfile")
14+
vi.mock("../../../utils/encoding", () => ({
15+
isBinaryFileWithEncodingDetection: vi.fn(),
16+
readFileWithEncodingDetection: vi.fn(),
17+
}))
1518

1619
describe("extractTextFromFile - Large File Handling", () => {
1720
// Type the mocks
1821
const mockedFs = vi.mocked(fs)
1922
const mockedCountFileLines = vi.mocked(countFileLines)
2023
const mockedReadLines = vi.mocked(readLines)
21-
const mockedIsBinaryFile = vi.mocked(isBinaryFile)
24+
const mockedIsBinaryFileWithEncodingDetection = vi.mocked(isBinaryFileWithEncodingDetection)
25+
const mockedReadFileWithEncodingDetection = vi.mocked(readFileWithEncodingDetection)
2226

2327
beforeEach(() => {
2428
vi.clearAllMocks()
2529
// Set default mock behavior
2630
mockedFs.access.mockResolvedValue(undefined)
27-
mockedIsBinaryFile.mockResolvedValue(false)
31+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(false)
2832
})
2933

3034
it("should truncate files that exceed maxReadFileLine limit", async () => {
@@ -61,7 +65,7 @@ describe("extractTextFromFile - Large File Handling", () => {
6165
.join("\n")
6266

6367
mockedCountFileLines.mockResolvedValue(50)
64-
mockedFs.readFile.mockResolvedValue(smallFileContent as any)
68+
mockedReadFileWithEncodingDetection.mockResolvedValue(smallFileContent)
6569

6670
const result = await extractTextFromFile("/test/small-file.ts", 100)
6771

@@ -80,7 +84,7 @@ describe("extractTextFromFile - Large File Handling", () => {
8084
.join("\n")
8185

8286
mockedCountFileLines.mockResolvedValue(100)
83-
mockedFs.readFile.mockResolvedValue(exactFileContent as any)
87+
mockedReadFileWithEncodingDetection.mockResolvedValue(exactFileContent)
8488

8589
const result = await extractTextFromFile("/test/exact-file.ts", 100)
8690

@@ -98,7 +102,7 @@ describe("extractTextFromFile - Large File Handling", () => {
98102
.map((_, i) => `Line ${i + 1}`)
99103
.join("\n")
100104

101-
mockedFs.readFile.mockResolvedValue(largeFileContent as any)
105+
mockedReadFileWithEncodingDetection.mockResolvedValue(largeFileContent)
102106

103107
const result = await extractTextFromFile("/test/large-file.ts", undefined)
104108

@@ -111,7 +115,7 @@ describe("extractTextFromFile - Large File Handling", () => {
111115
})
112116

113117
it("should handle empty files", async () => {
114-
mockedFs.readFile.mockResolvedValue("" as any)
118+
mockedReadFileWithEncodingDetection.mockResolvedValue("")
115119

116120
const result = await extractTextFromFile("/test/empty-file.ts", 100)
117121

@@ -155,7 +159,7 @@ describe("extractTextFromFile - Large File Handling", () => {
155159
it("should handle maxReadFileLine of 0 by throwing an error", async () => {
156160
const fileContent = "Line 1\nLine 2\nLine 3"
157161

158-
mockedFs.readFile.mockResolvedValue(fileContent as any)
162+
mockedReadFileWithEncodingDetection.mockResolvedValue(fileContent)
159163

160164
// maxReadFileLine of 0 should throw an error
161165
await expect(extractTextFromFile("/test/file.ts", 0)).rejects.toThrow(
@@ -166,7 +170,7 @@ describe("extractTextFromFile - Large File Handling", () => {
166170
it("should handle negative maxReadFileLine by treating as undefined", async () => {
167171
const fileContent = "Line 1\nLine 2\nLine 3"
168172

169-
mockedFs.readFile.mockResolvedValue(fileContent as any)
173+
mockedReadFileWithEncodingDetection.mockResolvedValue(fileContent)
170174

171175
const result = await extractTextFromFile("/test/file.ts", -1)
172176

@@ -204,7 +208,7 @@ describe("extractTextFromFile - Large File Handling", () => {
204208
})
205209

206210
it("should handle binary files by throwing an error", async () => {
207-
mockedIsBinaryFile.mockResolvedValue(true)
211+
mockedIsBinaryFileWithEncodingDetection.mockResolvedValue(true)
208212

209213
await expect(extractTextFromFile("/test/binary.bin", 100)).rejects.toThrow(
210214
"Cannot read text for file type: .bin",

src/integrations/misc/extract-text.ts

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@ import * as path from "path"
33
import pdf from "pdf-parse/lib/pdf-parse"
44
import mammoth from "mammoth"
55
import fs from "fs/promises"
6-
import { isBinaryFile } from "isbinaryfile"
76
import { extractTextFromXLSX } from "./extract-text-from-xlsx"
87
import { countFileLines } from "./line-counter"
98
import { readLines } from "./read-lines"
10-
import { readFileWithEncodingDetection } from "../../utils/encoding"
9+
import { readFileWithEncodingDetection, isBinaryFileWithEncodingDetection } from "../../utils/encoding"
1110

1211
async function extractTextFromPDF(filePath: string): Promise<string> {
1312
const dataBuffer = await fs.readFile(filePath)
@@ -85,9 +84,8 @@ export async function extractTextFromFile(filePath: string, maxReadFileLine?: nu
8584
if (extractor) {
8685
return extractor(filePath)
8786
}
88-
89-
// Handle other files
90-
const isBinary = await isBinaryFile(filePath).catch(() => false)
87+
// Handle other files - use unified binary file detection
88+
const isBinary = await isBinaryFileWithEncodingDetection(filePath)
9189

9290
if (!isBinary) {
9391
// Check if we need to apply line limit

0 commit comments

Comments
 (0)