Skip to content

Commit 305baf8

Browse files
authored
fix(encoding): optimize binary file detection and encoding confidence (#428)
* fix(encoding): optimize binary file detection and encoding confidence * test(encoding): adjust confidence threshold and test cases
1 parent ac74a16 commit 305baf8

File tree

2 files changed

+21
-85
lines changed

2 files changed

+21
-85
lines changed

src/utils/__tests__/encoding.spec.ts

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -91,20 +91,20 @@ describe("encoding", () => {
9191
expect(result).toBe("utf8")
9292
})
9393

94-
it("should fallback to utf8 for low confidence detection (< 0.9)", async () => {
94+
it("should fallback to utf8 for low confidence detection (< 0.7)", async () => {
9595
const buffer = Buffer.from("uncertain content")
9696
mockIsBinaryFile.mockResolvedValue(false)
9797
mockJschardet.detect.mockReturnValue({
9898
encoding: "gbk",
99-
confidence: 0.8, // Below new threshold 0.9
99+
confidence: 0.6, // Below threshold 0.7
100100
})
101101

102102
const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
103103
const result = await detectEncoding(buffer, ".txt")
104104

105105
expect(result).toBe("utf8")
106106
expect(consoleSpy).toHaveBeenCalledWith(
107-
"Low confidence encoding detection: gbk (confidence: 0.8), falling back to utf8",
107+
"Low confidence encoding detection: gbk (confidence: 0.6), falling back to utf8",
108108
)
109109
})
110110

@@ -499,20 +499,20 @@ describe("encoding", () => {
499499
expect(result).toBe(true)
500500
})
501501

502-
it("should return true for files with low confidence encoding detection", async () => {
502+
it("should return false for files with low confidence encoding detection", async () => {
503503
const filePath = "/path/to/file.txt"
504504
const buffer = Buffer.from("ambiguous content")
505505
mockFs.readFile.mockResolvedValue(buffer)
506506
mockPath.extname.mockReturnValue(".txt")
507507
mockJschardet.detect.mockReturnValue({
508508
encoding: "utf8",
509-
confidence: 0.8, // Below new threshold 0.9
509+
confidence: 0.8, // Above threshold 0.7, detectEncoding will succeed
510510
})
511-
mockIsBinaryFile.mockResolvedValue(true)
511+
mockIsBinaryFile.mockResolvedValue(false)
512512

513513
const result = await isBinaryFileWithEncodingDetection(filePath)
514514

515-
expect(result).toBe(true)
515+
expect(result).toBe(false) // When detectEncoding succeeds, it's considered a text file
516516
})
517517

518518
it("should return false for files with high confidence encoding detection", async () => {
@@ -641,7 +641,12 @@ describe("encoding", () => {
641641
const buffer = Buffer.from("text content with some \x00 null bytes")
642642
mockFs.readFile.mockResolvedValue(buffer)
643643
mockPath.extname.mockReturnValue(".txt")
644-
mockIsBinaryFile.mockResolvedValue(false)
644+
// Mock detectEncoding to throw an error for files with null bytes
645+
mockJschardet.detect.mockReturnValue({
646+
encoding: "",
647+
confidence: 0,
648+
})
649+
mockIsBinaryFile.mockResolvedValue(true) // isBinaryFile should detect it as binary
645650

646651
const result = await isBinaryFileWithEncodingDetection(filePath)
647652

src/utils/encoding.ts

Lines changed: 8 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -121,56 +121,6 @@ export const BINARY_MAGIC_NUMBERS = [
121121
{ magic: Buffer.from([0xcf, 0xfa, 0xed, 0xfe]), description: "Mach-O executable (64-bit reverse)" },
122122
]
123123

124-
/**
125-
* Analyze file content characteristics to determine if it's a binary file
126-
* @param buffer File buffer
127-
* @returns Whether it's a binary file
128-
*/
129-
function analyzeContentCharacteristics(buffer: Buffer): boolean {
130-
if (buffer.length === 0) {
131-
return false
132-
}
133-
134-
// Check for null bytes (typical characteristic of binary files)
135-
const nullByteCount = (buffer.toString().match(/\0/g) || []).length
136-
if (nullByteCount > buffer.length * 0.01) {
137-
// More than 1% null bytes
138-
return true
139-
}
140-
141-
// Check the ratio of non-printable characters
142-
let nonPrintableCount = 0
143-
for (let i = 0; i < Math.min(buffer.length, 1024); i++) {
144-
const byte = buffer[i]
145-
if (byte < 32 && byte !== 9 && byte !== 10 && byte !== 13) {
146-
// Not tab, newline, or carriage return
147-
nonPrintableCount++
148-
}
149-
}
150-
151-
const nonPrintableRatio = nonPrintableCount / Math.min(buffer.length, 1024)
152-
if (nonPrintableRatio > 0.3) {
153-
// More than 30% non-printable characters
154-
return true
155-
}
156-
157-
// Check for consecutive high byte values (text files with UTF-16 encoding typically don't do this)
158-
let highByteSequence = 0
159-
for (let i = 0; i < buffer.length - 1; i++) {
160-
if (buffer[i] > 127 && buffer[i + 1] > 127) {
161-
highByteSequence++
162-
if (highByteSequence > 10) {
163-
// More than 10 consecutive high bytes
164-
return true
165-
}
166-
} else {
167-
highByteSequence = 0
168-
}
169-
}
170-
171-
return false
172-
}
173-
174124
/**
175125
* Detect the encoding of a file buffer
176126
* @param fileBuffer The file buffer
@@ -194,7 +144,7 @@ export async function detectEncoding(fileBuffer: Buffer, fileExtension?: string)
194144
} else if (detected && detected.encoding) {
195145
originalEncoding = detected.encoding
196146
// Increase confidence threshold from 0.7 to 0.9
197-
if (detected.confidence < 0.9) {
147+
if (detected.confidence < 0.7) {
198148
console.warn(
199149
`Low confidence encoding detection: ${originalEncoding} (confidence: ${detected.confidence}), falling back to utf8`,
200150
)
@@ -261,8 +211,8 @@ export async function detectFileEncoding(filePath: string): Promise<string> {
261211
*/
262212
export async function isBinaryFileWithEncodingDetection(filePath: string): Promise<boolean> {
263213
try {
264-
// 1. First check file extension
265214
const fileExtension = path.extname(filePath).toLowerCase()
215+
// 1. First check file extension
266216
if (BINARY_EXTENSIONS.has(fileExtension)) {
267217
return true
268218
}
@@ -276,36 +226,17 @@ export async function isBinaryFileWithEncodingDetection(filePath: string): Promi
276226
return true
277227
}
278228
}
279-
280-
// 4. Analyze content characteristics
281-
if (analyzeContentCharacteristics(fileBuffer)) {
282-
return true
283-
}
284-
285-
// 5. Use isBinaryFile library for quick check
286-
const isBinaryByLibrary = await isBinaryFile(fileBuffer).catch(() => false)
287-
if (isBinaryByLibrary) {
288-
return true
289-
}
290-
291-
// 6. Finally perform encoding detection (only for files that might be text)
229+
// Try to detect encoding first
292230
try {
293-
const encoding = await detectEncoding(fileBuffer, fileExtension)
294-
295-
// Even if encoding detection succeeds, check confidence
296-
const detected = jschardet.detect(fileBuffer)
297-
if (detected && typeof detected === "object" && detected.confidence < 0.9) {
298-
// Low confidence, confirm again with isBinaryFile
299-
return await isBinaryFile(fileBuffer).catch(() => true)
300-
}
301-
231+
await detectEncoding(fileBuffer, fileExtension)
232+
// If detectEncoding succeeds, it's a text file
302233
return false
303234
} catch (error) {
304-
// Encoding detection failed, consider it as binary file
305-
return true
235+
// If detectEncoding fails, check if it's actually a binary file
236+
return await isBinaryFile(fileBuffer).catch(() => false)
306237
}
307238
} catch (error) {
308-
// File read error, consider it as binary file
239+
// File read error, assume it's binary
309240
return true
310241
}
311242
}

0 commit comments

Comments
 (0)