fix(encoding): optimize binary file detection and encoding confidence (#428)

mini2s · web-flow · commit 305baf8bdd90 · 2025-09-11T22:37:09.000+08:00
* fix(encoding): optimize binary file detection and encoding confidence

* test(encoding): adjust confidence threshold and test cases
diff --git a/src/utils/__tests__/encoding.spec.ts b/src/utils/__tests__/encoding.spec.ts
@@ -91,20 +91,20 @@ describe("encoding", () => {
 			expect(result).toBe("utf8")
 		})
 
-		it("should fallback to utf8 for low confidence detection (< 0.9)", async () => {
+		it("should fallback to utf8 for low confidence detection (< 0.7)", async () => {
 			const buffer = Buffer.from("uncertain content")
 			mockIsBinaryFile.mockResolvedValue(false)
 			mockJschardet.detect.mockReturnValue({
 				encoding: "gbk",
-				confidence: 0.8, // Below new threshold 0.9
+				confidence: 0.6, // Below threshold 0.7
 			})
 
 			const consoleSpy = vi.spyOn(console, "warn").mockImplementation(() => {})
 			const result = await detectEncoding(buffer, ".txt")
 
 			expect(result).toBe("utf8")
 			expect(consoleSpy).toHaveBeenCalledWith(
-				"Low confidence encoding detection: gbk (confidence: 0.8), falling back to utf8",
+				"Low confidence encoding detection: gbk (confidence: 0.6), falling back to utf8",
 			)
 		})
 
@@ -499,20 +499,20 @@ describe("encoding", () => {
 			expect(result).toBe(true)
 		})
 
-		it("should return true for files with low confidence encoding detection", async () => {
+		it("should return false for files with low confidence encoding detection", async () => {
 			const filePath = "/path/to/file.txt"
 			const buffer = Buffer.from("ambiguous content")
 			mockFs.readFile.mockResolvedValue(buffer)
 			mockPath.extname.mockReturnValue(".txt")
 			mockJschardet.detect.mockReturnValue({
 				encoding: "utf8",
-				confidence: 0.8, // Below new threshold 0.9
+				confidence: 0.8, // Above threshold 0.7, detectEncoding will succeed
 			})
-			mockIsBinaryFile.mockResolvedValue(true)
+			mockIsBinaryFile.mockResolvedValue(false)
 
 			const result = await isBinaryFileWithEncodingDetection(filePath)
 
-			expect(result).toBe(true)
+			expect(result).toBe(false) // When detectEncoding succeeds, it's considered a text file
 		})
 
 		it("should return false for files with high confidence encoding detection", async () => {
@@ -641,7 +641,12 @@ describe("encoding", () => {
 			const buffer = Buffer.from("text content with some \x00 null bytes")
 			mockFs.readFile.mockResolvedValue(buffer)
 			mockPath.extname.mockReturnValue(".txt")
-			mockIsBinaryFile.mockResolvedValue(false)
+			// Mock detectEncoding to throw an error for files with null bytes
+			mockJschardet.detect.mockReturnValue({
+				encoding: "",
+				confidence: 0,
+			})
+			mockIsBinaryFile.mockResolvedValue(true) // isBinaryFile should detect it as binary
 
 			const result = await isBinaryFileWithEncodingDetection(filePath)
 
diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts
@@ -121,56 +121,6 @@ export const BINARY_MAGIC_NUMBERS = [
 	{ magic: Buffer.from([0xcf, 0xfa, 0xed, 0xfe]), description: "Mach-O executable (64-bit reverse)" },
 ]
 
-/**
- * Analyze file content characteristics to determine if it's a binary file
- * @param buffer File buffer
- * @returns Whether it's a binary file
- */
-function analyzeContentCharacteristics(buffer: Buffer): boolean {
-	if (buffer.length === 0) {
-		return false
-	}
-
-	// Check for null bytes (typical characteristic of binary files)
-	const nullByteCount = (buffer.toString().match(/\0/g) || []).length
-	if (nullByteCount > buffer.length * 0.01) {
-		// More than 1% null bytes
-		return true
-	}
-
-	// Check the ratio of non-printable characters
-	let nonPrintableCount = 0
-	for (let i = 0; i < Math.min(buffer.length, 1024); i++) {
-		const byte = buffer[i]
-		if (byte < 32 && byte !== 9 && byte !== 10 && byte !== 13) {
-			// Not tab, newline, or carriage return
-			nonPrintableCount++
-		}
-	}
-
-	const nonPrintableRatio = nonPrintableCount / Math.min(buffer.length, 1024)
-	if (nonPrintableRatio > 0.3) {
-		// More than 30% non-printable characters
-		return true
-	}
-
-	// Check for consecutive high byte values (text files with UTF-16 encoding typically don't do this)
-	let highByteSequence = 0
-	for (let i = 0; i < buffer.length - 1; i++) {
-		if (buffer[i] > 127 && buffer[i + 1] > 127) {
-			highByteSequence++
-			if (highByteSequence > 10) {
-				// More than 10 consecutive high bytes
-				return true
-			}
-		} else {
-			highByteSequence = 0
-		}
-	}
-
-	return false
-}
-
 /**
  * Detect the encoding of a file buffer
  * @param fileBuffer The file buffer
@@ -194,7 +144,7 @@ export async function detectEncoding(fileBuffer: Buffer, fileExtension?: string)
 	} else if (detected && detected.encoding) {
 		originalEncoding = detected.encoding
 		// Increase confidence threshold from 0.7 to 0.9
-		if (detected.confidence < 0.9) {
+		if (detected.confidence < 0.7) {
 			console.warn(
 				`Low confidence encoding detection: ${originalEncoding} (confidence: ${detected.confidence}), falling back to utf8`,
 			)
@@ -261,8 +211,8 @@ export async function detectFileEncoding(filePath: string): Promise<string> {
  */
 export async function isBinaryFileWithEncodingDetection(filePath: string): Promise<boolean> {
 	try {
-		// 1. First check file extension
 		const fileExtension = path.extname(filePath).toLowerCase()
+		// 1. First check file extension
 		if (BINARY_EXTENSIONS.has(fileExtension)) {
 			return true
 		}
@@ -276,36 +226,17 @@ export async function isBinaryFileWithEncodingDetection(filePath: string): Promi
 				return true
 			}
 		}
-
-		// 4. Analyze content characteristics
-		if (analyzeContentCharacteristics(fileBuffer)) {
-			return true
-		}
-
-		// 5. Use isBinaryFile library for quick check
-		const isBinaryByLibrary = await isBinaryFile(fileBuffer).catch(() => false)
-		if (isBinaryByLibrary) {
-			return true
-		}
-
-		// 6. Finally perform encoding detection (only for files that might be text)
+		// Try to detect encoding first
 		try {
-			const encoding = await detectEncoding(fileBuffer, fileExtension)
-
-			// Even if encoding detection succeeds, check confidence
-			const detected = jschardet.detect(fileBuffer)
-			if (detected && typeof detected === "object" && detected.confidence < 0.9) {
-				// Low confidence, confirm again with isBinaryFile
-				return await isBinaryFile(fileBuffer).catch(() => true)
-			}
-
+			await detectEncoding(fileBuffer, fileExtension)
+			// If detectEncoding succeeds, it's a text file
 			return false
 		} catch (error) {
-			// Encoding detection failed, consider it as binary file
-			return true
+			// If detectEncoding fails, check if it's actually a binary file
+			return await isBinaryFile(fileBuffer).catch(() => false)
 		}
 	} catch (error) {
-		// File read error, consider it as binary file
+		// File read error, assume it's binary
 		return true
 	}
 }