@@ -121,56 +121,6 @@ export const BINARY_MAGIC_NUMBERS = [
121121 { magic : Buffer . from ( [ 0xcf , 0xfa , 0xed , 0xfe ] ) , description : "Mach-O executable (64-bit reverse)" } ,
122122]
123123
124- /**
125- * Analyze file content characteristics to determine if it's a binary file
126- * @param buffer File buffer
127- * @returns Whether it's a binary file
128- */
129- function analyzeContentCharacteristics ( buffer : Buffer ) : boolean {
130- if ( buffer . length === 0 ) {
131- return false
132- }
133-
134- // Check for null bytes (typical characteristic of binary files)
135- const nullByteCount = ( buffer . toString ( ) . match ( / \0 / g) || [ ] ) . length
136- if ( nullByteCount > buffer . length * 0.01 ) {
137- // More than 1% null bytes
138- return true
139- }
140-
141- // Check the ratio of non-printable characters
142- let nonPrintableCount = 0
143- for ( let i = 0 ; i < Math . min ( buffer . length , 1024 ) ; i ++ ) {
144- const byte = buffer [ i ]
145- if ( byte < 32 && byte !== 9 && byte !== 10 && byte !== 13 ) {
146- // Not tab, newline, or carriage return
147- nonPrintableCount ++
148- }
149- }
150-
151- const nonPrintableRatio = nonPrintableCount / Math . min ( buffer . length , 1024 )
152- if ( nonPrintableRatio > 0.3 ) {
153- // More than 30% non-printable characters
154- return true
155- }
156-
157- // Check for consecutive high byte values (text files with UTF-16 encoding typically don't do this)
158- let highByteSequence = 0
159- for ( let i = 0 ; i < buffer . length - 1 ; i ++ ) {
160- if ( buffer [ i ] > 127 && buffer [ i + 1 ] > 127 ) {
161- highByteSequence ++
162- if ( highByteSequence > 10 ) {
163- // More than 10 consecutive high bytes
164- return true
165- }
166- } else {
167- highByteSequence = 0
168- }
169- }
170-
171- return false
172- }
173-
174124/**
175125 * Detect the encoding of a file buffer
176126 * @param fileBuffer The file buffer
@@ -194,7 +144,7 @@ export async function detectEncoding(fileBuffer: Buffer, fileExtension?: string)
194144 } else if ( detected && detected . encoding ) {
195145 originalEncoding = detected . encoding
196146 // Increase confidence threshold from 0.7 to 0.9
197- if ( detected . confidence < 0.9 ) {
147+ if ( detected . confidence < 0.7 ) {
198148 console . warn (
199149 `Low confidence encoding detection: ${ originalEncoding } (confidence: ${ detected . confidence } ), falling back to utf8` ,
200150 )
@@ -261,8 +211,8 @@ export async function detectFileEncoding(filePath: string): Promise<string> {
261211 */
262212export async function isBinaryFileWithEncodingDetection ( filePath : string ) : Promise < boolean > {
263213 try {
264- // 1. First check file extension
265214 const fileExtension = path . extname ( filePath ) . toLowerCase ( )
215+ // 1. First check file extension
266216 if ( BINARY_EXTENSIONS . has ( fileExtension ) ) {
267217 return true
268218 }
@@ -276,36 +226,17 @@ export async function isBinaryFileWithEncodingDetection(filePath: string): Promi
276226 return true
277227 }
278228 }
279-
280- // 4. Analyze content characteristics
281- if ( analyzeContentCharacteristics ( fileBuffer ) ) {
282- return true
283- }
284-
285- // 5. Use isBinaryFile library for quick check
286- const isBinaryByLibrary = await isBinaryFile ( fileBuffer ) . catch ( ( ) => false )
287- if ( isBinaryByLibrary ) {
288- return true
289- }
290-
291- // 6. Finally perform encoding detection (only for files that might be text)
229+ // Try to detect encoding first
292230 try {
293- const encoding = await detectEncoding ( fileBuffer , fileExtension )
294-
295- // Even if encoding detection succeeds, check confidence
296- const detected = jschardet . detect ( fileBuffer )
297- if ( detected && typeof detected === "object" && detected . confidence < 0.9 ) {
298- // Low confidence, confirm again with isBinaryFile
299- return await isBinaryFile ( fileBuffer ) . catch ( ( ) => true )
300- }
301-
231+ await detectEncoding ( fileBuffer , fileExtension )
232+ // If detectEncoding succeeds, it's a text file
302233 return false
303234 } catch ( error ) {
304- // Encoding detection failed, consider it as binary file
305- return true
235+ // If detectEncoding fails, check if it's actually a binary file
236+ return await isBinaryFile ( fileBuffer ) . catch ( ( ) => false )
306237 }
307238 } catch ( error ) {
308- // File read error, consider it as binary file
239+ // File read error, assume it's binary
309240 return true
310241 }
311242}
0 commit comments