1
1
import logging
2
2
from os import PathLike
3
- from typing import Any , BinaryIO , List , Optional , Set
3
+ from typing import BinaryIO , List , Optional , Set , Union
4
4
5
5
from .cd import (
6
6
coherence_ratio ,
31
31
32
32
33
33
def from_bytes (
34
- sequences : bytes ,
34
+ sequences : Union [ bytes , bytearray ] ,
35
35
steps : int = 5 ,
36
36
chunk_size : int = 512 ,
37
37
threshold : float = 0.2 ,
@@ -40,6 +40,7 @@ def from_bytes(
40
40
preemptive_behaviour : bool = True ,
41
41
explain : bool = False ,
42
42
language_threshold : float = 0.1 ,
43
+ enable_fallback : bool = True ,
43
44
) -> CharsetMatches :
44
45
"""
45
46
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
@@ -158,6 +159,8 @@ def from_bytes(
158
159
159
160
results : CharsetMatches = CharsetMatches ()
160
161
162
+ early_stop_results : CharsetMatches = CharsetMatches ()
163
+
161
164
sig_encoding , sig_payload = identify_sig_or_bom (sequences )
162
165
163
166
if sig_encoding is not None :
@@ -220,16 +223,20 @@ def from_bytes(
220
223
try :
221
224
if is_too_large_sequence and is_multi_byte_decoder is False :
222
225
str (
223
- sequences [: int (50e4 )]
224
- if strip_sig_or_bom is False
225
- else sequences [len (sig_payload ) : int (50e4 )],
226
+ (
227
+ sequences [: int (50e4 )]
228
+ if strip_sig_or_bom is False
229
+ else sequences [len (sig_payload ) : int (50e4 )]
230
+ ),
226
231
encoding = encoding_iana ,
227
232
)
228
233
else :
229
234
decoded_payload = str (
230
- sequences
231
- if strip_sig_or_bom is False
232
- else sequences [len (sig_payload ) :],
235
+ (
236
+ sequences
237
+ if strip_sig_or_bom is False
238
+ else sequences [len (sig_payload ) :]
239
+ ),
233
240
encoding = encoding_iana ,
234
241
)
235
242
except (UnicodeDecodeError , LookupError ) as e :
@@ -361,11 +368,18 @@ def from_bytes(
361
368
)
362
369
# Preparing those fallbacks in case we got nothing.
363
370
if (
364
- encoding_iana in ["ascii" , "utf_8" , specified_encoding ]
371
+ enable_fallback
372
+ and encoding_iana in ["ascii" , "utf_8" , specified_encoding ]
365
373
and not lazy_str_hard_failure
366
374
):
367
375
fallback_entry = CharsetMatch (
368
- sequences , encoding_iana , threshold , False , [], decoded_payload
376
+ sequences ,
377
+ encoding_iana ,
378
+ threshold ,
379
+ False ,
380
+ [],
381
+ decoded_payload ,
382
+ preemptive_declaration = specified_encoding ,
369
383
)
370
384
if encoding_iana == specified_encoding :
371
385
fallback_specified = fallback_entry
@@ -419,28 +433,58 @@ def from_bytes(
419
433
),
420
434
)
421
435
422
- results .append (
423
- CharsetMatch (
424
- sequences ,
425
- encoding_iana ,
426
- mean_mess_ratio ,
427
- bom_or_sig_available ,
428
- cd_ratios_merged ,
429
- decoded_payload ,
430
- )
436
+ current_match = CharsetMatch (
437
+ sequences ,
438
+ encoding_iana ,
439
+ mean_mess_ratio ,
440
+ bom_or_sig_available ,
441
+ cd_ratios_merged ,
442
+ (
443
+ decoded_payload
444
+ if (
445
+ is_too_large_sequence is False
446
+ or encoding_iana in [specified_encoding , "ascii" , "utf_8" ]
447
+ )
448
+ else None
449
+ ),
450
+ preemptive_declaration = specified_encoding ,
431
451
)
432
452
453
+ results .append (current_match )
454
+
433
455
if (
434
456
encoding_iana in [specified_encoding , "ascii" , "utf_8" ]
435
457
and mean_mess_ratio < 0.1
436
458
):
459
+ # If md says nothing to worry about, then... stop immediately!
460
+ if mean_mess_ratio == 0.0 :
461
+ logger .debug (
462
+ "Encoding detection: %s is most likely the one." ,
463
+ current_match .encoding ,
464
+ )
465
+ if explain :
466
+ logger .removeHandler (explain_handler )
467
+ logger .setLevel (previous_logger_level )
468
+ return CharsetMatches ([current_match ])
469
+
470
+ early_stop_results .append (current_match )
471
+
472
+ if (
473
+ len (early_stop_results )
474
+ and (specified_encoding is None or specified_encoding in tested )
475
+ and "ascii" in tested
476
+ and "utf_8" in tested
477
+ ):
478
+ probable_result : CharsetMatch = early_stop_results .best () # type: ignore[assignment]
437
479
logger .debug (
438
- "Encoding detection: %s is most likely the one." , encoding_iana
480
+ "Encoding detection: %s is most likely the one." ,
481
+ probable_result .encoding ,
439
482
)
440
483
if explain :
441
484
logger .removeHandler (explain_handler )
442
485
logger .setLevel (previous_logger_level )
443
- return CharsetMatches ([results [encoding_iana ]])
486
+
487
+ return CharsetMatches ([probable_result ])
444
488
445
489
if encoding_iana == sig_encoding :
446
490
logger .debug (
@@ -507,6 +551,7 @@ def from_fp(
507
551
preemptive_behaviour : bool = True ,
508
552
explain : bool = False ,
509
553
language_threshold : float = 0.1 ,
554
+ enable_fallback : bool = True ,
510
555
) -> CharsetMatches :
511
556
"""
512
557
Same thing than the function from_bytes but using a file pointer that is already ready.
@@ -522,11 +567,12 @@ def from_fp(
522
567
preemptive_behaviour ,
523
568
explain ,
524
569
language_threshold ,
570
+ enable_fallback ,
525
571
)
526
572
527
573
528
574
def from_path (
529
- path : "PathLike[Any]" ,
575
+ path : Union [ str , bytes , PathLike ], # type: ignore[type-arg]
530
576
steps : int = 5 ,
531
577
chunk_size : int = 512 ,
532
578
threshold : float = 0.20 ,
@@ -535,6 +581,7 @@ def from_path(
535
581
preemptive_behaviour : bool = True ,
536
582
explain : bool = False ,
537
583
language_threshold : float = 0.1 ,
584
+ enable_fallback : bool = True ,
538
585
) -> CharsetMatches :
539
586
"""
540
587
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
@@ -551,4 +598,71 @@ def from_path(
551
598
preemptive_behaviour ,
552
599
explain ,
553
600
language_threshold ,
601
+ enable_fallback ,
554
602
)
603
+
604
+
605
+ def is_binary (
606
+ fp_or_path_or_payload : Union [PathLike , str , BinaryIO , bytes ], # type: ignore[type-arg]
607
+ steps : int = 5 ,
608
+ chunk_size : int = 512 ,
609
+ threshold : float = 0.20 ,
610
+ cp_isolation : Optional [List [str ]] = None ,
611
+ cp_exclusion : Optional [List [str ]] = None ,
612
+ preemptive_behaviour : bool = True ,
613
+ explain : bool = False ,
614
+ language_threshold : float = 0.1 ,
615
+ enable_fallback : bool = False ,
616
+ ) -> bool :
617
+ """
618
+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
619
+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
620
+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
621
+ """
622
+ if isinstance (fp_or_path_or_payload , (str , PathLike )):
623
+ guesses = from_path (
624
+ fp_or_path_or_payload ,
625
+ steps = steps ,
626
+ chunk_size = chunk_size ,
627
+ threshold = threshold ,
628
+ cp_isolation = cp_isolation ,
629
+ cp_exclusion = cp_exclusion ,
630
+ preemptive_behaviour = preemptive_behaviour ,
631
+ explain = explain ,
632
+ language_threshold = language_threshold ,
633
+ enable_fallback = enable_fallback ,
634
+ )
635
+ elif isinstance (
636
+ fp_or_path_or_payload ,
637
+ (
638
+ bytes ,
639
+ bytearray ,
640
+ ),
641
+ ):
642
+ guesses = from_bytes (
643
+ fp_or_path_or_payload ,
644
+ steps = steps ,
645
+ chunk_size = chunk_size ,
646
+ threshold = threshold ,
647
+ cp_isolation = cp_isolation ,
648
+ cp_exclusion = cp_exclusion ,
649
+ preemptive_behaviour = preemptive_behaviour ,
650
+ explain = explain ,
651
+ language_threshold = language_threshold ,
652
+ enable_fallback = enable_fallback ,
653
+ )
654
+ else :
655
+ guesses = from_fp (
656
+ fp_or_path_or_payload ,
657
+ steps = steps ,
658
+ chunk_size = chunk_size ,
659
+ threshold = threshold ,
660
+ cp_isolation = cp_isolation ,
661
+ cp_exclusion = cp_exclusion ,
662
+ preemptive_behaviour = preemptive_behaviour ,
663
+ explain = explain ,
664
+ language_threshold = language_threshold ,
665
+ enable_fallback = enable_fallback ,
666
+ )
667
+
668
+ return not guesses
0 commit comments