11import logging
22from os import PathLike
3- from typing import Any , BinaryIO , List , Optional , Set
3+ from typing import BinaryIO , List , Optional , Set , Union
44
55from .cd import (
66 coherence_ratio ,
3131
3232
3333def from_bytes (
34- sequences : bytes ,
34+ sequences : Union [ bytes , bytearray ] ,
3535 steps : int = 5 ,
3636 chunk_size : int = 512 ,
3737 threshold : float = 0.2 ,
@@ -40,6 +40,7 @@ def from_bytes(
4040 preemptive_behaviour : bool = True ,
4141 explain : bool = False ,
4242 language_threshold : float = 0.1 ,
43+ enable_fallback : bool = True ,
4344) -> CharsetMatches :
4445 """
4546 Given a raw bytes sequence, return the best possibles charset usable to render str objects.
@@ -158,6 +159,8 @@ def from_bytes(
158159
159160 results : CharsetMatches = CharsetMatches ()
160161
162+ early_stop_results : CharsetMatches = CharsetMatches ()
163+
161164 sig_encoding , sig_payload = identify_sig_or_bom (sequences )
162165
163166 if sig_encoding is not None :
@@ -220,16 +223,20 @@ def from_bytes(
220223 try :
221224 if is_too_large_sequence and is_multi_byte_decoder is False :
222225 str (
223- sequences [: int (50e4 )]
224- if strip_sig_or_bom is False
225- else sequences [len (sig_payload ) : int (50e4 )],
226+ (
227+ sequences [: int (50e4 )]
228+ if strip_sig_or_bom is False
229+ else sequences [len (sig_payload ) : int (50e4 )]
230+ ),
226231 encoding = encoding_iana ,
227232 )
228233 else :
229234 decoded_payload = str (
230- sequences
231- if strip_sig_or_bom is False
232- else sequences [len (sig_payload ) :],
235+ (
236+ sequences
237+ if strip_sig_or_bom is False
238+ else sequences [len (sig_payload ) :]
239+ ),
233240 encoding = encoding_iana ,
234241 )
235242 except (UnicodeDecodeError , LookupError ) as e :
@@ -361,11 +368,18 @@ def from_bytes(
361368 )
362369 # Preparing those fallbacks in case we got nothing.
363370 if (
364- encoding_iana in ["ascii" , "utf_8" , specified_encoding ]
371+ enable_fallback
372+ and encoding_iana in ["ascii" , "utf_8" , specified_encoding ]
365373 and not lazy_str_hard_failure
366374 ):
367375 fallback_entry = CharsetMatch (
368- sequences , encoding_iana , threshold , False , [], decoded_payload
376+ sequences ,
377+ encoding_iana ,
378+ threshold ,
379+ False ,
380+ [],
381+ decoded_payload ,
382+ preemptive_declaration = specified_encoding ,
369383 )
370384 if encoding_iana == specified_encoding :
371385 fallback_specified = fallback_entry
@@ -419,28 +433,58 @@ def from_bytes(
419433 ),
420434 )
421435
422- results .append (
423- CharsetMatch (
424- sequences ,
425- encoding_iana ,
426- mean_mess_ratio ,
427- bom_or_sig_available ,
428- cd_ratios_merged ,
429- decoded_payload ,
430- )
436+ current_match = CharsetMatch (
437+ sequences ,
438+ encoding_iana ,
439+ mean_mess_ratio ,
440+ bom_or_sig_available ,
441+ cd_ratios_merged ,
442+ (
443+ decoded_payload
444+ if (
445+ is_too_large_sequence is False
446+ or encoding_iana in [specified_encoding , "ascii" , "utf_8" ]
447+ )
448+ else None
449+ ),
450+ preemptive_declaration = specified_encoding ,
431451 )
432452
453+ results .append (current_match )
454+
433455 if (
434456 encoding_iana in [specified_encoding , "ascii" , "utf_8" ]
435457 and mean_mess_ratio < 0.1
436458 ):
459+ # If md says nothing to worry about, then... stop immediately!
460+ if mean_mess_ratio == 0.0 :
461+ logger .debug (
462+ "Encoding detection: %s is most likely the one." ,
463+ current_match .encoding ,
464+ )
465+ if explain :
466+ logger .removeHandler (explain_handler )
467+ logger .setLevel (previous_logger_level )
468+ return CharsetMatches ([current_match ])
469+
470+ early_stop_results .append (current_match )
471+
472+ if (
473+ len (early_stop_results )
474+ and (specified_encoding is None or specified_encoding in tested )
475+ and "ascii" in tested
476+ and "utf_8" in tested
477+ ):
478+ probable_result : CharsetMatch = early_stop_results .best () # type: ignore[assignment]
437479 logger .debug (
438- "Encoding detection: %s is most likely the one." , encoding_iana
480+ "Encoding detection: %s is most likely the one." ,
481+ probable_result .encoding ,
439482 )
440483 if explain :
441484 logger .removeHandler (explain_handler )
442485 logger .setLevel (previous_logger_level )
443- return CharsetMatches ([results [encoding_iana ]])
486+
487+ return CharsetMatches ([probable_result ])
444488
445489 if encoding_iana == sig_encoding :
446490 logger .debug (
@@ -507,6 +551,7 @@ def from_fp(
507551 preemptive_behaviour : bool = True ,
508552 explain : bool = False ,
509553 language_threshold : float = 0.1 ,
554+ enable_fallback : bool = True ,
510555) -> CharsetMatches :
511556 """
512557 Same thing than the function from_bytes but using a file pointer that is already ready.
@@ -522,11 +567,12 @@ def from_fp(
522567 preemptive_behaviour ,
523568 explain ,
524569 language_threshold ,
570+ enable_fallback ,
525571 )
526572
527573
528574def from_path (
529- path : "PathLike[Any]" ,
575+ path : Union [ str , bytes , PathLike ], # type: ignore[type-arg]
530576 steps : int = 5 ,
531577 chunk_size : int = 512 ,
532578 threshold : float = 0.20 ,
@@ -535,6 +581,7 @@ def from_path(
535581 preemptive_behaviour : bool = True ,
536582 explain : bool = False ,
537583 language_threshold : float = 0.1 ,
584+ enable_fallback : bool = True ,
538585) -> CharsetMatches :
539586 """
540587 Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
@@ -551,4 +598,71 @@ def from_path(
551598 preemptive_behaviour ,
552599 explain ,
553600 language_threshold ,
601+ enable_fallback ,
554602 )
603+
604+
605+ def is_binary (
606+ fp_or_path_or_payload : Union [PathLike , str , BinaryIO , bytes ], # type: ignore[type-arg]
607+ steps : int = 5 ,
608+ chunk_size : int = 512 ,
609+ threshold : float = 0.20 ,
610+ cp_isolation : Optional [List [str ]] = None ,
611+ cp_exclusion : Optional [List [str ]] = None ,
612+ preemptive_behaviour : bool = True ,
613+ explain : bool = False ,
614+ language_threshold : float = 0.1 ,
615+ enable_fallback : bool = False ,
616+ ) -> bool :
617+ """
618+ Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
619+ Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
620+ are disabled to be stricter around ASCII-compatible but unlikely to be a string.
621+ """
622+ if isinstance (fp_or_path_or_payload , (str , PathLike )):
623+ guesses = from_path (
624+ fp_or_path_or_payload ,
625+ steps = steps ,
626+ chunk_size = chunk_size ,
627+ threshold = threshold ,
628+ cp_isolation = cp_isolation ,
629+ cp_exclusion = cp_exclusion ,
630+ preemptive_behaviour = preemptive_behaviour ,
631+ explain = explain ,
632+ language_threshold = language_threshold ,
633+ enable_fallback = enable_fallback ,
634+ )
635+ elif isinstance (
636+ fp_or_path_or_payload ,
637+ (
638+ bytes ,
639+ bytearray ,
640+ ),
641+ ):
642+ guesses = from_bytes (
643+ fp_or_path_or_payload ,
644+ steps = steps ,
645+ chunk_size = chunk_size ,
646+ threshold = threshold ,
647+ cp_isolation = cp_isolation ,
648+ cp_exclusion = cp_exclusion ,
649+ preemptive_behaviour = preemptive_behaviour ,
650+ explain = explain ,
651+ language_threshold = language_threshold ,
652+ enable_fallback = enable_fallback ,
653+ )
654+ else :
655+ guesses = from_fp (
656+ fp_or_path_or_payload ,
657+ steps = steps ,
658+ chunk_size = chunk_size ,
659+ threshold = threshold ,
660+ cp_isolation = cp_isolation ,
661+ cp_exclusion = cp_exclusion ,
662+ preemptive_behaviour = preemptive_behaviour ,
663+ explain = explain ,
664+ language_threshold = language_threshold ,
665+ enable_fallback = enable_fallback ,
666+ )
667+
668+ return not guesses
0 commit comments