|
| 1 | + |
| 2 | +import enum |
| 3 | +import os |
| 4 | +from typing import Final, Union |
| 5 | + |
| 6 | +from ._chardet import chardet_detect |
| 7 | + |
| 8 | + |
| 9 | +_default_starting_chunk_len: Final = 2028 |
| 10 | + |
| 11 | +_control_chars: Final = b"\n\r\t\f\b" |
| 12 | +_printable_ascii: Final = _control_chars + bytes(range(32, 127)) |
| 13 | +_printable_high_ascii: Final = bytes(range(127, 256)) |
| 14 | + |
| 15 | + |
| 16 | +def get_starting_chunk( |
| 17 | + filename: Union[str, os.PathLike], /, *, chunk_len: int = _default_starting_chunk_len |
| 18 | +) -> bytes: |
| 19 | + """ |
| 20 | + :param filename: File to open and get the first little chunk of. |
| 21 | + :param chunk_len: Number of bytes to read, default 2048. |
| 22 | + :return: Starting chunk of bytes. |
| 23 | + """ |
| 24 | + with open(filename, "rb") as f: |
| 25 | + return f.read(chunk_len) |
| 26 | + |
| 27 | + |
| 28 | +class BinaryLikeliness(enum.Enum): |
| 29 | + HIGH = enum.auto() |
| 30 | + MID = enum.auto() |
| 31 | + LOW = enum.auto() |
| 32 | + |
| 33 | + @property |
| 34 | + def likely(self) -> bool: |
| 35 | + return self == BinaryLikeliness.MID or self == BinaryLikeliness.HIGH |
| 36 | + |
| 37 | + |
| 38 | +def is_likely_binary(bytes_to_check: bytes, /) -> BinaryLikeliness: |
| 39 | + """ |
| 40 | + :param bytes_to_check: A chunk of bytes to check. |
| 41 | + :return: True if is likely binary, False otherwise. |
| 42 | + """ |
| 43 | + # Check for a high percentage of ASCII control characters |
| 44 | + # Binary if control chars are > 30% of the string |
| 45 | + low_chars = bytes_to_check.translate(None, _printable_ascii) |
| 46 | + nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check)) |
| 47 | + |
| 48 | + # and check for a low percentage of high ASCII characters: |
| 49 | + # Binary if high ASCII chars are < 5% of the string |
| 50 | + # From: https://en.wikipedia.org/wiki/UTF-8 |
| 51 | + # If the bytes are random, the chances of a byte with the high bit set |
| 52 | + # starting a valid UTF-8 character is only 6.64%. The chances of finding 7 |
| 53 | + # of these without finding an invalid sequence is actually lower than the |
| 54 | + # chance of the first three bytes randomly being the UTF-8 BOM. |
| 55 | + |
| 56 | + high_chars = bytes_to_check.translate(None, _printable_high_ascii) |
| 57 | + nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check)) |
| 58 | + |
| 59 | + if nontext_ratio1 > 0.9 and nontext_ratio2 > 0.9: |
| 60 | + return BinaryLikeliness.HIGH |
| 61 | + |
| 62 | + if nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05: |
| 63 | + return BinaryLikeliness.MID |
| 64 | + elif nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8: |
| 65 | + return BinaryLikeliness.MID |
| 66 | + else: |
| 67 | + return BinaryLikeliness.LOW |
| 68 | + |
| 69 | + |
| 70 | +def is_decodable_as_unicode(bytes_to_check: bytes, /) -> bool: |
| 71 | + """ |
| 72 | + :param bytes_to_check: A chunk of bytes to check. |
| 73 | + :return: True if is unicode-decodable, False otherwise. |
| 74 | + """ |
| 75 | + |
| 76 | + # Check for binary for possible encoding detection with chardet |
| 77 | + detected_encoding = chardet_detect(bytes_to_check) |
| 78 | + |
| 79 | + # Decide if binary or text |
| 80 | + decodable_as_unicode = False |
| 81 | + if detected_encoding["confidence"] > 0.9 and detected_encoding["encoding"] != "ascii": |
| 82 | + try: |
| 83 | + bytes_to_check.decode(encoding=detected_encoding["encoding"]) |
| 84 | + decodable_as_unicode = True |
| 85 | + except (LookupError, UnicodeDecodeError): |
| 86 | + pass |
| 87 | + |
| 88 | + return decodable_as_unicode |
| 89 | + |
| 90 | + |
| 91 | +def has_null_bytes(bytes_to_check: bytes, /) -> bool: |
| 92 | + """ |
| 93 | + :param bytes_to_check: A chunk of bytes to check. |
| 94 | + :return: True if the chunk contains null bytes, False otherwise. |
| 95 | + """ |
| 96 | + return b"\x00" in bytes_to_check or b"\xff" in bytes_to_check |
| 97 | + |
| 98 | + |
| 99 | +def is_binary_string(bytes_to_check: bytes, /) -> bool: |
| 100 | + """ |
| 101 | + Uses a simplified version of the Perl detection algorithm, |
| 102 | + based roughly on Eli Bendersky's translation to Python: |
| 103 | + https://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/ |
| 104 | +
|
| 105 | + This is biased slightly more in favour of deeming files as text |
| 106 | + files than the Perl algorithm, since all ASCII compatible character |
| 107 | + sets are accepted as text, not just utf-8. |
| 108 | +
|
| 109 | + :param bytes_to_check: A chunk of bytes to check. |
| 110 | + :return: True if the chunk appears to be binary (not text), False otherwise. |
| 111 | + """ |
| 112 | + |
| 113 | + # Empty files are considered text files. |
| 114 | + if not bytes_to_check: |
| 115 | + return False |
| 116 | + |
| 117 | + likely_binary = is_likely_binary(bytes_to_check) |
| 118 | + if likely_binary == BinaryLikeliness.HIGH: |
| 119 | + return True |
| 120 | + |
| 121 | + decodable_as_unicode = is_decodable_as_unicode(bytes_to_check) |
| 122 | + |
| 123 | + if likely_binary.likely: |
| 124 | + return not decodable_as_unicode |
| 125 | + |
| 126 | + if decodable_as_unicode: |
| 127 | + return False |
| 128 | + |
| 129 | + return has_null_bytes(bytes_to_check) |
| 130 | + |
| 131 | + |
| 132 | +def is_binary_file( |
| 133 | + filename: Union[str, os.PathLike], /, *, starting_chunk_len: int = _default_starting_chunk_len |
| 134 | +) -> bool: |
| 135 | + """ |
| 136 | + :param filename: File to check. |
| 137 | + :param starting_chunk_len: Number of bytes to read, default 2048. |
| 138 | + :return: True if it's a binary file, otherwise False. |
| 139 | + """ |
| 140 | + # Check if the starting chunk is a binary string |
| 141 | + try: |
| 142 | + chunk = get_starting_chunk(filename, chunk_len=starting_chunk_len) |
| 143 | + except FileNotFoundError: |
| 144 | + if os.path.islink(filename) and not os.path.exists(filename): |
| 145 | + return True |
| 146 | + raise |
| 147 | + |
| 148 | + return is_binary_string(chunk) |
| 149 | + |
| 150 | + |
| 151 | +__all__ = ( |
| 152 | + "get_starting_chunk", |
| 153 | + "BinaryLikeliness", |
| 154 | + "is_likely_binary", |
| 155 | + "is_decodable_as_unicode", |
| 156 | + "has_null_bytes", |
| 157 | + "is_binary_string", |
| 158 | + "is_binary_file", |
| 159 | +) |
0 commit comments