Skip to content

Commit 5338391

Browse files
Add isbinary as a library.
1 parent c2ee529 commit 5338391

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+1207
-16
lines changed

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ pytest-html
1515
pytest-order
1616
packaging >= 24.2
1717
hatch-fancy-pypi-readme
18+
hypothesis # dependency for isbinary unit tests
1819
# To make a relase you need asciidoc3 (a2x3)
1920
# asciidoc3
2021
# Include also normal project requirements.

src/robotide/controller/tablecontrollers.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from ..robotapi import is_list_var, is_scalar_var, is_dict_var
2121
from ..utils import variablematcher
2222
from .basecontroller import ControllerWithParent
23-
from . import macrocontrollers # TestCaseController, UserKeywordController
23+
# from . import macrocontrollers # TestCaseController, UserKeywordController
2424
from .settingcontrollers import MetadataController, import_controller, VariableController
2525

2626

@@ -317,9 +317,10 @@ def _configure_controller(self, ctrl, config):
317317

318318

319319
class TestCaseTableController(_MacroTable):
320+
from .macrocontrollers import TestCaseController
320321
__test__ = False
321322
item_type = 'Test case'
322-
_controller_class = macrocontrollers.TestCaseController
323+
_controller_class = TestCaseController
323324

324325
@property
325326
def items(self):
@@ -350,8 +351,9 @@ def restore_test_order(self, rlist):
350351

351352

352353
class KeywordTableController(_MacroTable):
354+
from .macrocontrollers import UserKeywordController
353355
item_type = 'User keyword'
354-
_controller_class = macrocontrollers.UserKeywordController
356+
_controller_class = UserKeywordController
355357

356358
@property
357359
def items(self):

src/robotide/editor/gridcolorizer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
import wx
1717

1818
from ..controller.cellinfo import CellType, ContentType
19-
from ..controller.macrocontrollers import UserKeywordController
19+
# from ..controller.macrocontrollers import UserKeywordController
2020

2121

2222
# this import fails in HUDSON
@@ -58,6 +58,7 @@ def _coloring_task(self, task_index, selection_content, row=0, col=0):
5858
self._coloring_task(task_index, selection_content, row+1, 0)
5959

6060
def _colorize_cell(self, row, col, selection_content):
61+
from ..controller.macrocontrollers import UserKeywordController
6162
cell_info = self._controller.get_cell_info(row, col)
6263
if cell_info is None:
6364
self._set_default_colors(row, col)
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
BSD 3-Clause License
2+
3+
Copyright (c) 2013, Audrey Roy
4+
Copyright (c) 2022, Matthew Gamble
5+
6+
Redistribution and use in source and binary forms, with or without
7+
modification, are permitted provided that the following conditions are met:
8+
9+
1. Redistributions of source code must retain the above copyright notice, this
10+
list of conditions and the following disclaimer.
11+
12+
2. Redistributions in binary form must reproduce the above copyright notice,
13+
this list of conditions and the following disclaimer in the documentation
14+
and/or other materials provided with the distribution.
15+
16+
3. Neither the name of the copyright holder nor the names of its
17+
contributors may be used to endorse or promote products derived from
18+
this software without specific prior written permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
=======================================================================================
2+
These files were copied and adapted from https://github.com/djmattyg007/python-isbinary
3+
=======================================================================================
4+
5+
-----
6+
7+
========
8+
isbinary
9+
========
10+
11+
.. image:: https://github.com/djmattyg007/python-isbinary/workflows/CI/badge.svg?branch=main
12+
:target: https://github.com/djmattyg007/freiner/actions?query=branch%3Amain+workflow%3ACI
13+
:alt: CI
14+
15+
.. image:: https://codecov.io/gh/djmattyg007/python-isbinary/branch/main/graph/badge.svg
16+
:target: https://codecov.io/gh/djmattyg007/python-isbinary
17+
:alt: Coverage
18+
19+
.. image:: https://img.shields.io/pypi/v/isbinary.svg
20+
:target: https://pypi.org/pypi/isbinary
21+
:alt: PyPI
22+
23+
.. image:: https://img.shields.io/pypi/l/isbinary.svg
24+
:target: https://pypi.org/project/isbinary
25+
:alt: BSD License
26+
27+
.. image:: https://readthedocs.org/projects/isbinary/badge/?version=latest
28+
:target: https://isbinary.readthedocs.io/en/latest/?badge=latest
29+
:alt: Documentation Status
30+
31+
Lightweight pure Python package to guess whether a file is binary or text,
32+
using a heuristic similar to Perl's `pp_fttext` and its analysis by @eliben.
33+
34+
* Free software: BSD license
35+
* Documentation: https://isbinary.readthedocs.io/
36+
37+
Status
38+
------
39+
40+
It works, and people are using this package in various places. But it doesn't cover all edge cases yet.
41+
42+
The code could be improved. Pull requests welcome! As of now, it is based on these snippets, but that may change:
43+
44+
* https://stackoverflow.com/questions/898669/how-can-i-detect-if-a-file-is-binary-non-text-in-python
45+
* https://stackoverflow.com/questions/1446549/how-to-identify-binary-and-text-files-using-python
46+
* https://code.activestate.com/recipes/173220/
47+
* https://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
48+
49+
Features
50+
--------
51+
52+
Has tests for these file types:
53+
54+
* Text: .txt, .css, .json, .svg, .js, .lua, .pl, .rst
55+
* Binary: .png, .gif, .jpg, .tiff, .bmp, .DS_Store, .eot, .otf, .ttf, .woff, .rgb
56+
57+
Has tests for numerous encodings.
58+
59+
Why?
60+
----
61+
62+
You may be thinking, "I can write this in 2 lines of code?!"
63+
64+
It's actually not that easy. Here's a great article about how Perl's
65+
heuristic to guess file types works: https://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
66+
67+
And that's just where we started. Over time, we've found more edge cases and
68+
our heuristic has gotten more complex.
69+
70+
Also, this package saves you from having to write and thoroughly test
71+
your code with all sorts of weird file types and encodings, cross-platform.
72+
73+
History
74+
-------
75+
76+
This is a long-term fork of `binaryornot <https://github.com/audreyfeldroy/binaryornot>`_. It was created in
77+
May 2022 primarily because it appeared that upstream had been abandoned. There were a few other smaller issues:
78+
79+
1. Lack of type annotations.
80+
2. Lack of stricter modern code quality tools used in CI.
81+
3. Improved contributor experience by using Github Actions for CI.
82+
4. Possibility for optimisation with optional dependency on `cchardet`.
83+
5. Removal of Python 2 support, and explicit support for newer versions of Python 3.
84+
85+
Credits
86+
-------
87+
88+
* Audrey and Danny Roy Greenfeld, as the previous maintainers of this code.
89+
* Special thanks to Eli Bendersky (@eliben) for his writeup explaining the heuristic and his implementation, which this is largely based on.
90+
* Source code from the portion of Perl's `pp_fttext` that checks for textiness: https://github.com/Perl/perl5/blob/v5.23.1/pp_sys.c#L3527-L3587
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
from .check import (
3+
BinaryLikeliness,
4+
get_starting_chunk,
5+
has_null_bytes,
6+
is_binary_file,
7+
is_binary_string,
8+
is_decodable_as_unicode,
9+
is_likely_binary,
10+
)
11+
12+
13+
__version__ = "1.0.1"
14+
15+
16+
__all__ = (
17+
"get_starting_chunk",
18+
"BinaryLikeliness",
19+
"is_likely_binary",
20+
"is_decodable_as_unicode",
21+
"has_null_bytes",
22+
"is_binary_string",
23+
"is_binary_file",
24+
"__version__",
25+
)
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
from .cli import main
3+
4+
5+
main()
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
2+
from typing import Callable, TypedDict, cast
3+
4+
5+
class DetectResult(TypedDict):
6+
encoding: str
7+
confidence: float
8+
language: str
9+
10+
11+
Detect = Callable[[bytes], DetectResult]
12+
13+
14+
def _get_chardet_detect() -> Detect:
15+
try:
16+
detect = __import__("cchardet").detect
17+
except ImportError:
18+
detect = __import__("chardet").detect
19+
20+
return cast(Detect, detect)
21+
22+
23+
chardet_detect = _get_chardet_detect()
24+
25+
26+
__all__ = ("DetectResult", "chardet_detect")

src/robotide/lib/isbinary/check.py

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
2+
import enum
3+
import os
4+
from typing import Final, Union
5+
6+
from ._chardet import chardet_detect
7+
8+
9+
_default_starting_chunk_len: Final = 2028
10+
11+
_control_chars: Final = b"\n\r\t\f\b"
12+
_printable_ascii: Final = _control_chars + bytes(range(32, 127))
13+
_printable_high_ascii: Final = bytes(range(127, 256))
14+
15+
16+
def get_starting_chunk(
17+
filename: Union[str, os.PathLike], /, *, chunk_len: int = _default_starting_chunk_len
18+
) -> bytes:
19+
"""
20+
:param filename: File to open and get the first little chunk of.
21+
:param chunk_len: Number of bytes to read, default 2048.
22+
:return: Starting chunk of bytes.
23+
"""
24+
with open(filename, "rb") as f:
25+
return f.read(chunk_len)
26+
27+
28+
class BinaryLikeliness(enum.Enum):
29+
HIGH = enum.auto()
30+
MID = enum.auto()
31+
LOW = enum.auto()
32+
33+
@property
34+
def likely(self) -> bool:
35+
return self == BinaryLikeliness.MID or self == BinaryLikeliness.HIGH
36+
37+
38+
def is_likely_binary(bytes_to_check: bytes, /) -> BinaryLikeliness:
39+
"""
40+
:param bytes_to_check: A chunk of bytes to check.
41+
:return: True if is likely binary, False otherwise.
42+
"""
43+
# Check for a high percentage of ASCII control characters
44+
# Binary if control chars are > 30% of the string
45+
low_chars = bytes_to_check.translate(None, _printable_ascii)
46+
nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
47+
48+
# and check for a low percentage of high ASCII characters:
49+
# Binary if high ASCII chars are < 5% of the string
50+
# From: https://en.wikipedia.org/wiki/UTF-8
51+
# If the bytes are random, the chances of a byte with the high bit set
52+
# starting a valid UTF-8 character is only 6.64%. The chances of finding 7
53+
# of these without finding an invalid sequence is actually lower than the
54+
# chance of the first three bytes randomly being the UTF-8 BOM.
55+
56+
high_chars = bytes_to_check.translate(None, _printable_high_ascii)
57+
nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
58+
59+
if nontext_ratio1 > 0.9 and nontext_ratio2 > 0.9:
60+
return BinaryLikeliness.HIGH
61+
62+
if nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05:
63+
return BinaryLikeliness.MID
64+
elif nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8:
65+
return BinaryLikeliness.MID
66+
else:
67+
return BinaryLikeliness.LOW
68+
69+
70+
def is_decodable_as_unicode(bytes_to_check: bytes, /) -> bool:
71+
"""
72+
:param bytes_to_check: A chunk of bytes to check.
73+
:return: True if is unicode-decodable, False otherwise.
74+
"""
75+
76+
# Check for binary for possible encoding detection with chardet
77+
detected_encoding = chardet_detect(bytes_to_check)
78+
79+
# Decide if binary or text
80+
decodable_as_unicode = False
81+
if detected_encoding["confidence"] > 0.9 and detected_encoding["encoding"] != "ascii":
82+
try:
83+
bytes_to_check.decode(encoding=detected_encoding["encoding"])
84+
decodable_as_unicode = True
85+
except (LookupError, UnicodeDecodeError):
86+
pass
87+
88+
return decodable_as_unicode
89+
90+
91+
def has_null_bytes(bytes_to_check: bytes, /) -> bool:
92+
"""
93+
:param bytes_to_check: A chunk of bytes to check.
94+
:return: True if the chunk contains null bytes, False otherwise.
95+
"""
96+
return b"\x00" in bytes_to_check or b"\xff" in bytes_to_check
97+
98+
99+
def is_binary_string(bytes_to_check: bytes, /) -> bool:
100+
"""
101+
Uses a simplified version of the Perl detection algorithm,
102+
based roughly on Eli Bendersky's translation to Python:
103+
https://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
104+
105+
This is biased slightly more in favour of deeming files as text
106+
files than the Perl algorithm, since all ASCII compatible character
107+
sets are accepted as text, not just utf-8.
108+
109+
:param bytes_to_check: A chunk of bytes to check.
110+
:return: True if the chunk appears to be binary (not text), False otherwise.
111+
"""
112+
113+
# Empty files are considered text files.
114+
if not bytes_to_check:
115+
return False
116+
117+
likely_binary = is_likely_binary(bytes_to_check)
118+
if likely_binary == BinaryLikeliness.HIGH:
119+
return True
120+
121+
decodable_as_unicode = is_decodable_as_unicode(bytes_to_check)
122+
123+
if likely_binary.likely:
124+
return not decodable_as_unicode
125+
126+
if decodable_as_unicode:
127+
return False
128+
129+
return has_null_bytes(bytes_to_check)
130+
131+
132+
def is_binary_file(
133+
filename: Union[str, os.PathLike], /, *, starting_chunk_len: int = _default_starting_chunk_len
134+
) -> bool:
135+
"""
136+
:param filename: File to check.
137+
:param starting_chunk_len: Number of bytes to read, default 2048.
138+
:return: True if it's a binary file, otherwise False.
139+
"""
140+
# Check if the starting chunk is a binary string
141+
try:
142+
chunk = get_starting_chunk(filename, chunk_len=starting_chunk_len)
143+
except FileNotFoundError:
144+
if os.path.islink(filename) and not os.path.exists(filename):
145+
return True
146+
raise
147+
148+
return is_binary_string(chunk)
149+
150+
151+
__all__ = (
152+
"get_starting_chunk",
153+
"BinaryLikeliness",
154+
"is_likely_binary",
155+
"is_decodable_as_unicode",
156+
"has_null_bytes",
157+
"is_binary_string",
158+
"is_binary_file",
159+
)

0 commit comments

Comments
 (0)