Skip to content

Commit 55db71a

Browse files
authored
Version 2.0.0b2 (#108)
1 parent 9e80f56 commit 55db71a

File tree

11 files changed

+131
-42
lines changed

11 files changed

+131
-42
lines changed

README.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,8 @@ Acknowledgements
157157

158158
Gary C. Kessler
159159

160-
161160
For use of his File Signature Tables, available at:
162-
http://www.garykessler.net/library/file_sigs.html
161+
https://filesig.search.org/
163162

164163
Freedesktop.org
165164

puremagic/main.py

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,9 @@
99
Acknowledgements
1010
Gary C. Kessler
1111
For use of his File Signature Tables, available at:
12-
http://www.garykessler.net/library/file_sigs.html
12+
https://filesig.search.org/
1313
"""
1414

15-
from __future__ import annotations
16-
1715
import json
1816
import os
1917
from binascii import unhexlify
@@ -22,10 +20,10 @@
2220
from pathlib import Path
2321

2422
if os.getenv("PUREMAGIC_DEEPSCAN") != "0":
25-
from puremagic.scanners import zip_scanner, pdf_scanner, text_scanner
23+
from puremagic.scanners import zip_scanner, pdf_scanner, text_scanner, json_scanner, python_scanner
2624

2725
__author__ = "Chris Griffith"
28-
__version__ = "2.0.0b1"
26+
__version__ = "2.0.0b2"
2927
__all__ = [
3028
"magic_file",
3129
"magic_string",
@@ -262,7 +260,7 @@ def ext_from_filename(filename: os.PathLike | str) -> str:
262260
all_exts = [x.extension for x in chain(magic_header_array, magic_footer_array)]
263261

264262
if base[-4:].startswith("."):
265-
# For double extensions like like .tar.gz
263+
# For double extensions like .tar.gz
266264
long_ext = base[-4:] + ext
267265
if long_ext in all_exts:
268266
return long_ext
@@ -320,7 +318,7 @@ def from_stream(stream, mime: bool = False, filename: os.PathLike | str | None =
320318
def magic_file(filename: os.PathLike | str) -> list[PureMagicWithConfidence]:
321319
"""
322320
Returns list of (num_of_matches, array_of_matches)
323-
arranged highest confidence match first.
321+
arranged by highest confidence match first.
324322
325323
:param filename: path to file
326324
:return: list of possible matches, highest confidence first
@@ -341,7 +339,7 @@ def magic_file(filename: os.PathLike | str) -> list[PureMagicWithConfidence]:
341339
def magic_string(string, filename: os.PathLike | str | None = None) -> list[PureMagicWithConfidence]:
342340
"""
343341
Returns tuple of (num_of_matches, array_of_matches)
344-
arranged highest confidence match first
342+
arranged by highest confidence match first
345343
If filename is provided it will be used in the computation.
346344
347345
:param string: string representation to check
@@ -362,7 +360,7 @@ def magic_stream(
362360
filename: os.PathLike | None = None,
363361
) -> list[PureMagicWithConfidence]:
364362
"""Returns tuple of (num_of_matches, array_of_matches)
365-
arranged highest confidence match first
363+
arranged by highest confidence match first
366364
If filename is provided it will be used in the computation.
367365
368366
:param stream: stream representation to check
@@ -393,14 +391,27 @@ def _single_deep_scan(
393391
return zip_scanner.main(filename, head, foot)
394392
case pdf_scanner.match_bytes:
395393
return pdf_scanner.main(filename, head, foot)
396-
case None | b"":
397-
for scanner in (text_scanner, pdf_scanner):
398-
result = scanner.main(filename, head, foot)
399-
if result:
400-
return result
394+
395+
# First match wins, so text_scanner should always be last
396+
for scanner in (pdf_scanner, python_scanner, json_scanner):
397+
result = scanner.main(filename, head, foot)
398+
if result:
399+
return result
401400
return None
402401

403402

403+
def _catch_all_deep_scan(
404+
filename: os.PathLike | str,
405+
head=None,
406+
foot=None,
407+
):
408+
if os.getenv("PUREMAGIC_DEEPSCAN") == "0":
409+
return None
410+
if not isinstance(filename, os.PathLike):
411+
filename = Path(filename)
412+
return text_scanner.main(filename, head, foot)
413+
414+
404415
def _run_deep_scan(
405416
matches: list[PureMagicWithConfidence],
406417
filename: os.PathLike | str,
@@ -425,10 +436,18 @@ def _run_deep_scan(
425436
name=result.name,
426437
)
427438
]
439+
try:
440+
result = _catch_all_deep_scan(filename, head, foot)
441+
except Exception:
442+
pass
443+
else:
444+
if result:
445+
return [result]
428446
if raise_on_none:
429447
raise PureError("Could not identify file")
430448

431449
for pure_magic_match in matches:
450+
# noinspection PyBroadException
432451
try:
433452
result = _single_deep_scan(pure_magic_match.byte_match, filename, head, foot)
434453
except Exception:
@@ -484,7 +503,7 @@ def command_line_entry(*args):
484503
if i == 0:
485504
print("\n\tBest Match")
486505
else:
487-
print(f"\tAlertnative Match #{i}")
506+
print(f"\tAlternative Match #{i}")
488507
print(f"\tName: {result.name}")
489508
print(f"\tConfidence: {int(result.confidence * 100)}%")
490509
print(f"\tExtension: {result.extension}")
@@ -526,14 +545,14 @@ def what(file: os.PathLike | str | None, h: bytes | None = None, imghdr_strict:
526545
527546
imghdr_strict enables bug-for-bug compatibility between imghdr.what() and puremagic.what() when the imghdr returns
528547
a match but puremagic returns None. We believe that imghdr is delivering a "false positive" in each of these
529-
scenerios but we want puremagic.what()'s default behavior to match imghdr.what()'s false positives so we do not
548+
scenarios, but we want puremagic.what()'s default behavior to match imghdr.what()'s false positives so we do not
530549
break existing applications.
531550
532551
If imghdr_strict is True (the default) then a lookup will be done to deliver a matching result on all known false
533552
positives. If imghdr_strict is False then puremagic's algorithms will determine the image type. True is more
534553
compatible while False is more correct.
535554
536-
NOTE: This compatibility effort only deals false positives and we are not interested to track the opposite
555+
NOTE: This compatibility effort only deals false positives, and we are not interested to track the opposite
537556
situation where puremagic's deliver a match while imghdr would have returned None. Also, puremagic.what() can
538557
recognize many more file types than the twelve image file types that imghdr focused on.
539558
"""

puremagic/scanners/json_scanner.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import os
2+
import json
3+
4+
from puremagic.scanners.helpers import Match
5+
6+
match_bytes = b"{"
7+
8+
9+
def main(file_path: os.PathLike | str, head: bytes, foot: bytes) -> Match | None:
10+
if not (head.strip().startswith(b"{") and foot.strip().endswith(b"}")):
11+
return None
12+
try:
13+
with open(file_path, "rb") as file:
14+
json.load(file)
15+
except (json.decoder.JSONDecodeError, OSError):
16+
return None
17+
return Match(
18+
extension=".json",
19+
name="JSON File",
20+
mime_type="application/json",
21+
confidence=1.0,
22+
)

puremagic/scanners/pdf_scanner.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,9 @@
1-
from __future__ import annotations
2-
3-
from typing import Optional
4-
51
from puremagic.scanners.helpers import Match
62

73
match_bytes = b"%PDF"
84

95

10-
def main(_, head: bytes, foot: bytes) -> Optional[Match]:
6+
def main(_, head: bytes, foot: bytes) -> Match | None:
117
if b"%PDF-" in head and b"startxref" in foot:
128
return Match(".pdf", "PDF document", "application/pdf")
139
return None
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import ast
2+
import os
3+
4+
from puremagic.scanners.helpers import Match
5+
6+
7+
def main(file_path: os.PathLike | str, *_, **__) -> Match | None:
8+
file_size = os.path.getsize(file_path)
9+
if file_size > 1_000_000:
10+
return None
11+
if not str(file_path).endswith(".py") and file_size < 100:
12+
return None
13+
14+
try:
15+
with open(file_path, "r") as file:
16+
content = file.read()
17+
ast.parse(content)
18+
except Exception:
19+
return None
20+
return Match(
21+
extension=".py",
22+
name="Python Script",
23+
mime_type="text/x-python",
24+
confidence=1.0,
25+
)

puremagic/scanners/text_scanner.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
1-
from __future__ import annotations
2-
31
import os
42
import re
5-
from typing import Optional
63

74
from puremagic.scanners.helpers import Match
85

@@ -11,7 +8,7 @@
118
cr_pattern = re.compile(rb"\r(?!\n)")
129

1310

14-
def main(file_path: os.PathLike | str, _, __) -> Optional[Match]:
11+
def main(file_path: os.PathLike | str, _, __) -> Match | None:
1512
with open(file_path, "rb") as file:
1613
head = file.read(1_000_000)
1714
if len(head) < 8:

puremagic/scanners/zip_scanner.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,5 @@
1-
from __future__ import annotations
2-
31
import re
42
import os
5-
from typing import Optional
63
from zipfile import ZipFile
74

85
from puremagic.scanners.helpers import Match
@@ -13,7 +10,7 @@
1310
application_re = re.compile(b"<Application>(.*)</Application>")
1411

1512

16-
def open_office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Optional[Match]:
13+
def open_office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Match | None:
1714
if "content.xml" not in internal_files:
1815
return None
1916
if "mimetype" not in internal_files:
@@ -35,7 +32,7 @@ def open_office_check(internal_files: list[str], zip_file: ZipFile, extension: s
3532
return None
3633

3734

38-
def office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Optional[Match]:
35+
def office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Match | None:
3936
if "[Content_Types].xml" not in internal_files:
4037
return None
4138
if "docProps/app.xml" not in internal_files:
@@ -107,7 +104,7 @@ def office_check(internal_files: list[str], zip_file: ZipFile, extension: str |
107104
return None
108105

109106

110-
def jar_check(internal_files: list[str], zip_file: ZipFile) -> Optional[Match]:
107+
def jar_check(internal_files: list[str], zip_file: ZipFile) -> Match | None:
111108

112109
if "META-INF/MANIFEST.MF" not in internal_files:
113110
return None
@@ -119,21 +116,21 @@ def jar_check(internal_files: list[str], zip_file: ZipFile) -> Optional[Match]:
119116
return None
120117

121118

122-
def apk_check(internal_files: list[str]) -> Optional[Match]:
119+
def apk_check(internal_files: list[str]) -> Match | None:
123120
if "META-INF/MANIFEST.MF" not in internal_files:
124121
return None
125122
if "AndroidManifest.xml" in internal_files:
126123
return Match(".apk", "Android Package", "application/vnd.android.package-archive")
127124
return None
128125

129126

130-
def xpi_check(internal_files: list[str], zip_file: ZipFile) -> Optional[Match]:
127+
def xpi_check(internal_files: list[str], zip_file: ZipFile) -> Match | None:
131128
if "install.rdf" in internal_files and b"mozilla:install-manifest" in zip_file.read("install.rdf"):
132129
return Match(".xpi", "Mozilla Firefox Add-on", "application/x-xpinstall")
133130
return None
134131

135132

136-
def fb2_check(internal_files: list[str], zip_file: ZipFile, file_path: os.PathLike) -> Optional[Match]:
133+
def fb2_check(internal_files: list[str], zip_file: ZipFile, file_path: os.PathLike) -> Match | None:
137134
if (
138135
len(internal_files) == 1
139136
and internal_files[0].endswith(".fb2")
@@ -148,7 +145,7 @@ def fb2_check(internal_files: list[str], zip_file: ZipFile, file_path: os.PathLi
148145
return None
149146

150147

151-
def cbz_check(internal_files: list[str], extension: str) -> Optional[Match]:
148+
def cbz_check(internal_files: list[str], extension: str) -> Match | None:
152149
if extension != "cbz":
153150
return None
154151
image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")
@@ -158,7 +155,7 @@ def cbz_check(internal_files: list[str], extension: str) -> Optional[Match]:
158155
return Match(".cbz", "Comic Book Archive", "application/vnd.comicbook+zip")
159156

160157

161-
def main(file_path: os.PathLike, _, __) -> Optional[Match]:
158+
def main(file_path: os.PathLike, _, __) -> Match | None:
162159
extension = str(file_path).split(".")[-1].lower()
163160
if extension == "zip" and not str(file_path).endswith(".fb2.zip"):
164161
return Match(".zip", "ZIP archive", "application/zip")

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,11 @@ lint.extend-ignore = [
3838
"EXE001",
3939
"F401",
4040
"F403",
41+
"FA102",
4142
"FBT",
4243
"FIX002",
4344
"I001",
45+
"INP001",
4446
"N817",
4547
"PERF401",
4648
"PGH003",

test/resources/system/test.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"Test": "Script"}

test/resources/system/test.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Very simple test python file
2+
3+
4+
def main():
5+
print("Hello World")
6+
7+
8+
if __name__ == "__main__":
9+
main()

0 commit comments

Comments
 (0)