Skip to content

Commit 971c780

Browse files
committed
Add dynamic checks
1 parent 2a98164 commit 971c780

File tree

9 files changed

+322
-56
lines changed

9 files changed

+322
-56
lines changed

puremagic/magic_data.json

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1747,9 +1747,6 @@
17471747
["0a0d0d0a", 0, ".pcapng", "application/octet-stream", "pcapng capture file"],
17481748
["05000000", 0, "", "", "INFO2 Windows recycle bin"],
17491749
["34cdb2a1", 0, "", "", "Tcpdump capture file"],
1750-
["fffe0000", 0, "", "", "UTF-32|UCS-4 file"],
1751-
["efbbbf", 0, "", "", "UTF8 file"],
1752-
["feff", 0, "", "", "UTF-16|UCS-2 file"],
17531750
["6f3c", 0, "", "", "SMS text (SIM)"],
17541751
["aced", 0, "", "", "Java serialization data"],
17551752
[

puremagic/main.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import puremagic
2323

2424
if os.getenv("PUREMAGIC_DEEPSCAN") != "0":
25-
from puremagic.scanners import (zip_scanner, pdf_scanner, text_scanner, json_scanner, python_scanner, sndhdr_scanner)
25+
from puremagic.scanners import zip_scanner, pdf_scanner, text_scanner, json_scanner, python_scanner, sndhdr_scanner
2626

2727
__author__ = "Chris Griffith"
2828
__version__ = "2.0.0b5"
@@ -68,6 +68,10 @@ class PureError(LookupError):
6868
"""Do not have that type of file in our databanks"""
6969

7070

71+
class PureValueError(ValueError):
72+
"""Invalid input"""
73+
74+
7175
def magic_data(
7276
filename: os.PathLike | str = os.path.join(here, "magic_data.json"),
7377
) -> tuple[list[PureMagic], list[PureMagic], list[PureMagic], dict[bytes, list[PureMagic]]]:
@@ -193,7 +197,7 @@ def identify_all(header: bytes, footer: bytes, ext=None) -> list[PureMagicWithCo
193197
def perform_magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=None) -> str:
194198
"""Discover what type of file it is based on the incoming string"""
195199
if not header:
196-
raise ValueError("Input was empty")
200+
raise PureValueError("Input was empty")
197201
infos = identify_all(header, footer, ext)
198202
if filename and os.getenv("PUREMAGIC_DEEPSCAN") != "0":
199203
results = run_deep_scan(infos, filename, header, footer, raise_on_none=True)
@@ -322,7 +326,7 @@ def magic_file(filename: os.PathLike | str) -> list[PureMagicWithConfidence]:
322326
"""
323327
head, foot = file_details(filename)
324328
if not head:
325-
raise ValueError("Input was empty")
329+
raise PureValueError("Input was empty")
326330
try:
327331
info = identify_all(head, foot, ext_from_filename(filename))
328332
except PureError:
@@ -344,7 +348,7 @@ def magic_string(string, filename: os.PathLike | str | None = None) -> list[Pure
344348
:return: list of possible matches, highest confidence first
345349
"""
346350
if not string:
347-
raise ValueError("Input was empty")
351+
raise PureValueError("Input was empty")
348352
head, foot = string_details(string)
349353
ext = ext_from_filename(filename) if filename else None
350354
info = identify_all(head, foot, ext)
@@ -366,7 +370,7 @@ def magic_stream(
366370
"""
367371
head, foot = stream_details(stream)
368372
if not head:
369-
raise ValueError("Input was empty")
373+
raise PureValueError("Input was empty")
370374
ext = ext_from_filename(filename) if filename else None
371375
info = identify_all(head, foot, ext)
372376
info.sort(key=lambda x: x.confidence, reverse=True)
@@ -389,17 +393,13 @@ def single_deep_scan(
389393
return zip_scanner.main(filename, head, foot)
390394
case pdf_scanner.match_bytes:
391395
return pdf_scanner.main(filename, head, foot)
392-
case (
393-
sndhdr_scanner.hcom_match_bytes
394-
| sndhdr_scanner.fssd_match_bytes
395-
| sndhdr_scanner.sndr_match_bytes
396-
):
396+
case sndhdr_scanner.hcom_match_bytes | sndhdr_scanner.fssd_match_bytes | sndhdr_scanner.sndr_match_bytes:
397397
# sndr is a loose confidence and other results may be better
398398
result = sndhdr_scanner.main(filename, head, foot)
399399
if result and result.confidence > confidence:
400400
return result
401401

402-
# The first match wins, so text_scanner should always be the last
402+
# The first match wins
403403
for scanner in (pdf_scanner, python_scanner, json_scanner):
404404
result = scanner.main(filename, head, foot)
405405
if result:
@@ -446,7 +446,7 @@ def run_deep_scan(
446446
try:
447447
result = catch_all_deep_scan(filename, head, foot)
448448
except Exception:
449-
pass
449+
raise
450450
else:
451451
if result:
452452
return [result]
@@ -491,19 +491,29 @@ def command_line_entry(*args):
491491
help="Return the mime type instead of file type",
492492
)
493493
parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print verbose output")
494-
parser.add_argument("files", nargs="+")
494+
parser.add_argument("files", nargs="+", type=Path)
495495
parser.add_argument("--version", action="version", version=puremagic.__version__)
496496
args = parser.parse_args(args if args else sys.argv[1:])
497497

498498
for fn in args.files:
499-
if not os.path.exists(fn):
499+
if not fn.exists():
500500
print(f"File '{fn}' does not exist!")
501501
continue
502-
try:
503-
print(f"'{fn}' : {from_file(fn, args.mime)}")
504-
except PureError:
505-
print(f"'{fn}' : could not be Identified")
506-
continue
502+
if fn.is_dir():
503+
for file in fn.iterdir():
504+
if not file.is_file():
505+
continue
506+
try:
507+
print(f"'{file}' : {from_file(file, args.mime)}")
508+
except (PureError, PureValueError):
509+
print(f"'{file}' : could not be Identified")
510+
continue
511+
else:
512+
try:
513+
print(f"'{fn}' : {from_file(fn, args.mime)}")
514+
except (PureError, PureValueError):
515+
print(f"'{fn}' : could not be Identified")
516+
continue
507517
if args.verbose:
508518
matches = magic_file(fn)
509519
print(f"Total Possible Matches: {len(matches)}")

puremagic/scanners/json_scanner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88

99
def main(file_path: os.PathLike | str, head: bytes, foot: bytes) -> Match | None:
10-
if not (head.strip().startswith(b"{") and foot.strip().endswith(b"}")):
10+
if not (head.strip().startswith(b"{") and foot.strip().endswith(b"}")) and not (head.strip().startswith(b"[") and foot.strip().endswith(b"]")):
1111
return None
1212
try:
1313
with open(file_path, "rb") as file:
Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,106 @@
11
import ast
22
import os
3+
import re
34

45
from puremagic.scanners.helpers import Match
56

7+
python_common_keywords = [
8+
re.compile("\bdef\b"),
9+
re.compile("\bclass\b"),
10+
re.compile("\bimport\b"),
11+
re.compile("\belif\b"),
12+
re.compile("\bwhile\b"),
13+
re.compile("\bexcept\b"),
14+
re.compile("\bfinally\b"),
15+
re.compile("\breturn\b"),
16+
re.compile("\byield\b"),
17+
re.compile("\blambda\b"),
18+
re.compile("\bTrue\b"),
19+
re.compile("\bFalse\b"),
20+
re.compile("\bNone\b"),
21+
re.compile("\b__version__\b"),
22+
re.compile("__main__"),
23+
]
624

7-
def main(file_path: os.PathLike | str, *_, **__) -> Match | None:
25+
python_patterns = [
26+
re.compile(r"\bdef\s+\w+\s*\("), # Function definitions
27+
re.compile(r"\bclass\s+\w+\s*[\(:]"), # Class definitions
28+
re.compile(r"\bimport\s+\w+"), # Import statements
29+
re.compile(r"\bfrom\s+\w+\s+import"), # From-import statements
30+
re.compile(r"\bif\s+.*:"), # If statements
31+
re.compile(r"\bfor\s+\w+\s+in\s+.*:"), # For loops
32+
re.compile(r"\bwhile\s+.*:"), # While loops
33+
re.compile(r"\btry\s*:"), # Try blocks
34+
re.compile(r"\.append\("), # Method calls
35+
re.compile(r"\.join\("), # String operations
36+
re.compile(r"print\s*\("), # Print statements
37+
]
38+
39+
40+
def main(file_path: os.PathLike | str, _, __) -> Match | None:
841
file_size = os.path.getsize(file_path)
942
if file_size > 1_000_000:
1043
return None
1144
if not str(file_path).endswith(".py") and file_size < 100:
1245
return None
1346

1447
try:
15-
with open(file_path, "r") as file:
48+
with open(file_path, "r", encoding="utf-8") as file:
1649
content = file.read()
50+
51+
# Parse to ensure it's valid Python syntax
1752
ast.parse(content)
18-
except Exception:
53+
54+
if not str(file_path).endswith(".py"):
55+
if not is_substantial_python_code(content):
56+
return None
57+
58+
except (SyntaxError, UnicodeDecodeError, PermissionError, OSError):
1959
return None
60+
2061
return Match(
2162
extension=".py",
2263
name="Python Script",
2364
mime_type="text/x-python",
2465
confidence=1.0,
2566
)
67+
68+
69+
def is_substantial_python_code(content: str) -> bool:
70+
"""
71+
Check if the content contains substantial Python code indicators.
72+
Returns True if the content appears to be meaningful Python code.
73+
"""
74+
# Remove comments and strings to focus on actual code
75+
content_lines = content.splitlines()
76+
code_lines = []
77+
78+
for line in content_lines:
79+
# Remove comments (basic approach - doesn't handle strings containing #)
80+
line = line.split("#")[0].strip()
81+
if line: # Non-empty after removing comments
82+
code_lines.append(line)
83+
84+
# If too few substantial lines, it's probably not real code
85+
if len(code_lines) < 2:
86+
return False
87+
88+
code_text = " ".join(code_lines)
89+
90+
# Check for Python keywords that indicate actual code
91+
92+
# Count how many keywords are present
93+
keyword_count = 0
94+
for keyword in python_common_keywords:
95+
if keyword.search(code_text):
96+
keyword_count += 1
97+
98+
# Require at least 2 keywords for substantial code
99+
if keyword_count < 2:
100+
return False
101+
102+
# Check for common Python patterns
103+
for pattern in python_patterns:
104+
if pattern.search(code_text):
105+
return True
106+
return False

puremagic/scanners/sndhdr_scanner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def test_hcom(head: bytes) -> Optional[Match]:
3030
)
3131
return None
3232

33+
3334
def main(_, head: bytes, __) -> Optional[Match]:
3435
try:
3536
rate = get_short_le(head[2:4])

0 commit comments

Comments
 (0)