Skip to content

Commit 87629cb

Browse files
authored
Improve numpy compatibility and non-pickle handling (#58)
Address #57
1 parent b0292af commit 87629cb

File tree

4 files changed

+51
-6
lines changed

4 files changed

+51
-6
lines changed

src/picklescan/scanner.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -400,11 +400,14 @@ def scan_pickle_bytes(data: IO[bytes], file_id, multiple_pickles=True) -> ScanRe
400400
try:
401401
raw_globals = _list_globals(data, multiple_pickles)
402402
except GenOpsError as e:
403-
_log.error(f"ERROR: parsing pickle in {file_id}: {e}", exc_info=_log.isEnabledFor(logging.DEBUG))
404403
if e.globals is not None:
404+
# Found some globals before error - could be a malicious partial pickle
405+
_log.error(f"ERROR: parsing pickle in {file_id}: {e}", exc_info=_log.isEnabledFor(logging.DEBUG))
405406
return _build_scan_result_from_raw_globals(e.globals, file_id, scan_err=True)
406407
else:
407-
return ScanResult([], scan_err=True)
408+
# No globals found - likely not a pickle file at all
409+
_log.warning(f"WARNING: could not parse {file_id} as pickle: {e}")
410+
return ScanResult([], scanned_files=1, scan_err=False)
408411

409412
_log.debug("Global imports in %s: %s", file_id, raw_globals)
410413

@@ -487,8 +490,12 @@ def scan_numpy(data: IO[bytes], file_id) -> ScanResult:
487490
# .npy file
488491

489492
version = np.lib.format.read_magic(data)
490-
np.lib.format._check_version(version)
491-
_, _, dtype = np.lib.format._read_array_header(data, version)
493+
if version == (1, 0):
494+
_, _, dtype = np.lib.format.read_array_header_1_0(data)
495+
elif version in [(2, 0), (3, 0)]:
496+
_, _, dtype = np.lib.format.read_array_header_2_0(data)
497+
else:
498+
raise ValueError(f"Unsupported numpy format version: {version}")
492499

493500
if dtype.hasobject:
494501
return scan_pickle_bytes(data, file_id)

tests/data/not_a_pickle.bin

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
cmod
2+

tests/init_data_files.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,26 @@ def initialize_corrupt_zip_file_crc(path: str, file_name: str, data: bytes):
469469
print(f"Initialized file {path}.")
470470

471471

472+
def initialize_not_a_pickle_file(path: str):
473+
"""Create a binary file that starts with pickle GLOBAL opcode (0x63 = 'c') but contains
474+
invalid UTF-8 bytes, causing 'utf-8' codec can't decode byte error.
475+
This reproduces the issue seen with files like vitpose_h_wholebody_data.bin.
476+
"""
477+
if os.path.exists(path):
478+
print(f"File {path} already exists, skipping initialization.")
479+
return
480+
481+
# Byte 0x63 is 'c' which is the GLOBAL opcode in pickle protocol 0/1/2
482+
# The GLOBAL opcode expects to read two newline-terminated strings (module and name)
483+
# We provide a valid module name "mod" followed by newline, then an invalid UTF-8 byte (0xf8)
484+
# This triggers: 'utf-8' codec can't decode byte 0xf8 in position 0: invalid start byte
485+
data = b"cmod\n\xf8\n"
486+
487+
with open(path, "wb") as file:
488+
file.write(data)
489+
print(f"Initialized file {path}.")
490+
491+
472492
def initialize_pickle_files():
473493
os.makedirs(f"{_root_path}/data", exist_ok=True)
474494

@@ -796,6 +816,8 @@ def initialize_pickle_files():
796816
initialize_pickle_file_from_reduce("io_FileIO.pkl", reduce_io_FileIO)
797817
initialize_pickle_file_from_reduce("urllib_request_urlopen.pkl", reduce_urllib_request_urlopen)
798818

819+
initialize_not_a_pickle_file(f"{_root_path}/data/not_a_pickle.bin")
820+
799821

800822
def initialize_numpy_files():
801823
import numpy as np

tests/test_scanner.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,9 @@ def test_scan_file_path():
287287
malicious10 = ScanResult([Global("__builtin__", "exec", SafetyLevel.Dangerous)], 1, 1, 1)
288288
compare_scan_results(scan_file_path(f"{_root_path}/data/malicious10.pkl"), malicious10)
289289

290-
bad_pytorch = ScanResult([], 0, 0, 0, True)
290+
# bad_pytorch.pt is a PNG file with .pt extension - scanner should recognize it's not a valid pickle
291+
# and report it as scanned (scanned_files=1) but without errors (scan_err=False) since no threats were found
292+
bad_pytorch = ScanResult([], 1, 0, 0, False)
291293
compare_scan_results(scan_file_path(f"{_root_path}/data/bad_pytorch.pt"), bad_pytorch)
292294

293295
malicious14 = ScanResult([Global("runpy", "_run_code", SafetyLevel.Dangerous)], 1, 1, 1)
@@ -508,9 +510,11 @@ def test_scan_directory_path():
508510
Global("builtins", "eval", SafetyLevel.Dangerous),
509511
Global("builtins", "eval", SafetyLevel.Dangerous),
510512
],
511-
scanned_files=42,
513+
scanned_files=44,
512514
issues_count=43,
513515
infected_files=37,
516+
# scan_err=True because some files (broken_model.pkl, malicious-invalid-bytes.pkl) have partial parsing errors
517+
scan_err=True,
514518
)
515519
compare_scan_results(scan_directory_path(f"{_root_path}/data/"), sr)
516520

@@ -556,3 +560,13 @@ def test_invalid_bytes_err():
556560
scan_pickle_bytes(file, f"{_root_path}/data/malicious-invalid-bytes.pkl"),
557561
malicious_invalid_bytes,
558562
)
563+
564+
565+
def test_not_a_pickle_file():
566+
"""Test scanning a binary file that starts with pickle GLOBAL opcode but has invalid UTF-8.
567+
This reproduces the 'utf-8' codec can't decode byte error seen with files like vitpose_h_wholebody_data.bin.
568+
The scanner should handle this gracefully: file is scanned, no threats found, no error.
569+
"""
570+
# File is not a valid pickle, but scanner should not error - just report no threats
571+
not_a_pickle = ScanResult([], scanned_files=1, issues_count=0, infected_files=0, scan_err=False)
572+
compare_scan_results(scan_file_path(f"{_root_path}/data/not_a_pickle.bin"), not_a_pickle)

0 commit comments

Comments
 (0)