Skip to content

Commit 9f8b998

Browse files
authored
Users/mmaitre/include exclude (#63)
1 parent b3d7a79 commit 9f8b998

File tree

5 files changed

+213
-6
lines changed

5 files changed

+213
-6
lines changed

README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,37 @@ picklescan --url https://huggingface.co/sshleifer/tiny-distilbert-base-cased-dis
4040

4141
To scan Numpy's `.npy` files, pip install the `numpy` package first.
4242

43+
## Usage
44+
45+
### Exit codes
46+
4347
The scanner exit status codes are (a-la [ClamAV](https://www.clamav.net/)):
4448
- `0`: scan did not find malware
4549
- `1`: scan found malware
4650
- `2`: scan failed
4751

52+
### Filtering files and directories
53+
54+
When scanning directories, files and subdirectories can be filtered using regular expressions (again modeled after ClamAV). Each option can be specified multiple times:
55+
56+
| Option | Description |
57+
|---|---|
58+
| `--exclude=REGEX` | Don't scan files whose path matches the regex |
59+
| `--include=REGEX` | Only scan files whose path matches the regex |
60+
| `--exclude-dir=REGEX` | Don't descend into directories whose path matches the regex |
61+
| `--include-dir=REGEX` | Only descend into directories whose path matches the regex |
62+
63+
Key behaviors:
64+
- **Excludes always win over includes.** A file or directory matching both an exclude and an include pattern is skipped.
65+
- **Multiple patterns OR together.** A file is included if it matches *any* `--include` pattern.
66+
- **No includes = everything eligible.** Include patterns only narrow the scan when specified.
67+
- **`--exclude-dir` prunes traversal.** The directory and all of its contents are skipped entirely.
68+
69+
```bash
70+
# Only scan .pkl files, skip the cache/ subdirectory
71+
picklescan --path models/ --include='\.pkl$' --exclude-dir='cache'
72+
```
73+
4874
## Develop
4975

5076
Create and activate the conda environment ([miniconda](https://docs.conda.io/en/latest/miniconda.html) is sufficient):

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = picklescan
3-
version = 1.0.1
3+
version = 1.0.2
44
author = Matthieu Maitre
55
author_email = mmaitre314@users.noreply.github.com
66
description = Security scanner detecting Python Pickle files performing suspicious actions

src/picklescan/cli.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import argparse
22
import logging
33
import os
4+
import re
45
import sys
56

6-
from .scanner import ScanResult, scan_directory_path
7+
from .scanner import ScanFilter, ScanResult, scan_directory_path
78
from .scanner import scan_file_path
89
from .scanner import scan_url
910
from .scanner import scan_huggingface_model
@@ -40,6 +41,36 @@ def main():
4041
)
4142
parser.add_argument("-g", "--globals", help="list all globals found", action="store_true")
4243
parser.set_defaults(globals=False)
44+
parser.add_argument(
45+
"--exclude",
46+
action="append",
47+
default=[],
48+
metavar="REGEX",
49+
help="Don't scan file names matching regular expression. Can be used multiple times.",
50+
)
51+
parser.add_argument(
52+
"--include",
53+
action="append",
54+
default=[],
55+
metavar="REGEX",
56+
help="Only scan file names matching regular expression. Can be used multiple times.",
57+
)
58+
parser.add_argument(
59+
"--exclude-dir",
60+
action="append",
61+
default=[],
62+
metavar="REGEX",
63+
dest="exclude_dir",
64+
help="Don't scan directory names matching regular expression. Can be used multiple times.",
65+
)
66+
parser.add_argument(
67+
"--include-dir",
68+
action="append",
69+
default=[],
70+
metavar="REGEX",
71+
dest="include_dir",
72+
help="Only scan directory names matching regular expression. Can be used multiple times.",
73+
)
4374
parser.add_argument(
4475
"-l",
4576
"--log",
@@ -53,13 +84,20 @@ def main():
5384
if "log_level" in args and args.log_level is not None:
5485
_log.setLevel(getattr(logging, args.log_level))
5586

87+
scan_filter = ScanFilter(
88+
exclude=[re.compile(p) for p in args.exclude],
89+
include=[re.compile(p) for p in args.include],
90+
exclude_dir=[re.compile(p) for p in args.exclude_dir],
91+
include_dir=[re.compile(p) for p in args.include_dir],
92+
)
93+
5694
try:
5795
if args.path is not None:
5896
path = os.path.abspath(args.path)
5997
if not os.path.exists(path):
6098
raise FileNotFoundError(f"Path {path} does not exist")
6199
if os.path.isdir(path):
62-
scan_result = scan_directory_path(path)
100+
scan_result = scan_directory_path(path, scan_filter=scan_filter)
63101
else:
64102
scan_result = scan_file_path(path)
65103
elif args.url is not None:

src/picklescan/scanner.py

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
from dataclasses import dataclass
1+
from dataclasses import dataclass, field
22
from enum import Enum
33
import http.client
44
import io
55
import json
66
import logging
77
import os
88
import pickletools
9+
import re
910
from tarfile import TarError
1011
from tempfile import TemporaryDirectory
1112
from typing import IO, List, Optional, Set, Tuple
@@ -51,6 +52,25 @@ def merge(self, sr: "ScanResult"):
5152
self.scan_err = self.scan_err or sr.scan_err
5253

5354

55+
@dataclass
56+
class ScanFilter:
57+
"""Filtering options for directory scans, modeled after ClamAV clamscan.
58+
59+
- ``exclude``: regexes matched against the full file path; matching files are skipped.
60+
- ``include``: regexes matched against the full file path; when set, only matching files are scanned.
61+
- ``exclude_dir``: regexes matched against directory paths; matching directories are not traversed.
62+
- ``include_dir``: regexes matched against directory paths; when set, only matching directories are traversed.
63+
64+
Excludes always take precedence over includes.
65+
Multiple patterns of the same kind are combined with logical OR.
66+
"""
67+
68+
exclude: List[re.Pattern] = field(default_factory=list)
69+
include: List[re.Pattern] = field(default_factory=list)
70+
exclude_dir: List[re.Pattern] = field(default_factory=list)
71+
include_dir: List[re.Pattern] = field(default_factory=list)
72+
73+
5474
class GenOpsError(Exception):
5575
def __init__(self, msg: str, globals: Optional[Set[Tuple[str, str]]]):
5676
self.msg = msg
@@ -593,12 +613,31 @@ def scan_huggingface_model(repo_id):
593613
return scan_result
594614

595615

596-
def scan_directory_path(path) -> ScanResult:
616+
def _matches_any(patterns: List[re.Pattern], text: str) -> bool:
617+
"""Return True if *text* matches at least one compiled regex in *patterns*."""
618+
return any(p.search(text) for p in patterns)
619+
620+
621+
def scan_directory_path(path, scan_filter: Optional[ScanFilter] = None) -> ScanResult:
597622
_log.debug(f"scan_directory_path({path})")
598623

599624
scan_result = ScanResult([])
600625

601-
for base_path, _, file_names in os.walk(path):
626+
for base_path, dir_names, file_names in os.walk(path):
627+
# --- directory filtering (prune in-place so os.walk skips them) ---
628+
if scan_filter is not None:
629+
filtered_dirs = []
630+
for d in dir_names:
631+
dir_path = os.path.join(base_path, d)
632+
if _matches_any(scan_filter.exclude_dir, dir_path):
633+
_log.debug("Excluding directory %s (matched --exclude-dir)", dir_path)
634+
continue
635+
if scan_filter.include_dir and not _matches_any(scan_filter.include_dir, dir_path):
636+
_log.debug("Skipping directory %s (no --include-dir match)", dir_path)
637+
continue
638+
filtered_dirs.append(d)
639+
dir_names[:] = filtered_dirs
640+
602641
for file_name in file_names:
603642
file_ext = os.path.splitext(file_name)[1]
604643
if (
@@ -608,6 +647,16 @@ def scan_directory_path(path) -> ScanResult:
608647
):
609648
continue
610649
file_path = os.path.join(base_path, file_name)
650+
651+
# --- file filtering ---
652+
if scan_filter is not None:
653+
if _matches_any(scan_filter.exclude, file_path):
654+
_log.debug("Excluding file %s (matched --exclude)", file_path)
655+
continue
656+
if scan_filter.include and not _matches_any(scan_filter.include, file_path):
657+
_log.debug("Skipping file %s (no --include match)", file_path)
658+
continue
659+
611660
_log.debug("Scanning file %s", file_path)
612661
with open(file_path, "rb") as file:
613662
scan_result.merge(scan_bytes(file, file_path, file_ext))

tests/test_scanner.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import io
44
import os
55
import pickle
6+
import re
67
import sys
78
from typing import Union
89
from unittest import TestCase
@@ -14,6 +15,7 @@
1415
from picklescan.scanner import (
1516
Global,
1617
SafetyLevel,
18+
ScanFilter,
1719
ScanResult,
1820
_http_get,
1921
_list_globals,
@@ -586,3 +588,95 @@ def test_not_a_pickle_file():
586588
# File is not a valid pickle, but scanner should not error - just report no threats
587589
not_a_pickle = ScanResult([], scanned_files=1, issues_count=0, infected_files=0, scan_err=False)
588590
compare_scan_results(scan_file_path(f"{_root_path}/data/not_a_pickle.bin"), not_a_pickle)
591+
592+
593+
# ---------------------------------------------------------------------------
594+
# Tests for scan_directory_path with ScanFilter (--include/--exclude support)
595+
# ---------------------------------------------------------------------------
596+
597+
598+
def test_scan_directory_exclude_file():
599+
"""--exclude skips files whose full path matches the regex."""
600+
# Exclude all .zip files – only .pkl/.pickle/.pt/.bin/.7z remain
601+
sf = ScanFilter(exclude=[re.compile(r"\.zip$")])
602+
sr = scan_directory_path(f"{_root_path}/data/", scan_filter=sf)
603+
# No .zip file should have been scanned
604+
assert sr.scanned_files > 0
605+
# The unfiltered scan has 44 scanned files (from test_scan_directory_path);
606+
# we just verify that some files were dropped.
607+
unfiltered = scan_directory_path(f"{_root_path}/data/")
608+
assert sr.scanned_files < unfiltered.scanned_files
609+
610+
611+
def test_scan_directory_include_file():
612+
"""--include restricts scans to files whose path matches the regex."""
613+
# Only scan benign .pkl files
614+
sf = ScanFilter(include=[re.compile(r"benign0_v3\.pkl$")])
615+
sr = scan_directory_path(f"{_root_path}/data/", scan_filter=sf)
616+
assert sr.scanned_files == 1
617+
assert sr.issues_count == 0
618+
619+
620+
def test_scan_directory_exclude_wins_over_include():
621+
"""Excludes always take precedence over includes (ClamAV semantics)."""
622+
sf = ScanFilter(
623+
include=[re.compile(r"benign0_v3\.pkl$")],
624+
exclude=[re.compile(r"benign")],
625+
)
626+
sr = scan_directory_path(f"{_root_path}/data/", scan_filter=sf)
627+
assert sr.scanned_files == 0
628+
629+
630+
def test_scan_directory_exclude_dir():
631+
"""--exclude-dir prevents traversal into matching directories."""
632+
# Scanning the parent tests/ directory but excluding 'data2'
633+
sf = ScanFilter(exclude_dir=[re.compile(r"data2")])
634+
sr = scan_directory_path(f"{_root_path}/", scan_filter=sf)
635+
# Should still find files in data/ but none from data2/
636+
assert sr.scanned_files > 0
637+
# Compare with an include_dir that only allows data/
638+
sf2 = ScanFilter(include_dir=[re.compile(r"/data$")])
639+
sr2 = scan_directory_path(f"{_root_path}/", scan_filter=sf2)
640+
assert sr2.scanned_files > 0
641+
# Both should give the same set of scanned files (only data/)
642+
assert sr.scanned_files == sr2.scanned_files
643+
644+
645+
def test_scan_directory_include_dir():
646+
"""--include-dir restricts which directories are traversed."""
647+
# Only descend into data2/
648+
sf = ScanFilter(include_dir=[re.compile(r"data2")])
649+
sr = scan_directory_path(f"{_root_path}/", scan_filter=sf)
650+
assert sr.scanned_files > 0
651+
652+
# Verify data/ files are NOT included by scanning only data/ and comparing
653+
sf_data_only = ScanFilter(include_dir=[re.compile(r"/data$")])
654+
sr_data = scan_directory_path(f"{_root_path}/", scan_filter=sf_data_only)
655+
# data2 results should differ from data-only results
656+
assert sr.scanned_files != sr_data.scanned_files
657+
658+
659+
def test_scan_directory_multiple_patterns():
660+
"""Multiple patterns of the same kind are combined with logical OR."""
661+
sf = ScanFilter(
662+
include=[re.compile(r"benign0_v3\.pkl$"), re.compile(r"benign0_v4\.pkl$")],
663+
)
664+
sr = scan_directory_path(f"{_root_path}/data/", scan_filter=sf)
665+
assert sr.scanned_files == 2
666+
assert sr.issues_count == 0
667+
668+
669+
def test_scan_directory_no_filter():
670+
"""Passing no filter (None) gives the same result as default behaviour."""
671+
sr_none = scan_directory_path(f"{_root_path}/data/", scan_filter=None)
672+
sr_default = scan_directory_path(f"{_root_path}/data/")
673+
assert sr_none.scanned_files == sr_default.scanned_files
674+
assert sr_none.issues_count == sr_default.issues_count
675+
676+
677+
def test_scan_directory_empty_filter():
678+
"""An empty ScanFilter (no patterns) behaves like no filter at all."""
679+
sf = ScanFilter()
680+
sr = scan_directory_path(f"{_root_path}/data/", scan_filter=sf)
681+
sr_default = scan_directory_path(f"{_root_path}/data/")
682+
assert sr.scanned_files == sr_default.scanned_files

0 commit comments

Comments
 (0)