Skip to content

Commit 9812b2e

Browse files
committed
feat: add pyperscan support
hyperscan will run simultaneously all version checkers on a file which reduce processing time. pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature will allow to retrieve easily the checker associated to the matched pattern. On my local machine, running a scan on an embedded firmware takes 220 seconds with pyperscan instead of 326 seconds. However, pyperscan is slower on a single file and unsupported on Windows, so add a --pyperscan option (disabled by default) Fix #2485 Signed-off-by: Fabrice Fontaine <[email protected]>
1 parent c5bc81b commit 9812b2e

File tree

8 files changed

+122
-23
lines changed

8 files changed

+122
-23
lines changed

.github/actions/spelling/allow.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ https
310310
hunspell
311311
hur
312312
hwloc
313+
hyperscan
313314
i
314315
icecast
315316
icu
@@ -618,6 +619,7 @@ pybabel
618619
pycon
619620
pycqa
620621
pypa
622+
pyperscan
621623
pypi
622624
pytest
623625
pythex
@@ -796,6 +798,7 @@ uuid
796798
uwsgi
797799
v
798800
varnish
801+
vectorscan
799802
venv
800803
VEXs
801804
vextype

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -543,6 +543,7 @@ Checkers:
543543
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#-s-skips---skips-skips">-s SKIPS, --skips SKIPS</a>
544544
comma-separated list of checkers to disable
545545
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#-r-checkers---runs-checkers">-r RUNS, --runs RUNS</a> comma-separated list of checkers to enable
546+
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#--pyperscan">--pyperscan</a> use pyperscan for binary checkers (unsupported on Windows)
546547

547548
Database Management:
548549
--import-json IMPORT_JSON

cve_bin_tool/cli.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,12 @@ def main(argv=None):
489489
help="comma-separated list of checkers to enable",
490490
default="",
491491
)
492+
checker_group.add_argument(
493+
"--pyperscan",
494+
action="store_true",
495+
help="Use pyperscan for binary checkers (unsupported on Windows)",
496+
default=False,
497+
)
492498

493499
database_group = parser.add_argument_group("Database Management")
494500
database_group.add_argument(
@@ -1129,10 +1135,12 @@ def main(argv=None):
11291135
version_scanner = VersionScanner(
11301136
should_extract=args["extract"],
11311137
exclude_folders=args["exclude"],
1138+
logger=LOGGER,
11321139
error_mode=error_mode,
11331140
validate=not args["disable_validation_check"],
11341141
sources=enabled_sources,
11351142
no_scan=args["no_scan"],
1143+
pyperscan=args["pyperscan"],
11361144
)
11371145
version_scanner.remove_skiplist(skips)
11381146
LOGGER.info(f"Number of checkers: {version_scanner.number_of_checkers()}")

cve_bin_tool/version_scanner.py

Lines changed: 81 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99
from typing import Iterator
1010

11-
from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker
11+
from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker, VersionMatchInfo
1212
from cve_bin_tool.cvedb import CVEDB
1313
from cve_bin_tool.egg_updater import IS_DEVELOP, update_egg
1414
from cve_bin_tool.error_handler import ErrorMode
@@ -19,6 +19,9 @@
1919
from cve_bin_tool.strings import parse_strings
2020
from cve_bin_tool.util import DirWalk, ProductInfo, ScanInfo, inpath
2121

22+
if sys.platform != "win32":
23+
from pyperscan import Pattern, Scan, StreamDatabase
24+
2225
if sys.version_info >= (3, 10):
2326
from importlib import metadata as importlib_metadata
2427
else:
@@ -45,6 +48,7 @@ def __init__(
4548
validate: bool = True,
4649
sources=None,
4750
no_scan=False,
51+
pyperscan=False,
4852
):
4953
self.no_scan = no_scan
5054
self.logger = logger or LOGGER.getChild(self.__class__.__name__)
@@ -74,6 +78,17 @@ def __init__(
7478
)
7579
self.language_checkers = valid_files
7680
self.language_checkers_names = self.available_language_checkers()
81+
if pyperscan:
82+
self.logger.warning("pyperscan requested")
83+
if sys.platform == "win32":
84+
self.logger.error("pyperscan unsupported on Windows")
85+
self.pyperscan = False
86+
else:
87+
self.logger.warning("pyperscan enabled")
88+
self.pyperscan = True
89+
else:
90+
self.pyperscan = False
91+
self.pyperscan_db = None
7792

7893
if self.no_scan:
7994
self.cve_db = None
@@ -281,36 +296,79 @@ def scan_file(self, filename: str) -> Iterator[ScanInfo]:
281296

282297
yield from self.run_checkers(filename, lines)
283298

299+
def build_pyperscan_database(self, checkers: Checker) -> None:
300+
# Database built only once to improve performance
301+
if self.pyperscan_db is None:
302+
patterns = []
303+
for dummy_checker_name, checker in self.checkers.items():
304+
checker = checker()
305+
checker.dummy_checker_name = dummy_checker_name
306+
for pattern in checker.VERSION_PATTERNS + checker.CONTAINS_PATTERNS:
307+
patterns.append(Pattern(pattern.pattern.encode(), tag=checker))
308+
if patterns:
309+
self.pyperscan_db = StreamDatabase(*patterns)
310+
311+
@staticmethod
312+
def pyperscan_match(
313+
pyperscan_matches: dict, checker: Checker, offset: int, end: int
314+
) -> Scan:
315+
pyperscan_matches[checker.dummy_checker_name] = checker, offset, end
316+
return Scan.Continue
317+
284318
def run_checkers(self, filename: str, lines: str) -> Iterator[ScanInfo]:
285319
"""process a Set of checker objects, run them on file lines,
286320
and yield information about detected products and versions.
287321
It uses logging to provide debug and error information along the way."""
288322
LOGGER.info(f"filename = {filename}")
289-
# tko
290-
for dummy_checker_name, checker in self.checkers.items():
291-
checker = checker()
292-
version_results = checker.get_versions(lines, filename)
293-
294-
if version_results.matched_filename or version_results.matched_contains:
295-
for version in version_results.versions:
296-
if version == "UNKNOWN":
297-
file_path = "".join(self.file_stack)
298-
self.logger.debug(
299-
f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
300-
)
301-
else:
302-
file_path = "".join(self.file_stack)
303-
self.logger.debug(
304-
f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
305-
)
306-
for vendor, product in checker.VENDOR_PRODUCT:
307-
yield ScanInfo(
308-
ProductInfo(vendor, product, version),
309-
file_path,
310-
)
323+
if self.pyperscan:
324+
self.build_pyperscan_database(self.checkers)
325+
326+
pyperscan_matches = dict()
327+
scanner = self.pyperscan_db.build(pyperscan_matches, self.pyperscan_match)
328+
scanner.scan(lines.encode())
329+
330+
for dummy_checker_name, (checker, offset, end) in pyperscan_matches.items():
331+
# Confirm pyperscan match with get_versions as pyperscan doesn't support
332+
# group capture. SOM_LEFTMOST is not enabled (offset is always 0)
333+
version_results = checker.get_versions(lines[offset:end], filename)
334+
yield from self.parse_version_match(
335+
dummy_checker_name, checker, version_results
336+
)
337+
else:
338+
# tko
339+
for dummy_checker_name, checker in self.checkers.items():
340+
checker = checker()
341+
version_results = checker.get_versions(lines, filename)
342+
yield from self.parse_version_match(
343+
dummy_checker_name, checker, version_results
344+
)
311345

312346
self.logger.debug(f"Done scanning file: {filename}")
313347

348+
def parse_version_match(
349+
self,
350+
dummy_checker_name: str,
351+
checker: Checker,
352+
version_results: VersionMatchInfo,
353+
):
354+
if version_results.matched_filename or version_results.matched_contains:
355+
for version in version_results.versions:
356+
if version == "UNKNOWN":
357+
file_path = "".join(self.file_stack)
358+
self.logger.debug(
359+
f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
360+
)
361+
else:
362+
file_path = "".join(self.file_stack)
363+
self.logger.debug(
364+
f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
365+
)
366+
for vendor, product in checker.VENDOR_PRODUCT:
367+
yield ScanInfo(
368+
ProductInfo(vendor, product, version),
369+
file_path,
370+
)
371+
314372
@staticmethod
315373
def clean_file_path(filepath: str) -> str:
316374
"""Returns a cleaner filepath by removing temp path from filepath"""

doc/MANUAL.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
- [Checkers Arguments](#checkers-arguments)
3838
- [-s SKIPS, --skips SKIPS](#-s-skips---skips-skips)
3939
- [-r CHECKERS, --runs CHECKERS](#-r-checkers---runs-checkers)
40+
- [--pyperscan](#--pyperscan)
4041
- [Input Arguments](#input-arguments)
4142
- [directory (positional argument)](#directory-positional-argument)
4243
- [-i INPUT\_FILE, --input-file INPUT\_FILE](#-i-input_file---input-file-input_file)
@@ -214,6 +215,7 @@ which is useful if you're trying the latest code from
214215
-s SKIPS, --skips SKIPS
215216
comma-separated list of checkers to disable
216217
-r RUNS, --runs RUNS comma-separated list of checkers to enable
218+
--pyperscan use pyperscan for binary checkers (unsupported on Windows)
217219

218220
Database Management:
219221
--import-json IMPORT_JSON
@@ -889,6 +891,19 @@ This option allows one to skip (disable) a comma-separated list of checkers and
889891

890892
This option allows one to enable a comma-separated list of checkers.
891893

894+
### --pyperscan
895+
896+
The pyperscan flag enables pyperscan support in the CVE Binary Tool. [pyperscan](https://github.com/vlaci/pyperscan) is an opinionated Python binding for [Hyperscan](https://www.hyperscan.io) focusing on ease of use and safety.
897+
898+
When pyperscan flag is enabled, the tool leverages on Hyperscan high-performance regular expression matching library to runs simultaneously all binary version checkers on a file which significantly reduces processing time.
899+
900+
pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature allows to easily retrieve the binary checker associated to the matched pattern.
901+
902+
> **Note**: pyperscan is unsupported on Windows.
903+
904+
> **Note**: [Default](https://github.com/vlaci/pyperscan/issues/35) configuration of pyperscan uses [vectorscan](https://github.com/VectorCamp/vectorscan), a fork of Intel's Hyperscan, modified to run on more platforms.
905+
906+
892907
## Input Arguments
893908

894909
### directory (positional argument)

requirements.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ the_purl_authors_not_in_db,packageurl-python
2626
h2non,filetype
2727
python,setuptools
2828
jaraco,zipp
29+
vlaci_not_in_db,pyperscan

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ lib4vex>=0.2.0
1414
packageurl-python
1515
packaging>=22.0
1616
plotly
17+
pyperscan; sys_platform != 'win32' # pyperscan unsupported on Windows
1718
python-gnupg
1819
pyyaml>=5.4
1920
requests>=2.32.2

test/test_cli.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,18 @@ def test_runs(self, caplog):
236236
main(["cve-bin-tool", test_path, "-r", ",".join(runs)])
237237
self.check_checkers_log(caplog, skip_checkers, runs)
238238

239+
def test_pyperscan(self, caplog):
240+
test_path = str(Path(__file__).parent.resolve() / "csv")
241+
242+
with caplog.at_level(logging.INFO):
243+
main(["cve-bin-tool", "--pyperscan", test_path])
244+
assert (
245+
"cve_bin_tool",
246+
logging.WARNING,
247+
"pyperscan requested",
248+
) in caplog.record_tuples
249+
caplog.clear()
250+
239251
@pytest.mark.skipif(not LONG_TESTS(), reason="Update flag tests are long tests")
240252
def test_update(self, caplog):
241253
test_path = str(Path(__file__).parent.resolve() / "csv")

0 commit comments

Comments
 (0)