Skip to content

Commit cced951

Browse files
committed
feat: add pyperscan support
hyperscan will run simultaneously all version checkers on a file which reduce processing time. pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature will allow to retrieve easily the checker associated to the matched pattern. On my local machine, running a scan on an embedded firmware takes 220 seconds with pyperscan instead of 326 seconds. However, pyperscan is slower on a single file and unsupported on Windows, so add a --pyperscan option (disabled by default) Fix #2485 Signed-off-by: Fabrice Fontaine <[email protected]>
1 parent 8f8acb3 commit cced951

File tree

8 files changed

+122
-23
lines changed

8 files changed

+122
-23
lines changed

.github/actions/spelling/allow.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ https
303303
hunspell
304304
hur
305305
hwloc
306+
hyperscan
306307
i
307308
icecast
308309
icu
@@ -606,6 +607,7 @@ pybabel
606607
pycon
607608
pycqa
608609
pypa
610+
pyperscan
609611
pypi
610612
pytest
611613
pythex
@@ -782,6 +784,7 @@ uuid
782784
uwsgi
783785
v
784786
varnish
787+
vectorscan
785788
venv
786789
VEXs
787790
vextype

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@ Checkers:
541541
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#-s-skips---skips-skips">-s SKIPS, --skips SKIPS</a>
542542
comma-separated list of checkers to disable
543543
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#-r-checkers---runs-checkers">-r RUNS, --runs RUNS</a> comma-separated list of checkers to enable
544+
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#--pyperscan">--pyperscan</a> use pyperscan for binary checkers (unsupported on Windows)
544545

545546
Database Management:
546547
--import-json IMPORT_JSON

cve_bin_tool/cli.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,12 @@ def main(argv=None):
488488
help="comma-separated list of checkers to enable",
489489
default="",
490490
)
491+
checker_group.add_argument(
492+
"--pyperscan",
493+
action="store_true",
494+
help="Use pyperscan for binary checkers (unsupported on Windows)",
495+
default=False,
496+
)
491497

492498
database_group = parser.add_argument_group("Database Management")
493499
database_group.add_argument(
@@ -1122,10 +1128,12 @@ def main(argv=None):
11221128
version_scanner = VersionScanner(
11231129
should_extract=args["extract"],
11241130
exclude_folders=args["exclude"],
1131+
logger=LOGGER,
11251132
error_mode=error_mode,
11261133
validate=not args["disable_validation_check"],
11271134
sources=enabled_sources,
11281135
no_scan=args["no_scan"],
1136+
pyperscan=args["pyperscan"],
11291137
)
11301138
version_scanner.remove_skiplist(skips)
11311139
LOGGER.info(f"Number of checkers: {version_scanner.number_of_checkers()}")

cve_bin_tool/version_scanner.py

Lines changed: 81 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99
from typing import Iterator
1010

11-
from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker
11+
from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker, VersionMatchInfo
1212
from cve_bin_tool.cvedb import CVEDB
1313
from cve_bin_tool.egg_updater import IS_DEVELOP, update_egg
1414
from cve_bin_tool.error_handler import ErrorMode
@@ -19,6 +19,9 @@
1919
from cve_bin_tool.strings import parse_strings
2020
from cve_bin_tool.util import DirWalk, ProductInfo, ScanInfo, inpath
2121

22+
if sys.platform != "win32":
23+
from pyperscan import Pattern, Scan, StreamDatabase
24+
2225
if sys.version_info >= (3, 10):
2326
from importlib import metadata as importlib_metadata
2427
else:
@@ -45,6 +48,7 @@ def __init__(
4548
validate: bool = True,
4649
sources=None,
4750
no_scan=False,
51+
pyperscan=False,
4852
):
4953
self.no_scan = no_scan
5054
self.logger = logger or LOGGER.getChild(self.__class__.__name__)
@@ -73,6 +77,17 @@ def __init__(
7377
self.logger.info("Checkers loaded: %s" % (", ".join(self.checkers.keys())))
7478
self.language_checkers = valid_files
7579
self.language_checkers_names = self.available_language_checkers()
80+
if pyperscan:
81+
self.logger.warning("pyperscan requested")
82+
if sys.platform == "win32":
83+
self.logger.error("pyperscan unsupported on Windows")
84+
self.pyperscan = False
85+
else:
86+
self.logger.warning("pyperscan enabled")
87+
self.pyperscan = True
88+
else:
89+
self.pyperscan = False
90+
self.pyperscan_db = None
7691

7792
@classmethod
7893
def load_checkers(cls) -> dict[str, type[Checker]]:
@@ -276,36 +291,79 @@ def scan_file(self, filename: str) -> Iterator[ScanInfo]:
276291

277292
yield from self.run_checkers(filename, lines)
278293

294+
def build_pyperscan_database(self, checkers: Checker) -> None:
295+
# Database built only once to improve performance
296+
if self.pyperscan_db is None:
297+
patterns = []
298+
for dummy_checker_name, checker in self.checkers.items():
299+
checker = checker()
300+
checker.dummy_checker_name = dummy_checker_name
301+
for pattern in checker.VERSION_PATTERNS + checker.CONTAINS_PATTERNS:
302+
patterns.append(Pattern(pattern.pattern.encode(), tag=checker))
303+
if patterns:
304+
self.pyperscan_db = StreamDatabase(*patterns)
305+
306+
@staticmethod
307+
def pyperscan_match(
308+
pyperscan_matches: dict, checker: Checker, offset: int, end: int
309+
) -> Scan:
310+
pyperscan_matches[checker.dummy_checker_name] = checker, offset, end
311+
return Scan.Continue
312+
279313
def run_checkers(self, filename: str, lines: str) -> Iterator[ScanInfo]:
280314
"""process a Set of checker objects, run them on file lines,
281315
and yield information about detected products and versions.
282316
It uses logging to provide debug and error information along the way."""
283317
LOGGER.info(f"filename = {filename}")
284-
# tko
285-
for dummy_checker_name, checker in self.checkers.items():
286-
checker = checker()
287-
version_results = checker.get_versions(lines, filename)
288-
289-
if version_results.matched_filename or version_results.matched_contains:
290-
for version in version_results.versions:
291-
if version == "UNKNOWN":
292-
file_path = "".join(self.file_stack)
293-
self.logger.debug(
294-
f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
295-
)
296-
else:
297-
file_path = "".join(self.file_stack)
298-
self.logger.debug(
299-
f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
300-
)
301-
for vendor, product in checker.VENDOR_PRODUCT:
302-
yield ScanInfo(
303-
ProductInfo(vendor, product, version),
304-
file_path,
305-
)
318+
if self.pyperscan:
319+
self.build_pyperscan_database(self.checkers)
320+
321+
pyperscan_matches = dict()
322+
scanner = self.pyperscan_db.build(pyperscan_matches, self.pyperscan_match)
323+
scanner.scan(lines.encode())
324+
325+
for dummy_checker_name, (checker, offset, end) in pyperscan_matches.items():
326+
# Confirm pyperscan match with get_versions as pyperscan doesn't support
327+
# group capture. SOM_LEFTMOST is not enabled (offset is always 0)
328+
version_results = checker.get_versions(lines[offset:end], filename)
329+
yield from self.parse_version_match(
330+
dummy_checker_name, checker, version_results
331+
)
332+
else:
333+
# tko
334+
for dummy_checker_name, checker in self.checkers.items():
335+
checker = checker()
336+
version_results = checker.get_versions(lines, filename)
337+
yield from self.parse_version_match(
338+
dummy_checker_name, checker, version_results
339+
)
306340

307341
self.logger.debug(f"Done scanning file: {filename}")
308342

343+
def parse_version_match(
344+
self,
345+
dummy_checker_name: str,
346+
checker: Checker,
347+
version_results: VersionMatchInfo,
348+
):
349+
if version_results.matched_filename or version_results.matched_contains:
350+
for version in version_results.versions:
351+
if version == "UNKNOWN":
352+
file_path = "".join(self.file_stack)
353+
self.logger.debug(
354+
f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
355+
)
356+
else:
357+
file_path = "".join(self.file_stack)
358+
self.logger.debug(
359+
f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
360+
)
361+
for vendor, product in checker.VENDOR_PRODUCT:
362+
yield ScanInfo(
363+
ProductInfo(vendor, product, version),
364+
file_path,
365+
)
366+
309367
@staticmethod
310368
def clean_file_path(filepath: str) -> str:
311369
"""Returns a cleaner filepath by removing temp path from filepath"""

doc/MANUAL.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
- [Checkers Arguments](#checkers-arguments)
3838
- [-s SKIPS, --skips SKIPS](#-s-skips---skips-skips)
3939
- [-r CHECKERS, --runs CHECKERS](#-r-checkers---runs-checkers)
40+
- [--pyperscan](#--pyperscan)
4041
- [Input Arguments](#input-arguments)
4142
- [directory (positional argument)](#directory-positional-argument)
4243
- [-i INPUT\_FILE, --input-file INPUT\_FILE](#-i-input_file---input-file-input_file)
@@ -214,6 +215,7 @@ which is useful if you're trying the latest code from
214215
-s SKIPS, --skips SKIPS
215216
comma-separated list of checkers to disable
216217
-r RUNS, --runs RUNS comma-separated list of checkers to enable
218+
--pyperscan use pyperscan for binary checkers (unsupported on Windows)
217219

218220
Database Management:
219221
--import-json IMPORT_JSON
@@ -887,6 +889,19 @@ This option allows one to skip (disable) a comma-separated list of checkers and
887889

888890
This option allows one to enable a comma-separated list of checkers.
889891

892+
### --pyperscan
893+
894+
The pyperscan flag enables pyperscan support in the CVE Binary Tool. [pyperscan](https://github.com/vlaci/pyperscan) is an opinionated Python binding for [Hyperscan](https://www.hyperscan.io) focusing on ease of use and safety.
895+
896+
When pyperscan flag is enabled, the tool leverages on Hyperscan high-performance regular expression matching library to runs simultaneously all binary version checkers on a file which significantly reduces processing time.
897+
898+
pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature allows to easily retrieve the binary checker associated to the matched pattern.
899+
900+
> **Note**: pyperscan is unsupported on Windows.
901+
902+
> **Note**: [Default](https://github.com/vlaci/pyperscan/issues/35) configuration of pyperscan uses [vectorscan](https://github.com/VectorCamp/vectorscan), a fork of Intel's Hyperscan, modified to run on more platforms.
903+
904+
890905
## Input Arguments
891906

892907
### directory (positional argument)

requirements.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ the_purl_authors_not_in_db,packageurl-python
2626
h2non,filetype
2727
python,setuptools
2828
jaraco,zipp
29+
vlaci_not_in_db,pyperscan

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ lib4vex>=0.2.0
1414
packageurl-python
1515
packaging>=22.0
1616
plotly
17+
pyperscan; sys_platform != 'win32' # pyperscan unsupported on Windows
1718
python-gnupg
1819
pyyaml>=5.4
1920
requests>=2.32.2

test/test_cli.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,18 @@ def test_runs(self, caplog):
236236
main(["cve-bin-tool", test_path, "-r", ",".join(runs)])
237237
self.check_checkers_log(caplog, skip_checkers, runs)
238238

239+
def test_pyperscan(self, caplog):
240+
test_path = str(Path(__file__).parent.resolve() / "csv")
241+
242+
with caplog.at_level(logging.INFO):
243+
main(["cve-bin-tool", "--pyperscan", test_path])
244+
assert (
245+
"cve_bin_tool",
246+
logging.WARNING,
247+
"pyperscan requested",
248+
) in caplog.record_tuples
249+
caplog.clear()
250+
239251
@pytest.mark.skipif(not LONG_TESTS(), reason="Update flag tests are long tests")
240252
def test_update(self, caplog):
241253
test_path = str(Path(__file__).parent.resolve() / "csv")

0 commit comments

Comments
 (0)