Skip to content

Commit a238db8

Browse files
committed
feat: add pyperscan support
hyperscan will run simultaneously all version checkers on a file which reduce processing time. pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature will allow to retrieve easily the checker associated to the matched pattern. On my local machine, running a scan on an embedded firmware takes 220 seconds with pyperscan instead of 326 seconds. However, pyperscan is slower on a single file and unsupported on Windows, so add a --pyperscan option (disabled by default) Fix #2485 Signed-off-by: Fabrice Fontaine <[email protected]>
1 parent 8f8acb3 commit a238db8

File tree

7 files changed

+101
-23
lines changed

7 files changed

+101
-23
lines changed

.github/actions/spelling/allow.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ https
303303
hunspell
304304
hur
305305
hwloc
306+
hyperscan
306307
i
307308
icecast
308309
icu
@@ -606,6 +607,7 @@ pybabel
606607
pycon
607608
pycqa
608609
pypa
610+
pyperscan
609611
pypi
610612
pytest
611613
pythex

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@ Checkers:
541541
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#-s-skips---skips-skips">-s SKIPS, --skips SKIPS</a>
542542
comma-separated list of checkers to disable
543543
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#-r-checkers---runs-checkers">-r RUNS, --runs RUNS</a> comma-separated list of checkers to enable
544+
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#--pyperscan">--pyperscan</a> use pyperscan for binary checkers (unsupported on Windows)
544545

545546
Database Management:
546547
--import-json IMPORT_JSON

cve_bin_tool/cli.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,12 @@ def main(argv=None):
488488
help="comma-separated list of checkers to enable",
489489
default="",
490490
)
491+
checker_group.add_argument(
492+
"--pyperscan",
493+
action="store_true",
494+
help="Use pyperscan for binary checkers (unsupported on Windows)",
495+
default=False,
496+
)
491497

492498
database_group = parser.add_argument_group("Database Management")
493499
database_group.add_argument(
@@ -1126,6 +1132,7 @@ def main(argv=None):
11261132
validate=not args["disable_validation_check"],
11271133
sources=enabled_sources,
11281134
no_scan=args["no_scan"],
1135+
pyperscan=args["pyperscan"],
11291136
)
11301137
version_scanner.remove_skiplist(skips)
11311138
LOGGER.info(f"Number of checkers: {version_scanner.number_of_checkers()}")

cve_bin_tool/version_scanner.py

Lines changed: 76 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99
from typing import Iterator
1010

11-
from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker
11+
from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker, VersionMatchInfo
1212
from cve_bin_tool.cvedb import CVEDB
1313
from cve_bin_tool.egg_updater import IS_DEVELOP, update_egg
1414
from cve_bin_tool.error_handler import ErrorMode
@@ -19,6 +19,9 @@
1919
from cve_bin_tool.strings import parse_strings
2020
from cve_bin_tool.util import DirWalk, ProductInfo, ScanInfo, inpath
2121

22+
if sys.platform != "win32":
23+
from pyperscan import Pattern, Scan, StreamDatabase
24+
2225
if sys.version_info >= (3, 10):
2326
from importlib import metadata as importlib_metadata
2427
else:
@@ -45,6 +48,7 @@ def __init__(
4548
validate: bool = True,
4649
sources=None,
4750
no_scan=False,
51+
pyperscan=False,
4852
):
4953
self.no_scan = no_scan
5054
self.logger = logger or LOGGER.getChild(self.__class__.__name__)
@@ -73,6 +77,12 @@ def __init__(
7377
self.logger.info("Checkers loaded: %s" % (", ".join(self.checkers.keys())))
7478
self.language_checkers = valid_files
7579
self.language_checkers_names = self.available_language_checkers()
80+
if sys.platform == "win32" and pyperscan:
81+
self.logger.error("pyperscan unsupported on Windows")
82+
self.pyperscan = False
83+
else:
84+
self.pyperscan = pyperscan
85+
self.pyperscan_db = None
7686

7787
@classmethod
7888
def load_checkers(cls) -> dict[str, type[Checker]]:
@@ -276,36 +286,79 @@ def scan_file(self, filename: str) -> Iterator[ScanInfo]:
276286

277287
yield from self.run_checkers(filename, lines)
278288

289+
def build_pyperscan_database(self, checkers: Checker) -> None:
290+
# Database built only once to improve performance
291+
if self.pyperscan_db is None:
292+
patterns = []
293+
for dummy_checker_name, checker in self.checkers.items():
294+
checker = checker()
295+
checker.dummy_checker_name = dummy_checker_name
296+
for pattern in checker.VERSION_PATTERNS + checker.CONTAINS_PATTERNS:
297+
patterns.append(Pattern(pattern.pattern.encode(), tag=checker))
298+
if patterns:
299+
self.pyperscan_db = StreamDatabase(*patterns)
300+
301+
@staticmethod
302+
def pyperscan_match(
303+
pyperscan_matches: dict, checker: Checker, offset: int, end: int
304+
) -> Scan:
305+
pyperscan_matches[checker.dummy_checker_name] = checker, offset, end
306+
return Scan.Continue
307+
279308
def run_checkers(self, filename: str, lines: str) -> Iterator[ScanInfo]:
280309
"""process a Set of checker objects, run them on file lines,
281310
and yield information about detected products and versions.
282311
It uses logging to provide debug and error information along the way."""
283312
LOGGER.info(f"filename = {filename}")
284-
# tko
285-
for dummy_checker_name, checker in self.checkers.items():
286-
checker = checker()
287-
version_results = checker.get_versions(lines, filename)
288-
289-
if version_results.matched_filename or version_results.matched_contains:
290-
for version in version_results.versions:
291-
if version == "UNKNOWN":
292-
file_path = "".join(self.file_stack)
293-
self.logger.debug(
294-
f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
295-
)
296-
else:
297-
file_path = "".join(self.file_stack)
298-
self.logger.debug(
299-
f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
300-
)
301-
for vendor, product in checker.VENDOR_PRODUCT:
302-
yield ScanInfo(
303-
ProductInfo(vendor, product, version),
304-
file_path,
305-
)
313+
if self.pyperscan:
314+
self.build_pyperscan_database(self.checkers)
315+
316+
pyperscan_matches = dict()
317+
scanner = self.pyperscan_db.build(pyperscan_matches, self.pyperscan_match)
318+
scanner.scan(lines.encode())
319+
320+
for dummy_checker_name, (checker, offset, end) in pyperscan_matches.items():
321+
# Confirm pyperscan match with get_versions as pyperscan doesn't support
322+
# group capture. SOM_LEFTMOST is not enabled (offset is always 0)
323+
version_results = checker.get_versions(lines[offset:end], filename)
324+
yield from self.parse_version_match(
325+
dummy_checker_name, checker, version_results
326+
)
327+
else:
328+
# tko
329+
for dummy_checker_name, checker in self.checkers.items():
330+
checker = checker()
331+
version_results = checker.get_versions(lines, filename)
332+
yield from self.parse_version_match(
333+
dummy_checker_name, checker, version_results
334+
)
306335

307336
self.logger.debug(f"Done scanning file: {filename}")
308337

338+
def parse_version_match(
339+
self,
340+
dummy_checker_name: str,
341+
checker: Checker,
342+
version_results: VersionMatchInfo,
343+
):
344+
if version_results.matched_filename or version_results.matched_contains:
345+
for version in version_results.versions:
346+
if version == "UNKNOWN":
347+
file_path = "".join(self.file_stack)
348+
self.logger.debug(
349+
f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
350+
)
351+
else:
352+
file_path = "".join(self.file_stack)
353+
self.logger.debug(
354+
f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
355+
)
356+
for vendor, product in checker.VENDOR_PRODUCT:
357+
yield ScanInfo(
358+
ProductInfo(vendor, product, version),
359+
file_path,
360+
)
361+
309362
@staticmethod
310363
def clean_file_path(filepath: str) -> str:
311364
"""Returns a cleaner filepath by removing temp path from filepath"""

doc/MANUAL.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
- [Checkers Arguments](#checkers-arguments)
3838
- [-s SKIPS, --skips SKIPS](#-s-skips---skips-skips)
3939
- [-r CHECKERS, --runs CHECKERS](#-r-checkers---runs-checkers)
40+
- [--pyperscan](#--pyperscan)
4041
- [Input Arguments](#input-arguments)
4142
- [directory (positional argument)](#directory-positional-argument)
4243
- [-i INPUT\_FILE, --input-file INPUT\_FILE](#-i-input_file---input-file-input_file)
@@ -214,6 +215,7 @@ which is useful if you're trying the latest code from
214215
-s SKIPS, --skips SKIPS
215216
comma-separated list of checkers to disable
216217
-r RUNS, --runs RUNS comma-separated list of checkers to enable
218+
--pyperscan use pyperscan for binary checkers (unsupported on Windows)
217219

218220
Database Management:
219221
--import-json IMPORT_JSON
@@ -887,6 +889,17 @@ This option allows one to skip (disable) a comma-separated list of checkers and
887889

888890
This option allows one to enable a comma-separated list of checkers.
889891

892+
### --pyperscan
893+
894+
The pyperscan flag enables pyperscan support in the CVE Bin Tool. [pyperscan](https://github.com/vlaci/pyperscan) is an opinionated Python binding for [Hyperscan](https://www.hyperscan.io) focusing on easy of use and safety.
895+
896+
When pyperscan flag is enabled, the tool leverages on Hyperscan High-performance regular expression matching library to runs simultaneously all binary version checkers on a file which significantly reduce processing time.
897+
898+
pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature allows to easily retrieve the binary checker associated to the matched pattern.
899+
900+
> **Note**: pyperscan is unsupported on Windows.
901+
902+
890903
## Input Arguments
891904

892905
### directory (positional argument)

requirements.csv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,4 @@ the_purl_authors_not_in_db,packageurl-python
2626
h2non,filetype
2727
python,setuptools
2828
jaraco,zipp
29+
vlaci_not_in_db,pyperscan

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ lib4vex>=0.2.0
1414
packageurl-python
1515
packaging>=22.0
1616
plotly
17+
pyperscan; sys_platform != 'win32' # pyperscan unsupported on Windows
1718
python-gnupg
1819
pyyaml>=5.4
1920
requests>=2.32.2

0 commit comments

Comments
 (0)