Skip to content

Commit 9c73442

Browse files
authored
feat: find common strings in CONTAINS_PATTERNS from helper_scripts.py (closes #1353) (#1586)
* closes #1353
1 parent 1635fa4 commit 9c73442

File tree

5 files changed

+195
-38
lines changed

5 files changed

+195
-38
lines changed

cve_bin_tool/checkers/README.md

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,13 +278,16 @@ that include this product. For our example all listings except
278278
example SQL query).
279279

280280
## Helper-Script
281-
Helper-Script is a tool that takes *packages*(i.e. busybox_1.30.1-4ubuntu9_amd64.deb) as input and returns:
281+
Helper-Script is a tool that takes a *package*(i.e. busybox_1.30.1-4ubuntu9_amd64.deb) as input and returns:
282282

283283
> 1. `CONTAINS_PATTERNS` - list of commonly found strings in the binary of the product
284284
> 2. `FILENAME_PATTERNS` - list of different filename for the product
285285
> 3. `VERSION_PATTERNS` - list of version patterns found in binary of the product.
286286
> 4. `VENDOR_PRODUCT` - list of vendor product pairs for the product as they appear in NVD.
287287
288+
Helper-Script can also take multiple packages and `PRODUCT_NAME`(required) as input and return
289+
common strings for `CONTAINS_PATTERNS`.
290+
288291
Usage: `python -m cve_bin_tool.helper_script`
289292

290293
```
@@ -357,6 +360,22 @@ class BusyboxChecker(Checker):
357360

358361
Try this against a few more `busybox` packages across different `distros` and see which strings are common among the following. Then follow the above steps to create the checker.
359362

363+
To get common strings for `CONTAINS_PATTERNS` in multiple `busybox` packages, we can use the script like this:
364+
365+
```
366+
windows > python3 -m cve_bin_tool.helper_script busybox_1.30.1-4ubuntu6_amd64.deb busybox-1.33.0-3.fc34.x86_64.rpm --product busybox
367+
linux $ python3 -m cve_bin_tool.helper_script busybox_1.30.1-4ubuntu6_amd64.deb busybox-1.33.0-3.fc34.x86_64.rpm --product busybox
368+
─────────────────────────────────────────────────────── Common CONTAINS_PATTERNS strings for BusyboxChecker──────────────────────────
369+
370+
class BusyboxChecker(Checker):
371+
CONTAINS_PATTERNS = [
372+
r"BusyBox is a multi-call binary that combines many common Unix",
373+
r"BusyBox is copyrighted by many authors between 1998-2015.",
374+
r"link to busybox for each function they wish to use and BusyBox",
375+
]
376+
─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
377+
```
378+
360379
> _***NOTE:*** If you look at our existing checkers, you'll see that some strings are commented out in `CONTAINS_PATTERNS`. These strings are kept there as potential strings in case if the currently used strings stop working in the future versions. If you also find more than 2-3 strings, it's recommended to comment them out for future reference._
361380
362381
Currently, if you receive multiple vendor-product pairs, select the appropriate vendor-product pair from the following pairs obtained manually. In this case, it is `[('busybox', 'busybox')]`.

cve_bin_tool/helper_script.py

Lines changed: 100 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,44 @@
11
# Copyright (C) 2021 Intel Corporation
22
# SPDX-License-Identifier: GPL-3.0-or-later
33

4+
from __future__ import annotations
5+
46
import argparse
57
import os
68
import re
79
import sys
810
import textwrap
911
from collections import ChainMap
12+
from logging import Logger
1013

1114
from rich import print as rprint
1215
from rich.console import Console
1316

1417
from cve_bin_tool.cvedb import CVEDB, DBNAME, DISK_LOCATION_DEFAULT
1518
from cve_bin_tool.error_handler import ErrorHandler, ErrorMode, UnknownArchiveType
16-
from cve_bin_tool.extractor import Extractor
19+
from cve_bin_tool.extractor import Extractor, TempDirExtractorContext
1720
from cve_bin_tool.log import LOGGER
1821
from cve_bin_tool.util import DirWalk
1922
from cve_bin_tool.version_scanner import VersionScanner
2023

24+
WARNED = False
25+
2126

2227
class HelperScript:
2328
"""Helps contributors who want to write a new cve-bin-tool checker find common filenames, version strings, and other necessary data for building a binary checker"""
2429

2530
CONSOLE = Console()
26-
LOGGER = LOGGER.getChild("HelperScript")
31+
LOGGER: Logger = LOGGER.getChild("HelperScript")
2732

2833
def __init__(
29-
self, filename, product_name=None, version_number=None, string_length=40
34+
self,
35+
filename: str,
36+
product_name: str | None = None,
37+
version_number: str | None = None,
38+
string_length: int = 40,
3039
):
3140
self.filename = filename
32-
self.extractor = Extractor()
41+
self.extractor: TempDirExtractorContext = Extractor()
3342
self.product_name, self.version_number = self.parse_filename(filename)
3443
if product_name:
3544
self.product_name = product_name
@@ -45,20 +54,20 @@ def __init__(
4554
self.walker = DirWalk().walk
4655

4756
# for output (would use in future)
48-
self.contains_patterns = []
49-
self.filename_pattern = []
50-
self.version_pattern = []
51-
self.vendor_product = self.find_vendor_product()
57+
self.contains_patterns: list[str] = []
58+
self.filename_pattern: list[str] = []
59+
self.version_pattern: list[str] = []
60+
self.vendor_product: list[tuple[str, str]] | None = self.find_vendor_product()
5261

5362
# for scanning files versions
5463
self.version_scanner = VersionScanner()
5564

56-
def extract_and_parse_file(self, filename):
65+
def extract_and_parse_file(self, filename: str) -> list[str] | None:
5766
"""extracts and parses the file for common patterns, version strings and common filename patterns"""
5867

5968
with self.extractor as ectx:
6069
if ectx.can_extract(filename):
61-
binary_string_list = []
70+
binary_string_list: list[str] = []
6271
for filepath in self.walker([ectx.extract(filename)]):
6372
clean_path = self.version_scanner.clean_file_path(filepath)
6473
LOGGER.debug(f"checking whether {clean_path} is binary")
@@ -100,7 +109,7 @@ def extract_and_parse_file(self, filename):
100109
return self.contains_patterns
101110
return binary_string_list
102111

103-
def search_pattern(self, file_content, pattern):
112+
def search_pattern(self, file_content: str, pattern: str) -> list[str]:
104113
"""find strings for CONTAINS_PATTERNS with product_name in them"""
105114

106115
file_content_list = file_content.split("\n")
@@ -112,7 +121,7 @@ def search_pattern(self, file_content, pattern):
112121
) # TODO: regex highlight in these matched strings?
113122
return matches
114123

115-
def search_version_string(self, matched_list):
124+
def search_version_string(self, matched_list: list[str]) -> list[str]:
116125
"""finds version strings from matched list"""
117126

118127
# TODO: add multiline string finding
@@ -142,7 +151,7 @@ def search_version_string(self, matched_list):
142151
) # TODO: regex highlight in these matched strings?
143152
return version_strings
144153

145-
def parse_filename(self, filename):
154+
def parse_filename(self, filename: str) -> tuple[str, str]:
146155
"""
147156
returns package_name/product_name from package_filename of types .rpm, .deb, etc.
148157
Example: package_filename = openssh-client_8.4p1-5ubuntu1_amd64.deb
@@ -160,7 +169,7 @@ def parse_filename(self, filename):
160169
# example: libarchive-3.5.1-1-aarch64.pkg.tar.xz
161170
elif filename.endswith(".deb") or filename.endswith(".ipk"):
162171
product_name = filename.rsplit("_")[0]
163-
version_number = filename.rsplit("_")[1]
172+
version_number = filename.rsplit("_")[1].rsplit("-")[0].rsplit("+")[0]
164173
# example: varnish_6.4.0-3_amd64.deb
165174
else:
166175
product_name = filename.rsplit("-", 2)[0]
@@ -175,7 +184,7 @@ def parse_filename(self, filename):
175184
with ErrorHandler(mode=ErrorMode.NoTrace, logger=LOGGER):
176185
raise UnknownArchiveType(filename)
177186

178-
def find_vendor_product(self):
187+
def find_vendor_product(self) -> list[tuple[str, str]] | None:
179188
"""find vendor-product pairs from database"""
180189

181190
LOGGER.debug(
@@ -197,7 +206,8 @@ def find_vendor_product(self):
197206
# checking if (vendor, product) was found in the database
198207
if data:
199208
# warning the user to select the vendor-product pairs manually if multiple pairs are found
200-
if len(data) != 1:
209+
global WARNED
210+
if len(data) != 1 and not WARNED:
201211
LOGGER.warning(
202212
textwrap.dedent(
203213
f"""
@@ -208,6 +218,7 @@ def find_vendor_product(self):
208218
"""
209219
)
210220
)
221+
WARNED = True # prevent same warning multiple times
211222
return data # [('vendor', 'product')]
212223
else:
213224
if self.product_name:
@@ -236,7 +247,7 @@ def find_vendor_product(self):
236247

237248
CVEDB.db_close(self)
238249

239-
def output(self):
250+
def output_single(self) -> None:
240251
"""display beautiful output for Helper-Script"""
241252

242253
self.CONSOLE.rule(f"[bold dark_magenta]{self.product_name.capitalize()}Checker")
@@ -312,8 +323,78 @@ def output(self):
312323

313324
self.CONSOLE.rule()
314325

326+
@staticmethod
327+
def output_common(common_strings: list[str], product_name: str) -> None:
328+
"""display beautiful output for common strings in CONTAINS_PATTERNS"""
329+
330+
HelperScript.CONSOLE.rule(
331+
f"[bold dark_magenta]Common CONTAINS_PATTERNS strings for {product_name.capitalize()}Checker"
332+
)
333+
rprint(f"[red]class[/] [blue]{product_name.capitalize()}Checker[/](Checker):")
334+
335+
print("\tCONTAINS_PATTERNS = [")
336+
for common_string in sorted(common_strings):
337+
if ".debug" in common_string:
338+
rprint(
339+
f'\t\t[red]r"{common_string}"[/] <--- not recommended to use this form of strings'
340+
)
341+
continue # without this, the else statement was getting printed ;-;
342+
if ".so" in common_string:
343+
rprint(
344+
f'\t\t[red]r"{common_string}"[/] <--- not recommended to use this form of strings'
345+
)
346+
else:
347+
rprint(f'\t\t[green]r"{common_string}"[/],')
348+
print("\t]")
349+
HelperScript.CONSOLE.rule()
350+
351+
352+
def scan_files(args) -> None:
353+
"""Scans file and outputs Checker class or common CONTAINS_PATTERNS depending on number of files given"""
354+
355+
LOGGER.debug(f"Given filenames: {args['filenames']}")
356+
LOGGER.info("Scanning files")
357+
358+
hs_list: list[HelperScript] = [
359+
HelperScript(
360+
args["filenames"][x],
361+
product_name=args["product_name"],
362+
version_number=args["version_number"],
363+
string_length=args["string_length"],
364+
)
365+
for x, _ in enumerate(args["filenames"])
366+
]
367+
368+
if len(hs_list) > 1: # more than one files are given - output common strings
369+
370+
# return if product_name is not given
371+
if not args["product_name"]:
372+
LOGGER.error("PRODUCT_NAME not in arguments")
373+
return None
374+
375+
if args["version_number"]:
376+
LOGGER.warning(
377+
"VERSION_NUMBER in arguments, common strings may not be found if files have different versions"
378+
)
379+
380+
for hs in hs_list:
381+
hs.extract_and_parse_file(hs.filename)
382+
383+
common_strings = hs_list[0].contains_patterns
315384

316-
def main(argv=None):
385+
# getting common strings
386+
for hs in hs_list:
387+
common_strings = list(set(common_strings) & set(hs.contains_patterns))
388+
389+
HelperScript.output_common(common_strings, hs_list[0].product_name)
390+
391+
else: # one file is given
392+
for hs in hs_list:
393+
hs.extract_and_parse_file(hs.filename)
394+
hs.output_single()
395+
396+
397+
def main(argv=None) -> None:
317398

318399
argv = argv or sys.argv
319400

@@ -383,20 +464,7 @@ def main(argv=None):
383464

384465
LOGGER.setLevel(args["log_level"].upper())
385466

386-
LOGGER.debug(f"Given filenames: {args['filenames']}")
387-
LOGGER.info(f"Scanning only the first filename: '{args['filenames'][0]}'")
388-
hs = HelperScript(
389-
args["filenames"][0],
390-
product_name=args["product_name"],
391-
version_number=args["version_number"],
392-
string_length=args["string_length"],
393-
)
394-
395-
# Parsing, Extracting and Searching for version-strings
396-
hs.extract_and_parse_file(args["filenames"][0])
397-
398-
# output on console
399-
hs.output()
467+
scan_files(args)
400468

401469

402470
if __name__ == "__main__":
5.01 MB
Binary file not shown.
2.89 MB
Binary file not shown.

0 commit comments

Comments
 (0)