|
7 | 7 | # See https://aboutcode.org for more information about nexB OSS projects. |
8 | 8 | # |
9 | 9 | import logging |
10 | | -import os |
11 | | -from hashlib import sha512 |
| 10 | +from itertools import groupby |
12 | 11 | from pathlib import Path |
13 | 12 |
|
14 | 13 | import saneyaml |
15 | 14 | from django.core.management.base import BaseCommand |
16 | 15 | from django.core.management.base import CommandError |
17 | 16 | from packageurl import PackageURL |
18 | 17 |
|
| 18 | +from aboutcode import hashid |
19 | 19 | from vulnerabilities.models import Package |
20 | 20 |
|
21 | 21 | logger = logging.getLogger(__name__) |
22 | 22 |
|
23 | 23 |
|
| 24 | +def serialize_severity(sev): |
| 25 | + # inlines refs |
| 26 | + ref = sev.reference |
| 27 | + sevref = { |
| 28 | + "url": ref.url, |
| 29 | + "reference_type": ref.reference_type, |
| 30 | + "reference_id": ref.reference_id, |
| 31 | + } |
| 32 | + |
| 33 | + return { |
| 34 | + "score": sev.value, |
| 35 | + "scoring_system": sev.scoring_system, |
| 36 | + "scoring_elements": sev.scoring_elements, |
| 37 | + "published_at": sev.published_at, |
| 38 | + "reference": sevref, |
| 39 | + } |
| 40 | + |
| 41 | + |
| 42 | +def serialize_vulnerability(vuln): |
| 43 | + """ |
| 44 | + Return a plain data mapping seralized from ``vuln`` Vulnerability instance. |
| 45 | + """ |
| 46 | + aliases = list(vuln.aliases.values_list("alias", flat=True)) |
| 47 | + severities = [serialize_severity(sev) for sev in vuln.severities] |
| 48 | + weaknesses = [wkns.cwe for wkns in vuln.weaknesses.all()] |
| 49 | + |
| 50 | + references = list( |
| 51 | + vuln.references.values( |
| 52 | + "url", |
| 53 | + "reference_type", |
| 54 | + "reference_id", |
| 55 | + ) |
| 56 | + ) |
| 57 | + |
| 58 | + return { |
| 59 | + "vulnerability_id": vuln.vcid, |
| 60 | + "aliases": aliases, |
| 61 | + "summary": vuln.summary, |
| 62 | + "severities": severities, |
| 63 | + "weaknesses": weaknesses, |
| 64 | + "references": references, |
| 65 | + } |
| 66 | + |
| 67 | + |
24 | 68 | class Command(BaseCommand): |
25 | | - help = "export vulnerablecode data" |
| 69 | + help = """Export vulnerability and package data as YAML for use in FederatedCode |
| 70 | +
|
| 71 | + This command exports the data in a tree of directories and YAML files designed such that |
| 72 | + it is possible to access directly a vulnerability data file by only knowing its VCID, and that |
| 73 | + it is possible to access directly the package data files by only knowing its PURL. |
| 74 | + """ |
26 | 75 |
|
27 | 76 | def add_arguments(self, parser): |
28 | | - parser.add_argument("path") |
| 77 | + parser.add_argument( |
| 78 | + "path", |
| 79 | + help="Path to a directory where to export data.", |
| 80 | + ) |
29 | 81 |
|
30 | 82 | def handle(self, *args, **options): |
31 | | - if options["path"]: |
32 | | - git_path = Path(options["path"]) |
33 | | - if not git_path.is_dir(): |
34 | | - raise CommandError("Please enter a valid path") |
| 83 | + if path := options["path"]: |
| 84 | + base_path = Path(path) |
35 | 85 |
|
36 | | - self.export_data(git_path) |
| 86 | + if not path or not base_path.is_dir(): |
| 87 | + raise CommandError("Enter a valid directory path") |
37 | 88 |
|
38 | | - self.stdout.write(self.style.SUCCESS("Successfully exported vulnerablecode data")) |
| 89 | + self.stdout.write("Exporting vulnerablecode Package and Vulnerability data.") |
| 90 | + self.export_data(base_path) |
| 91 | + self.stdout.write(self.style.SUCCESS(f"Successfully exported data to {base_path}.")) |
39 | 92 |
|
40 | | - def export_data(self, git_path): |
| 93 | + def export_data(self, base_path: Path): |
41 | 94 | """ |
42 | | - export vulnerablecode data |
43 | | - by running `python manage.py export /path/vulnerablecode-data` |
| 95 | + Export vulnerablecode data to ``base_path``.` |
44 | 96 | """ |
45 | | - self.stdout.write("Exporting vulnerablecode data") |
46 | | - |
47 | | - ecosystems = [pkg.type for pkg in Package.objects.distinct("type")] |
48 | | - |
49 | | - for ecosystem in ecosystems: |
50 | | - package_files = {} # {"package path": "data" } |
51 | | - vul_files = {} # {"vulnerability path": "data" } |
52 | | - |
53 | | - for purl in ( |
54 | | - Package.objects.filter(type=ecosystem) |
55 | | - .prefetch_related("vulnerabilities") |
56 | | - .paginated() |
57 | | - ): |
58 | | - purl_without_version = PackageURL( |
59 | | - type=purl.type, |
60 | | - namespace=purl.namespace, |
61 | | - name=purl.name, |
62 | | - ) |
63 | | - |
64 | | - # ./aboutcode-packages-ed5/maven/org.apache.log4j/log4j-core/versions/vulnerabilities.yml |
65 | | - pkg_filepath = ( |
66 | | - f"./aboutcode-packages-{get_purl_hash(purl_without_version)}/{purl.type}/{purl.namespace}/{purl.name}" |
67 | | - f"/versions/vulnerabilities.yml" |
68 | | - ) |
69 | | - |
70 | | - package_data = { |
71 | | - "purl": str(purl), |
72 | | - "affected_by_vulnerabilities": [ |
73 | | - vuln.vulnerability_id for vuln in purl.affected_by |
74 | | - ], |
75 | | - "fixing_vulnerabilities": [vuln.vulnerability_id for vuln in purl.fixing], |
76 | | - } |
77 | | - |
78 | | - if pkg_filepath in package_files: |
79 | | - package_files[pkg_filepath]["versions"].append(package_data) |
80 | | - else: |
81 | | - package_files[pkg_filepath] = { |
82 | | - "package": str(purl_without_version), |
83 | | - "versions": [package_data], |
| 97 | + i = 0 |
| 98 | + seen_vcid = set() |
| 99 | + |
| 100 | + for i, (purl_without_version, package_versions) in enumerate(packages_by_type_ns_name(), 1): |
| 101 | + pkg_version = None |
| 102 | + try: |
| 103 | + package_urls = [] |
| 104 | + package_vulnerabilities = [] |
| 105 | + for pkg_version in package_versions: |
| 106 | + purl = pkg_version.package_url |
| 107 | + package_urls.append(purl) |
| 108 | + package_data = { |
| 109 | + "purl": purl, |
| 110 | + "affected_by_vulnerabilities": list( |
| 111 | + pkg_version.affected_by.values_list("vulnerability_id", flat=True) |
| 112 | + ), |
| 113 | + "fixing_vulnerabilities": list( |
| 114 | + pkg_version.fixing.values_list("vulnerability_id", flat=True) |
| 115 | + ), |
84 | 116 | } |
| 117 | + package_vulnerabilities.append(package_data) |
85 | 118 |
|
86 | | - for vul in purl.vulnerabilities.all(): |
87 | | - vulnerability_id = vul.vulnerability_id |
88 | | - # ./aboutcode-vulnerabilities-12/34/VCID-1223-3434-34343/VCID-1223-3434-34343.yml |
89 | | - vul_filepath = ( |
90 | | - f"./aboutcode-vulnerabilities-{vulnerability_id[5:7]}/{vulnerability_id[10:12]}" |
91 | | - f"/{vulnerability_id}/{vulnerability_id}.yml" |
92 | | - ) |
93 | | - vul_files[vul_filepath] = { |
94 | | - "vulnerability_id": vul.vulnerability_id, |
95 | | - "aliases": [alias.alias for alias in vul.get_aliases], |
96 | | - "summary": vul.summary, |
97 | | - "severities": [severity for severity in vul.severities.values()], |
98 | | - "references": [ref for ref in vul.references.values()], |
99 | | - "weaknesses": [ |
100 | | - "CWE-" + str(weakness["cwe_id"]) for weakness in vul.weaknesses.values() |
101 | | - ], |
102 | | - } |
| 119 | + for vuln in pkg_version.vulnerabilities.all(): |
| 120 | + vcid = vuln.vulnerability_id |
| 121 | + # do not write twice the same file |
| 122 | + if vcid in seen_vcid: |
| 123 | + continue |
| 124 | + |
| 125 | + seen_vcid.add(vcid) |
| 126 | + vulnerability = serialize_vulnerability(vuln) |
| 127 | + vpath = hashid.get_vcid_yml_file_path(vcid) |
| 128 | + write_file(base_path=base_path, file_path=vpath, data=vulnerability) |
| 129 | + if (lv := len(seen_vcid)) % 100 == 0: |
| 130 | + self.stdout.write(f"Processed {lv} vulnerabilities. Last VCID: {vcid}") |
| 131 | + |
| 132 | + ppath = hashid.get_package_purls_yml_file_path(purl) |
| 133 | + write_file(base_path=base_path, file_path=ppath, data=package_urls) |
103 | 134 |
|
104 | | - for items in [package_files, vul_files]: |
105 | | - for filepath, data in items.items(): |
106 | | - create_file(filepath, git_path, data) |
| 135 | + pvpath = hashid.get_package_vulnerabilities_yml_file_path(purl) |
| 136 | + write_file(base_path=base_path, file_path=pvpath, data=package_vulnerabilities) |
107 | 137 |
|
108 | | - self.stdout.write(f"Successfully exported {ecosystem} data") |
| 138 | + if i % 100 == 0: |
| 139 | + self.stdout.write(f"Processed {i} package. Last PURL: {purl_without_version}") |
109 | 140 |
|
| 141 | + except Exception as e: |
| 142 | + raise Exception(f"Failed to process Package: {pkg_version}") from e |
110 | 143 |
|
111 | | -def create_file(filepath, git_path, data): |
| 144 | + self.stdout.write(f"Exported data for: {i} package and {len(seen_vcid)} vulnerabilities.") |
| 145 | + |
| 146 | + |
| 147 | +def by_purl_type_ns_name(package): |
112 | 148 | """ |
113 | | - Check if the directories exist if it doesn't exist create a new one then Create the file |
114 | | - ./aboutcode-vulnerabilities-12/34/VCID-1223-3434-34343/VCID-1223-3434-34343.yml |
115 | | - ./aboutcode-packages-ed5/maven/org.apache.log4j/log4j-core/versions/vulnerabilities.yml |
116 | | - ./aboutcode-packages-ed5/maven/org.apache.log4j/log4j-core/versions/1.2.3/vulnerabilities.yml |
| 149 | + Key function to sort packages by type, namespace and name |
117 | 150 | """ |
118 | | - filepath = git_path.joinpath(filepath) |
119 | | - dirname = os.path.dirname(filepath) |
120 | | - os.makedirs(dirname, exist_ok=True) |
121 | | - data = saneyaml.dump(data) |
122 | | - with open(filepath, encoding="utf-8", mode="w") as f: |
123 | | - f.write(data) |
| 151 | + return package.type, package.namespace, package.name |
124 | 152 |
|
125 | 153 |
|
126 | | -def get_purl_hash(purl: PackageURL, length: int = 3) -> str: |
| 154 | +def packages_by_type_ns_name(): |
| 155 | + """ |
| 156 | + Return a two-level iterator over all Packages grouped-by package, ignoring version. |
| 157 | + """ |
| 158 | + qs = ( |
| 159 | + Package.objects.order_by("type", "namespace", "name", "version") |
| 160 | + .prefetch_related( |
| 161 | + "vulnerabilities", |
| 162 | + "vulnerabilities__references", |
| 163 | + "vulnerabilities__weaknesses", |
| 164 | + "vulnerabilities__references__vulnerabilityseverity_set", |
| 165 | + ) |
| 166 | + .paginated() |
| 167 | + ) |
| 168 | + |
| 169 | + for tp_ns_name, packages in groupby(qs, key=by_purl_type_ns_name): |
| 170 | + yield PackageURL(*tp_ns_name), packages |
| 171 | + |
| 172 | + |
| 173 | +def write_file(base_path: Path, file_path: Path, data: dict): |
127 | 174 | """ |
128 | | - Return a short lower cased hash of a purl. |
129 | | - https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154 |
| 175 | + Write the ``data`` as YAML to the ``file_path`` in the ``base_path`` root directory. |
| 176 | + Create directories in the path as needed. |
130 | 177 | """ |
131 | | - purl_bytes = str(purl).encode("utf-8") |
132 | | - short_hash = sha512(purl_bytes).hexdigest()[:length] |
133 | | - return short_hash.lower() |
| 178 | + write_to = base_path / file_path |
| 179 | + write_to.parent.mkdir(parents=True, exist_ok=True) |
| 180 | + with open(write_to, encoding="utf-8", mode="w") as f: |
| 181 | + f.write(saneyaml.dump(data)) |
0 commit comments