Skip to content

Commit ab31565

Browse files
committed
Refactor export command
- Improve memroy usage of main querysets - Do not leak internal ids in serialized data - Work towards reusing serializers Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent c4e617d commit ab31565

File tree

6 files changed

+244
-169
lines changed

6 files changed

+244
-169
lines changed

vulnerabilities/management/commands/export.py

Lines changed: 138 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -7,127 +7,175 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99
import logging
10-
import os
11-
from hashlib import sha512
10+
from itertools import groupby
1211
from pathlib import Path
1312

1413
import saneyaml
1514
from django.core.management.base import BaseCommand
1615
from django.core.management.base import CommandError
1716
from packageurl import PackageURL
1817

18+
from aboutcode import hashid
1919
from vulnerabilities.models import Package
2020

2121
logger = logging.getLogger(__name__)
2222

2323

24+
def serialize_severity(sev):
25+
# inlines refs
26+
ref = sev.reference
27+
sevref = {
28+
"url": ref.url,
29+
"reference_type": ref.reference_type,
30+
"reference_id": ref.reference_id,
31+
}
32+
33+
return {
34+
"score": sev.value,
35+
"scoring_system": sev.scoring_system,
36+
"scoring_elements": sev.scoring_elements,
37+
"published_at": sev.published_at,
38+
"reference": sevref,
39+
}
40+
41+
42+
def serialize_vulnerability(vuln):
43+
"""
44+
Return a plain data mapping seralized from ``vuln`` Vulnerability instance.
45+
"""
46+
aliases = list(vuln.aliases.values_list("alias", flat=True))
47+
severities = [serialize_severity(sev) for sev in vuln.severities]
48+
weaknesses = [wkns.cwe for wkns in vuln.weaknesses.all()]
49+
50+
references = list(
51+
vuln.references.values(
52+
"url",
53+
"reference_type",
54+
"reference_id",
55+
)
56+
)
57+
58+
return {
59+
"vulnerability_id": vuln.vcid,
60+
"aliases": aliases,
61+
"summary": vuln.summary,
62+
"severities": severities,
63+
"weaknesses": weaknesses,
64+
"references": references,
65+
}
66+
67+
2468
class Command(BaseCommand):
25-
help = "export vulnerablecode data"
69+
help = """Export vulnerability and package data as YAML for use in FederatedCode
70+
71+
This command exports the data in a tree of directories and YAML files designed such that
72+
it is possible to access directly a vulnerability data file by only knowing its VCID, and that
73+
it is possible to access directly the package data files by only knowing its PURL.
74+
"""
2675

2776
def add_arguments(self, parser):
28-
parser.add_argument("path")
77+
parser.add_argument(
78+
"path",
79+
help="Path to a directory where to export data.",
80+
)
2981

3082
def handle(self, *args, **options):
31-
if options["path"]:
32-
git_path = Path(options["path"])
33-
if not git_path.is_dir():
34-
raise CommandError("Please enter a valid path")
83+
if path := options["path"]:
84+
base_path = Path(path)
3585

36-
self.export_data(git_path)
86+
if not path or not base_path.is_dir():
87+
raise CommandError("Enter a valid directory path")
3788

38-
self.stdout.write(self.style.SUCCESS("Successfully exported vulnerablecode data"))
89+
self.stdout.write("Exporting vulnerablecode Package and Vulnerability data.")
90+
self.export_data(base_path)
91+
self.stdout.write(self.style.SUCCESS(f"Successfully exported data to {base_path}."))
3992

40-
def export_data(self, git_path):
93+
def export_data(self, base_path: Path):
4194
"""
42-
export vulnerablecode data
43-
by running `python manage.py export /path/vulnerablecode-data`
95+
Export vulnerablecode data to ``base_path``.`
4496
"""
45-
self.stdout.write("Exporting vulnerablecode data")
46-
47-
ecosystems = [pkg.type for pkg in Package.objects.distinct("type")]
48-
49-
for ecosystem in ecosystems:
50-
package_files = {} # {"package path": "data" }
51-
vul_files = {} # {"vulnerability path": "data" }
52-
53-
for purl in (
54-
Package.objects.filter(type=ecosystem)
55-
.prefetch_related("vulnerabilities")
56-
.paginated()
57-
):
58-
purl_without_version = PackageURL(
59-
type=purl.type,
60-
namespace=purl.namespace,
61-
name=purl.name,
62-
)
63-
64-
# ./aboutcode-packages-ed5/maven/org.apache.log4j/log4j-core/versions/vulnerabilities.yml
65-
pkg_filepath = (
66-
f"./aboutcode-packages-{get_purl_hash(purl_without_version)}/{purl.type}/{purl.namespace}/{purl.name}"
67-
f"/versions/vulnerabilities.yml"
68-
)
69-
70-
package_data = {
71-
"purl": str(purl),
72-
"affected_by_vulnerabilities": [
73-
vuln.vulnerability_id for vuln in purl.affected_by
74-
],
75-
"fixing_vulnerabilities": [vuln.vulnerability_id for vuln in purl.fixing],
76-
}
77-
78-
if pkg_filepath in package_files:
79-
package_files[pkg_filepath]["versions"].append(package_data)
80-
else:
81-
package_files[pkg_filepath] = {
82-
"package": str(purl_without_version),
83-
"versions": [package_data],
97+
i = 0
98+
seen_vcid = set()
99+
100+
for i, (purl_without_version, package_versions) in enumerate(packages_by_type_ns_name(), 1):
101+
pkg_version = None
102+
try:
103+
package_urls = []
104+
package_vulnerabilities = []
105+
for pkg_version in package_versions:
106+
purl = pkg_version.package_url
107+
package_urls.append(purl)
108+
package_data = {
109+
"purl": purl,
110+
"affected_by_vulnerabilities": list(
111+
pkg_version.affected_by.values_list("vulnerability_id", flat=True)
112+
),
113+
"fixing_vulnerabilities": list(
114+
pkg_version.fixing.values_list("vulnerability_id", flat=True)
115+
),
84116
}
117+
package_vulnerabilities.append(package_data)
85118

86-
for vul in purl.vulnerabilities.all():
87-
vulnerability_id = vul.vulnerability_id
88-
# ./aboutcode-vulnerabilities-12/34/VCID-1223-3434-34343/VCID-1223-3434-34343.yml
89-
vul_filepath = (
90-
f"./aboutcode-vulnerabilities-{vulnerability_id[5:7]}/{vulnerability_id[10:12]}"
91-
f"/{vulnerability_id}/{vulnerability_id}.yml"
92-
)
93-
vul_files[vul_filepath] = {
94-
"vulnerability_id": vul.vulnerability_id,
95-
"aliases": [alias.alias for alias in vul.get_aliases],
96-
"summary": vul.summary,
97-
"severities": [severity for severity in vul.severities.values()],
98-
"references": [ref for ref in vul.references.values()],
99-
"weaknesses": [
100-
"CWE-" + str(weakness["cwe_id"]) for weakness in vul.weaknesses.values()
101-
],
102-
}
119+
for vuln in pkg_version.vulnerabilities.all():
120+
vcid = vuln.vulnerability_id
121+
# do not write twice the same file
122+
if vcid in seen_vcid:
123+
continue
124+
125+
seen_vcid.add(vcid)
126+
vulnerability = serialize_vulnerability(vuln)
127+
vpath = hashid.get_vcid_yml_file_path(vcid)
128+
write_file(base_path=base_path, file_path=vpath, data=vulnerability)
129+
if (lv := len(seen_vcid)) % 100 == 0:
130+
self.stdout.write(f"Processed {lv} vulnerabilities. Last VCID: {vcid}")
131+
132+
ppath = hashid.get_package_purls_yml_file_path(purl)
133+
write_file(base_path=base_path, file_path=ppath, data=package_urls)
103134

104-
for items in [package_files, vul_files]:
105-
for filepath, data in items.items():
106-
create_file(filepath, git_path, data)
135+
pvpath = hashid.get_package_vulnerabilities_yml_file_path(purl)
136+
write_file(base_path=base_path, file_path=pvpath, data=package_vulnerabilities)
107137

108-
self.stdout.write(f"Successfully exported {ecosystem} data")
138+
if i % 100 == 0:
139+
self.stdout.write(f"Processed {i} package. Last PURL: {purl_without_version}")
109140

141+
except Exception as e:
142+
raise Exception(f"Failed to process Package: {pkg_version}") from e
110143

111-
def create_file(filepath, git_path, data):
144+
self.stdout.write(f"Exported data for: {i} package and {len(seen_vcid)} vulnerabilities.")
145+
146+
147+
def by_purl_type_ns_name(package):
112148
"""
113-
Check if the directories exist if it doesn't exist create a new one then Create the file
114-
./aboutcode-vulnerabilities-12/34/VCID-1223-3434-34343/VCID-1223-3434-34343.yml
115-
./aboutcode-packages-ed5/maven/org.apache.log4j/log4j-core/versions/vulnerabilities.yml
116-
./aboutcode-packages-ed5/maven/org.apache.log4j/log4j-core/versions/1.2.3/vulnerabilities.yml
149+
Key function to sort packages by type, namespace and name
117150
"""
118-
filepath = git_path.joinpath(filepath)
119-
dirname = os.path.dirname(filepath)
120-
os.makedirs(dirname, exist_ok=True)
121-
data = saneyaml.dump(data)
122-
with open(filepath, encoding="utf-8", mode="w") as f:
123-
f.write(data)
151+
return package.type, package.namespace, package.name
124152

125153

126-
def get_purl_hash(purl: PackageURL, length: int = 3) -> str:
154+
def packages_by_type_ns_name():
155+
"""
156+
Return a two-level iterator over all Packages grouped-by package, ignoring version.
157+
"""
158+
qs = (
159+
Package.objects.order_by("type", "namespace", "name", "version")
160+
.prefetch_related(
161+
"vulnerabilities",
162+
"vulnerabilities__references",
163+
"vulnerabilities__weaknesses",
164+
"vulnerabilities__references__vulnerabilityseverity_set",
165+
)
166+
.paginated()
167+
)
168+
169+
for tp_ns_name, packages in groupby(qs, key=by_purl_type_ns_name):
170+
yield PackageURL(*tp_ns_name), packages
171+
172+
173+
def write_file(base_path: Path, file_path: Path, data: dict):
127174
"""
128-
Return a short lower cased hash of a purl.
129-
https://github.com/nexB/purldb/pull/235/files#diff-a1fd023bd42d73f56019d540f38be711255403547add15108540d70f9948dd40R154
175+
Write the ``data`` as YAML to the ``file_path`` in the ``base_path`` root directory.
176+
Create directories in the path as needed.
130177
"""
131-
purl_bytes = str(purl).encode("utf-8")
132-
short_hash = sha512(purl_bytes).hexdigest()[:length]
133-
return short_hash.lower()
178+
write_to = base_path / file_path
179+
write_to.parent.mkdir(parents=True, exist_ok=True)
180+
with open(write_to, encoding="utf-8", mode="w") as f:
181+
f.write(saneyaml.dump(data))
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- pkg:generic/nginx/test@2
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- purl: pkg:generic/nginx/test@2
2+
affected_by_vulnerabilities:
3+
- VCID-pst6-b358-aaap
4+
fixing_vulnerabilities: []
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
vulnerability_id: VCID-pst6-b358-aaap
2+
aliases:
3+
- CVE-xxx-xxx-xx
4+
summary: test-vuln
5+
severities:
6+
- score: '7.0'
7+
scoring_system: cvssv3_vector
8+
scoring_elements: CVSS:3.0/AV:N/AC:L/PR:N/UI:N/S:U/C:N/I:N/A:H
9+
published_at:
10+
reference:
11+
url: https://..
12+
reference_type:
13+
reference_id: fake
14+
weaknesses:
15+
- CWE-15
16+
references:
17+
- url: https://..
18+
reference_type:
19+
reference_id: fake

0 commit comments

Comments
 (0)