Skip to content

Commit 5b982c6

Browse files
authored
Merge pull request #1559 from aboutcode-org/1509-pypa-importer-pipeline
Add base pipeline for importers and migrate PyPa importer to aboutcode pipeline
2 parents 1e3afdc + d73cfd4 commit 5b982c6

File tree

13 files changed

+523
-79
lines changed

13 files changed

+523
-79
lines changed

vulnerabilities/import_runner.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818

1919
from vulnerabilities.importer import AdvisoryData
2020
from vulnerabilities.importer import Importer
21-
from vulnerabilities.importers import IMPORTERS_REGISTRY
2221
from vulnerabilities.improver import Inference
2322
from vulnerabilities.improvers.default import DefaultImporter
2423
from vulnerabilities.models import Advisory

vulnerabilities/importers/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from vulnerabilities.importers import oss_fuzz
3131
from vulnerabilities.importers import postgresql
3232
from vulnerabilities.importers import project_kb_msr2019
33-
from vulnerabilities.importers import pypa
3433
from vulnerabilities.importers import pysec
3534
from vulnerabilities.importers import redhat
3635
from vulnerabilities.importers import retiredotnet
@@ -40,13 +39,13 @@
4039
from vulnerabilities.importers import ubuntu_usn
4140
from vulnerabilities.importers import vulnrichment
4241
from vulnerabilities.importers import xen
42+
from vulnerabilities.pipelines import pypa_importer
4343

4444
IMPORTERS_REGISTRY = [
4545
nvd.NVDImporter,
4646
github.GitHubAPIImporter,
4747
gitlab.GitLabAPIImporter,
4848
npm.NpmImporter,
49-
pypa.PyPaImporter,
5049
nginx.NginxImporter,
5150
pysec.PyPIImporter,
5251
alpine_linux.AlpineImporter,
@@ -75,6 +74,7 @@
7574
github_osv.GithubOSVImporter,
7675
epss.EPSSImporter,
7776
vulnrichment.VulnrichImporter,
77+
pypa_importer.PyPaImporterPipeline,
7878
]
7979

8080
IMPORTERS_REGISTRY = {x.qualified_name: x for x in IMPORTERS_REGISTRY}

vulnerabilities/importers/pypa.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

vulnerabilities/management/commands/import.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from vulnerabilities.import_runner import ImportRunner
1515
from vulnerabilities.importers import IMPORTERS_REGISTRY
16+
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline
1617

1718

1819
class Command(BaseCommand):
@@ -57,6 +58,13 @@ def import_data(self, importers):
5758

5859
for importer in importers:
5960
self.stdout.write(f"Importing data using {importer.qualified_name}")
61+
if issubclass(importer, VulnerableCodeBaseImporterPipeline):
62+
status, error = importer().execute()
63+
if status != 0:
64+
self.stdout.write(error)
65+
failed_importers.append(importer.qualified_name)
66+
continue
67+
6068
try:
6169
ImportRunner(importer).run()
6270
self.stdout.write(

vulnerabilities/pipelines/__init__.py

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,24 @@
33
# VulnerableCode is a trademark of nexB Inc.
44
# SPDX-License-Identifier: Apache-2.0
55
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6-
# See https://github.com/nexB/vulnerablecode for support or download.
6+
# See https://github.com/aboutcode-org/vulnerablecode for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9+
910
import logging
1011
from datetime import datetime
1112
from datetime import timezone
13+
from traceback import format_exc as traceback_format_exc
14+
from typing import Iterable
1215

1316
from aboutcode.pipeline import BasePipeline
17+
from aboutcode.pipeline import LoopProgress
1418

19+
from vulnerabilities.importer import AdvisoryData
20+
from vulnerabilities.improver import MAX_CONFIDENCE
21+
from vulnerabilities.models import Advisory
22+
from vulnerabilities.pipes.advisory import import_advisory
23+
from vulnerabilities.pipes.advisory import insert_advisory
1524
from vulnerabilities.utils import classproperty
1625

1726
module_logger = logging.getLogger(__name__)
@@ -32,3 +41,90 @@ def qualified_name(cls):
3241
Fully qualified name prefixed with the module name of the pipeline used in logging.
3342
"""
3443
return f"{cls.__module__}.{cls.__qualname__}"
44+
45+
46+
class VulnerableCodeBaseImporterPipeline(VulnerableCodePipeline):
47+
"""
48+
Base importer pipeline for importing advisories.
49+
50+
Uses:
51+
Subclass this Pipeline and implement ``advisories_count`` and ``collect_advisories`` method.
52+
Also override the ``steps`` and ``advisory_confidence`` as needed.
53+
"""
54+
55+
license_url = None
56+
spdx_license_expression = None
57+
repo_url = None
58+
importer_name = None
59+
advisory_confidence = MAX_CONFIDENCE
60+
61+
@classmethod
62+
def steps(cls):
63+
return (
64+
# Add step for downloading/cloning resource as required.
65+
cls.collect_and_store_advisories,
66+
cls.import_new_advisories,
67+
# Add step for removing downloaded/cloned resource as required.
68+
)
69+
70+
def collect_advisories(self) -> Iterable[AdvisoryData]:
71+
"""
72+
Yield AdvisoryData for importer pipeline.
73+
74+
Populate the `self.collected_advisories_count` field and yield AdvisoryData
75+
"""
76+
raise NotImplementedError
77+
78+
def advisories_count(self) -> int:
79+
"""
80+
Return the estimated AdvisoryData to be yielded by ``collect_advisories``.
81+
82+
Used by ``collect_and_store_advisories`` to log the progress of advisory collection.
83+
"""
84+
raise NotImplementedError
85+
86+
def collect_and_store_advisories(self):
87+
collected_advisory_count = 0
88+
progress = LoopProgress(total_iterations=self.advisories_count(), logger=self.log)
89+
for advisory in progress.iter(self.collect_advisories()):
90+
if _obj := insert_advisory(
91+
advisory=advisory,
92+
pipeline_name=self.qualified_name,
93+
logger=self.log,
94+
):
95+
collected_advisory_count += 1
96+
97+
self.log(f"Successfully collected {collected_advisory_count:,d} advisories")
98+
99+
def import_new_advisories(self):
100+
new_advisories = Advisory.objects.filter(
101+
created_by=self.qualified_name,
102+
date_imported__isnull=True,
103+
)
104+
105+
new_advisories_count = new_advisories.count()
106+
107+
self.log(f"Importing {new_advisories_count:,d} new advisories")
108+
109+
imported_advisory_count = 0
110+
progress = LoopProgress(total_iterations=new_advisories_count, logger=self.log)
111+
for advisory in progress.iter(new_advisories.paginated()):
112+
self.import_advisory(advisory=advisory)
113+
if advisory.date_imported:
114+
imported_advisory_count += 1
115+
116+
self.log(f"Successfully imported {imported_advisory_count:,d} new advisories")
117+
118+
def import_advisory(self, advisory: Advisory) -> int:
119+
try:
120+
import_advisory(
121+
advisory=advisory,
122+
pipeline_name=self.qualified_name,
123+
confidence=self.advisory_confidence,
124+
logger=self.log,
125+
)
126+
except Exception as e:
127+
self.log(
128+
f"Failed to import advisory: {advisory!r} with error {e!r}:\n{traceback_format_exc()}",
129+
level=logging.ERROR,
130+
)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# VulnerableCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/vulnerablecode for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
import logging
10+
from pathlib import Path
11+
from typing import Iterable
12+
13+
import saneyaml
14+
from fetchcode.vcs import fetch_via_vcs
15+
16+
from vulnerabilities.importer import AdvisoryData
17+
from vulnerabilities.importers.osv import parse_advisory_data
18+
from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipeline
19+
from vulnerabilities.utils import get_advisory_url
20+
21+
module_logger = logging.getLogger(__name__)
22+
23+
24+
class PyPaImporterPipeline(VulnerableCodeBaseImporterPipeline):
25+
"""Collect advisories from PyPA GitHub repository."""
26+
27+
spdx_license_expression = "CC-BY-4.0"
28+
license_url = "https://github.com/pypa/advisory-database/blob/main/LICENSE"
29+
repo_url = "git+https://github.com/pypa/advisory-database"
30+
importer_name = "Pypa Importer"
31+
32+
@classmethod
33+
def steps(cls):
34+
return (
35+
cls.clone,
36+
cls.collect_and_store_advisories,
37+
cls.import_new_advisories,
38+
cls.clean_downloads,
39+
)
40+
41+
def clone(self):
42+
self.log(f"Cloning `{self.repo_url}`")
43+
self.vcs_response = fetch_via_vcs(self.repo_url)
44+
45+
def advisories_count(self):
46+
vulns_directory = Path(self.vcs_response.dest_dir) / "vulns"
47+
return sum(1 for _ in vulns_directory.rglob("*.yaml"))
48+
49+
def collect_advisories(self) -> Iterable[AdvisoryData]:
50+
base_directory = Path(self.vcs_response.dest_dir)
51+
vulns_directory = base_directory / "vulns"
52+
self.advisories_count = sum(1 for _ in vulns_directory.rglob("*.yaml"))
53+
54+
for advisory in vulns_directory.rglob("*.yaml"):
55+
advisory_url = get_advisory_url(
56+
file=advisory,
57+
base_path=base_directory,
58+
url="https://github.com/pypa/advisory-database/blob/main/",
59+
)
60+
advisory_dict = saneyaml.load(advisory.read_text())
61+
yield parse_advisory_data(
62+
raw_data=advisory_dict,
63+
supported_ecosystems=["pypi"],
64+
advisory_url=advisory_url,
65+
)
66+
67+
def clean_downloads(self):
68+
if self.vcs_response:
69+
self.log(f"Removing cloned repository")
70+
self.vcs_response.delete()

0 commit comments

Comments
 (0)