Skip to content

Commit 87b4ebc

Browse files
authored
Merge pull request #1188 from TG1999/clean_import_data
Clean imported data after import process
2 parents 2646d7e + dcbf076 commit 87b4ebc

File tree

13 files changed

+125
-107
lines changed

13 files changed

+125
-107
lines changed

vulnerabilities/importer.py

Lines changed: 10 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
import pytz
2626
from dateutil import parser as dateparser
27+
from fetchcode.vcs import VCSResponse
2728
from fetchcode.vcs import fetch_via_vcs
2829
from license_expression import Licensing
2930
from packageurl import PackageURL
@@ -288,6 +289,10 @@ class InvalidSPDXLicense(Exception):
288289
pass
289290

290291

292+
class ForkError(Exception):
293+
pass
294+
295+
291296
class Importer:
292297
"""
293298
An Importer collects data from various upstreams and returns corresponding AdvisoryData objects
@@ -297,7 +302,7 @@ class Importer:
297302
spdx_license_expression = ""
298303
license_url = ""
299304
notice = ""
300-
vcs_response = None
305+
vcs_response: VCSResponse = None
301306

302307
def __init__(self):
303308
if not self.spdx_license_expression:
@@ -324,47 +329,18 @@ def advisory_data(self) -> Iterable[AdvisoryData]:
324329
raise NotImplementedError
325330

326331
def clone(self, repo_url):
332+
"""
333+
Clone the repo at repo_url and return the VCSResponse object
334+
"""
327335
try:
328336
self.vcs_response = fetch_via_vcs(repo_url)
337+
return self.vcs_response
329338
except Exception as e:
330339
msg = f"Failed to fetch {repo_url} via vcs: {e}"
331340
logger.error(msg)
332341
raise ForkError(msg) from e
333342

334343

335-
class ForkError(Exception):
336-
pass
337-
338-
339-
class GitImporter(Importer):
340-
def __init__(self, repo_url):
341-
super().__init__()
342-
self.repo_url = repo_url
343-
self.vcs_response = None
344-
345-
def __enter__(self):
346-
super().__enter__()
347-
self.clone()
348-
return self
349-
350-
def __exit__(self):
351-
self.vcs_response.delete()
352-
353-
def clone(self):
354-
try:
355-
self.vcs_response = fetch_via_vcs(self.repo_url)
356-
except Exception as e:
357-
msg = f"Failed to fetch {self.repo_url} via vcs: {e}"
358-
logger.error(msg)
359-
raise ForkError(msg) from e
360-
361-
def advisory_data(self) -> Iterable[AdvisoryData]:
362-
"""
363-
Return AdvisoryData objects corresponding to the data being imported
364-
"""
365-
raise NotImplementedError
366-
367-
368344
# TODO: Needs rewrite
369345
class OvalImporter(Importer):
370346
"""

vulnerabilities/importers/elixir_security.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class ElixirSecurityImporter(Importer):
2929

3030
def advisory_data(self) -> Set[AdvisoryData]:
3131
try:
32-
self.clone(self.repo_url)
32+
self.clone(repo_url=self.repo_url)
3333
path = Path(self.vcs_response.dest_dir)
3434
vuln = path / "packages"
3535
for file in vuln.glob("**/*.yml"):

vulnerabilities/importers/fireeye.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313
from typing import List
1414

1515
from vulnerabilities.importer import AdvisoryData
16-
from vulnerabilities.importer import GitImporter
16+
from vulnerabilities.importer import Importer
1717
from vulnerabilities.importer import Reference
1818
from vulnerabilities.utils import build_description
1919
from vulnerabilities.utils import dedupe
2020

2121
logger = logging.getLogger(__name__)
2222

2323

24-
class FireyeImporter(GitImporter):
24+
class FireyeImporter(Importer):
2525
spdx_license_expression = "CC-BY-SA-4.0 AND MIT"
2626
license_url = "https://github.com/mandiant/Vulnerability-Disclosures/blob/master/README.md"
2727
notice = """
@@ -30,23 +30,25 @@ class FireyeImporter(GitImporter):
3030
1. CC BY-SA 4.0 - For CVE related information not including source code (such as PoCs)
3131
2. MIT - For source code contained within provided CVE information
3232
"""
33-
34-
def __init__(self):
35-
super().__init__(repo_url="git+https://github.com/mandiant/Vulnerability-Disclosures")
33+
repo_url = "git+https://github.com/mandiant/Vulnerability-Disclosures"
3634

3735
def advisory_data(self) -> Iterable[AdvisoryData]:
38-
self.clone()
39-
files = filter(
40-
lambda p: p.suffix in [".md", ".MD"], Path(self.vcs_response.dest_dir).glob("**/*")
41-
)
42-
for file in files:
43-
if Path(file).stem == "README":
44-
continue
45-
try:
46-
with open(file) as f:
47-
yield parse_advisory_data(f.read())
48-
except UnicodeError:
49-
logger.error(f"Invalid file {file}")
36+
try:
37+
self.clone(repo_url=self.repo_url)
38+
files = filter(
39+
lambda p: p.suffix in [".md", ".MD"], Path(self.vcs_response.dest_dir).glob("**/*")
40+
)
41+
for file in files:
42+
if Path(file).stem == "README":
43+
continue
44+
try:
45+
with open(file) as f:
46+
yield parse_advisory_data(f.read())
47+
except UnicodeError:
48+
logger.error(f"Invalid file {file}")
49+
finally:
50+
if self.vcs_response:
51+
self.vcs_response.delete()
5052

5153

5254
def parse_advisory_data(raw_data) -> AdvisoryData:

vulnerabilities/importers/gitlab.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from vulnerabilities.importer import AdvisoryData
2727
from vulnerabilities.importer import AffectedPackage
28-
from vulnerabilities.importer import GitImporter
28+
from vulnerabilities.importer import Importer
2929
from vulnerabilities.importer import Reference
3030
from vulnerabilities.utils import build_description
3131

@@ -48,16 +48,14 @@
4848
GITLAB_SCHEME_BY_PURL_TYPE = {v: k for k, v in PURL_TYPE_BY_GITLAB_SCHEME.items()}
4949

5050

51-
class GitLabAPIImporter(GitImporter):
51+
class GitLabAPIImporter(Importer):
5252
spdx_license_expression = "MIT"
5353
license_url = "https://gitlab.com/gitlab-org/advisories-community/-/blob/main/LICENSE"
54+
repo_url = "git+https://gitlab.com/gitlab-org/advisories-community/"
5455

55-
def __init__(self):
56-
super().__init__(repo_url="git+https://gitlab.com/gitlab-org/advisories-community/")
57-
58-
def advisory_data(self, _keep_clone=True) -> Iterable[AdvisoryData]:
56+
def advisory_data(self, _keep_clone=False) -> Iterable[AdvisoryData]:
5957
try:
60-
self.clone()
58+
self.clone(repo_url=self.repo_url)
6159
base_path = Path(self.vcs_response.dest_dir)
6260

6361
for file_path in base_path.glob("**/*.yml"):

vulnerabilities/importers/istio.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -53,17 +53,21 @@ class IstioImporter(Importer):
5353
repo_url = "git+https://github.com/istio/istio.io/"
5454

5555
def advisory_data(self) -> Set[AdvisoryData]:
56-
self.clone(self.repo_url)
57-
path = Path(self.vcs_response.dest_dir)
58-
vuln = path / "content/en/news/security/"
59-
for file in vuln.glob("**/*.md"):
60-
# Istio website has files with name starting with underscore, these contain metadata
61-
# required for rendering the website. We're not interested in these.
62-
# See also https://github.com/nexB/vulnerablecode/issues/563
63-
file = str(file)
64-
if file.endswith("_index.md"):
65-
continue
66-
yield from self.process_file(file)
56+
try:
57+
self.clone(repo_url=self.repo_url)
58+
path = Path(self.vcs_response.dest_dir)
59+
vuln = path / "content/en/news/security/"
60+
for file in vuln.glob("**/*.md"):
61+
# Istio website has files with name starting with underscore, these contain metadata
62+
# required for rendering the website. We're not interested in these.
63+
# See also https://github.com/nexB/vulnerablecode/issues/563
64+
file = str(file)
65+
if file.endswith("_index.md"):
66+
continue
67+
yield from self.process_file(file)
68+
finally:
69+
if self.vcs_response:
70+
self.vcs_response.delete()
6771

6872
def process_file(self, path):
6973

vulnerabilities/importers/kaybee.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010
from packageurl import PackageURL
1111

1212
from vulnerabilities.importer import AdvisoryData
13-
from vulnerabilities.importer import GitImporter
13+
from vulnerabilities.importer import Importer
1414
from vulnerabilities.importer import Reference
1515
from vulnerabilities.utils import load_yaml
1616
from vulnerabilities.utils import nearest_patched_package
1717

1818

19-
class KaybeeImporter(GitImporter):
19+
class KaybeeImporter(Importer):
2020
def __enter__(self):
2121
super(KaybeeImporter, self).__enter__()
2222
self._added_files, self._updated_files = self.file_changes(

vulnerabilities/importers/mozilla.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class MozillaImporter(Importer):
3939

4040
def advisory_data(self) -> Iterable[AdvisoryData]:
4141
try:
42-
self.clone(self.repo_url)
42+
self.clone(repo_url=self.repo_url)
4343
path = Path(self.vcs_response.dest_dir)
4444

4545
vuln = path / "announce"

vulnerabilities/importers/npm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class NpmImporter(Importer):
3636

3737
def advisory_data(self) -> Iterable[AdvisoryData]:
3838
try:
39-
self.clone(self.repo_url)
39+
self.clone(repo_url=self.repo_url)
4040
path = Path(self.vcs_response.dest_dir)
4141

4242
vuln = path / "vuln"

vulnerabilities/importers/pypa.py

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#
99
import logging
1010
import os
11+
from pathlib import Path
1112
from typing import Iterable
1213

1314
import saneyaml
@@ -23,33 +24,28 @@
2324
class PyPaImporter(Importer):
2425
license_url = "https://github.com/pypa/advisory-database/blob/main/LICENSE"
2526
spdx_license_expression = "CC-BY-4.0"
26-
url = "git+https://github.com/pypa/advisory-database"
27+
repo_url = "git+https://github.com/pypa/advisory-database"
2728

2829
def advisory_data(self) -> Iterable[AdvisoryData]:
29-
for raw_data in fork_and_get_files(self.url):
30-
yield parse_advisory_data(raw_data=raw_data, supported_ecosystem="pypi")
30+
try:
31+
self.clone(repo_url=self.repo_url)
32+
path = Path(self.vcs_response.dest_dir)
33+
for raw_data in fork_and_get_files(path=path):
34+
yield parse_advisory_data(raw_data=raw_data, supported_ecosystem="pypi")
35+
finally:
36+
if self.vcs_response:
37+
self.vcs_response.delete()
3138

3239

3340
class ForkError(Exception):
3441
pass
3542

3643

37-
def fork_and_get_files(url) -> dict:
44+
def fork_and_get_files(path) -> dict:
3845
"""
3946
Yield advisorie data mappings from the PyPA GitHub repository at ``url``.
4047
"""
41-
try:
42-
fork_directory = fetch_via_git(url=url)
43-
except Exception as e:
44-
logger.error(f"Failed to clone url {url}: {e}")
45-
raise ForkError(url) from e
46-
47-
advisory_dirs = os.path.join(fork_directory.dest_dir, "vulns")
48-
for root, _, files in os.walk(advisory_dirs):
49-
for file in files:
50-
path = os.path.join(root, file)
51-
if not file.endswith(".yaml"):
52-
logger.warning(f"Unsupported non-YAML PyPA advisory file: {path}")
53-
continue
54-
with open(path) as f:
55-
yield saneyaml.load(f.read())
48+
advisory_dirs = path / "vulns"
49+
for file in advisory_dirs.glob("**/*.yaml"):
50+
with open(file) as f:
51+
yield saneyaml.load(f.read())

vulnerabilities/importers/retiredotnet.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class RetireDotnetImporter(Importer):
3030

3131
def advisory_data(self) -> Iterable[AdvisoryData]:
3232
try:
33-
self.clone(self.repo_url)
33+
self.clone(repo_url=self.repo_url)
3434
path = Path(self.vcs_response.dest_dir)
3535

3636
vuln = path / "Content"

0 commit comments

Comments
 (0)