Skip to content

Commit 3632832

Browse files
ziadhanyTG1999
authored andcommitted
Refactor Gitimporter using fetchcode
Co-authored-by: Tushar Goel <[email protected]> Signed-off-by: ziadhany <[email protected]>
1 parent ac9677b commit 3632832

File tree

4 files changed

+117
-496
lines changed

4 files changed

+117
-496
lines changed

vulnerabilities/importer.py

Lines changed: 25 additions & 184 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import logging
1313
import os
1414
import shutil
15-
import tempfile
1615
import traceback
1716
import xml.etree.ElementTree as ET
1817
from pathlib import Path
@@ -23,9 +22,7 @@
2322
from typing import Set
2423
from typing import Tuple
2524

26-
from binaryornot.helpers import is_binary_string
27-
from git import DiffIndex
28-
from git import Repo
25+
from fetchcode.vcs import fetch_via_vcs
2926
from license_expression import Licensing
3027
from packageurl import PackageURL
3128
from univers.version_range import VersionRange
@@ -312,193 +309,37 @@ def advisory_data(self) -> Iterable[AdvisoryData]:
312309
raise NotImplementedError
313310

314311

315-
# TODO: Needs rewrite
316-
class GitImporter(Importer):
317-
def validate_configuration(self) -> None:
312+
class ForkError(Exception):
313+
pass
318314

319-
if not self.config.create_working_directory and self.config.working_directory is None:
320-
self.error(
321-
'"create_working_directory" is not set but "working_directory" is set to '
322-
"the default, which calls tempfile.mkdtemp()"
323-
)
324315

325-
if not self.config.create_working_directory and not os.path.exists(
326-
self.config.working_directory
327-
):
328-
self.error(
329-
'"working_directory" does not contain an existing directory and'
330-
'"create_working_directory" is not set'
331-
)
332-
333-
if not self.config.remove_working_directory and self.config.working_directory is None:
334-
self.error(
335-
'"remove_working_directory" is not set and "working_directory" is set to '
336-
"the default, which calls tempfile.mkdtemp()"
337-
)
316+
class GitImporter(Importer):
317+
def __init__(self, repo_url):
318+
super().__init__()
319+
self.repo_url = repo_url
320+
self.vcs_response = None
338321

339322
def __enter__(self):
340-
self._ensure_working_directory()
341-
self._ensure_repository()
342-
343-
def __exit__(self, exc_type, exc_val, exc_tb):
344-
if self.config.remove_working_directory:
345-
shutil.rmtree(self.config.working_directory)
346-
347-
def file_changes(
348-
self,
349-
subdir: str = None,
350-
recursive: bool = False,
351-
file_ext: Optional[str] = None,
352-
) -> Tuple[Set[str], Set[str]]:
353-
"""
354-
Returns all added and modified files since last_run_date or cutoff_date (whichever is more
355-
recent).
356-
357-
:param subdir: filter by files in this directory
358-
:param recursive: whether to include files in subdirectories
359-
:param file_ext: filter files by this extension
360-
:return: The first set contains (absolute paths to) added files, the second one modified
361-
files
362-
"""
363-
if subdir is None:
364-
working_dir = self.config.working_directory
365-
else:
366-
working_dir = os.path.join(self.config.working_directory, subdir)
323+
super().__enter__()
324+
self.clone()
325+
return self
367326

368-
path = Path(working_dir)
327+
def __exit__(self):
328+
self.vcs_response.delete()
369329

370-
if self.config.last_run_date is None and self.config.cutoff_date is None:
371-
if recursive:
372-
glob = "**/*"
373-
else:
374-
glob = "*"
375-
376-
if file_ext:
377-
glob = f"{glob}.{file_ext}"
378-
379-
return {str(p) for p in path.glob(glob) if p.is_file()}, set()
380-
381-
return self._collect_file_changes(subdir=subdir, recursive=recursive, file_ext=file_ext)
382-
383-
def _collect_file_changes(
384-
self,
385-
subdir: Optional[str],
386-
recursive: bool,
387-
file_ext: Optional[str],
388-
) -> Tuple[Set[str], Set[str]]:
389-
390-
added_files, updated_files = set(), set()
391-
392-
# find the most ancient commit we need to diff with
393-
cutoff_commit = None
394-
for commit in self._repo.iter_commits(self._repo.head):
395-
if commit.committed_date < self.cutoff_timestamp:
396-
break
397-
cutoff_commit = commit
398-
399-
if cutoff_commit is None:
400-
return added_files, updated_files
401-
402-
def _is_binary(d: DiffIndex):
403-
return is_binary_string(d.b_blob.data_stream.read(1024))
404-
405-
for d in cutoff_commit.diff(self._repo.head.commit):
406-
if not _include_file(d.b_path, subdir, recursive, file_ext) or _is_binary(d):
407-
continue
408-
409-
abspath = os.path.join(self.config.working_directory, d.b_path)
410-
if d.new_file:
411-
added_files.add(abspath)
412-
elif d.a_blob and d.b_blob:
413-
if d.a_path != d.b_path:
414-
# consider moved files as added
415-
added_files.add(abspath)
416-
elif d.a_blob != d.b_blob:
417-
updated_files.add(abspath)
418-
419-
# Any file that has been added and then updated inside the window of the git history we
420-
# looked at, should be considered "added", not "updated", since it does not exist in the
421-
# database yet.
422-
updated_files = updated_files - added_files
423-
424-
return added_files, updated_files
425-
426-
def _ensure_working_directory(self) -> None:
427-
if self.config.working_directory is None:
428-
self.config.working_directory = tempfile.mkdtemp()
429-
elif self.config.create_working_directory and not os.path.exists(
430-
self.config.working_directory
431-
):
432-
os.mkdir(self.config.working_directory)
433-
434-
def _ensure_repository(self) -> None:
435-
if not os.path.exists(os.path.join(self.config.working_directory, ".git")):
436-
self._clone_repository()
437-
return
438-
self._repo = Repo(self.config.working_directory)
439-
440-
if self.config.branch is None:
441-
self.config.branch = str(self._repo.active_branch)
442-
branch = self.config.branch
443-
self._repo.head.reference = self._repo.heads[branch]
444-
self._repo.head.reset(index=True, working_tree=True)
445-
446-
remote = self._find_or_add_remote()
447-
self._update_from_remote(remote, branch)
448-
449-
def _clone_repository(self) -> None:
450-
kwargs = {}
451-
if self.config.branch:
452-
kwargs["branch"] = self.config.branch
453-
454-
self._repo = Repo.clone_from(
455-
self.config.repository_url, self.config.working_directory, **kwargs
456-
)
457-
458-
def _find_or_add_remote(self):
459-
remote = None
460-
for r in self._repo.remotes:
461-
if r.url == self.config.repository_url:
462-
remote = r
463-
break
464-
465-
if remote is None:
466-
remote = self._repo.create_remote(
467-
"added_by_vulnerablecode", url=self.config.repository_url
468-
)
469-
470-
return remote
471-
472-
def _update_from_remote(self, remote, branch) -> None:
473-
fetch_info = remote.fetch()
474-
if len(fetch_info) == 0:
475-
return
476-
branch = self._repo.branches[branch]
477-
branch.set_reference(remote.refs[branch.name])
478-
self._repo.head.reset(index=True, working_tree=True)
479-
480-
481-
def _include_file(
482-
path: str,
483-
subdir: Optional[str] = None,
484-
recursive: bool = False,
485-
file_ext: Optional[str] = None,
486-
) -> bool:
487-
match = True
488-
489-
if subdir:
490-
if not subdir.endswith(os.path.sep):
491-
subdir = f"{subdir}{os.path.sep}"
492-
493-
match = match and path.startswith(subdir)
494-
495-
if not recursive:
496-
match = match and (os.path.sep not in path[len(subdir or "") :])
497-
498-
if file_ext:
499-
match = match and path.endswith(f".{file_ext}")
330+
def clone(self):
331+
try:
332+
self.vcs_response = fetch_via_vcs(self.repo_url)
333+
except Exception as e:
334+
msg = f"Failed to fetch {self.repo_url} via vcs: {e}"
335+
logger.error(msg)
336+
raise ForkError(msg) from e
500337

501-
return match
338+
def advisory_data(self) -> Iterable[AdvisoryData]:
339+
"""
340+
Return AdvisoryData objects corresponding to the data being imported
341+
"""
342+
raise NotImplementedError
502343

503344

504345
# TODO: Needs rewrite

vulnerabilities/importers/gitlab.py

Lines changed: 48 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
#
99

1010
import logging
11-
import os
1211
import traceback
1312
from datetime import datetime
13+
from pathlib import Path
1414
from typing import Iterable
1515
from typing import List
1616
from typing import Mapping
@@ -29,7 +29,7 @@
2929

3030
from vulnerabilities.importer import AdvisoryData
3131
from vulnerabilities.importer import AffectedPackage
32-
from vulnerabilities.importer import Importer
32+
from vulnerabilities.importer import GitImporter
3333
from vulnerabilities.importer import Reference
3434
from vulnerabilities.importer import UnMergeablePackageError
3535
from vulnerabilities.improver import Improver
@@ -71,31 +71,53 @@ def fork_and_get_dir(url):
7171
return fetch_via_vcs(url).dest_dir
7272

7373

74-
class ForkError(Exception):
75-
pass
76-
77-
78-
class GitLabAPIImporter(Importer):
74+
class GitLabAPIImporter(GitImporter):
7975
spdx_license_expression = "MIT"
8076
license_url = "https://gitlab.com/gitlab-org/advisories-community/-/blob/main/LICENSE"
81-
gitlab_url = "git+https://gitlab.com/gitlab-org/advisories-community/"
77+
78+
def __init__(self):
79+
super().__init__(repo_url="git+https://gitlab.com/gitlab-org/advisories-community/")
8280

8381
def advisory_data(self) -> Iterable[AdvisoryData]:
8482
try:
85-
fork_directory = fork_and_get_dir(url=self.gitlab_url)
86-
except Exception as e:
87-
logger.error(f"Can't clone url {self.gitlab_url}")
88-
raise ForkError(self.gitlab_url) from e
89-
for root_dir in os.listdir(fork_directory):
90-
# skip well known files and directories that contain no advisory data
91-
if root_dir in ("ci", "CODEOWNERS", "README.md", "LICENSE", ".git"):
92-
continue
93-
if root_dir not in PURL_TYPE_BY_GITLAB_SCHEME:
94-
logger.error(f"Unknown package type: {root_dir}")
95-
continue
96-
for root, _, files in os.walk(os.path.join(fork_directory, root_dir)):
97-
for file in files:
98-
yield parse_gitlab_advisory(file=os.path.join(root, file))
83+
self.clone()
84+
path = Path(self.vcs_response.dest_dir)
85+
86+
glob = "**/*.yml"
87+
files = (p for p in path.glob(glob) if p.is_file())
88+
for file in files:
89+
# split a path according to gitlab conventions where package type and name are a part of path
90+
# For example with this path:
91+
# /tmp/tmpi1klhpmd/pypi/gradio/CVE-2021-43831.yml
92+
# the package type is pypi and the package name is gradio
93+
# to ('/', 'tmp', 'tmpi1klhpmd', 'pypi', 'gradio', 'CVE-2021-43831.yml')
94+
purl_type = get_gitlab_package_type(path=file)
95+
if not purl_type:
96+
logger.error(f"Unknow gitlab directory structure {file!r}")
97+
continue
98+
99+
if purl_type in PURL_TYPE_BY_GITLAB_SCHEME:
100+
yield parse_gitlab_advisory(file)
101+
102+
else:
103+
logger.error(f"Unknow package type {purl_type!r}")
104+
continue
105+
finally:
106+
if self.vcs_response:
107+
self.vcs_response.delete()
108+
109+
110+
def get_gitlab_package_type(path: Path):
111+
"""
112+
Return a package type extracted from a gitlab advisory path or None
113+
"""
114+
parts = path.parts[-3:]
115+
116+
if len(parts) < 3:
117+
return
118+
119+
type, _name, _vid = parts
120+
return type
99121

100122

101123
def get_purl(package_slug):
@@ -168,10 +190,12 @@ def parse_gitlab_advisory(file):
168190
identifiers:
169191
- "GMS-2018-26"
170192
"""
171-
with open(file, "r") as f:
193+
with open(file) as f:
172194
gitlab_advisory = saneyaml.load(f)
173195
if not isinstance(gitlab_advisory, dict):
174-
logger.error(f"parse_yaml_file: yaml_file is not of type `dict`: {gitlab_advisory!r}")
196+
logger.error(
197+
f"parse_gitlab_advisory: unknown gitlab advisory format in {file!r} with data: {gitlab_advisory!r}"
198+
)
175199
return
176200

177201
# refer to schema here https://gitlab.com/gitlab-org/advisories-community/-/blob/main/ci/schema/schema.json

vulnerabilities/tests/conftest.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ def no_rmtree(monkeypatch):
3030
"test_apache_tomcat.py",
3131
"test_api.py",
3232
"test_archlinux.py",
33-
"test_data_source.py",
3433
"test_debian_oval.py",
3534
"test_elixir_security.py",
3635
"test_gentoo.py",

0 commit comments

Comments
 (0)