- 
          
- 
                Notifications
    You must be signed in to change notification settings 
- Fork 239
Add support for parsing Git commit messages #1992
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,111 @@ | ||
| import re | ||
| import shutil | ||
| import tempfile | ||
| from collections import defaultdict | ||
|  | ||
| from git import Repo | ||
|  | ||
| from vulnerabilities.importer import AdvisoryData | ||
| from vulnerabilities.importer import ReferenceV2 | ||
| from vulnerabilities.pipelines import VulnerableCodeBaseImporterPipelineV2 | ||
|  | ||
| SECURITY_PATTERNS = [ | ||
| r"\bCVE-\d{4}-\d{4,19}\b", | ||
| r"\bGHSA-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}\b", | ||
| r"\bPYSEC-\d{4}-\d{1,6}\b", | ||
| r"\bXSA-\d{1,4}\b", | ||
| ] | ||
|  | ||
|  | ||
| class CollectRepoFixCommitPipeline(VulnerableCodeBaseImporterPipelineV2): | ||
| """ | ||
| Pipeline to collect fix commits from any git repository. | ||
| """ | ||
|  | ||
| pipeline_id = "repo_fix_commit" | ||
|  | ||
| @classmethod | ||
| def steps(cls): | ||
| return ( | ||
| cls.clone, | ||
| cls.collect_and_store_advisories, | ||
| cls.clean_downloads, | ||
| ) | ||
|  | ||
| def clone(self): | ||
| """Clone the repository.""" | ||
| self.repo_url = "https://github.com/torvalds/linux" | ||
| repo_path = tempfile.mkdtemp() | ||
| self.repo = Repo.clone_from( | ||
| url=self.repo_url, | ||
| to_path=repo_path, | ||
| bare=True, | ||
| no_checkout=True, | ||
| multi_options=["--filter=blob:none"], | ||
| ) | ||
|  | ||
| def advisories_count(self) -> int: | ||
| return int(self.repo.git.rev_list("--count", "HEAD")) | ||
|  | ||
| def classify_commit_type(self, commit) -> list[str]: | ||
| """ | ||
| Extract vulnerability identifiers from a commit message. | ||
| Returns a list of matched vulnerability IDs (normalized to uppercase). | ||
| """ | ||
| matches = [] | ||
| for pattern in SECURITY_PATTERNS: | ||
| found = re.findall(pattern, commit.message, flags=re.IGNORECASE) | ||
| matches.extend(found) | ||
| return matches | ||
|  | ||
| def collect_fix_commits(self): | ||
| """ | ||
| Iterate through repository commits and group them by vulnerability identifiers. | ||
| return a list with (vuln_id, [(commit_id, commit_message)]). | ||
| """ | ||
| self.log("Processing git repository fix commits (grouped by vulnerability IDs).") | ||
|  | ||
| grouped_commits = defaultdict(list) | ||
| for commit in self.repo.iter_commits("--all"): | ||
| matched_ids = self.classify_commit_type(commit) | ||
| if not matched_ids: | ||
| continue | ||
|  | ||
| commit_id = commit.hexsha | ||
| commit_message = commit.message.strip() | ||
|  | ||
| for vuln_id in matched_ids: | ||
| grouped_commits[vuln_id].append((commit_id, commit_message)) | ||
|  | ||
| self.log(f"Found {len(grouped_commits)} vulnerabilities with related commits.") | ||
| self.log("Finished processing all commits.") | ||
| return grouped_commits | ||
|  | ||
| def collect_advisories(self): | ||
| """ | ||
| Generate AdvisoryData objects for each vulnerability ID grouped with its related commits. | ||
| """ | ||
| self.log("Generating AdvisoryData objects from grouped commits.") | ||
| grouped_commits = self.collect_fix_commits() | ||
| for vuln_id, commits in grouped_commits.items(): | ||
| references = [ReferenceV2(url=f"{self.repo_url}/commit/{cid}") for cid, _ in commits] | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Where are we storing proper fix commit? just keeping it in reference is not sufficient IMO. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @keshav-space I was relying on this pipeline  CollectFixCommitsPipeline  to create fix commits , the issue is that  | ||
|  | ||
| summary_lines = [f"- {cid}: {msg}" for cid, msg in commits] | ||
| summary = f"Commits fixing {vuln_id}:\n" + "\n".join(summary_lines) | ||
| yield AdvisoryData( | ||
| advisory_id=vuln_id, | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will be problematic since we intend to collect fixed commits from multiple different repo here. Suppose we get a fix commit for  There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @keshav-space Not sure what the best solution for this is, but based on my understanding, we can make the pipeline_id dynamic 
 For example: avid: "e.g., django_fix_commit/PYSEC-2020-2233"
avid: "e.g., django_restframework_fix_commit/PYSEC-2020-2233"This will generate a different  | ||
| aliases=[vuln_id], | ||
| summary=summary, | ||
| references_v2=references, | ||
| url=self.repo_url, | ||
| ) | ||
|  | ||
| def clean_downloads(self): | ||
| """Cleanup any temporary repository data.""" | ||
| self.log("Cleaning up local repository resources.") | ||
| if hasattr(self, "repo") and self.repo.working_dir: | ||
| shutil.rmtree(path=self.repo.working_dir) | ||
|  | ||
| def on_failure(self): | ||
| """Ensure cleanup is always performed on failure.""" | ||
| self.clean_downloads() | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| # | ||
| # Copyright (c) nexB Inc. and others. All rights reserved. | ||
| # VulnerableCode is a trademark of nexB Inc. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # See http://www.apache.org/licenses/LICENSE-2.0 for the license text. | ||
| # See https://github.com/aboutcode-org/vulnerablecode for support or download. | ||
| # See https://aboutcode.org for more information about nexB OSS projects. | ||
| # | ||
|  | ||
| import json | ||
| from pathlib import Path | ||
| from unittest import TestCase | ||
| from unittest.mock import MagicMock | ||
| from unittest.mock import patch | ||
|  | ||
| import pytest | ||
|  | ||
| from vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits import ( | ||
| CollectRepoFixCommitPipeline, | ||
| ) | ||
| from vulnerabilities.tests import util_tests | ||
|  | ||
|  | ||
| @pytest.fixture | ||
| def pipeline(): | ||
| pipeline = CollectRepoFixCommitPipeline() | ||
| pipeline.repo_url = "https://github.com/test/repo" | ||
| pipeline.log = MagicMock() | ||
| return pipeline | ||
|  | ||
|  | ||
| def test_classify_commit_type_extracts_ids(pipeline): | ||
| class DummyCommit: | ||
| message = "Fix for CVE-2023-1234 and GHSA-2479-qvv7-47qq" | ||
|  | ||
| result = pipeline.classify_commit_type(DummyCommit) | ||
| assert result == ["CVE-2023-1234", "GHSA-2479-qvv7-47qq"] | ||
|  | ||
|  | ||
| @patch("vulnerabilities.pipelines.v2_importers.collect_repo_fix_commits.Repo") | ||
| def test_collect_fix_commits_groups_by_vuln(mock_repo, pipeline): | ||
| commit1 = MagicMock(message="Fix CVE-2021-0001", hexsha="abc123") | ||
| commit2 = MagicMock(message="Patch GHSA-dead-beef-baad", hexsha="def456") | ||
| commit3 = MagicMock(message="Unrelated change", hexsha="ghi789") | ||
|  | ||
| pipeline.repo = MagicMock() | ||
| pipeline.repo.iter_commits.return_value = [commit1, commit2, commit3] | ||
|  | ||
| pipeline.classify_commit_type = MagicMock( | ||
| side_effect=lambda c: ( | ||
| ["CVE-2021-0001"] | ||
| if "CVE" in c.message | ||
| else ["GHSA-dead-beef-baad"] | ||
| if "GHSA" in c.message | ||
| else [] | ||
| ) | ||
| ) | ||
|  | ||
| grouped = pipeline.collect_fix_commits() | ||
|  | ||
| expected = { | ||
| "CVE-2021-0001": [("abc123", "Fix CVE-2021-0001")], | ||
| "GHSA-dead-beef-baad": [("def456", "Patch GHSA-dead-beef-baad")], | ||
| } | ||
|  | ||
| assert grouped == expected | ||
|  | ||
|  | ||
| TEST_DATA = Path(__file__).parent.parent.parent / "test_data" / "fix_commits" | ||
|  | ||
|  | ||
| class TestRepoFixCommitPipeline(TestCase): | ||
| def test_collect_advisories_from_json(self): | ||
| input_file = TEST_DATA / "grouped_commits_input.json" | ||
| expected_file = TEST_DATA / "expected_linux_advisory_output.json" | ||
|  | ||
| grouped_commits = json.loads(input_file.read_text(encoding="utf-8")) | ||
|  | ||
| pipeline = CollectRepoFixCommitPipeline() | ||
| pipeline.repo_url = "https://github.com/test/repo" | ||
| pipeline.log = MagicMock() | ||
| pipeline.collect_fix_commits = MagicMock(return_value=grouped_commits) | ||
|  | ||
| result = [adv.to_dict() for adv in pipeline.collect_advisories()] | ||
|  | ||
| util_tests.check_results_against_json(result, expected_file) | ||
|  | ||
|  | ||
| @pytest.mark.parametrize( | ||
| "commit_message, expected_ids", | ||
| [ | ||
| ("Fix CVE-2023-12345 buffer overflow", ["CVE-2023-12345"]), | ||
| ("Address GHSA-abcd-1234-efgh report", ["GHSA-abcd-1234-efgh"]), | ||
| ("Python security PYSEC-2021-12345 fix", ["PYSEC-2021-12345"]), | ||
| ("Xen XSA-43 security update", ["XSA-43"]), | ||
| ( | ||
| "Fix CVE-2023-1111 and GHSA-aaaa-bbbb-cccc in kernel", | ||
| ["CVE-2023-1111", "GHSA-aaaa-bbbb-cccc"], | ||
| ), | ||
| ("Refactor logging system with no security ID", []), | ||
| ], | ||
| ) | ||
| def test_classify_commit_type_detects_vuln_ids(pipeline, commit_message, expected_ids): | ||
| """Ensure classify_commit_type correctly extracts vulnerability IDs.""" | ||
|  | ||
| class DummyCommit: | ||
| def __init__(self, message): | ||
| self.message = message | ||
|  | ||
| commit = DummyCommit(commit_message) | ||
| result = pipeline.classify_commit_type(commit) | ||
|  | ||
| assert result == expected_ids, f"Unexpected result for message: {commit_message}" | ||
|  | ||
|  | ||
| def test_classify_commit_type_case_insensitive(pipeline): | ||
| """Ensure pattern matching is case-insensitive.""" | ||
|  | ||
| class DummyCommit: | ||
| message = "fix cVe-2022-9999 and ghSa-dead-beef-baad" | ||
|  | ||
| result = pipeline.classify_commit_type(DummyCommit) | ||
| assert any("CVE-2022-9999" in r.upper() for r in result) | ||
| assert any("GHSA-DEAD-BEEF-BAAD" in r.upper() for r in result) | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| [ | ||
| { | ||
| "advisory_id": "CVE-2021-0001", | ||
| "aliases": [ | ||
| "CVE-2021-0001" | ||
| ], | ||
| "summary": "Commits fixing CVE-2021-0001:\n- abc123: Fix CVE-2021-0001", | ||
| "affected_packages": [], | ||
| "references_v2": [ | ||
| { | ||
| "reference_id": "", | ||
| "reference_type": "", | ||
| "url": "https://github.com/test/repo/commit/abc123" | ||
| } | ||
| ], | ||
| "severities": [], | ||
| "date_published": null, | ||
| "weaknesses": [], | ||
| "url": "https://github.com/test/repo" | ||
| }, | ||
| { | ||
| "advisory_id": "GHSA-dead-beef-baad", | ||
| "aliases": [ | ||
| "GHSA-dead-beef-baad" | ||
| ], | ||
| "summary": "Commits fixing GHSA-dead-beef-baad:\n- def456: Patch GHSA-dead-beef-baad", | ||
| "affected_packages": [], | ||
| "references_v2": [ | ||
| { | ||
| "reference_id": "", | ||
| "reference_type": "", | ||
| "url": "https://github.com/test/repo/commit/def456" | ||
| } | ||
| ], | ||
| "severities": [], | ||
| "date_published": null, | ||
| "weaknesses": [], | ||
| "url": "https://github.com/test/repo" | ||
| } | ||
| ] | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| { | ||
| "CVE-2021-0001": [ | ||
| ["abc123", "Fix CVE-2021-0001"] | ||
| ], | ||
| "GHSA-dead-beef-baad": [ | ||
| ["def456", "Patch GHSA-dead-beef-baad"] | ||
| ] | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This part should not be static