Skip to content

Commit 26bf2d9

Browse files
Add pipeline to mine NuGet PURLs (#721)
* Add pipeline to mine NuGet PURLs Signed-off-by: Keshav Priyadarshi <[email protected]> * Move scanpipe import inside function scope - Scope scanpipe import to avoid Django's ImproperlyConfigured error during tests. Signed-off-by: Keshav Priyadarshi <[email protected]> * Include batch count in commit message Signed-off-by: Keshav Priyadarshi <[email protected]> * Add test for mine_nuget pipeline Signed-off-by: Keshav Priyadarshi <[email protected]> * Apply code formatting Signed-off-by: Keshav Priyadarshi <[email protected]> * Bump minecode pipeline to 0.0.1b4 Signed-off-by: Keshav Priyadarshi <[email protected]> * Fix broken type hints Signed-off-by: Keshav Priyadarshi <[email protected]> * Assert known tags against fetched tag - Doing it other way would cause test to fail each time a new tag is released to PurlDB. Signed-off-by: Keshav Priyadarshi <[email protected]> * Add ftputil dependency for debian pipeline Signed-off-by: Keshav Priyadarshi <[email protected]> * Use minecode_pipelines in Maven pipeline entry point Signed-off-by: Keshav Priyadarshi <[email protected]> * Fix missing dependency for debain and maven pipeline Signed-off-by: Keshav Priyadarshi <[email protected]> --------- Signed-off-by: Keshav Priyadarshi <[email protected]> Co-authored-by: Ayan Sinha Mahapatra <[email protected]>
1 parent c4fbf73 commit 26bf2d9

File tree

9 files changed

+668
-7
lines changed

9 files changed

+668
-7
lines changed

minecode/tests/collectors/test_github.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,5 +41,5 @@ def test_github_get_all_versions(self):
4141
"minecode-pipelines/v0.0.1b7",
4242
"minecode-pipelines/v0.0.1b8",
4343
]
44-
for item in versions:
45-
self.assertIn(item, expected)
44+
for item in expected:
45+
self.assertIn(item, versions)

minecode_pipelines/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,5 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
1011
VERSION = "0.0.1b15"
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
24+
from pathlib import Path
25+
26+
from minecode_pipelines.pipes import nuget
27+
28+
from scanpipe.pipelines import Pipeline
29+
from scanpipe.pipes import federatedcode
30+
31+
32+
class MineNuGet(Pipeline):
33+
"""
34+
Mine and Publish NuGet PackageURLs.
35+
36+
Mine PackageURLs from AboutCode NuGet catalog mirror and publish
37+
them to FederatedCode Git repository.
38+
"""
39+
40+
download_inputs = False
41+
CATALOG_REPO_URL = "https://github.com/aboutcode-org/aboutcode-mirror-nuget-catalog.git"
42+
43+
@classmethod
44+
def steps(cls):
45+
return (
46+
cls.check_federatedcode_eligibility,
47+
cls.fetch_nuget_catalog,
48+
cls.mine_nuget_package_versions,
49+
cls.mine_and_publish_nuget_packageurls,
50+
cls.delete_cloned_repos,
51+
)
52+
53+
def check_federatedcode_eligibility(self):
54+
"""
55+
Check if the project fulfills the following criteria for
56+
pushing the project result to FederatedCode.
57+
"""
58+
federatedcode.check_federatedcode_configured_and_available()
59+
60+
def fetch_nuget_catalog(self):
61+
"""Fetch NuGet package catalog from AboutCode mirror."""
62+
self.catalog_repo = federatedcode.clone_repository(
63+
repo_url=self.CATALOG_REPO_URL,
64+
logger=self.log,
65+
)
66+
67+
def mine_nuget_package_versions(self):
68+
"""Mine NuGet package and versions from NuGet catalog."""
69+
self.package_versions, self.skipped_packages = nuget.mine_nuget_package_versions(
70+
catalog_path=Path(self.catalog_repo.working_dir),
71+
logger=self.log,
72+
)
73+
74+
def mine_and_publish_nuget_packageurls(self):
75+
"""Mine and publish PackageURLs from NuGet package versions."""
76+
nuget.mine_and_publish_nuget_packageurls(
77+
package_versions=self.package_versions,
78+
logger=self.log,
79+
)
80+
81+
def delete_cloned_repos(self):
82+
"""Remove cloned catalog repository."""
83+
if self.catalog_repo:
84+
self.log("Removing cloned repository")
85+
federatedcode.delete_local_clone(repo=self.catalog_repo)

minecode_pipelines/pipes/__init__.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,11 @@
1010
import json
1111
import os
1212
from pathlib import Path
13-
13+
from git import Repo
1414
import requests
1515
import saneyaml
1616

1717
from aboutcode.hashid import PURLS_FILENAME
18-
from git import Repo
19-
20-
from scanpipe.pipes.federatedcode import delete_local_clone
21-
from scanpipe.pipes.federatedcode import commit_and_push_changes
2218

2319
# states:
2420
# note: a state is null when mining starts
@@ -50,6 +46,8 @@ def get_checkpoint_from_file(cloned_repo, path):
5046

5147

5248
def update_checkpoints_in_github(checkpoint, cloned_repo, path):
49+
from scanpipe.pipes.federatedcode import commit_and_push_changes
50+
5351
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
5452
write_data_to_json_file(path=checkpoint_path, data=checkpoint)
5553
commit_message = """Update federatedcode purl mining checkpoint"""
@@ -121,6 +119,8 @@ def write_data_to_json_file(path, data):
121119

122120

123121
def delete_cloned_repos(repos, logger=None):
122+
from scanpipe.pipes.federatedcode import delete_local_clone
123+
124124
if not repos:
125125
return
126126

minecode_pipelines/pipes/nuget.py

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an “AS IS” BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
24+
import json
25+
import re
26+
27+
from django.conf import settings
28+
29+
from minecode_pipelines.pipes import write_packageurls_to_file
30+
from packageurl import PackageURL
31+
32+
from aboutcode.hashid import get_package_base_dir
33+
from aboutcode.pipeline import LoopProgress
34+
from scancodeio import VERSION
35+
36+
NUGET_PURL_METADATA_REPO = "https://github.com/aboutcode-data/minecode-data-nuget-test"
37+
38+
39+
def get_catalog_page_count(catalog_index):
40+
if catalog_index.exists():
41+
with catalog_index.open("r", encoding="utf-8") as f:
42+
index = json.load(f)
43+
return index.get("count", 0)
44+
return 0
45+
46+
47+
def collect_package_versions(events, package_versions, skipped_packages):
48+
"""Collect package versions from events in the NuGet package catalog."""
49+
for event in events or []:
50+
if event["@type"] != "nuget:PackageDetails":
51+
continue
52+
pkg_name = event["nuget:id"]
53+
54+
# Skip package names that resemble NuGet API key and can't be pushed to GitHub.
55+
if bool(re.fullmatch(r"oy2[a-z0-9]{43}", pkg_name)):
56+
skipped_packages.add(pkg_name)
57+
continue
58+
59+
purl = PackageURL(type="nuget", name=pkg_name).to_string()
60+
if purl not in package_versions:
61+
package_versions[purl] = set()
62+
63+
package_versions[purl].add(event["nuget:version"])
64+
65+
66+
def mine_nuget_package_versions(catalog_path, logger):
67+
"""Mine NuGet package and versions from NuGet catalog."""
68+
catalog = catalog_path / "catalog"
69+
catalog_count = get_catalog_page_count(catalog / "index.json")
70+
catalog_pages = catalog / "pages"
71+
72+
package_versions = {}
73+
skipped_packages = set()
74+
logger(f"Collecting versions from {catalog_count:,d} NuGet catalog.")
75+
progress = LoopProgress(total_iterations=catalog_count, logger=logger)
76+
for page in progress.iter(catalog_pages.rglob("*.json")):
77+
with page.open("r", encoding="utf-8") as f:
78+
page_catalog = json.load(f)
79+
80+
collect_package_versions(
81+
events=page_catalog["items"],
82+
package_versions=package_versions,
83+
skipped_packages=skipped_packages,
84+
)
85+
logger(f"Collected versions for {len(package_versions):,d} NuGet package.")
86+
return package_versions, skipped_packages
87+
88+
89+
def commit_message(commit_batch, total_commit_batch="many"):
90+
author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME
91+
author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL
92+
tool_name = "pkg:github/aboutcode-org/scancode.io"
93+
94+
return f"""\
95+
Collect PackageURLs from NuGet catalog ({commit_batch}/{total_commit_batch})
96+
97+
Tool: {tool_name}@v{VERSION}
98+
Reference: https://{settings.ALLOWED_HOSTS[0]}
99+
100+
Signed-off-by: {author_name} <{author_email}>
101+
"""
102+
103+
104+
def get_nuget_purls_from_versions(base_purl, versions):
105+
"""Return PURLs for a NuGet `base_purls` from set of `versions`."""
106+
purl_dict = PackageURL.from_string(base_purl).to_dict()
107+
del purl_dict["version"]
108+
return [PackageURL(**purl_dict, version=v).to_string() for v in versions]
109+
110+
111+
def mine_and_publish_nuget_packageurls(package_versions, logger):
112+
"""Mine and publish PackageURLs from NuGet package versions."""
113+
from scanpipe.pipes import federatedcode
114+
115+
cloned_repo = federatedcode.clone_repository(
116+
repo_url=NUGET_PURL_METADATA_REPO,
117+
logger=logger,
118+
)
119+
file_to_commit = []
120+
batch_size = 4000
121+
file_processed = 0
122+
commit_count = 1
123+
nuget_package_count = len(package_versions)
124+
progress = LoopProgress(
125+
total_iterations=nuget_package_count,
126+
logger=logger,
127+
progress_step=1,
128+
)
129+
130+
logger(f"Mine packageURL for {nuget_package_count:,d} NuGet packages.")
131+
for base, versions in progress.iter(package_versions.items()):
132+
package_base_dir = get_package_base_dir(purl=base)
133+
packageurls = get_nuget_purls_from_versions(base_purl=base, versions=versions)
134+
135+
purl_file = write_packageurls_to_file(
136+
repo=cloned_repo,
137+
base_dir=package_base_dir,
138+
packageurls=sorted(packageurls),
139+
)
140+
file_to_commit.append(purl_file)
141+
file_processed += 1
142+
143+
if len(file_to_commit) > batch_size:
144+
if federatedcode.commit_and_push_changes(
145+
commit_message=commit_message(commit_count),
146+
repo=cloned_repo,
147+
files_to_commit=file_to_commit,
148+
logger=logger,
149+
):
150+
commit_count += 1
151+
file_to_commit.clear()
152+
153+
federatedcode.commit_and_push_changes(
154+
commit_message=commit_message(
155+
commit_batch=commit_count,
156+
total_commit_batch=commit_count,
157+
),
158+
repo=cloned_repo,
159+
files_to_commit=file_to_commit,
160+
logger=logger,
161+
)
162+
logger(f"Processed PackageURL for {file_processed:,d} NuGet packages.")
163+
logger(f"Pushed new PackageURL in {commit_count:,d} commits.")
164+
federatedcode.delete_local_clone(repo=cloned_repo)

0 commit comments

Comments
 (0)