|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# |
| 3 | +# http://nexb.com and https://github.com/aboutcode-org/scancode.io |
| 4 | +# The ScanCode.io software is licensed under the Apache License version 2.0. |
| 5 | +# Data generated with ScanCode.io is provided as-is without warranties. |
| 6 | +# ScanCode is a trademark of nexB Inc. |
| 7 | +# |
| 8 | +# You may not use this software except in compliance with the License. |
| 9 | +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 |
| 10 | +# Unless required by applicable law or agreed to in writing, software distributed |
| 11 | +# under the License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR |
| 12 | +# CONDITIONS OF ANY KIND, either express or implied. See the License for the |
| 13 | +# specific language governing permissions and limitations under the License. |
| 14 | +# |
| 15 | +# Data Generated with ScanCode.io is provided on an “AS IS” BASIS, WITHOUT WARRANTIES |
| 16 | +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from |
| 17 | +# ScanCode.io should be considered or used as legal advice. Consult an Attorney |
| 18 | +# for any legal advice. |
| 19 | +# |
| 20 | +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. |
| 21 | +# Visit https://github.com/aboutcode-org/scancode.io for support and download. |
| 22 | + |
| 23 | + |
| 24 | +import json |
| 25 | +import re |
| 26 | + |
| 27 | +from django.conf import settings |
| 28 | + |
| 29 | +from minecode_pipelines.pipes import write_packageurls_to_file |
| 30 | +from packageurl import PackageURL |
| 31 | + |
| 32 | +from aboutcode.hashid import get_package_base_dir |
| 33 | +from aboutcode.pipeline import LoopProgress |
| 34 | +from scancodeio import VERSION |
| 35 | + |
| 36 | +NUGET_PURL_METADATA_REPO = "https://github.com/aboutcode-data/minecode-data-nuget-test" |
| 37 | + |
| 38 | + |
| 39 | +def get_catalog_page_count(catalog_index): |
| 40 | + if catalog_index.exists(): |
| 41 | + with catalog_index.open("r", encoding="utf-8") as f: |
| 42 | + index = json.load(f) |
| 43 | + return index.get("count", 0) |
| 44 | + return 0 |
| 45 | + |
| 46 | + |
| 47 | +def collect_package_versions(events, package_versions, skipped_packages): |
| 48 | + """Collect package versions from events in the NuGet package catalog.""" |
| 49 | + for event in events or []: |
| 50 | + if event["@type"] != "nuget:PackageDetails": |
| 51 | + continue |
| 52 | + pkg_name = event["nuget:id"] |
| 53 | + |
| 54 | + # Skip package names that resemble NuGet API key and can't be pushed to GitHub. |
| 55 | + if bool(re.fullmatch(r"oy2[a-z0-9]{43}", pkg_name)): |
| 56 | + skipped_packages.add(pkg_name) |
| 57 | + continue |
| 58 | + |
| 59 | + purl = PackageURL(type="nuget", name=pkg_name).to_string() |
| 60 | + if purl not in package_versions: |
| 61 | + package_versions[purl] = set() |
| 62 | + |
| 63 | + package_versions[purl].add(event["nuget:version"]) |
| 64 | + |
| 65 | + |
| 66 | +def mine_nuget_package_versions(catalog_path, logger): |
| 67 | + """Mine NuGet package and versions from NuGet catalog.""" |
| 68 | + catalog = catalog_path / "catalog" |
| 69 | + catalog_count = get_catalog_page_count(catalog / "index.json") |
| 70 | + catalog_pages = catalog / "pages" |
| 71 | + |
| 72 | + package_versions = {} |
| 73 | + skipped_packages = set() |
| 74 | + logger(f"Collecting versions from {catalog_count:,d} NuGet catalog.") |
| 75 | + progress = LoopProgress(total_iterations=catalog_count, logger=logger) |
| 76 | + for page in progress.iter(catalog_pages.rglob("*.json")): |
| 77 | + with page.open("r", encoding="utf-8") as f: |
| 78 | + page_catalog = json.load(f) |
| 79 | + |
| 80 | + collect_package_versions( |
| 81 | + events=page_catalog["items"], |
| 82 | + package_versions=package_versions, |
| 83 | + skipped_packages=skipped_packages, |
| 84 | + ) |
| 85 | + logger(f"Collected versions for {len(package_versions):,d} NuGet package.") |
| 86 | + return package_versions, skipped_packages |
| 87 | + |
| 88 | + |
| 89 | +def commit_message(commit_batch, total_commit_batch="many"): |
| 90 | + author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME |
| 91 | + author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL |
| 92 | + tool_name = "pkg:github/aboutcode-org/scancode.io" |
| 93 | + |
| 94 | + return f"""\ |
| 95 | + Collect PackageURLs from NuGet catalog ({commit_batch}/{total_commit_batch}) |
| 96 | +
|
| 97 | + Tool: {tool_name}@v{VERSION} |
| 98 | + Reference: https://{settings.ALLOWED_HOSTS[0]} |
| 99 | +
|
| 100 | + Signed-off-by: {author_name} <{author_email}> |
| 101 | + """ |
| 102 | + |
| 103 | + |
| 104 | +def get_nuget_purls_from_versions(base_purl, versions): |
| 105 | + """Return PURLs for a NuGet `base_purls` from set of `versions`.""" |
| 106 | + purl_dict = PackageURL.from_string(base_purl).to_dict() |
| 107 | + del purl_dict["version"] |
| 108 | + return [PackageURL(**purl_dict, version=v).to_string() for v in versions] |
| 109 | + |
| 110 | + |
| 111 | +def mine_and_publish_nuget_packageurls(package_versions, logger): |
| 112 | + """Mine and publish PackageURLs from NuGet package versions.""" |
| 113 | + from scanpipe.pipes import federatedcode |
| 114 | + |
| 115 | + cloned_repo = federatedcode.clone_repository( |
| 116 | + repo_url=NUGET_PURL_METADATA_REPO, |
| 117 | + logger=logger, |
| 118 | + ) |
| 119 | + file_to_commit = [] |
| 120 | + batch_size = 4000 |
| 121 | + file_processed = 0 |
| 122 | + commit_count = 1 |
| 123 | + nuget_package_count = len(package_versions) |
| 124 | + progress = LoopProgress( |
| 125 | + total_iterations=nuget_package_count, |
| 126 | + logger=logger, |
| 127 | + progress_step=1, |
| 128 | + ) |
| 129 | + |
| 130 | + logger(f"Mine packageURL for {nuget_package_count:,d} NuGet packages.") |
| 131 | + for base, versions in progress.iter(package_versions.items()): |
| 132 | + package_base_dir = get_package_base_dir(purl=base) |
| 133 | + packageurls = get_nuget_purls_from_versions(base_purl=base, versions=versions) |
| 134 | + |
| 135 | + purl_file = write_packageurls_to_file( |
| 136 | + repo=cloned_repo, |
| 137 | + base_dir=package_base_dir, |
| 138 | + packageurls=sorted(packageurls), |
| 139 | + ) |
| 140 | + file_to_commit.append(purl_file) |
| 141 | + file_processed += 1 |
| 142 | + |
| 143 | + if len(file_to_commit) > batch_size: |
| 144 | + if federatedcode.commit_and_push_changes( |
| 145 | + commit_message=commit_message(commit_count), |
| 146 | + repo=cloned_repo, |
| 147 | + files_to_commit=file_to_commit, |
| 148 | + logger=logger, |
| 149 | + ): |
| 150 | + commit_count += 1 |
| 151 | + file_to_commit.clear() |
| 152 | + |
| 153 | + federatedcode.commit_and_push_changes( |
| 154 | + commit_message=commit_message( |
| 155 | + commit_batch=commit_count, |
| 156 | + total_commit_batch=commit_count, |
| 157 | + ), |
| 158 | + repo=cloned_repo, |
| 159 | + files_to_commit=file_to_commit, |
| 160 | + logger=logger, |
| 161 | + ) |
| 162 | + logger(f"Processed PackageURL for {file_processed:,d} NuGet packages.") |
| 163 | + logger(f"Pushed new PackageURL in {commit_count:,d} commits.") |
| 164 | + federatedcode.delete_local_clone(repo=cloned_repo) |
0 commit comments