Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions minecode_pipelines/pipelines/mine_nuget.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

import json
from pathlib import Path

from minecode_pipelines.pipes import nuget
from minecode_pipelines.pipes import write_packageurls_to_file
from packageurl import PackageURL

from aboutcode.hashid import get_package_base_dir
from aboutcode.pipeline import LoopProgress
from scanpipe.pipelines import Pipeline
from scanpipe.pipes import federatedcode


class MineAndPublishNuGetPURLs(Pipeline):
"""
Mine all packageURLs from NuGet catalog and publish them to
a FederatedCode repo.
"""

download_inputs = False
CATALOG_REPO_URL = (
"https://github.com/aboutcode-org/aboutcode-mirror-nuget-catalog.git"
)
NUGET_PURL_METADATA_REPO = "https://github.com/aboutcode-data/minecode-data-nuget-test"

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.fetch_nuget_catalog,
cls.mine_nuget_package_versions,
cls.mine_and_publish_nuget_packageurls,
cls.clean_downloads,
)

def check_federatedcode_eligibility(self):
"""
Check if the project fulfills the following criteria for
pushing the project result to FederatedCode.
"""
federatedcode.check_federatedcode_configured_and_available()

def fetch_nuget_catalog(self):
self.catalog_repo = federatedcode.clone_repository(
repo_url=self.CATALOG_REPO_URL,
logger=self.log,
)

def mine_nuget_package_versions(self):
"""Mine NuGet package and versions from NuGet catalog."""
catalog = Path(self.catalog_repo.working_dir) / "catalog"
catalog_count = nuget.get_catalog_page_count(catalog / "index.json")
catalog_pages = catalog / "pages"

self.package_versions = {}
self.skipped_packages = set()
self.log(f"Collecting versions from {catalog_count:,d} NuGet catalog.")
progress = LoopProgress(total_iterations=catalog_count, logger=self.log)
for page in progress.iter(catalog_pages.rglob("*.json")):
with page.open("r", encoding="utf-8") as f:
page_catalog = json.load(f)

nuget.collect_package_versions(
events=page_catalog["items"],
package_versions=self.package_versions,
skipped_packages=self.skipped_packages,
)
self.log(
f"Collected versions for {len(self.package_versions):,d} NuGet package."
)

def mine_and_publish_nuget_packageurls(self):
cloned_repo = federatedcode.clone_repository(
repo_url=self.NUGET_PURL_METADATA_REPO,
logger=self.log,
)
file_to_commit = []
batch_size = 4000
file_processed = 0
nuget_package_count = len(self.package_versions)
progress = LoopProgress(
total_iterations=nuget_package_count,
logger=self.log,
progress_step=1,
)

self.log(f"Mine packageURL for {nuget_package_count:,d} NuGet packages.")
for base, versions in progress.iter(self.package_versions.items()):
package_base_dir = get_package_base_dir(purl=base)
purl_dict = PackageURL.from_string(base).to_dict()
del purl_dict["version"]
packageurls = [
PackageURL(**purl_dict, version=v).to_string() for v in versions
]
purl_file = write_packageurls_to_file(
repo=cloned_repo,
base_dir=package_base_dir,
packageurls=sorted(packageurls),
)
file_to_commit.append(purl_file)
file_processed += 1

if len(file_to_commit) > batch_size:
federatedcode.commit_and_push_changes(
commit_message=nuget.commit_message(),
repo=cloned_repo,
files_to_commit=file_to_commit,
logger=self.log,
)
file_to_commit.clear()

self.log(f"Processed packageURL for {file_processed:,d} NuGet packages.")
federatedcode.commit_and_push_changes(
commit_message=nuget.commit_message(),
repo=cloned_repo,
files_to_commit=file_to_commit,
logger=self.log,
)
federatedcode.delete_local_clone(repo=cloned_repo)

def clean_downloads(self):
if self.catalog_repo:
self.log("Removing cloned repository")
federatedcode.delete_local_clone(repo=self.catalog_repo)
70 changes: 70 additions & 0 deletions minecode_pipelines/pipes/nuget.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an “AS IS” BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.


import json
import re

from django.conf import settings

from packageurl import PackageURL

from scancodeio import VERSION


def get_catalog_page_count(catalog_index):
if catalog_index.exists():
with catalog_index.open("r", encoding="utf-8") as f:
index = json.load(f)
return index.get("count", 0)
return 0


def collect_package_versions(events, package_versions, skipped_packages):
for event in events or []:
if event["@type"] != "nuget:PackageDetails":
continue
pkg_name = event["nuget:id"]
if bool(re.fullmatch(r"[a-z0-9]{46}", pkg_name)):
skipped_packages.add(pkg_name)
continue

purl = PackageURL(type="nuget", name=pkg_name).to_string()
if purl not in package_versions:
package_versions[purl] = set()

package_versions[purl].add(event["nuget:version"])


def commit_message():
author_name = settings.FEDERATEDCODE_GIT_SERVICE_NAME
author_email = settings.FEDERATEDCODE_GIT_SERVICE_EMAIL
tool_name = "pkg:github/aboutcode-org/scancode.io"

return f"""\
Collect PackageURLs from NuGet catalog
Tool: {tool_name}@v{VERSION}
Reference: https://{settings.ALLOWED_HOSTS[0]}
Signed-off-by: {author_name} <{author_email}>
"""
3 changes: 2 additions & 1 deletion pyproject-minecode_pipeline.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "flot.buildapi"

[project]
name = "minecode_pipelines"
version = "0.0.1b2"
version = "0.0.1b4"
description = "A library for mining packageURLs and package metadata from ecosystem repositories."
readme = "minecode_pipelines/README.rst"
license = { text = "Apache-2.0" }
Expand Down Expand Up @@ -46,6 +46,7 @@ urls = { Homepage = "https://github.com/aboutcode-org/purldb" }

[project.entry-points."scancodeio_pipelines"]
mine_pypi = "minecode_pipelines.pipelines.mine_pypi:MineandPublishPypiPURLs"
mine_nuget = "minecode_pipelines.pipelines.mine_nuget:MineAndPublishNuGetPURLs"

[tool.bumpversion]
current_version = "0.0.1b1"
Expand Down
Loading