Skip to content

Commit e389935

Browse files
Merge pull request #679 from ziadhany/min-cargo
Add support to mine cargo Package-URLs
2 parents ea38f08 + 86090ac commit e389935

File tree

11 files changed

+500
-4
lines changed

11 files changed

+500
-4
lines changed

minecode_pipelines/miners/cargo.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
from datetime import datetime
10+
11+
from minecode_pipelines.pipes import get_checkpoint_from_file
12+
from minecode_pipelines.pipes import get_commit_at_distance_ahead
13+
from minecode_pipelines.pipes import update_checkpoints_in_github
14+
from minecode_pipelines.pipes import get_changed_files
15+
from minecode_pipelines.pipes.cargo import store_cargo_packages
16+
from scanpipe.pipes.federatedcode import commit_changes
17+
from scanpipe.pipes.federatedcode import push_changes
18+
from minecode_pipelines import VERSION
19+
20+
import json
21+
from pathlib import Path
22+
23+
24+
PACKAGE_BATCH_SIZE = 500
25+
COMMIT_BATCH_SIZE = 10
26+
27+
CARGO_CHECKPOINT_PATH = "cargo/checkpoints.json"
28+
29+
30+
def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logger):
31+
"""
32+
Process Cargo index files commit by commit.
33+
Push changes to fed_repo after:
34+
- every `commit_batch` commits, OR when reaching HEAD.
35+
"""
36+
37+
base_path = Path(cargo_index_repo.working_tree_dir)
38+
39+
while True:
40+
cargo_checkpoints = get_checkpoint_from_file(
41+
cloned_repo=config_repo, path=CARGO_CHECKPOINT_PATH
42+
)
43+
44+
checkpoints_last_commit = cargo_checkpoints.get("last_commit")
45+
46+
try:
47+
next_commit = get_commit_at_distance_ahead(
48+
cargo_index_repo,
49+
checkpoints_last_commit,
50+
num_commits_ahead=COMMIT_BATCH_SIZE,
51+
branch_name="master",
52+
)
53+
except ValueError as e:
54+
logger(str(e))
55+
break
56+
57+
if next_commit == checkpoints_last_commit:
58+
logger("No new commits to mine")
59+
break
60+
61+
changed_files = get_changed_files(
62+
cargo_index_repo, commit_x=checkpoints_last_commit, commit_y=next_commit
63+
)
64+
logger(f"Found {len(changed_files)} changed files in Cargo index.")
65+
66+
file_counter = 0
67+
purl_files = []
68+
purls = []
69+
for idx, rel_path in enumerate(changed_files):
70+
file_path = base_path / rel_path
71+
logger(f"Found {file_path}.")
72+
73+
if not file_path.is_file() or file_path.name in {
74+
"config.json",
75+
"README.md",
76+
"update-dl-url.yml",
77+
}:
78+
continue
79+
80+
packages = []
81+
with open(file_path, encoding="utf-8") as f:
82+
for line in f:
83+
if line.strip():
84+
try:
85+
packages.append(json.loads(line))
86+
except json.JSONDecodeError as e:
87+
logger(f"Skipping invalid JSON in {file_path}: {e}")
88+
89+
file_counter += 1
90+
91+
# Commit and push after each full batch or when processing the last file
92+
commit_and_push = (file_counter % PACKAGE_BATCH_SIZE == 0) or (
93+
idx == len(changed_files) - 1
94+
)
95+
96+
result_store = store_cargo_packages(packages, cloned_data_repo)
97+
if result_store:
98+
purl_file, base_purl = result_store
99+
logger(f"writing packageURLs for package: {base_purl} at: {purl_file}")
100+
101+
purl_files.append(purl_file)
102+
purls.append(str(base_purl))
103+
104+
if not commit_and_push:
105+
continue
106+
107+
commit_changes(
108+
repo=cloned_data_repo,
109+
files_to_commit=purl_files,
110+
purls=purls,
111+
mine_type="packageURL",
112+
tool_name="pkg:pypi/minecode-pipelines",
113+
tool_version=VERSION,
114+
)
115+
116+
push_changes(repo=cloned_data_repo)
117+
purl_files = []
118+
purls = []
119+
120+
if logger:
121+
logger(
122+
f"Updating checkpoint at: {CARGO_CHECKPOINT_PATH} with last commit: {checkpoints_last_commit}"
123+
)
124+
125+
if next_commit != checkpoints_last_commit:
126+
settings_data = {
127+
"date": str(datetime.now()),
128+
"last_commit": next_commit,
129+
}
130+
131+
update_checkpoints_in_github(
132+
checkpoint=settings_data,
133+
cloned_repo=config_repo,
134+
path=CARGO_CHECKPOINT_PATH,
135+
)
136+
137+
logger(f"Pushed batch for commit range {checkpoints_last_commit}:{next_commit}.")
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
import os
24+
from scanpipe.pipelines import Pipeline
25+
from scanpipe.pipes import federatedcode
26+
from minecode_pipelines.miners import cargo
27+
from minecode_pipelines import pipes
28+
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO
29+
30+
MINECODE_DATA_CARGO_REPO = os.environ.get(
31+
"MINECODE_DATA_CARGO_REPO", "https://github.com/aboutcode-data/minecode-data-cargo-test"
32+
)
33+
MINECODE_CARGO_INDEX_REPO = "https://github.com/rust-lang/crates.io-index"
34+
35+
36+
class MineCargo(Pipeline):
37+
"""Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode."""
38+
39+
@classmethod
40+
def steps(cls):
41+
return (
42+
cls.check_federatedcode_eligibility,
43+
cls.clone_cargo_repos,
44+
cls.mine_and_publish_cargo_packageurls,
45+
cls.delete_cloned_repos,
46+
)
47+
48+
def check_federatedcode_eligibility(self):
49+
"""
50+
Check if the project fulfills the following criteria for
51+
pushing the project result to FederatedCode.
52+
"""
53+
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
54+
55+
def clone_cargo_repos(self):
56+
"""
57+
Clone the Cargo-related repositories (index, data, and pipelines config)
58+
and store their Repo objects in the corresponding instance variables.
59+
"""
60+
self.cargo_index_repo = federatedcode.clone_repository(MINECODE_CARGO_INDEX_REPO)
61+
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CARGO_REPO)
62+
self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO)
63+
64+
if self.log:
65+
self.log(
66+
f"{MINECODE_CARGO_INDEX_REPO} repo cloned at: {self.cargo_index_repo.working_dir}"
67+
)
68+
self.log(
69+
f"{MINECODE_DATA_CARGO_REPO} repo cloned at: {self.cloned_data_repo.working_dir}"
70+
)
71+
self.log(
72+
f"{MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {self.cloned_config_repo.working_dir}"
73+
)
74+
75+
def mine_and_publish_cargo_packageurls(self):
76+
cargo.process_cargo_packages(
77+
self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo, self.log
78+
)
79+
80+
def delete_cloned_repos(self):
81+
pipes.delete_cloned_repos(
82+
repos=[self.cargo_index_repo, self.cloned_data_repo, self.cloned_config_repo],
83+
logger=self.log,
84+
)

minecode_pipelines/pipes/__init__.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import saneyaml
1616

1717
from aboutcode.hashid import PURLS_FILENAME
18+
from git import Repo
19+
1820
from scanpipe.pipes.federatedcode import delete_local_clone
1921
from scanpipe.pipes.federatedcode import commit_and_push_changes
2022

@@ -34,7 +36,7 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path):
3436
)
3537
response = requests.get(checkpoints_file)
3638
if not response.ok:
37-
return
39+
return {}
3840

3941
checkpoint_data = json.loads(response.text)
4042
return checkpoint_data
@@ -112,3 +114,66 @@ def delete_cloned_repos(repos, logger=None):
112114
if logger:
113115
logger(f"Deleting local clone at: {repo.working_dir}")
114116
delete_local_clone(repo)
117+
118+
119+
def get_changed_files(repo: Repo, commit_x: str = None, commit_y: str = None):
120+
"""
121+
Return a list of files changed between two commits using GitPython.
122+
Includes added, modified, deleted, and renamed files.
123+
- commit_x: base commit (or the empty tree hash for the first commit)
124+
- commit_y: target commit (defaults to HEAD if not provided)
125+
"""
126+
EMPTY_TREE_HASH = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
127+
128+
if commit_y is None:
129+
commit_y = repo.head.commit.hexsha
130+
commit_y_obj = repo.commit(commit_y)
131+
132+
if commit_x is None or commit_x == EMPTY_TREE_HASH:
133+
# First commit case: diff against empty tree
134+
diff_index = commit_y_obj.diff(EMPTY_TREE_HASH, R=True)
135+
else:
136+
commit_x_obj = repo.commit(commit_x)
137+
diff_index = commit_x_obj.diff(commit_y_obj, R=True)
138+
139+
changed_files = {item.a_path or item.b_path for item in diff_index}
140+
return list(changed_files)
141+
142+
143+
def get_last_commit(repo, ecosystem):
144+
"""
145+
Retrieve the last mined commit for a given ecosystem.
146+
This function reads a JSON checkpoint file from the repository, which stores
147+
mining progress. Each checkpoint contains the "last_commit" from the package
148+
index (e.g., PyPI) that was previously mined.
149+
https://github.com/AyanSinhaMahapatra/minecode-test/blob/main/minecode_checkpoints/pypi.json
150+
https://github.com/ziadhany/cargo-test/blob/main/minecode_checkpoints/cargo.json
151+
"""
152+
153+
last_commit_file_path = (
154+
Path(repo.working_tree_dir) / "minecode_checkpoints" / f"{ecosystem}.json"
155+
)
156+
try:
157+
with open(last_commit_file_path) as f:
158+
settings_data = json.load(f)
159+
except FileNotFoundError:
160+
return
161+
return settings_data.get("last_commit")
162+
163+
164+
def get_commit_at_distance_ahead(
165+
repo: Repo,
166+
current_commit: str,
167+
num_commits_ahead: int = 10,
168+
branch_name: str = "master",
169+
) -> str:
170+
"""
171+
Return the commit hash that is `num_commits_ahead` commits ahead of `current_commit`
172+
on the given branch.
173+
"""
174+
if not current_commit:
175+
current_commit = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
176+
revs = repo.git.rev_list(f"^{current_commit}", branch_name).splitlines()
177+
if len(revs) < num_commits_ahead:
178+
raise ValueError(f"Not enough commits ahead; only {len(revs)} available.")
179+
return revs[-num_commits_ahead]

minecode_pipelines/pipes/cargo.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from pathlib import Path
2+
3+
from aboutcode import hashid
4+
from packageurl import PackageURL
5+
from aboutcode.hashid import get_core_purl
6+
7+
from minecode_pipelines.pipes import write_data_to_yaml_file
8+
9+
10+
def store_cargo_packages(packages, repo):
11+
"""Collect Cargo package versions into purls and write them to the repo."""
12+
13+
if not packages:
14+
return
15+
16+
first_pkg = packages[0]
17+
name = first_pkg.get("name")
18+
version = first_pkg.get("vers")
19+
purl = PackageURL(type="cargo", name=name, version=version)
20+
base_purl = get_core_purl(purl)
21+
22+
updated_purls = []
23+
for package in packages:
24+
name = package.get("name")
25+
version = package.get("vers")
26+
purl = PackageURL(type="cargo", name=name, version=version).to_string()
27+
updated_purls.append(purl)
28+
29+
ppath = hashid.get_package_purls_yml_file_path(base_purl)
30+
purl_file_full_path = Path(repo.working_dir) / ppath
31+
write_data_to_yaml_file(path=purl_file_full_path, data=updated_purls)
32+
return purl_file_full_path, base_purl

minecode_pipelines/pipes/pypi.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
from aboutcode.hashid import get_package_base_dir
4646
from packageurl import PackageURL
4747
from scanpipe.pipes.federatedcode import clone_repository
48+
4849
from scanpipe.pipes.federatedcode import commit_changes
4950
from scanpipe.pipes.federatedcode import push_changes
5051

0 commit comments

Comments
 (0)