Skip to content

Commit fd990d2

Browse files
committed
Implement functionality to process bulk of commits
Signed-off-by: ziad hany <[email protected]>
1 parent 63b1864 commit fd990d2

File tree

4 files changed

+95
-40
lines changed

4 files changed

+95
-40
lines changed

minecode_pipelines/miners/cargo.py

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,47 +6,61 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9-
from minecode_pipelines.pipes import get_last_commit, get_changed_files, update_last_commit
9+
from minecode_pipelines.pipes import get_last_commit
10+
from minecode_pipelines.pipes import get_changed_files
11+
from minecode_pipelines.pipes import update_last_commit
1012
from minecode_pipelines.pipes.cargo import store_cargo_packages
1113
import json
1214
from pathlib import Path
1315

16+
from minecode_pipelines.utils import get_next_x_commit
17+
18+
19+
def process_cargo_packages(cargo_repo, fed_repo, fed_conf_repo, logger):
20+
"""
21+
Process Cargo index files commit by commit.
22+
Push changes to fed_repo after:
23+
- every `commit_batch_size` commits, OR
24+
- every `file_batch_size` files, OR
25+
- when reaching HEAD.
26+
"""
1427

15-
def process_cargo_packages(cargo_repo, fed_repo, logger):
1628
base_path = Path(cargo_repo.working_tree_dir)
17-
setting_last_commit = get_last_commit(fed_repo, "cargo")
18-
valid_files = get_changed_files(cargo_repo, setting_last_commit) # start from empty tree hash
1929

20-
logger(f"Found {len(valid_files)} changed files in Cargo index.")
21-
targets_files = []
22-
for file_path in base_path.glob("**/*"):
23-
if not file_path.is_file():
24-
continue
30+
while True:
31+
setting_last_commit = get_last_commit(fed_conf_repo, "cargo")
32+
33+
if setting_last_commit is None:
34+
setting_last_commit = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
2535

26-
rel_path = str(file_path.relative_to(base_path))
27-
if rel_path not in valid_files:
28-
continue
36+
next_commit = get_next_x_commit(cargo_repo, setting_last_commit, x=1000, branch="master")
2937

30-
if file_path.name in {"config.json", "README.md", "update-dl-url.yml"}:
31-
continue
38+
if next_commit == setting_last_commit:
39+
logger("No new commits to mine")
40+
break
3241

33-
targets_files.append(file_path)
42+
changed_files = get_changed_files(
43+
cargo_repo, commit_x=setting_last_commit, commit_y=next_commit
44+
)
45+
logger(f"Found {len(changed_files)} changed files in Cargo index.")
3446

35-
logger(f"Collected {len(targets_files)} target package files to process.")
47+
for idx, rel_path in enumerate(changed_files):
48+
file_path = base_path / rel_path
49+
logger(f"Found {file_path}.")
3650

37-
for idx, file_path in enumerate(targets_files, start=1):
38-
packages = []
39-
with open(file_path, encoding="utf-8") as f:
40-
for line in f:
41-
if line.strip():
42-
packages.append(json.loads(line))
51+
if not file_path.is_file():
52+
continue
4353

44-
if not packages:
45-
continue
54+
if file_path.name in {"config.json", "README.md", "update-dl-url.yml"}:
55+
continue
56+
packages = []
57+
with open(file_path, encoding="utf-8") as f:
58+
for line in f:
59+
if line.strip():
60+
packages.append(json.loads(line))
4661

47-
push_commit = idx == len(targets_files) # only True on last
48-
store_cargo_packages(packages, fed_repo, push_commit)
49-
logger(f"Processed {len(packages)} packages from {file_path} ({idx}/{len(targets_files)}).")
62+
push_commit = idx == len(changed_files)
63+
store_cargo_packages(packages, fed_repo, push_commit)
5064

51-
update_last_commit(setting_last_commit, fed_repo, "cargo")
52-
logger("Updated last commit checkpoint for Cargo.")
65+
update_last_commit(next_commit, fed_conf_repo, "cargo")
66+
logger(f"Pushed batch for commit range {setting_last_commit}:{next_commit}.")

minecode_pipelines/pipelines/mine_cargo.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
"FEDERATEDCODE_CARGO_GIT_URL", "https://github.com/ziadhany/cargo-test"
3333
)
3434

35+
FEDERATEDCODE_CONFIG_GIT_URL = os.environ.get(
36+
"FEDERATEDCODE_CONFIG_GIT_URL", "https://github.com/ziadhany/federatedcode-config"
37+
)
38+
3539

3640
class MineandPublishCargoPURLs(Pipeline):
3741
"""Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode."""
@@ -50,7 +54,7 @@ def check_federatedcode_eligibility(self):
5054
Check if the project fulfills the following criteria for
5155
pushing the project result to FederatedCode.
5256
"""
53-
federatedcode.check_federatedcode_eligibility(project=self.project)
57+
federatedcode.check_federatedcode_configured_and_available(project=self.project)
5458

5559
def clone_cargo_repo(self):
5660
"""
@@ -59,6 +63,7 @@ def clone_cargo_repo(self):
5963
conan_repo_url = "https://github.com/rust-lang/crates.io-index"
6064

6165
self.fed_repo = federatedcode.clone_repository(FEDERATEDCODE_CARGO_GIT_URL)
66+
self.fed_conf_repo = federatedcode.clone_repository(FEDERATEDCODE_CONFIG_GIT_URL)
6267
self.cargo_repo = Repo.clone_from(conan_repo_url, get_temp_file())
6368

6469
def collect_packages_from_cargo(self):
@@ -73,3 +78,6 @@ def clean_cargo_repo(self):
7378

7479
if self.fed_repo:
7580
delete_local_clone(self.fed_repo)
81+
82+
if self.fed_conf_repo:
83+
delete_local_clone(self.fed_repo)

minecode_pipelines/pipes/__init__.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,12 @@ def write_data_to_file(path, data):
4141
f.write(saneyaml.dump(data))
4242

4343

44+
def write_json_file(path, data):
45+
path.parent.mkdir(parents=True, exist_ok=True)
46+
with open(path, mode="w", encoding="utf-8") as f:
47+
json.dump(data, f, indent=4, ensure_ascii=False)
48+
49+
4450
def write_purls_to_repo(repo, package, updated_purls, push_commits=False):
4551
"""Write or update package purls in the repo and optionally commit/push changes."""
4652
ppath = hashid.get_package_purls_yml_file_path(package)
@@ -84,23 +90,23 @@ def get_changed_files(repo: Repo, commit_x: str = None, commit_y: str = None):
8490
Return a list of files changed between two commits using GitPython.
8591
Includes added, modified, deleted, and renamed files.
8692
87-
- commit_x is the empty tree hash (repo root).
88-
- commit_y is the latest commit (HEAD).
93+
- commit_x: base commit (or the empty tree hash for the first commit)
94+
- commit_y: target commit (defaults to HEAD if not provided)
8995
"""
96+
EMPTY_TREE_HASH = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
97+
9098
if commit_y is None:
9199
commit_y = repo.head.commit.hexsha
92-
93100
commit_y_obj = repo.commit(commit_y)
94-
if commit_x is None:
95-
commit_x = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
96101

97-
diff_index = commit_y_obj.diff(commit_x, R=True)
102+
if commit_x is None or commit_x == EMPTY_TREE_HASH:
103+
# First commit case: diff against empty tree
104+
diff_index = commit_y_obj.diff(EMPTY_TREE_HASH, R=True)
98105
else:
99106
commit_x_obj = repo.commit(commit_x)
100-
diff_index = commit_x_obj.diff(commit_y_obj)
107+
diff_index = commit_x_obj.diff(commit_y_obj, R=True)
101108

102109
changed_files = {item.a_path or item.b_path for item in diff_index}
103-
104110
return list(changed_files)
105111

106112

@@ -136,7 +142,7 @@ def update_last_commit(last_commit, repo, ecosystem):
136142
}
137143

138144
settings_path = Path(repo.working_tree_dir) / "minecode_checkpoints" / f"{ecosystem}.json"
139-
write_data_to_file(path=settings_path, data=settings_data)
145+
write_json_file(path=settings_path, data=settings_data)
140146
repo.index.add([settings_path])
141147

142148
commit_message = f"""\

minecode_pipelines/utils.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9-
109
import tempfile
1110
import os
1211
from commoncode.fileutils import create_dir
12+
from git.repo.base import Repo
1313

1414

1515
def system_temp_dir(temp_dir=os.getenv("MINECODE_TMP")):
@@ -48,3 +48,30 @@ def get_temp_file(file_name="data", extension=".file", dir_name=""):
4848
temp_dir = get_temp_dir(dir_name)
4949
location = os.path.join(temp_dir, file_name)
5050
return location
51+
52+
53+
EMPTY_TREE_HASH = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
54+
55+
56+
def get_next_x_commit(repo: Repo, current_commit: str, x: int = 1, branch: str = "master") -> str:
57+
if x == 0:
58+
return current_commit
59+
60+
history = list(repo.iter_commits(branch))
61+
if not history:
62+
return current_commit # no commits, return current_commit
63+
64+
if not current_commit or current_commit == EMPTY_TREE_HASH:
65+
if x == 1:
66+
return history[-1].hexsha
67+
else:
68+
return history[0].hexsha
69+
70+
for i, commit in enumerate(history):
71+
if commit.hexsha == current_commit:
72+
if i + x < len(history):
73+
return history[i + x].hexsha
74+
else:
75+
return history[0].hexsha
76+
77+
return history[0].hexsha

0 commit comments

Comments
 (0)