Skip to content

Commit 927a2f3

Browse files
committed
Implement functionality to process bulk of commits
Signed-off-by: ziad hany <[email protected]>
1 parent 233c4ce commit 927a2f3

File tree

3 files changed

+80
-31
lines changed

3 files changed

+80
-31
lines changed

minecode_pipelines/miners/cargo.py

Lines changed: 43 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6,47 +6,61 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9-
from minecode_pipelines.pipes import get_last_commit, get_changed_files, update_last_commit
9+
from minecode_pipelines.pipes import get_last_commit
10+
from minecode_pipelines.pipes import get_changed_files
11+
from minecode_pipelines.pipes import update_last_commit
1012
from minecode_pipelines.pipes.cargo import store_cargo_packages
1113
import json
1214
from pathlib import Path
1315

16+
from minecode_pipelines.utils import get_next_x_commit
17+
18+
19+
def process_cargo_packages(cargo_repo, fed_repo, fed_conf_repo, logger):
20+
"""
21+
Process Cargo index files commit by commit.
22+
Push changes to fed_repo after:
23+
- every `commit_batch_size` commits, OR
24+
- every `file_batch_size` files, OR
25+
- when reaching HEAD.
26+
"""
1427

15-
def process_cargo_packages(cargo_repo, fed_repo, logger):
1628
base_path = Path(cargo_repo.working_tree_dir)
17-
setting_last_commit = get_last_commit(fed_repo, "cargo")
18-
valid_files = get_changed_files(cargo_repo, setting_last_commit) # start from empty tree hash
1929

20-
logger(f"Found {len(valid_files)} changed files in Cargo index.")
21-
targets_files = []
22-
for file_path in base_path.glob("**/*"):
23-
if not file_path.is_file():
24-
continue
30+
while True:
31+
setting_last_commit = get_last_commit(fed_conf_repo, "cargo")
32+
33+
if setting_last_commit is None:
34+
setting_last_commit = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
2535

26-
rel_path = str(file_path.relative_to(base_path))
27-
if rel_path not in valid_files:
28-
continue
36+
next_commit = get_next_x_commit(cargo_repo, setting_last_commit, x=1000, branch="master")
2937

30-
if file_path.name in {"config.json", "README.md", "update-dl-url.yml"}:
31-
continue
38+
if next_commit == setting_last_commit:
39+
logger("No new commits to mine")
40+
break
3241

33-
targets_files.append(file_path)
42+
changed_files = get_changed_files(
43+
cargo_repo, commit_x=setting_last_commit, commit_y=next_commit
44+
)
45+
logger(f"Found {len(changed_files)} changed files in Cargo index.")
3446

35-
logger(f"Collected {len(targets_files)} target package files to process.")
47+
for idx, rel_path in enumerate(changed_files):
48+
file_path = base_path / rel_path
49+
logger(f"Found {file_path}.")
3650

37-
for idx, file_path in enumerate(targets_files, start=1):
38-
packages = []
39-
with open(file_path, encoding="utf-8") as f:
40-
for line in f:
41-
if line.strip():
42-
packages.append(json.loads(line))
51+
if not file_path.is_file():
52+
continue
4353

44-
if not packages:
45-
continue
54+
if file_path.name in {"config.json", "README.md", "update-dl-url.yml"}:
55+
continue
56+
packages = []
57+
with open(file_path, encoding="utf-8") as f:
58+
for line in f:
59+
if line.strip():
60+
packages.append(json.loads(line))
4661

47-
push_commit = idx == len(targets_files) # only True on last
48-
store_cargo_packages(packages, fed_repo, push_commit)
49-
logger(f"Processed {len(packages)} packages from {file_path} ({idx}/{len(targets_files)}).")
62+
push_commit = idx == len(changed_files)
63+
store_cargo_packages(packages, fed_repo, push_commit)
5064

51-
update_last_commit(setting_last_commit, fed_repo, "cargo")
52-
logger("Updated last commit checkpoint for Cargo.")
65+
update_last_commit(next_commit, fed_conf_repo, "cargo")
66+
logger(f"Pushed batch for commit range {setting_last_commit}:{next_commit}.")

minecode_pipelines/pipelines/mine_cargo.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
"FEDERATEDCODE_CARGO_GIT_URL", "https://github.com/ziadhany/cargo-test"
3333
)
3434

35+
FEDERATEDCODE_CONFIG_GIT_URL = os.environ.get(
36+
"FEDERATEDCODE_CONFIG_GIT_URL", "https://github.com/ziadhany/federatedcode-config"
37+
)
38+
3539

3640
class MineandPublishCargoPURLs(Pipeline):
3741
"""Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode."""
@@ -50,7 +54,7 @@ def check_federatedcode_eligibility(self):
5054
Check if the project fulfills the following criteria for
5155
pushing the project result to FederatedCode.
5256
"""
53-
federatedcode.check_federatedcode_eligibility(project=self.project)
57+
federatedcode.check_federatedcode_configured_and_available(project=self.project)
5458

5559
def clone_cargo_repo(self):
5660
"""
@@ -59,6 +63,7 @@ def clone_cargo_repo(self):
5963
conan_repo_url = "https://github.com/rust-lang/crates.io-index"
6064

6165
self.fed_repo = federatedcode.clone_repository(FEDERATEDCODE_CARGO_GIT_URL)
66+
self.fed_conf_repo = federatedcode.clone_repository(FEDERATEDCODE_CONFIG_GIT_URL)
6267
self.cargo_repo = Repo.clone_from(conan_repo_url, get_temp_file())
6368

6469
def collect_packages_from_cargo(self):
@@ -73,3 +78,6 @@ def clean_cargo_repo(self):
7378

7479
if self.fed_repo:
7580
delete_local_clone(self.fed_repo)
81+
82+
if self.fed_conf_repo:
83+
delete_local_clone(self.fed_repo)

minecode_pipelines/utils.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9-
109
import tempfile
1110
import os
1211
from commoncode.fileutils import create_dir
12+
from git.repo.base import Repo
1313

1414
from itertools import zip_longest
1515

@@ -60,3 +60,30 @@ def get_temp_file(file_name="data", extension=".file", dir_name=""):
6060
temp_dir = get_temp_dir(dir_name)
6161
location = os.path.join(temp_dir, file_name)
6262
return location
63+
64+
65+
EMPTY_TREE_HASH = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
66+
67+
68+
def get_next_x_commit(repo: Repo, current_commit: str, x: int = 1, branch: str = "master") -> str:
69+
if x == 0:
70+
return current_commit
71+
72+
history = list(repo.iter_commits(branch))
73+
if not history:
74+
return current_commit # no commits, return current_commit
75+
76+
if not current_commit or current_commit == EMPTY_TREE_HASH:
77+
if x == 1:
78+
return history[-1].hexsha
79+
else:
80+
return history[0].hexsha
81+
82+
for i, commit in enumerate(history):
83+
if commit.hexsha == current_commit:
84+
if i + x < len(history):
85+
return history[i + x].hexsha
86+
else:
87+
return history[0].hexsha
88+
89+
return history[0].hexsha

0 commit comments

Comments
 (0)