Skip to content

Commit 0783717

Browse files
committed
Fix requested changes
Signed-off-by: ziad hany <[email protected]>
1 parent 3a3197c commit 0783717

File tree

5 files changed

+45
-45
lines changed

5 files changed

+45
-45
lines changed

minecode_pipelines/miners/cargo.py

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from datetime import datetime
1010

1111
from minecode_pipelines.pipes import fetch_checkpoint_from_github
12+
from minecode_pipelines.pipes import get_commit_at_distance_ahead
1213
from minecode_pipelines.pipes import update_checkpoints_in_github
1314
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO
1415
from minecode_pipelines.pipes import get_changed_files
@@ -20,9 +21,10 @@
2021
import json
2122
from pathlib import Path
2223

23-
from minecode_pipelines.utils import get_next_x_commit
2424

2525
PACKAGE_BATCH_SIZE = 500
26+
COMMIT_BATCH_SIZE = 10
27+
2628
CARGO_CHECKPOINT_PATH = "cargo/checkpoints.json"
2729

2830

@@ -36,14 +38,14 @@ def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logg
3638
base_path = Path(cargo_index_repo.working_tree_dir)
3739

3840
while True:
39-
cargo_checkpoints = (
40-
fetch_checkpoint_from_github(MINECODE_PIPELINES_CONFIG_REPO, CARGO_CHECKPOINT_PATH)
41-
or {}
41+
cargo_checkpoints = fetch_checkpoint_from_github(
42+
config_repo=MINECODE_PIPELINES_CONFIG_REPO, checkpoint_path=CARGO_CHECKPOINT_PATH
4243
)
44+
4345
checkpoints_last_commit = cargo_checkpoints.get("last_commit")
4446

45-
next_commit = get_next_x_commit(
46-
cargo_index_repo, checkpoints_last_commit, x=10, branch="master"
47+
next_commit = get_commit_at_distance_ahead(
48+
cargo_index_repo, checkpoints_last_commit, num_commits_ahead=10, branch_name="master"
4749
)
4850

4951
if next_commit == checkpoints_last_commit:
@@ -62,10 +64,11 @@ def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logg
6264
file_path = base_path / rel_path
6365
logger(f"Found {file_path}.")
6466

65-
if not file_path.is_file():
66-
continue
67-
68-
if file_path.name in {"config.json", "README.md", "update-dl-url.yml"}:
67+
if not file_path.is_file() or file_path.name in {
68+
"config.json",
69+
"README.md",
70+
"update-dl-url.yml",
71+
}:
6972
continue
7073

7174
packages = []
@@ -75,6 +78,8 @@ def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logg
7578
packages.append(json.loads(line))
7679

7780
file_counter += 1
81+
82+
# Commit and push after each full batch or when processing the last file
7883
commit_and_push = (file_counter % PACKAGE_BATCH_SIZE == 0) or (
7984
idx == len(changed_files)
8085
)
@@ -83,6 +88,7 @@ def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logg
8388

8489
purl_files.append(purl_file)
8590
purls.append(str(base_purl))
91+
8692
if not commit_and_push:
8793
continue
8894

@@ -91,11 +97,10 @@ def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logg
9197
files_to_commit=purl_files,
9298
purls=purls,
9399
mine_type="packageURL",
94-
tool_name="pkg:cargo/minecode-pipelines",
100+
tool_name="pkg:pypi/minecode-pipelines",
95101
tool_version=VERSION,
96102
)
97103

98-
# Push changes to remote repository
99104
push_changes(repo=cloned_data_repo)
100105
purl_files = []
101106
purls = []
@@ -105,15 +110,16 @@ def process_cargo_packages(cargo_index_repo, cloned_data_repo, config_repo, logg
105110
f"Updating checkpoint at: {CARGO_CHECKPOINT_PATH} with last commit: {checkpoints_last_commit}"
106111
)
107112

108-
settings_data = {
109-
"date": str(datetime.now()),
110-
"last_commit": next_commit,
111-
}
113+
if next_commit != checkpoints_last_commit:
114+
settings_data = {
115+
"date": str(datetime.now()),
116+
"last_commit": next_commit,
117+
}
112118

113-
update_checkpoints_in_github(
114-
checkpoint=settings_data,
115-
cloned_repo=config_repo,
116-
path=CARGO_CHECKPOINT_PATH,
117-
)
119+
update_checkpoints_in_github(
120+
checkpoint=settings_data,
121+
cloned_repo=config_repo,
122+
path=CARGO_CHECKPOINT_PATH,
123+
)
118124

119125
logger(f"Pushed batch for commit range {checkpoints_last_commit}:{next_commit}.")

minecode_pipelines/pipelines/mine_cargo.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class MineandPublishCargoPURLs(Pipeline):
4040
def steps(cls):
4141
return (
4242
cls.check_federatedcode_eligibility,
43-
cls.clone_cargo_repo,
43+
cls.clone_cargo_repos,
4444
cls.mine_and_publish_cargo_packageurls,
4545
cls.delete_cloned_repos,
4646
)
@@ -52,9 +52,10 @@ def check_federatedcode_eligibility(self):
5252
"""
5353
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
5454

55-
def clone_cargo_repo(self):
55+
def clone_cargo_repos(self):
5656
"""
57-
Clone the repo at repo_url and return the Repo object
57+
Clone the Cargo-related repositories (index, data, and pipelines config)
58+
and store their Repo objects in the corresponding instance variables.
5859
"""
5960
self.cargo_index_repo = federatedcode.clone_repository(MINECODE_CARGO_INDEX_REPO)
6061
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_DATA_CARGO_REPO)

minecode_pipelines/pipes/__init__.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path):
3636
)
3737
response = requests.get(checkpoints_file)
3838
if not response.ok:
39-
return
39+
return {}
4040

4141
checkpoint_data = json.loads(response.text)
4242
return checkpoint_data
@@ -161,13 +161,19 @@ def get_last_commit(repo, ecosystem):
161161
return settings_data.get("last_commit")
162162

163163

164-
def get_next_x_commit(repo: Repo, current_commit: str, x: int = 10, branch: str = "master") -> str:
164+
def get_commit_at_distance_ahead(
165+
repo: Repo,
166+
current_commit: str,
167+
num_commits_ahead: int = 10,
168+
branch_name: str = "master",
169+
) -> str:
165170
"""
166-
Get the x-th next commit after the current commit in the specified branch.
171+
Return the commit hash that is `num_commits_ahead` commits ahead of `current_commit`
172+
on the given branch.
167173
"""
168174
if not current_commit:
169175
current_commit = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
170-
revs = repo.git.rev_list(f"^{current_commit}", branch).splitlines()
171-
if len(revs) < x:
176+
revs = repo.git.rev_list(f"^{current_commit}", branch_name).splitlines()
177+
if len(revs) < num_commits_ahead:
172178
raise ValueError(f"Not enough commits ahead; only {len(revs)} available.")
173-
return revs[-x]
179+
return revs[-num_commits_ahead]

minecode_pipelines/tests/pipes/test_cargo_pipes.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def test_collect_packages_from_cargo_calls_write(self, mock_write):
3636

3737
self.assertEqual(called_repo, repo)
3838

39-
expected_base_purl = 'aboutcode-packages-cargo-0/cargo/c5store/purls.yml'
39+
expected_base_purl = "aboutcode-packages-cargo-0/cargo/c5store/purls.yml"
4040
self.assertEqual(str(base_purl), str(expected_base_purl))
4141
self.assertEqual(written_packages, expected)
4242

@@ -59,4 +59,4 @@ def test_add_purl_result_with_mock_repo(self):
5959

6060
with open(written_file, encoding="utf-8") as f:
6161
content = saneyaml.load(f)
62-
self.assertEqual(content, purls)
62+
self.assertEqual(content, purls)

minecode_pipelines/utils.py

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
import tempfile
1010
import os
1111
from commoncode.fileutils import create_dir
12-
from git.repo.base import Repo
1312

1413
from itertools import zip_longest
1514

@@ -60,15 +59,3 @@ def get_temp_file(file_name="data", extension=".file", dir_name=""):
6059
temp_dir = get_temp_dir(dir_name)
6160
location = os.path.join(temp_dir, file_name)
6261
return location
63-
64-
65-
def get_next_x_commit(repo: Repo, current_commit: str, x: int = 10, branch: str = "master") -> str:
66-
"""
67-
Get the x-th next commit after the current commit in the specified branch.
68-
"""
69-
if not current_commit:
70-
current_commit = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
71-
revs = repo.git.rev_list(f"^{current_commit}", branch).splitlines()
72-
if len(revs) < x:
73-
raise ValueError(f"Not enough commits ahead; only {len(revs)} available.")
74-
return revs[-x]

0 commit comments

Comments
 (0)