Skip to content

Commit d617c82

Browse files
committed
Mine only the unprocessed data.
Add logging and fix bug in process_cargo_packages Signed-off-by: ziad hany <[email protected]>
1 parent cb1632a commit d617c82

File tree

3 files changed

+39
-49
lines changed

3 files changed

+39
-49
lines changed

minecode_pipelines/miners/cargo.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,37 +6,47 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9+
from minecode_pipelines.pipes import get_last_commit, get_changed_files, update_last_commit
10+
from minecode_pipelines.pipes.cargo import store_cargo_packages
911
import json
1012
from pathlib import Path
1113

12-
from minecode_pipelines.pipes.cargo import store_cargo_packages
13-
from minecode_pipelines.utils import get_changed_files
14-
1514

16-
def process_cargo_packages(cargo_repo, fed_repo):
15+
def process_cargo_packages(cargo_repo, fed_repo, logger):
1716
base_path = Path(cargo_repo.working_tree_dir)
18-
valid_files = get_changed_files(cargo_repo) # start from empty tree hash
17+
setting_last_commit = get_last_commit(fed_repo, "cargo")
18+
valid_files = get_changed_files(cargo_repo, setting_last_commit) # start from empty tree hash
1919

20-
json_files = []
20+
logger(f"Found {len(valid_files)} changed files in Cargo index.")
21+
targets_files = []
2122
for file_path in base_path.glob("**/*"):
22-
if not file_path.is_file() or file_path not in valid_files:
23+
if not file_path.is_file():
24+
continue
25+
26+
rel_path = str(file_path.relative_to(base_path))
27+
if rel_path not in valid_files:
2328
continue
2429

2530
if file_path.name in {"config.json", "README.md", "update-dl-url.yml"}:
2631
continue
27-
json_files.append(file_path)
2832

29-
for idx, file_path in enumerate(json_files, start=1):
30-
try:
31-
with open(file_path, encoding="utf-8") as f:
32-
packages = []
33-
for line in f:
34-
if line.strip():
35-
packages.append(json.loads(line))
33+
targets_files.append(file_path)
34+
35+
logger(f"Collected {len(targets_files)} target package files to process.")
3636

37-
except (json.JSONDecodeError, UnicodeDecodeError):
37+
for idx, file_path in enumerate(targets_files, start=1):
38+
packages = []
39+
with open(file_path, encoding="utf-8") as f:
40+
for line in f:
41+
if line.strip():
42+
packages.append(json.loads(line))
43+
44+
if not packages:
3845
continue
3946

40-
if packages:
41-
push_commit = idx == len(json_files) # only True on last
42-
store_cargo_packages(packages, fed_repo, push_commit)
47+
push_commit = idx == len(targets_files) # only True on last
48+
store_cargo_packages(packages, fed_repo, push_commit)
49+
logger(f"Processed {len(packages)} packages from {file_path} ({idx}/{len(targets_files)}).")
50+
51+
update_last_commit(setting_last_commit, fed_repo, "cargo")
52+
logger("Updated last commit checkpoint for Cargo.")

minecode_pipelines/pipelines/mine_cargo.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,19 @@
1919
#
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
import os
2223

2324
from git.repo.base import Repo
2425
from scanpipe.pipes.federatedcode import delete_local_clone
25-
from minecode.utils import get_temp_file
26+
from minecode_pipelines.utils import get_temp_file
2627
from scanpipe.pipelines import Pipeline
2728
from scanpipe.pipes import federatedcode
2829
from minecode_pipelines.miners import cargo
2930

31+
FEDERATEDCODE_CARGO_GIT_URL = os.environ.get(
32+
"FEDERATEDCODE_CARGO_GIT_URL", "https://github.com/ziadhany/cargo-test"
33+
)
34+
3035

3136
class MineandPublishCargoPURLs(Pipeline):
3237
"""Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode."""
@@ -49,16 +54,15 @@ def check_federatedcode_eligibility(self):
4954

5055
def clone_cargo_repo(self):
5156
"""
52-
Clone the repo at repo_url and return the VCSResponse object
57+
Clone the repo at repo_url and return the Repo object
5358
"""
54-
conan_repo_url = "git+https://github.com/rust-lang/crates.io-index"
55-
fed_repo_url = "git+https://github.com/ziadhany/cargo-test"
59+
conan_repo_url = "https://github.com/rust-lang/crates.io-index"
5660

57-
self.fed_repo = federatedcode.clone_repository(fed_repo_url)
61+
self.fed_repo = federatedcode.clone_repository(FEDERATEDCODE_CARGO_GIT_URL)
5862
self.cargo_repo = Repo.clone_from(conan_repo_url, get_temp_file())
5963

6064
def collect_packages_from_cargo(self):
61-
cargo.process_cargo_packages(self.cargo_repo, self.fed_repo)
65+
cargo.process_cargo_packages(self.cargo_repo, self.fed_repo, self.log)
6266

6367
def clean_cargo_repo(self):
6468
"""

minecode_pipelines/utils.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import tempfile
1111
import os
1212
from commoncode.fileutils import create_dir
13-
from git import Repo
1413

1514
from itertools import zip_longest
1615

@@ -61,26 +60,3 @@ def get_temp_file(file_name="data", extension=".file", dir_name=""):
6160
temp_dir = get_temp_dir(dir_name)
6261
location = os.path.join(temp_dir, file_name)
6362
return location
64-
65-
66-
def get_changed_files(
67-
repo: Repo, commit_x: str = "4b825dc642cb6eb9a060e54bf8d69288fbee4904", commit_y: str = None
68-
):
69-
"""
70-
Return a list of files changed between two commits using GitPython.
71-
Includes added, modified, deleted, and renamed files.
72-
73-
- commit_x is the empty tree hash (repo root).
74-
- commit_y is the latest commit (HEAD).
75-
"""
76-
77-
if commit_y is None:
78-
commit_y = repo.head.commit.hexsha
79-
80-
commit_x_obj = repo.commit(commit_x)
81-
commit_y_obj = repo.commit(commit_y)
82-
83-
diff_index = commit_x_obj.diff(commit_y_obj)
84-
changed_files = {item.a_path or item.b_path for item in diff_index}
85-
86-
return list(changed_files)

0 commit comments

Comments
 (0)