Skip to content

Commit 63b1864

Browse files
committed
Mine only the unprocessed data.
Add logging and fix bug in process_cargo_packages Signed-off-by: ziad hany <[email protected]>
1 parent ae60ff2 commit 63b1864

File tree

4 files changed

+121
-51
lines changed

4 files changed

+121
-51
lines changed

minecode_pipelines/miners/cargo.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,37 +6,47 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9+
from minecode_pipelines.pipes import get_last_commit, get_changed_files, update_last_commit
10+
from minecode_pipelines.pipes.cargo import store_cargo_packages
911
import json
1012
from pathlib import Path
1113

12-
from minecode_pipelines.pipes.cargo import store_cargo_packages
13-
from minecode_pipelines.utils import get_changed_files
14-
1514

16-
def process_cargo_packages(cargo_repo, fed_repo):
15+
def process_cargo_packages(cargo_repo, fed_repo, logger):
1716
base_path = Path(cargo_repo.working_tree_dir)
18-
valid_files = get_changed_files(cargo_repo) # start from empty tree hash
17+
setting_last_commit = get_last_commit(fed_repo, "cargo")
18+
valid_files = get_changed_files(cargo_repo, setting_last_commit) # start from empty tree hash
1919

20-
json_files = []
20+
logger(f"Found {len(valid_files)} changed files in Cargo index.")
21+
targets_files = []
2122
for file_path in base_path.glob("**/*"):
22-
if not file_path.is_file() or file_path not in valid_files:
23+
if not file_path.is_file():
24+
continue
25+
26+
rel_path = str(file_path.relative_to(base_path))
27+
if rel_path not in valid_files:
2328
continue
2429

2530
if file_path.name in {"config.json", "README.md", "update-dl-url.yml"}:
2631
continue
27-
json_files.append(file_path)
2832

29-
for idx, file_path in enumerate(json_files, start=1):
30-
try:
31-
with open(file_path, encoding="utf-8") as f:
32-
packages = []
33-
for line in f:
34-
if line.strip():
35-
packages.append(json.loads(line))
33+
targets_files.append(file_path)
34+
35+
logger(f"Collected {len(targets_files)} target package files to process.")
3636

37-
except (json.JSONDecodeError, UnicodeDecodeError):
37+
for idx, file_path in enumerate(targets_files, start=1):
38+
packages = []
39+
with open(file_path, encoding="utf-8") as f:
40+
for line in f:
41+
if line.strip():
42+
packages.append(json.loads(line))
43+
44+
if not packages:
3845
continue
3946

40-
if packages:
41-
push_commit = idx == len(json_files) # only True on last
42-
store_cargo_packages(packages, fed_repo, push_commit)
47+
push_commit = idx == len(targets_files) # only True on last
48+
store_cargo_packages(packages, fed_repo, push_commit)
49+
logger(f"Processed {len(packages)} packages from {file_path} ({idx}/{len(targets_files)}).")
50+
51+
update_last_commit(setting_last_commit, fed_repo, "cargo")
52+
logger("Updated last commit checkpoint for Cargo.")

minecode_pipelines/pipelines/mine_cargo.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,19 @@
1919
#
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
import os
2223

2324
from git.repo.base import Repo
2425
from scanpipe.pipes.federatedcode import delete_local_clone
25-
from minecode.utils import get_temp_file
26+
from minecode_pipelines.utils import get_temp_file
2627
from scanpipe.pipelines import Pipeline
2728
from scanpipe.pipes import federatedcode
2829
from minecode_pipelines.miners import cargo
2930

31+
FEDERATEDCODE_CARGO_GIT_URL = os.environ.get(
32+
"FEDERATEDCODE_CARGO_GIT_URL", "https://github.com/ziadhany/cargo-test"
33+
)
34+
3035

3136
class MineandPublishCargoPURLs(Pipeline):
3237
"""Pipeline to mine Cargo (crates.io) packages and publish them to FederatedCode."""
@@ -49,16 +54,15 @@ def check_federatedcode_eligibility(self):
4954

5055
def clone_cargo_repo(self):
5156
"""
52-
Clone the repo at repo_url and return the VCSResponse object
57+
Clone the repo at repo_url and return the Repo object
5358
"""
54-
conan_repo_url = "git+https://github.com/rust-lang/crates.io-index"
55-
fed_repo_url = "git+https://github.com/ziadhany/cargo-test"
59+
conan_repo_url = "https://github.com/rust-lang/crates.io-index"
5660

57-
self.fed_repo = federatedcode.clone_repository(fed_repo_url)
61+
self.fed_repo = federatedcode.clone_repository(FEDERATEDCODE_CARGO_GIT_URL)
5862
self.cargo_repo = Repo.clone_from(conan_repo_url, get_temp_file())
5963

6064
def collect_packages_from_cargo(self):
61-
cargo.process_cargo_packages(self.cargo_repo, self.fed_repo)
65+
cargo.process_cargo_packages(self.cargo_repo, self.fed_repo, self.log)
6266

6367
def clean_cargo_repo(self):
6468
"""

minecode_pipelines/pipes/__init__.py

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,18 @@
88
#
99

1010
from aboutcode.hashid import PURLS_FILENAME
11-
import os
1211
import textwrap
13-
from pathlib import Path
1412
import saneyaml
1513
from aboutcode import hashid
14+
import os
15+
from datetime import datetime
16+
from pathlib import Path
17+
18+
from git import Repo
19+
import json
20+
21+
from minecode_pipelines.miners import write_data_to_file
22+
1623

1724
VERSION = os.environ.get("VERSION", "")
1825
PURLDB_ALLOWED_HOST = os.environ.get("FEDERATEDCODE_GIT_ALLOWED_HOST", "")
@@ -70,3 +77,76 @@ def commit_and_push_changes(repo):
7077
default_branch = repo.active_branch.name
7178
repo.index.commit(textwrap.dedent(commit_message))
7279
repo.git.push(remote_name, default_branch, "--no-verify")
80+
81+
82+
def get_changed_files(repo: Repo, commit_x: str = None, commit_y: str = None):
83+
"""
84+
Return a list of files changed between two commits using GitPython.
85+
Includes added, modified, deleted, and renamed files.
86+
87+
- commit_x is the empty tree hash (repo root).
88+
- commit_y is the latest commit (HEAD).
89+
"""
90+
if commit_y is None:
91+
commit_y = repo.head.commit.hexsha
92+
93+
commit_y_obj = repo.commit(commit_y)
94+
if commit_x is None:
95+
commit_x = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"
96+
97+
diff_index = commit_y_obj.diff(commit_x, R=True)
98+
else:
99+
commit_x_obj = repo.commit(commit_x)
100+
diff_index = commit_x_obj.diff(commit_y_obj)
101+
102+
changed_files = {item.a_path or item.b_path for item in diff_index}
103+
104+
return list(changed_files)
105+
106+
107+
def get_last_commit(repo, ecosystem):
108+
"""
109+
Retrieve the last mined commit for a given ecosystem.
110+
111+
This function reads a JSON checkpoint file from the repository, which stores
112+
mining progress. Each checkpoint contains the "last_commit" from the package
113+
index (e.g., PyPI) that was previously mined.
114+
115+
https://github.com/AyanSinhaMahapatra/minecode-test/blob/main/minecode_checkpoints/pypi.json
116+
https://github.com/ziadhany/cargo-test/blob/main/minecode_checkpoints/cargo.json
117+
"""
118+
119+
last_commit_file_path = (
120+
Path(repo.working_tree_dir) / "minecode_checkpoints" / f"{ecosystem}.json"
121+
)
122+
try:
123+
with open(last_commit_file_path) as f:
124+
settings_data = json.load(f)
125+
except FileNotFoundError:
126+
return
127+
return settings_data.get("last_commit")
128+
129+
130+
def update_last_commit(last_commit, repo, ecosystem):
131+
"""Update the last mined commit checkpoint for a given ecosystem and push it to the repo."""
132+
133+
settings_data = {
134+
"date": str(datetime.now()),
135+
"last_commit": last_commit,
136+
}
137+
138+
settings_path = Path(repo.working_tree_dir) / "minecode_checkpoints" / f"{ecosystem}.json"
139+
write_data_to_file(path=settings_path, data=settings_data)
140+
repo.index.add([settings_path])
141+
142+
commit_message = f"""\
143+
Update last mined commit for {ecosystem}
144+
145+
Tool: pkg:github/aboutcode-org/purldb@v{VERSION}
146+
Reference: https://{PURLDB_ALLOWED_HOST}/
147+
Signed-off-by: {author_name} <{author_email}>
148+
"""
149+
150+
default_branch = repo.active_branch.name
151+
repo.index.commit(textwrap.dedent(commit_message))
152+
repo.git.push(remote_name, default_branch, "--no-verify")

minecode_pipelines/utils.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
import tempfile
1111
import os
1212
from commoncode.fileutils import create_dir
13-
from git import Repo
1413

1514

1615
def system_temp_dir(temp_dir=os.getenv("MINECODE_TMP")):
@@ -49,26 +48,3 @@ def get_temp_file(file_name="data", extension=".file", dir_name=""):
4948
temp_dir = get_temp_dir(dir_name)
5049
location = os.path.join(temp_dir, file_name)
5150
return location
52-
53-
54-
def get_changed_files(
55-
repo: Repo, commit_x: str = "4b825dc642cb6eb9a060e54bf8d69288fbee4904", commit_y: str = None
56-
):
57-
"""
58-
Return a list of files changed between two commits using GitPython.
59-
Includes added, modified, deleted, and renamed files.
60-
61-
- commit_x is the empty tree hash (repo root).
62-
- commit_y is the latest commit (HEAD).
63-
"""
64-
65-
if commit_y is None:
66-
commit_y = repo.head.commit.hexsha
67-
68-
commit_x_obj = repo.commit(commit_x)
69-
commit_y_obj = repo.commit(commit_y)
70-
71-
diff_index = commit_x_obj.diff(commit_y_obj)
72-
changed_files = {item.a_path or item.b_path for item in diff_index}
73-
74-
return list(changed_files)

0 commit comments

Comments
 (0)