Skip to content

Commit fba0889

Browse files
Support mining pypi purls with checkpoints (#708)
Reference: #662 Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent d61be1a commit fba0889

File tree

7 files changed

+369
-90
lines changed

7 files changed

+369
-90
lines changed

minecode_pipelines/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,3 +6,5 @@
66
# See https://github.com/aboutcode-org/purldb for support or download.
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
9+
10+
VERSION = "0.0.1b2"

minecode_pipelines/miners/pypi.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from packageurl import PackageURL
1515

1616
from minecode_pipelines.utils import get_temp_file
17+
from minecode_pipelines.pipes import write_data_to_json_file
1718

1819
"""
1920
Visitors for Pypi and Pypi-like Python package repositories.
@@ -48,11 +49,12 @@ def get_pypi_packages(pypi_repo, logger=None):
4849
if not response.ok:
4950
return
5051

51-
packages = response.json()
52-
temp_file = get_temp_file("PypiPackagesJSON")
53-
with open(temp_file, "w", encoding="utf-8") as f:
54-
json.dump(packages, f, indent=4)
52+
return response.json()
5553

54+
55+
def write_packages_json(packages, name):
56+
temp_file = get_temp_file(name)
57+
write_data_to_json_file(path=temp_file, data=packages)
5658
return temp_file
5759

5860

@@ -76,11 +78,19 @@ def get_pypi_packageurls(name):
7678
return packageurls
7779

7880

79-
def load_pypi_packages(packages):
80-
with open(packages) as f:
81+
def load_pypi_packages(packages_file):
82+
with open(packages_file) as f:
8183
packages_data = json.load(f)
8284

8385
last_serial = packages_data.get("meta").get("_last-serial")
8486
packages = packages_data.get("projects")
8587

8688
return last_serial, packages
89+
90+
91+
def get_last_serial_from_packages(packages_file):
92+
with open(packages_file) as f:
93+
packages_data = json.load(f)
94+
95+
last_serial = packages_data.get("meta").get("_last-serial")
96+
return last_serial

minecode_pipelines/pipelines/mine_pypi.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from scanpipe.pipes import federatedcode
2525

2626
from minecode_pipelines.pipes import pypi
27+
from minecode_pipelines import pipes
2728

2829

2930
class MineandPublishPypiPURLs(Pipeline):
@@ -38,19 +39,27 @@ def steps(cls):
3839
cls.check_federatedcode_eligibility,
3940
cls.mine_pypi_packages,
4041
cls.mine_and_publish_pypi_packageurls,
42+
cls.delete_cloned_repos,
4143
)
4244

4345
def check_federatedcode_eligibility(self):
4446
"""
4547
Check if the project fulfills the following criteria for
4648
pushing the project result to FederatedCode.
4749
"""
48-
federatedcode.check_federatedcode_configured_and_available()
50+
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
4951

5052
def mine_pypi_packages(self):
51-
"""Mine pypi package names from pypi indexes."""
52-
self.pypi_packages = pypi.mine_pypi_packages(logger=self.log)
53+
"""Mine pypi package names from pypi indexes or checkpoint."""
54+
self.pypi_packages, self.state = pypi.mine_pypi_packages(logger=self.log)
5355

5456
def mine_and_publish_pypi_packageurls(self):
5557
"""Get pypi packageURLs for all mined pypi package names."""
56-
pypi.mine_and_publish_pypi_packageurls(packages=self.pypi_packages, logger=self.log)
58+
self.repos = pypi.mine_and_publish_pypi_packageurls(
59+
packages_file=self.pypi_packages,
60+
state=self.state,
61+
logger=self.log,
62+
)
63+
64+
def delete_cloned_repos(self):
65+
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)

minecode_pipelines/pipes/__init__.py

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,101 @@
88
#
99

1010
import os
11+
import json
12+
import requests
1113
import saneyaml
1214

1315
from pathlib import Path
1416

1517
from aboutcode.hashid import PURLS_FILENAME
1618

19+
from scanpipe.pipes.federatedcode import delete_local_clone
20+
from scanpipe.pipes.federatedcode import commit_and_push_changes
21+
22+
# states:
23+
# note: a state is null when mining starts
24+
INITIAL_SYNC_STATE = "initial-sync"
25+
PERIODIC_SYNC_STATE = "periodic-sync"
26+
27+
28+
MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/"
29+
30+
31+
def fetch_checkpoint_from_github(config_repo, checkpoint_path):
32+
repo_name = config_repo.split("github.com")[-1]
33+
checkpoints_file = (
34+
"https://raw.githubusercontent.com/" + repo_name + "refs/heads/main/" + checkpoint_path
35+
)
36+
response = requests.get(checkpoints_file)
37+
if not response.ok:
38+
return
39+
40+
checkpoint_data = json.loads(response.text)
41+
return checkpoint_data
42+
43+
44+
def update_checkpoints_in_github(checkpoint, cloned_repo, path):
45+
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
46+
write_data_to_json_file(path=checkpoint_path, data=checkpoint)
47+
commit_message = """Update federatedcode purl mining checkpoint"""
48+
commit_and_push_changes(
49+
repo=cloned_repo,
50+
files_to_commit=[checkpoint_path],
51+
commit_message=commit_message,
52+
)
53+
54+
55+
def get_mined_packages_from_checkpoint(config_repo, checkpoint_path):
56+
checkpoint = fetch_checkpoint_from_github(
57+
config_repo=config_repo,
58+
checkpoint_path=checkpoint_path,
59+
)
60+
return checkpoint.get("packages_mined", [])
61+
62+
63+
def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, checkpoint_path):
64+
mined_packages = get_mined_packages_from_checkpoint(
65+
config_repo=config_repo,
66+
checkpoint_path=checkpoint_path,
67+
)
68+
packages = {"packages_mined": packages + mined_packages}
69+
update_checkpoints_in_github(
70+
checkpoint=packages,
71+
cloned_repo=cloned_repo,
72+
path=checkpoint_path,
73+
)
74+
1775

1876
def write_packageurls_to_file(repo, base_dir, packageurls):
1977
purl_file_rel_path = os.path.join(base_dir, PURLS_FILENAME)
2078
purl_file_full_path = Path(repo.working_dir) / purl_file_rel_path
21-
write_data_to_file(path=purl_file_full_path, data=packageurls)
79+
write_data_to_yaml_file(path=purl_file_full_path, data=packageurls)
2280
return purl_file_rel_path
2381

2482

25-
def write_data_to_file(path, data):
83+
def write_data_to_yaml_file(path, data):
84+
if isinstance(path, str):
85+
path = Path(path)
86+
2687
path.parent.mkdir(parents=True, exist_ok=True)
2788
with open(path, encoding="utf-8", mode="w") as f:
2889
f.write(saneyaml.dump(data))
90+
91+
92+
def write_data_to_json_file(path, data):
93+
if isinstance(path, str):
94+
path = Path(path)
95+
96+
path.parent.mkdir(parents=True, exist_ok=True)
97+
with open(path, "w", encoding="utf-8") as f:
98+
json.dump(data, f, indent=4)
99+
100+
101+
def delete_cloned_repos(repos, logger=None):
102+
if not repos:
103+
return
104+
105+
for repo in repos:
106+
if logger:
107+
logger(f"Deleting local clone at: {repo.working_dir}")
108+
delete_local_clone(repo)

0 commit comments

Comments
 (0)