Skip to content

Commit eb4a4fb

Browse files
committed
Use new common functions for writing and commiting purls #660
* Create function to get checkpoint from checked out settings repo Signed-off-by: Jono Yang <[email protected]>
1 parent e5ed41a commit eb4a4fb

File tree

3 files changed

+82
-34
lines changed

3 files changed

+82
-34
lines changed

minecode_pipelines/pipes/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import requests
1515
import saneyaml
1616

17+
from aboutcode.hashid import PURLS_FILENAME
1718
from scanpipe.pipes.federatedcode import delete_local_clone
1819
from scanpipe.pipes.federatedcode import commit_and_push_changes
1920

@@ -39,6 +40,13 @@ def fetch_checkpoint_from_github(config_repo, checkpoint_path):
3940
return checkpoint_data
4041

4142

43+
def get_checkpoint_from_file(cloned_repo, path):
44+
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
45+
with open(checkpoint_path) as f:
46+
checkpoint_data = json.load(f)
47+
return checkpoint_data or {}
48+
49+
4250
def update_checkpoints_in_github(checkpoint, cloned_repo, path):
4351
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
4452
write_data_to_json_file(path=checkpoint_path, data=checkpoint)

minecode_pipelines/pipes/maven.py

Lines changed: 73 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import arrow
2020
import javaproperties
2121

22+
from aboutcode import hashid
2223
from packagedcode.maven import build_filename
2324
from packagedcode.maven import build_url
2425
from packagedcode.maven import get_urls
@@ -27,8 +28,8 @@
2728
from scanpipe.pipes.fetch import fetch_http
2829
from scanpipe.pipes import federatedcode
2930

30-
from minecode_pipelines import miners
3131
from minecode_pipelines import pipes
32+
from minecode_pipelines import VERSION
3233
from minecode_pipelines.pipes import java_stream
3334

3435

@@ -49,7 +50,10 @@
4950
MAVEN_INDEX_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz"
5051
MAVEN_INDEX_INCREMENT_BASE_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz"
5152
MAVEN_INDEX_PROPERTIES_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties"
52-
MAVEN_SETTINGS_PATH = "minecode_checkpoints/maven.json"
53+
MAVEN_CHECKPOINT_PATH = "maven/checkpoints.json"
54+
55+
# We are testing and storing mined packageURLs in one single repo per ecosystem for now
56+
MINECODE_DATA_MAVEN_REPO = "https://github.com/aboutcode-data/minecode-data-maven-test"
5357

5458

5559
def is_worthy_artifact(artifact):
@@ -589,26 +593,24 @@ def __init__(self, index_properties_location=None):
589593
with open(content) as config_file:
590594
self.index_properties = javaproperties.load(config_file) or {}
591595

592-
def fetch_index(self, uri=MAVEN_INDEX_URL):
596+
def _fetch_index(self, uri=MAVEN_INDEX_URL):
593597
"""
594598
Return a temporary location where the maven index was saved.
595599
"""
596600
index = fetch_http(uri)
597601
return index.path
598602

599-
def fetch_index_properties(self, uri=MAVEN_INDEX_PROPERTIES_URL):
603+
def _fetch_index_properties(self, uri=MAVEN_INDEX_PROPERTIES_URL):
600604
"""
601605
Return a temporary location where the maven index properties file was saved.
602606
"""
603607
index_properties = fetch_http(uri)
604608
return index_properties.path
605609

606-
def fetch_index_increments(self):
610+
def _fetch_index_increments(self, last_incremental):
607611
"""
608612
Yield maven index increments
609613
"""
610-
# in this context, last serial means last incremental
611-
last_incremental = pipes.fetch_last_serial_mined(settings_path=MAVEN_SETTINGS_PATH)
612614
for key, increment_index in self.index_properties.items():
613615
if increment_index <= last_incremental:
614616
continue
@@ -684,15 +686,14 @@ def _get_packages(self, content=None):
684686
)
685687
yield current_purl, package
686688

687-
def _get_packages_from_index_increments(self):
688-
for index_increment in self.fetch_index_increments():
689+
def _get_packages_from_index_increments(self, last_incremental):
690+
for index_increment in self._fetch_index_increments(last_incremental=last_incremental):
689691
return self._get_packages(content=index_increment)
690692

691-
def get_packages(self, content=None):
693+
def get_packages(self, content=None, last_incremental=None):
692694
"""Yield Package objects from maven index"""
693-
last_incremental = pipes.fetch_last_serial_mined(settings_path=MAVEN_SETTINGS_PATH)
694695
if last_incremental:
695-
packages = chain(self._get_packages_from_index_increments())
696+
packages = chain(self._get_packages_from_index_increments(last_incremental=last_incremental))
696697
else:
697698
if content:
698699
index_location = content
@@ -703,37 +704,76 @@ def get_packages(self, content=None):
703704

704705

705706
def collect_packages_from_maven(commits_per_push=10, logger=None):
707+
# Clone data and config repo
708+
data_repo = federatedcode.clone_repository(
709+
repo_url=MINECODE_DATA_MAVEN_REPO,
710+
logger=logger,
711+
)
712+
config_repo = federatedcode.clone_repository(
713+
repo_url=pipes.MINECODE_PIPELINES_CONFIG_REPO,
714+
logger=logger,
715+
)
716+
if logger:
717+
logger(f"{MINECODE_DATA_MAVEN_REPO} repo cloned at: {data_repo.working_dir}")
718+
logger(f"{pipes.MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {config_repo.working_dir}")
719+
720+
# get last_incremental to see if we can start from incrementals
721+
checkpoint = pipes.get_checkpoint_from_file(
722+
cloned_repo=config_repo,
723+
path=MAVEN_CHECKPOINT_PATH
724+
)
725+
last_incremental = checkpoint.get("last_incremental")
726+
if logger:
727+
logger(f"last_incremental: {last_incremental}")
728+
706729
# download and iterate through maven nexus index
707730
maven_nexus_collector = MavenNexusCollector()
708731
prev_purl = None
709-
current_packages = []
710-
for i, (current_purl, package) in enumerate(maven_nexus_collector.get_packages(), start=1):
732+
current_purls = []
733+
for i, (current_purl, package) in enumerate(
734+
maven_nexus_collector.get_packages(last_incremental=last_incremental),
735+
start=1
736+
):
711737
if not prev_purl:
712738
prev_purl = current_purl
713739
elif prev_purl != current_purl:
714-
repo_url, _ = federatedcode.get_package_repository(
715-
project_purl=prev_purl,
716-
logger=logger
717-
)
718-
repo = federatedcode.clone_repository(
719-
repo_url=repo_url,
720-
logger=logger,
740+
# write packageURLs to file
741+
package_base_dir = hashid.get_package_base_dir(purl=prev_purl)
742+
purl_file = pipes.write_packageurls_to_file(
743+
repo=data_repo,
744+
base_dir=package_base_dir,
745+
packageurls=current_purls,
721746
)
722747

723-
push_commit = not bool(i % commits_per_push)
724-
# save purls to yaml
725-
miners.write_purls_to_repo(
726-
repo=repo,
727-
package=prev_purl,
728-
packages=current_packages,
729-
push_commit=push_commit
748+
# commit changes
749+
pipes.commit_changes(
750+
repo=data_repo,
751+
files_to_commit=[purl_file],
752+
purls=current_purls,
753+
mine_type="packageURL",
754+
tool_name="pkg:pypi/minecode-pipelines",
755+
tool_version=VERSION,
730756
)
731757

732-
federatedcode.delete_local_clone(repo)
758+
# Push changes to remote repository
759+
push_commit = not bool(i % commits_per_push)
760+
if push_commit:
761+
federatedcode.push_changes(repo=data_repo)
733762

734-
current_packages = []
763+
current_purls = []
735764
prev_purl = current_purl
736-
current_packages.append(package)
765+
current_purls.append(package.to_string())
766+
767+
# update last_incremental so we can pick up from the proper place next time
768+
last_incremental = maven_nexus_collector.index_properties.get("nexus.index.last-incremental")
769+
checkpoint = {"last_incremental": last_incremental}
770+
if logger:
771+
logger(f"checkpoint: {checkpoint}")
772+
pipes.update_checkpoints_in_github(
773+
checkpoint=checkpoint,
774+
cloned_repo=config_repo,
775+
path=MAVEN_CHECKPOINT_PATH
776+
)
737777

738-
last_incremental = maven_nexus_collector.index_properties.get("nexus.index.last-incremental")
739-
pipes.update_last_serial_mined(last_serial=last_incremental, settings_path=MAVEN_SETTINGS_PATH)
778+
# clean up cloned repos
779+
pipes.delete_cloned_repos(repos=[data_repo, config_repo], logger=logger)

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ install_requires =
6262
matchcode-toolkit >= 7.2.2
6363
purl2vcs >= 2.0.0
6464
univers >= 31.0.0
65-
scancodeio @ git+https://github.com/aboutcode-org/scancode.io@f3a581171e201e5fdb20962115a661b4eb4a7850
65+
scancodeio @ git+https://github.com/aboutcode-org/scancode.io@c69e8a0492b4e3abfde2f66dcd3434c883cf2ba2
6666
GitPython >= 3.1.44
6767
samecode >= 0.5.1
6868
# FederatedCode integration

0 commit comments

Comments
 (0)