Skip to content

Commit 810956f

Browse files
committed
Use last incremental from index properties #660
Signed-off-by: Jono Yang <[email protected]>
1 parent 74bd21c commit 810956f

File tree

3 files changed

+86
-76
lines changed

3 files changed

+86
-76
lines changed

minecode_pipelines/pipes/__init__.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,20 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
import json
1011
import os
12+
import requests
1113
import saneyaml
1214

15+
from datetime import datetime
1316
from pathlib import Path
1417

1518
from aboutcode.hashid import PURLS_FILENAME
19+
from scanpipe.pipes.federatedcode import clone_repository
20+
from scanpipe.pipes.federatedcode import commit_and_push_changes
21+
22+
23+
MINECODE_SETTINGS_REPO = "https://github.com/AyanSinhaMahapatra/minecode-test/"
1624

1725

1826
def write_packageurls_to_file(repo, base_dir, packageurls):
@@ -26,3 +34,41 @@ def write_data_to_file(path, data):
2634
path.parent.mkdir(parents=True, exist_ok=True)
2735
with open(path, encoding="utf-8", mode="w") as f:
2836
f.write(saneyaml.dump(data))
37+
38+
39+
def fetch_last_serial_mined(
40+
settings_repo=MINECODE_SETTINGS_REPO,
41+
settings_path=None,
42+
):
43+
"""
44+
Fetch "last_serial" for the last mined packages.
45+
46+
This is a simple JSON in a github repo containing mining checkpoints
47+
with the "last_serial" from the pypi index which was mined. Example:
48+
https://github.com/AyanSinhaMahapatra/minecode-test/blob/main/minecode_checkpoints/pypi.json
49+
"""
50+
repo_name = settings_repo.split("github.com")[-1]
51+
minecode_checkpoint_pypi = (
52+
"https://raw.githubusercontent.com/" + repo_name + "refs/heads/main/" + settings_path
53+
)
54+
response = requests.get(minecode_checkpoint_pypi)
55+
if not response.ok:
56+
return
57+
58+
settings_data = json.loads(response.text)
59+
return settings_data.get("last_serial")
60+
61+
62+
def update_last_serial_mined(
63+
last_serial,
64+
settings_repo=MINECODE_SETTINGS_REPO,
65+
settings_path=None,
66+
):
67+
settings_data = {
68+
"date": str(datetime.now()),
69+
"last_serial": last_serial,
70+
}
71+
cloned_repo = clone_repository(repo_url=settings_repo)
72+
settings_path = os.path.join(cloned_repo.working_dir, settings_path)
73+
write_data_to_file(path=settings_path, data=settings_data)
74+
commit_and_push_changes(repo=cloned_repo, file_to_commit=settings_path)

minecode_pipelines/pipes/maven.py

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7,26 +7,29 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10+
from collections import namedtuple
1011
from itertools import chain
1112
import os
1213
import gzip
1314
import io
1415
import logging
15-
import arrow
16-
import javaproperties
17-
from packageurl import PackageURL
1816

1917
from dateutil import tz
20-
from minecode_pipelines.pipes import java_stream
21-
from collections import namedtuple
22-
from scanpipe.pipes.fetch import fetch_http
23-
from scanpipe.pipes import federatedcode
24-
from minecode_pipeline.pipes import write_purls_to_repo
2518
from jawa.util.utf import decode_modified_utf8
26-
from packagedcode.maven import get_urls
19+
import arrow
20+
import javaproperties
21+
2722
from packagedcode.maven import build_filename
2823
from packagedcode.maven import build_url
24+
from packagedcode.maven import get_urls
2925
from packagedcode.models import PackageData
26+
from packageurl import PackageURL
27+
from scanpipe.pipes.fetch import fetch_http
28+
from scanpipe.pipes import federatedcode
29+
30+
from minecode_pipelines import miners
31+
from minecode_pipelines import pipes
32+
from minecode_pipelines.pipes import java_stream
3033

3134

3235
logger = logging.getLogger(__name__)
@@ -44,8 +47,9 @@
4447

4548
MAVEN_BASE_URL = "https://repo1.maven.org/maven2"
4649
MAVEN_INDEX_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz"
47-
MAVEN_INDEX_INCREMENT_BASE_URL = f"https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz"
50+
MAVEN_INDEX_INCREMENT_BASE_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz"
4851
MAVEN_INDEX_PROPERTIES_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties"
52+
MAVEN_SETTINGS_PATH = "minecode_checkpoints/maven.json"
4953

5054

5155
def is_worthy_artifact(artifact):
@@ -577,6 +581,14 @@ class MavenNexusCollector:
577581
WARNING: Processing is rather long: a full index is ~600MB.
578582
"""
579583

584+
def __init__(self, index_properties_location=None):
585+
if index_properties_location:
586+
content = index_properties_location
587+
else:
588+
content = self.fetch_index_properties()
589+
with open(content) as config_file:
590+
self.index_properties = javaproperties.load(config_file) or {}
591+
580592
def fetch_index(self, uri=MAVEN_INDEX_URL):
581593
"""
582594
Return a temporary location where the maven index was saved.
@@ -595,11 +607,11 @@ def fetch_index_increments(self):
595607
"""
596608
Yield maven index increments
597609
"""
598-
content = self.fetch_index_properties()
599-
with open(content) as config_file:
600-
properties = javaproperties.load(config_file) or {}
601-
602-
for key, increment_index in properties.items():
610+
# in this context, last serial means last incremental
611+
last_incremental = pipes.fetch_last_serial_mined(settings_path=MAVEN_SETTINGS_PATH)
612+
for key, increment_index in self.index_properties.items():
613+
if increment_index <= last_incremental:
614+
continue
603615
if key.startswith("nexus.index.incremental"):
604616
index_increment_url = MAVEN_INDEX_INCREMENT_BASE_URL.format(index=increment_index)
605617
index_increment = fetch_http(index_increment_url)
@@ -676,9 +688,10 @@ def _get_packages_from_index_increments(self):
676688
for index_increment in self.fetch_index_increments():
677689
return self._get_packages(content=index_increment)
678690

679-
def get_packages(self, content=None, increments=False):
691+
def get_packages(self, content=None):
680692
"""Yield Package objects from maven index"""
681-
if increments:
693+
last_incremental = pipes.fetch_last_serial_mined(settings_path=MAVEN_SETTINGS_PATH)
694+
if last_incremental:
682695
packages = chain(self._get_packages_from_index_increments())
683696
else:
684697
if content:
@@ -698,7 +711,6 @@ def collect_packages_from_maven(commits_per_push=10, logger=None):
698711
if not prev_purl:
699712
prev_purl = current_purl
700713
elif prev_purl != current_purl:
701-
# check out repo
702714
repo_url, _ = federatedcode.get_package_repository(
703715
project_purl=prev_purl,
704716
logger=logger
@@ -710,16 +722,18 @@ def collect_packages_from_maven(commits_per_push=10, logger=None):
710722

711723
push_commit = not bool(i % commits_per_push)
712724
# save purls to yaml
713-
write_purls_to_repo(
725+
miners.write_purls_to_repo(
714726
repo=repo,
715727
package=prev_purl,
716728
packages=current_packages,
717729
push_commit=push_commit
718730
)
719731

720-
# delete local clone
721732
federatedcode.delete_local_clone(repo)
722733

723734
current_packages = []
724735
prev_purl = current_purl
725736
current_packages.append(package)
737+
738+
last_incremental = maven_nexus_collector.index_properties.get("nexus.index.last-incremental")
739+
pipes.update_last_serial_mined(last_serial=last_incremental, settings_path=MAVEN_SETTINGS_PATH)

minecode_pipelines/pipes/pypi.py

Lines changed: 6 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -20,80 +20,30 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
2222

23-
import os
24-
import json
25-
import requests
26-
27-
from datetime import datetime
28-
2923
from minecode_pipelines import pipes
3024
from minecode_pipelines.miners.pypi import get_pypi_packages
31-
from minecode_pipelines.miners.pypi import load_pypi_packages
3225
from minecode_pipelines.miners.pypi import get_pypi_packageurls
26+
from minecode_pipelines.miners.pypi import load_pypi_packages
3327
from minecode_pipelines.miners.pypi import PYPI_REPO
34-
3528
from minecode_pipelines.miners.pypi import PYPI_TYPE
3629

37-
from packageurl import PackageURL
38-
3930
from aboutcode.hashid import get_package_base_dir
40-
41-
31+
from packageurl import PackageURL
4232
from scanpipe.pipes.federatedcode import clone_repository
4333
from scanpipe.pipes.federatedcode import commit_changes
4434
from scanpipe.pipes.federatedcode import push_changes
45-
from scanpipe.pipes.federatedcode import commit_and_push_changes
4635

4736

48-
MINECODE_SETTINGS_REPO = "https://github.com/AyanSinhaMahapatra/minecode-test/"
4937
PYPI_SETTINGS_PATH = "minecode_checkpoints/pypi.json"
5038

5139

5240
def mine_pypi_packages(logger=None):
5341
return get_pypi_packages(pypi_repo=PYPI_REPO, logger=logger)
5442

5543

56-
def fetch_last_serial_mined(
57-
settings_repo=MINECODE_SETTINGS_REPO,
58-
settings_path=PYPI_SETTINGS_PATH,
59-
):
60-
"""
61-
Fetch "last_serial" for the last mined packages.
62-
63-
This is a simple JSON in a github repo containing mining checkpoints
64-
with the "last_serial" from the pypi index which was mined. Example:
65-
https://github.com/AyanSinhaMahapatra/minecode-test/blob/main/minecode_checkpoints/pypi.json
66-
"""
67-
repo_name = settings_repo.split("github.com")[-1]
68-
minecode_checkpoint_pypi = (
69-
"https://raw.githubusercontent.com/" + repo_name + "refs/heads/main/" + settings_path
70-
)
71-
response = requests.get(minecode_checkpoint_pypi)
72-
if not response.ok:
73-
return
74-
75-
settings_data = json.loads(response.text)
76-
return settings_data.get("last_serial")
77-
78-
79-
def update_last_serial_mined(
80-
last_serial,
81-
settings_repo=MINECODE_SETTINGS_REPO,
82-
settings_path=PYPI_SETTINGS_PATH,
83-
):
84-
settings_data = {
85-
"date": str(datetime.now()),
86-
"last_serial": last_serial,
87-
}
88-
cloned_repo = clone_repository(repo_url=settings_repo)
89-
settings_path = os.path.join(cloned_repo.working_dir, settings_path)
90-
pipes.write_data_to_file(path=settings_path, data=settings_data)
91-
commit_and_push_changes(repo=cloned_repo, file_to_commit=settings_path)
92-
93-
9444
def mine_and_publish_pypi_packageurls(packages, use_last_serial=False, logger=None):
9545
if use_last_serial:
96-
last_serial_fetched = fetch_last_serial_mined()
46+
last_serial_fetched = pipes.fetch_last_serial_mined(settings_path=PYPI_SETTINGS_PATH)
9747
if logger:
9848
logger(f"Last serial number mined: {last_serial_fetched}")
9949

@@ -108,9 +58,9 @@ def mine_and_publish_pypi_packageurls(packages, use_last_serial=False, logger=No
10858

10959
if packages:
11060
# clone repo
111-
cloned_repo = clone_repository(repo_url=MINECODE_SETTINGS_REPO)
61+
cloned_repo = clone_repository(repo_url=pipes.MINECODE_SETTINGS_REPO)
11262
if logger:
113-
logger(f"{MINECODE_SETTINGS_REPO} repo cloned at: {cloned_repo.working_dir}")
63+
logger(f"{pipes.MINECODE_SETTINGS_REPO} repo cloned at: {cloned_repo.working_dir}")
11464

11565
purl_files_updated = []
11666
for package in packages:
@@ -155,4 +105,4 @@ def mine_and_publish_pypi_packageurls(packages, use_last_serial=False, logger=No
155105

156106
# update last_serial to minecode checkpoints
157107
if use_last_serial:
158-
update_last_serial_mined(last_serial=last_serial)
108+
pipes.update_last_serial_mined(last_serial=last_serial)

0 commit comments

Comments
 (0)