Skip to content

Commit d9c8b8d

Browse files
committed
Add methods to fetch maven index properties and increments #660
Signed-off-by: Jono Yang <[email protected]>
1 parent b0a3d6e commit d9c8b8d

File tree

1 file changed

+45
-15
lines changed

1 file changed

+45
-15
lines changed

minecode_pipeline/pipes/maven.py

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77
# See https://aboutcode.org for more information about nexB OSS projects.
88
#
99

10-
import saneyaml
10+
from itertools import chain
1111
import os
1212
import gzip
1313
import io
1414
import logging
1515
import arrow
16-
from aboutcode import hashid
16+
import javaproperties
1717
from packageurl import PackageURL
18-
from urllib.parse import urlparse
18+
1919
from dateutil import tz
2020
from minecode_pipeline.pipes import java_stream
2121
from collections import namedtuple
@@ -44,6 +44,8 @@
4444

4545
MAVEN_BASE_URL = "https://repo1.maven.org/maven2"
4646
MAVEN_INDEX_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz"
47+
MAVEN_INDEX_INCREMENT_BASE_URL = f"https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz"
48+
MAVEN_INDEX_PROPERTIES_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties"
4749

4850

4951
def is_worthy_artifact(artifact):
@@ -575,24 +577,36 @@ class MavenNexusCollector:
575577
WARNING: Processing is rather long: a full index is ~600MB.
576578
"""
577579

578-
def fetch_index(self, uri=MAVEN_INDEX_URL, timeout=10):
580+
def fetch_index(self, uri=MAVEN_INDEX_URL):
579581
"""
580-
Return a temporary location where the fetched content was saved.
581-
Does not return the content proper as a regular fetch does.
582-
583-
`timeout` is a default timeout.
582+
Return a temporary location where the maven index was saved.
584583
"""
585584
index = fetch_http(uri)
586585
return index.path
587586

588-
def get_packages(self, content=None):
589-
"""Yield Package objects from maven index"""
590-
if content:
591-
index_location = content
592-
else:
593-
index_location = self.fetch_index()
587+
def fetch_index_properties(self, uri=MAVEN_INDEX_PROPERTIES_URL):
588+
"""
589+
Return a temporary location where the maven index properties file was saved.
590+
"""
591+
index_properties = fetch_http(uri)
592+
return index_properties.path
594593

595-
artifacts = get_artifacts(index_location, worthyness=is_worthy_artifact)
594+
def fetch_index_increments(self):
595+
"""
596+
Yield maven index increments
597+
"""
598+
content = self.fetch_index_properties()
599+
with open(content) as config_file:
600+
properties = javaproperties.load(config_file) or {}
601+
602+
for key, increment_index in properties.items():
603+
if key.startswith("nexus.index.incremental"):
604+
index_increment_url = MAVEN_INDEX_INCREMENT_BASE_URL.format(index=increment_index)
605+
index_increment = fetch_http(index_increment_url)
606+
yield index_increment.path
607+
608+
def _get_packages(self, content=None):
609+
artifacts = get_artifacts(content, worthyness=is_worthy_artifact)
596610

597611
for artifact in artifacts:
598612
# we cannot do much without these
@@ -658,6 +672,22 @@ def get_packages(self, content=None):
658672
)
659673
yield current_purl, package
660674

675+
def _get_packages_from_index_increments(self):
676+
for index_increment in self.fetch_index_increments():
677+
return self._get_packages(content=index_increment)
678+
679+
def get_packages(self, content=None, increments=False):
680+
"""Yield Package objects from maven index"""
681+
if increments:
682+
packages = chain(self._get_packages_from_index_increments())
683+
else:
684+
if content:
685+
index_location = content
686+
else:
687+
index_location = self.fetch_index()
688+
packages = self._get_packages(content=index_location)
689+
return packages
690+
661691

662692
def collect_packages_from_maven(commits_per_push=10, logger=None):
663693
# download and iterate through maven nexus index

0 commit comments

Comments
 (0)