1919import arrow
2020import javaproperties
2121
22+ from aboutcode import hashid
2223from packagedcode .maven import build_filename
2324from packagedcode .maven import build_url
2425from packagedcode .maven import get_urls
2728from scanpipe .pipes .fetch import fetch_http
2829from scanpipe .pipes import federatedcode
2930
30- from minecode_pipelines import miners
3131from minecode_pipelines import pipes
32+ from minecode_pipelines import VERSION
3233from minecode_pipelines .pipes import java_stream
3334
3435
4950MAVEN_INDEX_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.gz"
5051MAVEN_INDEX_INCREMENT_BASE_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.{index}.gz"
5152MAVEN_INDEX_PROPERTIES_URL = "https://repo1.maven.org/maven2/.index/nexus-maven-repository-index.properties"
52- MAVEN_SETTINGS_PATH = "minecode_checkpoints/maven.json"
53+ MAVEN_CHECKPOINT_PATH = "maven/checkpoints.json"
54+
55+ # We are testing and storing mined packageURLs in one single repo per ecosystem for now
56+ MINECODE_DATA_MAVEN_REPO = "https://github.com/aboutcode-data/minecode-data-maven-test"
5357
5458
5559def is_worthy_artifact (artifact ):
@@ -589,26 +593,24 @@ def __init__(self, index_properties_location=None):
589593 with open (content ) as config_file :
590594 self .index_properties = javaproperties .load (config_file ) or {}
591595
592- def fetch_index (self , uri = MAVEN_INDEX_URL ):
596+ def _fetch_index (self , uri = MAVEN_INDEX_URL ):
593597 """
594598 Return a temporary location where the maven index was saved.
595599 """
596600 index = fetch_http (uri )
597601 return index .path
598602
599- def fetch_index_properties (self , uri = MAVEN_INDEX_PROPERTIES_URL ):
603+ def _fetch_index_properties (self , uri = MAVEN_INDEX_PROPERTIES_URL ):
600604 """
601605 Return a temporary location where the maven index properties file was saved.
602606 """
603607 index_properties = fetch_http (uri )
604608 return index_properties .path
605609
606- def fetch_index_increments (self ):
610+ def _fetch_index_increments (self , last_incremental ):
607611 """
608612 Yield maven index increments
609613 """
610- # in this context, last serial means last incremental
611- last_incremental = pipes .fetch_last_serial_mined (settings_path = MAVEN_SETTINGS_PATH )
612614 for key , increment_index in self .index_properties .items ():
613615 if increment_index <= last_incremental :
614616 continue
@@ -684,15 +686,14 @@ def _get_packages(self, content=None):
684686 )
685687 yield current_purl , package
686688
687- def _get_packages_from_index_increments (self ):
688- for index_increment in self .fetch_index_increments ( ):
689+ def _get_packages_from_index_increments (self , last_incremental ):
690+ for index_increment in self ._fetch_index_increments ( last_incremental = last_incremental ):
689691 return self ._get_packages (content = index_increment )
690692
691- def get_packages (self , content = None ):
693+ def get_packages (self , content = None , last_incremental = None ):
692694 """Yield Package objects from maven index"""
693- last_incremental = pipes .fetch_last_serial_mined (settings_path = MAVEN_SETTINGS_PATH )
694695 if last_incremental :
695- packages = chain (self ._get_packages_from_index_increments ())
696+ packages = chain (self ._get_packages_from_index_increments (last_incremental = last_incremental ))
696697 else :
697698 if content :
698699 index_location = content
@@ -703,37 +704,76 @@ def get_packages(self, content=None):
703704
704705
705706def collect_packages_from_maven (commits_per_push = 10 , logger = None ):
707+ # Clone data and config repo
708+ data_repo = federatedcode .clone_repository (
709+ repo_url = MINECODE_DATA_MAVEN_REPO ,
710+ logger = logger ,
711+ )
712+ config_repo = federatedcode .clone_repository (
713+ repo_url = pipes .MINECODE_PIPELINES_CONFIG_REPO ,
714+ logger = logger ,
715+ )
716+ if logger :
717+ logger (f"{ MINECODE_DATA_MAVEN_REPO } repo cloned at: { data_repo .working_dir } " )
718+ logger (f"{ pipes .MINECODE_PIPELINES_CONFIG_REPO } repo cloned at: { config_repo .working_dir } " )
719+
720+ # get last_incremental to see if we can start from incrementals
721+ checkpoint = pipes .get_checkpoint_from_file (
722+ cloned_repo = config_repo ,
723+ path = MAVEN_CHECKPOINT_PATH
724+ )
725+ last_incremental = checkpoint .get ("last_incremental" )
726+ if logger :
727+ logger (f"last_incremental: { last_incremental } " )
728+
706729 # download and iterate through maven nexus index
707730 maven_nexus_collector = MavenNexusCollector ()
708731 prev_purl = None
709- current_packages = []
710- for i , (current_purl , package ) in enumerate (maven_nexus_collector .get_packages (), start = 1 ):
732+ current_purls = []
733+ for i , (current_purl , package ) in enumerate (
734+ maven_nexus_collector .get_packages (last_incremental = last_incremental ),
735+ start = 1
736+ ):
711737 if not prev_purl :
712738 prev_purl = current_purl
713739 elif prev_purl != current_purl :
714- repo_url , _ = federatedcode .get_package_repository (
715- project_purl = prev_purl ,
716- logger = logger
717- )
718- repo = federatedcode .clone_repository (
719- repo_url = repo_url ,
720- logger = logger ,
740+ # write packageURLs to file
741+ package_base_dir = hashid .get_package_base_dir (purl = prev_purl )
742+ purl_file = pipes .write_packageurls_to_file (
743+ repo = data_repo ,
744+ base_dir = package_base_dir ,
745+ packageurls = current_purls ,
721746 )
722747
723- push_commit = not bool (i % commits_per_push )
724- # save purls to yaml
725- miners .write_purls_to_repo (
726- repo = repo ,
727- package = prev_purl ,
728- packages = current_packages ,
729- push_commit = push_commit
748+ # commit changes
749+ pipes .commit_changes (
750+ repo = data_repo ,
751+ files_to_commit = [purl_file ],
752+ purls = current_purls ,
753+ mine_type = "packageURL" ,
754+ tool_name = "pkg:pypi/minecode-pipelines" ,
755+ tool_version = VERSION ,
730756 )
731757
732- federatedcode .delete_local_clone (repo )
758+ # Push changes to remote repository
759+ push_commit = not bool (i % commits_per_push )
760+ if push_commit :
761+ federatedcode .push_changes (repo = data_repo )
733762
734- current_packages = []
763+ current_purls = []
735764 prev_purl = current_purl
736- current_packages .append (package )
765+ current_purls .append (package .to_string ())
766+
767+ # update last_incremental so we can pick up from the proper place next time
768+ last_incremental = maven_nexus_collector .index_properties .get ("nexus.index.last-incremental" )
769+ checkpoint = {"last_incremental" : last_incremental }
770+ if logger :
771+ logger (f"checkpoint: { checkpoint } " )
772+ pipes .update_checkpoints_in_github (
773+ checkpoint = checkpoint ,
774+ cloned_repo = config_repo ,
775+ path = MAVEN_CHECKPOINT_PATH
776+ )
737777
738- last_incremental = maven_nexus_collector . index_properties . get ( "nexus.index.last-incremental" )
739- pipes .update_last_serial_mined ( last_serial = last_incremental , settings_path = MAVEN_SETTINGS_PATH )
778+ # clean up cloned repos
779+ pipes .delete_cloned_repos ( repos = [ data_repo , config_repo ], logger = logger )
0 commit comments