diff --git a/minecode_pipelines/miners/npm.py b/minecode_pipelines/miners/npm.py new file mode 100644 index 00000000..4031a2e4 --- /dev/null +++ b/minecode_pipelines/miners/npm.py @@ -0,0 +1,153 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# purldb is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/purldb for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# + + +import json +import requests + +from packageurl import PackageURL + + +""" +Visitors for Npmjs and npmjs-like javascript package repositories. + +We have this hierarchy in npm replicate and registry index: + npm projects replicate.npmjs.com (paginated JSON) -> versions at registry.npmjs.org (JSON) -> download urls + +See https://github.com/orgs/community/discussions/152515 for information on +the latest replicate.npmjs.com API. + +https://replicate.npmjs.com/_all_docs +This NPMJS replicate API serves as an index to get all npm packages and their revision IDs +in paginated queries. + +https://replicate.npmjs.com/_changes +This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which +can be fetched in paginated queries. + +https://registry.npmjs.org/{namespace/name} +For each npm package, a JSON containing details including the list of all releases +and archives, their URLs, and some metadata for each release. + +https://registry.npmjs.org/{namespace/name}/{version} +For each release, a JSON contains details for the released version and all the +downloads available for this release. +""" + + +NPM_REPLICATE_REPO = "https://replicate.npmjs.com/" +NPM_REGISTRY_REPO = "https://registry.npmjs.org/" +NPM_TYPE = "NPM" +NPM_REPLICATE_BATCH_SIZE = 10000 + + +def get_package_names_last_key(package_data): + names = [package.get("id") for package in package_data.get("rows")] + last_key = package_data.get("rows")[-1].get("key") + return names, last_key + + +def get_package_names_last_seq(package_data): + names = [package.get("id") for package in package_data.get("results")] + last_seq = package_data.get("last_seq") + return names, last_seq + + +def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO): + npm_replicate_latest_changes = replicate_url + "_changes?descending=True" + response = requests.get(npm_replicate_latest_changes) + if not response.ok: + return + + package_data = response.json() + _package_names, last_seq = get_package_names_last_seq(package_data) + return last_seq + + +def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO): + all_package_names = [] + i = 0 + + while True: + print(f"Processing iteration: {i}: changes after seq: {last_seq}") + npm_replicate_changes = ( + replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}" + ) + response = requests.get(npm_replicate_changes) + if not response.ok: + return all_package_names + + package_data = response.json() + package_names, last_seq = get_package_names_last_seq(package_data) + all_package_names.extend(package_names) + + # We have fetched the last set of changes if True + if len(package_names) < NPM_REPLICATE_BATCH_SIZE: + break + + i += 1 + + return {"packages": all_package_names}, last_seq + + +def get_npm_packages(replicate_url=NPM_REPLICATE_REPO): + all_package_names = [] + + npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + response = requests.get(npm_replicate_all) + if not response.ok: + return all_package_names + + package_data = response.json() + package_names, last_key = get_package_names_last_key(package_data) + all_package_names.extend(package_names) + + total_rows = package_data.get("total_rows") + iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1 + + for i in range(iterations): + npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"' + print(f"Processing iteration: {i}: {npm_replicate_from_id}") + + response = requests.get(npm_replicate_from_id) + if not response.ok: + raise Exception(npm_replicate_from_id, response.text) + + package_data = response.json() + package_names, last_key = get_package_names_last_key(package_data) + all_package_names.extend(package_names) + + return {"packages": all_package_names} + + +def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO): + packageurls = [] + + project_index_api_url = npm_repo + name + response = requests.get(project_index_api_url) + if not response.ok: + return packageurls + + project_data = response.json() + for version in project_data.get("versions"): + purl = PackageURL( + type=NPM_TYPE, + name=name, + version=version, + ) + packageurls.append(purl.to_string()) + + return packageurls + + +def load_npm_packages(packages_file): + with open(packages_file) as f: + packages_data = json.load(f) + + return packages_data.get("packages", []) diff --git a/minecode_pipelines/miners/pypi.py b/minecode_pipelines/miners/pypi.py index be81a515..680cfa1e 100644 --- a/minecode_pipelines/miners/pypi.py +++ b/minecode_pipelines/miners/pypi.py @@ -13,9 +13,6 @@ from packageurl import PackageURL -from minecode_pipelines.utils import get_temp_file -from minecode_pipelines.pipes import write_data_to_json_file - """ Visitors for Pypi and Pypi-like Python package repositories. @@ -52,12 +49,6 @@ def get_pypi_packages(pypi_repo, logger=None): return response.json() -def write_packages_json(packages, name): - temp_file = get_temp_file(name) - write_data_to_json_file(path=temp_file, data=packages) - return temp_file - - def get_pypi_packageurls(name): packageurls = [] diff --git a/minecode_pipelines/pipelines/mine_npm.py b/minecode_pipelines/pipelines/mine_npm.py new file mode 100644 index 00000000..6736c4f1 --- /dev/null +++ b/minecode_pipelines/pipelines/mine_npm.py @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes import federatedcode + +from minecode_pipelines.pipes import npm +from minecode_pipelines import pipes + + +class MineNPM(Pipeline): + """ + Mine all packageURLs from a npm index and publish them to + a FederatedCode repo. + """ + + @classmethod + def steps(cls): + return ( + cls.check_federatedcode_eligibility, + cls.mine_npm_packages, + cls.mine_and_publish_npm_packageurls, + cls.delete_cloned_repos, + ) + + def check_federatedcode_eligibility(self): + """ + Check if the project fulfills the following criteria for + pushing the project result to FederatedCode. + """ + federatedcode.check_federatedcode_configured_and_available(logger=self.log) + + def mine_npm_packages(self): + """Mine npm package names from npm indexes or checkpoint.""" + self.npm_packages, self.state, self.last_seq = npm.mine_npm_packages(logger=self.log) + + def mine_and_publish_npm_packageurls(self): + """Get npm packageURLs for all mined npm package names.""" + self.repos = npm.mine_and_publish_npm_packageurls( + packages_file=self.npm_packages, + state=self.state, + last_seq=self.last_seq, + logger=self.log, + ) + + def delete_cloned_repos(self): + pipes.delete_cloned_repos(repos=self.repos, logger=self.log) diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py index 43f32f68..d2be0d0e 100644 --- a/minecode_pipelines/pipes/__init__.py +++ b/minecode_pipelines/pipes/__init__.py @@ -7,8 +7,10 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import gzip import json import os +import shutil from pathlib import Path from git import Repo import requests @@ -16,6 +18,10 @@ from aboutcode.hashid import PURLS_FILENAME +from scanpipe.pipes.federatedcode import commit_and_push_changes + +from minecode_pipelines.utils import get_temp_file + # states: # note: a state is null when mining starts INITIAL_SYNC_STATE = "initial-sync" @@ -25,6 +31,27 @@ MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/" +def compress_packages_file(packages_file, compressed_packages_file): + with open(packages_file, "rb") as f_in: + with gzip.open(compressed_packages_file, "wb") as f_out: + f_out.writelines(f_in) + + +def decompress_packages_file(compressed_packages_file, name): + packages_file = get_temp_file(name) + with gzip.open(compressed_packages_file, "rb") as f_in: + with open(packages_file, "wb") as f_out: + f_out.writelines(f_in) + + return packages_file + + +def write_packages_json(packages, name): + temp_file = get_temp_file(name) + write_data_to_json_file(path=temp_file, data=packages) + return temp_file + + def fetch_checkpoint_from_github(config_repo, checkpoint_path): repo_name = config_repo.split("github.com")[-1] checkpoints_file = ( @@ -46,8 +73,6 @@ def get_checkpoint_from_file(cloned_repo, path): def update_checkpoints_in_github(checkpoint, cloned_repo, path): - from scanpipe.pipes.federatedcode import commit_and_push_changes - checkpoint_path = os.path.join(cloned_repo.working_dir, path) write_data_to_json_file(path=checkpoint_path, data=checkpoint) commit_message = """Update federatedcode purl mining checkpoint""" @@ -58,6 +83,17 @@ def update_checkpoints_in_github(checkpoint, cloned_repo, path): ) +def update_checkpoints_file_in_github(checkpoints_file, cloned_repo, path): + checkpoint_path = os.path.join(cloned_repo.working_dir, path) + shutil.move(checkpoints_file, checkpoint_path) + commit_message = """Update federatedcode purl mining checkpoint""" + commit_and_push_changes( + repo=cloned_repo, + files_to_commit=[checkpoint_path], + commit_message=commit_message, + ) + + def get_mined_packages_from_checkpoint(config_repo, checkpoint_path): checkpoint = fetch_checkpoint_from_github( config_repo=config_repo, @@ -79,6 +115,37 @@ def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, chec ) +def update_checkpoint_state( + cloned_repo, + state, + checkpoint_path, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, +): + checkpoint = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=checkpoint_path, + ) + checkpoint["state"] = state + update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=cloned_repo, + path=checkpoint_path, + ) + + +def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name): + packages = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=checkpoint_path, + ) + return write_packages_json(packages, name=name) + + +def fetch_checkpoint_by_git(cloned_repo, checkpoint_path): + cloned_repo.remotes.origin.pull() + return os.path.join(cloned_repo.working_dir, checkpoint_path) + + def write_packageurls_to_file(repo, base_dir, packageurls, append=False): if not isinstance(packageurls, list): raise Exception("`packageurls` needs to be a list") diff --git a/minecode_pipelines/pipes/npm.py b/minecode_pipelines/pipes/npm.py new file mode 100644 index 00000000..43b77a66 --- /dev/null +++ b/minecode_pipelines/pipes/npm.py @@ -0,0 +1,354 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +from datetime import datetime + +from minecode_pipelines import VERSION +from minecode_pipelines.pipes import write_packageurls_to_file +from minecode_pipelines.pipes import fetch_checkpoint_from_github +from minecode_pipelines.pipes import update_checkpoints_in_github +from minecode_pipelines.pipes import update_checkpoints_file_in_github +from minecode_pipelines.pipes import get_mined_packages_from_checkpoint +from minecode_pipelines.pipes import update_mined_packages_in_checkpoint +from minecode_pipelines.pipes import update_checkpoint_state +from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO +from minecode_pipelines.pipes import INITIAL_SYNC_STATE +from minecode_pipelines.pipes import PERIODIC_SYNC_STATE +from minecode_pipelines.pipes import write_packages_json +from minecode_pipelines.pipes import compress_packages_file +from minecode_pipelines.pipes import decompress_packages_file +from minecode_pipelines.pipes import fetch_checkpoint_by_git + + +from minecode_pipelines.miners.npm import get_npm_packages +from minecode_pipelines.miners.npm import get_updated_npm_packages +from minecode_pipelines.miners.npm import get_current_last_seq +from minecode_pipelines.miners.npm import load_npm_packages +from minecode_pipelines.miners.npm import get_npm_packageurls +from minecode_pipelines.miners.npm import NPM_REPLICATE_REPO + +from minecode_pipelines.miners.npm import NPM_TYPE +from minecode_pipelines.utils import grouper + +from packageurl import PackageURL + +from aboutcode.hashid import get_package_base_dir + + +from scanpipe.pipes.federatedcode import clone_repository +from scanpipe.pipes.federatedcode import commit_changes +from scanpipe.pipes.federatedcode import push_changes + + +PACKAGE_FILE_NAME = "NPMPackages.json" +COMPRESSED_PACKAGE_FILE_NAME = "NPMPackages.json.gz" +NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + PACKAGE_FILE_NAME +COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH = "npm/" + COMPRESSED_PACKAGE_FILE_NAME +NPM_CHECKPOINT_PATH = "npm/checkpoints.json" +NPM_PACKAGES_CHECKPOINT_PATH = "npm/packages_checkpoint.json" + +# We are testing and storing mined packageURLs in one single repo per ecosystem for now +MINECODE_DATA_NPM_REPO = "https://github.com/aboutcode-data/minecode-data-npm-test" + + +PACKAGE_BATCH_SIZE = 700 + + +def mine_npm_packages(logger=None): + """ + Mine npm package names from npm replicate index and save to checkpoints, + or get packages from saved checkpoints. We have 3 cases: + + 1. first sync: we get latest set of packages from the "_all_docs" API endpoint + of npm replicate and save this and last sequence of the package to checkpoints. + 2. intial sync: we get packages from checkpoint which we're trying to sync upto + 3. periodic sync: we get latest packages newly released in npm through the + "_changes" API, for a period, from our last mined sequence of package. + """ + + npm_checkpoints = fetch_checkpoint_from_github( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + checkpoint_path=NPM_CHECKPOINT_PATH, + ) + state = npm_checkpoints.get("state") + if logger: + logger(f"Mining state from checkpoint: {state}") + + cloned_repo = clone_repository(repo_url=MINECODE_PIPELINES_CONFIG_REPO) + + # This is the first time we are syncing from npm replicate + if not state: + last_seq = get_current_last_seq(replicate_url=NPM_REPLICATE_REPO) + if logger: + logger( + f"Starting initial checkpointing of packages from npm replicate till seq: {last_seq}" + ) + + packages = get_npm_packages(replicate_url=NPM_REPLICATE_REPO) + packages_file = write_packages_json( + packages=packages, + name=PACKAGE_FILE_NAME, + ) + compressed_packages_file = packages_file + ".gz" + compress_packages_file( + packages_file=packages_file, + compressed_packages_file=compressed_packages_file, + ) + update_checkpoints_file_in_github( + checkpoints_file=compressed_packages_file, + cloned_repo=cloned_repo, + path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH, + ) + + if logger: + logger(f"Updating checkpoint mining state to: {INITIAL_SYNC_STATE}") + logger(f"Updating checkpoint mining last_seq to: {last_seq}") + + update_npm_checkpoints( + cloned_repo=cloned_repo, + state=INITIAL_SYNC_STATE, + last_seq=last_seq, + checkpoint_path=NPM_CHECKPOINT_PATH, + ) + + elif state == INITIAL_SYNC_STATE: + if logger: + logger("Getting packages to sync from npm checkpoint") + + last_seq = fetch_last_seq_mined( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + settings_path=NPM_CHECKPOINT_PATH, + ) + + compressed_packages_file = fetch_checkpoint_by_git( + cloned_repo=cloned_repo, + checkpoint_path=COMPRESSED_NPM_REPLICATE_CHECKPOINT_PATH, + ) + packages_file = decompress_packages_file( + compressed_packages_file=compressed_packages_file, + name=PACKAGE_FILE_NAME, + ) + + elif state == PERIODIC_SYNC_STATE: + last_seq = fetch_last_seq_mined( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + settings_path=NPM_CHECKPOINT_PATH, + ) + if logger: + logger( + f"Getting latest packages from npm replicate index changes after seq: {last_seq}" + ) + + packages, last_seq = get_updated_npm_packages( + last_seq=last_seq, + replicate_url=NPM_REPLICATE_REPO, + ) + packages_file = write_packages_json( + packages=packages, + name=PACKAGE_FILE_NAME, + ) + + return packages_file, state, last_seq + + +def update_npm_checkpoints( + cloned_repo, + checkpoint_path, + state=None, + last_seq=None, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, +): + checkpoint = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=checkpoint_path, + ) + if state: + checkpoint["state"] = state + if last_seq: + checkpoint["last_seq"] = last_seq + + checkpoint["date"] = str(datetime.now()) + update_checkpoints_in_github( + checkpoint=checkpoint, + cloned_repo=cloned_repo, + path=checkpoint_path, + ) + + +def fetch_last_seq_mined(config_repo, settings_path): + """ + Fetch "last_seq" for the last mined packages. + + This is a simple JSON in a github repo containing mining checkpoints + with the "last_seq" from the npm replicate index which was mined. Example: + https://github.com/aboutcode-data/minecode-pipelines-config/blob/main/npm/checkpoints.json + """ + checkpoints = fetch_checkpoint_from_github( + config_repo=config_repo, + checkpoint_path=settings_path, + ) + return checkpoints.get("last_seq") + + +def mine_and_publish_npm_packageurls(packages_file, state, last_seq, logger=None): + if logger: + logger(f"Last serial number mined: {last_seq}") + logger(f"Mining state: {state}") + + # this is either from npm replicate or from checkpoints + packages = load_npm_packages(packages_file) + if logger: + logger(f"# of package names fetched from index/checkpoint: {len(packages)}") + + if not packages: + return + + if not state: + packages_to_sync = packages + if logger: + logger(f"Starting package mining for {len(packages_to_sync)} packages") + + elif state == INITIAL_SYNC_STATE or state == PERIODIC_SYNC_STATE: + synced_packages = get_mined_packages_from_checkpoint( + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + checkpoint_path=NPM_PACKAGES_CHECKPOINT_PATH, + ) + packages_to_sync = [package for package in packages if package not in synced_packages] + if logger: + logger( + f"Starting initial package mining for {len(packages_to_sync)} packages from checkpoint" + ) + + # clone repo + cloned_data_repo = clone_repository(repo_url=MINECODE_DATA_NPM_REPO) + cloned_config_repo = clone_repository(repo_url=MINECODE_PIPELINES_CONFIG_REPO) + if logger: + logger(f"{MINECODE_DATA_NPM_REPO} repo cloned at: {cloned_data_repo.working_dir}") + logger(f"{MINECODE_PIPELINES_CONFIG_REPO} repo cloned at: {cloned_config_repo.working_dir}") + + for package_batch in grouper(n=PACKAGE_BATCH_SIZE, iterable=packages_to_sync): + packages_mined = [] + purls = [] + purl_files = [] + + if logger: + logger("Starting package mining for a batch of packages") + + for package_name in package_batch: + if not package_name: + continue + + # fetch packageURLs for package + if logger: + logger(f"getting packageURLs for package: {package_name}") + + packageurls = get_npm_packageurls(package_name) + if not packageurls: + if logger: + logger(f"Could not fetch package versions for package: {package_name}") + continue + + # get repo and path for package + base_purl = PackageURL(type=NPM_TYPE, name=package_name).to_string() + package_base_dir = get_package_base_dir(purl=base_purl) + + if logger: + logger(f"writing packageURLs for package: {base_purl} at: {package_base_dir}") + purls_string = " ".join(packageurls) + logger(f"packageURLs: {purls_string}") + + # write packageURLs to file + purl_file = write_packageurls_to_file( + repo=cloned_data_repo, + base_dir=package_base_dir, + packageurls=packageurls, + ) + purl_files.append(purl_file) + purls.append(base_purl) + + packages_mined.append(package_name) + + if logger: + purls_string = " ".join(purls) + logger("Committing and pushing changes for a batch of packages: ") + logger(f"{purls_string}") + + # commit changes + commit_changes( + repo=cloned_data_repo, + files_to_commit=purl_files, + purls=purls, + mine_type="packageURL", + tool_name="pkg:pypi/minecode-pipelines", + tool_version=VERSION, + ) + + # Push changes to remote repository + push_changes(repo=cloned_data_repo) + + # As we are mining the packages to sync with the index, + # we need to update mined packages checkpoint for every batch + # so we can continue mining the other packages after restarting + if logger: + logger(f"Checkpointing processed packages to: {NPM_PACKAGES_CHECKPOINT_PATH}") + + packages_checkpoint = packages_mined + synced_packages + update_mined_packages_in_checkpoint( + packages=packages_checkpoint, + config_repo=MINECODE_PIPELINES_CONFIG_REPO, + cloned_repo=cloned_config_repo, + checkpoint_path=NPM_PACKAGES_CHECKPOINT_PATH, + ) + + # If we are finished mining all the packages in the intial sync, we can now + # periodically sync the packages from latest + if state == INITIAL_SYNC_STATE: + if logger: + logger(f"{INITIAL_SYNC_STATE} completed. starting: {PERIODIC_SYNC_STATE}") + update_checkpoint_state( + cloned_repo=cloned_config_repo, + state=PERIODIC_SYNC_STATE, + checkpoint_path=NPM_CHECKPOINT_PATH, + ) + + # If we are finished mining all the packages in the periodic sync, we can now update + # the last sequence updated + if state == PERIODIC_SYNC_STATE: + if logger: + logger(f"{PERIODIC_SYNC_STATE} completed. Updating last seq to: {last_seq}") + + update_npm_checkpoints( + cloned_repo=cloned_config_repo, + checkpoint_path=NPM_CHECKPOINT_PATH, + state=PERIODIC_SYNC_STATE, + last_seq=last_seq, + ) + + # Refresh mined packages checkpoint + update_checkpoints_in_github( + checkpoint={"packages_mined": []}, + cloned_repo=cloned_config_repo, + path=NPM_PACKAGES_CHECKPOINT_PATH, + ) + + repos_to_clean = [cloned_data_repo, cloned_config_repo] + return repos_to_clean diff --git a/minecode_pipelines/pipes/pypi.py b/minecode_pipelines/pipes/pypi.py index 457a1ab6..e3577782 100644 --- a/minecode_pipelines/pipes/pypi.py +++ b/minecode_pipelines/pipes/pypi.py @@ -28,6 +28,9 @@ from minecode_pipelines.pipes import update_checkpoints_in_github from minecode_pipelines.pipes import get_mined_packages_from_checkpoint from minecode_pipelines.pipes import update_mined_packages_in_checkpoint +from minecode_pipelines.pipes import get_packages_file_from_checkpoint +from minecode_pipelines.pipes import update_checkpoint_state +from minecode_pipelines.pipes import write_packages_json from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO from minecode_pipelines.pipes import INITIAL_SYNC_STATE from minecode_pipelines.pipes import PERIODIC_SYNC_STATE @@ -37,7 +40,7 @@ from minecode_pipelines.miners.pypi import get_pypi_packageurls from minecode_pipelines.miners.pypi import load_pypi_packages from minecode_pipelines.miners.pypi import PYPI_REPO -from minecode_pipelines.miners.pypi import write_packages_json + from minecode_pipelines.miners.pypi import PYPI_TYPE from minecode_pipelines.utils import grouper @@ -114,7 +117,11 @@ def mine_pypi_packages(logger=None): ) if logger: logger(f"Updating checkpoint mining state to: {INITIAL_SYNC_STATE}") - update_checkpoint_state(cloned_repo=cloned_repo, state=INITIAL_SYNC_STATE) + update_checkpoint_state( + cloned_repo=cloned_repo, + state=INITIAL_SYNC_STATE, + checkpoint_path=PYPI_CHECKPOINT_PATH, + ) return packages_file, state @@ -134,25 +141,6 @@ def fetch_last_serial_mined(config_repo, settings_path): return checkpoints.get("last_serial") -def update_checkpoint_state( - cloned_repo, - state, - config_repo=MINECODE_PIPELINES_CONFIG_REPO, - checkpoint_path=PYPI_CHECKPOINT_PATH, -): - checkpoint = fetch_checkpoint_from_github( - config_repo=config_repo, - checkpoint_path=checkpoint_path, - ) - checkpoint["state"] = state - checkpoint["last_updated"] = str(datetime.now()) - update_checkpoints_in_github( - checkpoint=checkpoint, - cloned_repo=cloned_repo, - path=checkpoint_path, - ) - - def update_pypi_checkpoints( last_serial, state, @@ -171,14 +159,6 @@ def update_pypi_checkpoints( ) -def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name): - packages = fetch_checkpoint_from_github( - config_repo=config_repo, - checkpoint_path=checkpoint_path, - ) - return write_packages_json(packages, name=name) - - def mine_and_publish_pypi_packageurls(packages_file, state, logger=None): last_serial_fetched = fetch_last_serial_mined( config_repo=MINECODE_PIPELINES_CONFIG_REPO, @@ -314,7 +294,7 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None): checkpoint_path=PYPI_PACKAGES_CHECKPOINT_PATH, ) - # If we are finshed mining all the packages in the intial sync, we can now + # If we are finished mining all the packages in the intial sync, we can now # periodically sync the packages from latest if state == INITIAL_SYNC_STATE: if logger: @@ -325,12 +305,13 @@ def mine_and_publish_pypi_packageurls(packages_file, state, logger=None): cloned_repo=cloned_config_repo, state=state, ) - # refresh packages checkpoint once to only checkpoint new packages - update_checkpoints_in_github( - checkpoint={"packages_mined": []}, - cloned_repo=cloned_config_repo, - path=PYPI_PACKAGES_CHECKPOINT_PATH, - ) + + # refresh packages checkpoint once to only checkpoint new packages + update_checkpoints_in_github( + checkpoint={"packages_mined": []}, + cloned_repo=cloned_config_repo, + path=PYPI_PACKAGES_CHECKPOINT_PATH, + ) # update last_serial to minecode checkpoints whenever we finish mining # either from checkpoints or from the latest pypi diff --git a/pyproject-minecode_pipelines.toml b/pyproject-minecode_pipelines.toml index 1db10685..18b99f66 100644 --- a/pyproject-minecode_pipelines.toml +++ b/pyproject-minecode_pipelines.toml @@ -49,6 +49,7 @@ urls = { Homepage = "https://github.com/aboutcode-org/purldb" } [project.entry-points."scancodeio_pipelines"] mine_pypi = "minecode_pipelines.pipelines.mine_pypi:MinePypi" +mine_npm = "minecode_pipelines.pipelines.mine_npm:MineNPM" mine_maven = "minecode_pipelines.pipelines.mine_maven:MineMaven" mine_cargo = "minecode_pipelines.pipelines.mine_cargo:MineCargo" mine_debian = "minecode_pipelines.pipelines.mine_debian:MineDebian" diff --git a/requirements.txt b/requirements.txt index b6ca857f..1b962ccc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,7 +34,7 @@ crispy-bootstrap3==2024.1 crontab==1.0.4 cryptography==45.0.4 cyclonedx-python-lib==10.2.0 -debian_inspector==31.1.0 +debian_inspector==31.1.1 defusedxml==0.7.1 Deprecated==1.2.18 Django==5.1.11