Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 153 additions & 0 deletions minecode_pipelines/miners/npm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# purldb is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/aboutcode-org/purldb for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#


import json
import requests

from packageurl import PackageURL


"""
Visitors for Npmjs and npmjs-like javascript package repositories.

We have this hierarchy in npm replicate and registry index:
npm projects replicate.npmjs.com (paginated JSON) -> versions at registry.npmjs.org (JSON) -> download urls

See https://github.com/orgs/community/discussions/152515 for information on
the latest replicate.npmjs.com API.

https://replicate.npmjs.com/_all_docs
This NPMJS replicate API serves as an index to get all npm packages and their revision IDs
in paginated queries.

https://replicate.npmjs.com/_changes
This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which
can be fetched in paginated queries.

https://registry.npmjs.org/{namespace/name}
For each npm package, a JSON containing details including the list of all releases
and archives, their URLs, and some metadata for each release.

https://registry.npmjs.org/{namespace/name}/{version}
For each release, a JSON contains details for the released version and all the
downloads available for this release.
"""


NPM_REPLICATE_REPO = "https://replicate.npmjs.com/"
NPM_REGISTRY_REPO = "https://registry.npmjs.org/"
NPM_TYPE = "NPM"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is "npm"?

NPM_REPLICATE_BATCH_SIZE = 10000


def get_package_names_last_key(package_data):
names = [package.get("id") for package in package_data.get("rows")]
last_key = package_data.get("rows")[-1].get("key")
return names, last_key


def get_package_names_last_seq(package_data):
names = [package.get("id") for package in package_data.get("results")]
last_seq = package_data.get("last_seq")
return names, last_seq


def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO):
npm_replicate_latest_changes = replicate_url + "_changes?descending=True"
response = requests.get(npm_replicate_latest_changes)
if not response.ok:
return

package_data = response.json()
_package_names, last_seq = get_package_names_last_seq(package_data)
return last_seq


def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO):
all_package_names = []
i = 0

while True:
print(f"Processing iteration: {i}: changes after seq: {last_seq}")
npm_replicate_changes = (
replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}"
)
response = requests.get(npm_replicate_changes)
if not response.ok:
return all_package_names

package_data = response.json()
package_names, last_seq = get_package_names_last_seq(package_data)
all_package_names.extend(package_names)

# We have fetched the last set of changes if True
if len(package_names) < NPM_REPLICATE_BATCH_SIZE:
break

i += 1

return {"packages": all_package_names}, last_seq


def get_npm_packages(replicate_url=NPM_REPLICATE_REPO):
all_package_names = []

npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}"
response = requests.get(npm_replicate_all)
if not response.ok:
return all_package_names

package_data = response.json()
package_names, last_key = get_package_names_last_key(package_data)
all_package_names.extend(package_names)

total_rows = package_data.get("total_rows")
iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1

for i in range(iterations):
npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"'
print(f"Processing iteration: {i}: {npm_replicate_from_id}")

response = requests.get(npm_replicate_from_id)
if not response.ok:
raise Exception(npm_replicate_from_id, response.text)

package_data = response.json()
package_names, last_key = get_package_names_last_key(package_data)
all_package_names.extend(package_names)

return {"packages": all_package_names}


def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO):
packageurls = []

project_index_api_url = npm_repo + name
response = requests.get(project_index_api_url)
if not response.ok:
return packageurls

project_data = response.json()
for version in project_data.get("versions"):
purl = PackageURL(
type=NPM_TYPE,
name=name,
version=version,
)
packageurls.append(purl.to_string())

return packageurls


def load_npm_packages(packages_file):
with open(packages_file) as f:
packages_data = json.load(f)

return packages_data.get("packages", [])
9 changes: 0 additions & 9 deletions minecode_pipelines/miners/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@

from packageurl import PackageURL

from minecode_pipelines.utils import get_temp_file
from minecode_pipelines.pipes import write_data_to_json_file

"""
Visitors for Pypi and Pypi-like Python package repositories.

Expand Down Expand Up @@ -52,12 +49,6 @@ def get_pypi_packages(pypi_repo, logger=None):
return response.json()


def write_packages_json(packages, name):
temp_file = get_temp_file(name)
write_data_to_json_file(path=temp_file, data=packages)
return temp_file


def get_pypi_packageurls(name):
packageurls = []

Expand Down
66 changes: 66 additions & 0 deletions minecode_pipelines/pipelines/mine_npm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/aboutcode-org/scancode.io for support and download.

from scanpipe.pipelines import Pipeline
from scanpipe.pipes import federatedcode

from minecode_pipelines.pipes import npm
from minecode_pipelines import pipes


class MineNPM(Pipeline):
"""
Mine all packageURLs from a npm index and publish them to
a FederatedCode repo.
"""

@classmethod
def steps(cls):
return (
cls.check_federatedcode_eligibility,
cls.mine_npm_packages,
cls.mine_and_publish_npm_packageurls,
cls.delete_cloned_repos,
)

def check_federatedcode_eligibility(self):
"""
Check if the project fulfills the following criteria for
pushing the project result to FederatedCode.
"""
federatedcode.check_federatedcode_configured_and_available(logger=self.log)

def mine_npm_packages(self):
"""Mine npm package names from npm indexes or checkpoint."""
self.npm_packages, self.state, self.last_seq = npm.mine_npm_packages(logger=self.log)

def mine_and_publish_npm_packageurls(self):
"""Get npm packageURLs for all mined npm package names."""
self.repos = npm.mine_and_publish_npm_packageurls(
packages_file=self.npm_packages,
state=self.state,
last_seq=self.last_seq,
logger=self.log,
)

def delete_cloned_repos(self):
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
71 changes: 69 additions & 2 deletions minecode_pipelines/pipes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,21 @@
# See https://aboutcode.org for more information about nexB OSS projects.
#

import gzip
import json
import os
import shutil
from pathlib import Path
from git import Repo
import requests
import saneyaml

from aboutcode.hashid import PURLS_FILENAME

from scanpipe.pipes.federatedcode import commit_and_push_changes

from minecode_pipelines.utils import get_temp_file

# states:
# note: a state is null when mining starts
INITIAL_SYNC_STATE = "initial-sync"
Expand All @@ -25,6 +31,27 @@
MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/"


def compress_packages_file(packages_file, compressed_packages_file):
with open(packages_file, "rb") as f_in:
with gzip.open(compressed_packages_file, "wb") as f_out:
f_out.writelines(f_in)


def decompress_packages_file(compressed_packages_file, name):
packages_file = get_temp_file(name)
with gzip.open(compressed_packages_file, "rb") as f_in:
with open(packages_file, "wb") as f_out:
f_out.writelines(f_in)

return packages_file


def write_packages_json(packages, name):
temp_file = get_temp_file(name)
write_data_to_json_file(path=temp_file, data=packages)
return temp_file


def fetch_checkpoint_from_github(config_repo, checkpoint_path):
repo_name = config_repo.split("github.com")[-1]
checkpoints_file = (
Expand All @@ -46,8 +73,6 @@ def get_checkpoint_from_file(cloned_repo, path):


def update_checkpoints_in_github(checkpoint, cloned_repo, path):
from scanpipe.pipes.federatedcode import commit_and_push_changes

checkpoint_path = os.path.join(cloned_repo.working_dir, path)
write_data_to_json_file(path=checkpoint_path, data=checkpoint)
commit_message = """Update federatedcode purl mining checkpoint"""
Expand All @@ -58,6 +83,17 @@ def update_checkpoints_in_github(checkpoint, cloned_repo, path):
)


def update_checkpoints_file_in_github(checkpoints_file, cloned_repo, path):
checkpoint_path = os.path.join(cloned_repo.working_dir, path)
shutil.move(checkpoints_file, checkpoint_path)
commit_message = """Update federatedcode purl mining checkpoint"""
commit_and_push_changes(
repo=cloned_repo,
files_to_commit=[checkpoint_path],
commit_message=commit_message,
)


def get_mined_packages_from_checkpoint(config_repo, checkpoint_path):
checkpoint = fetch_checkpoint_from_github(
config_repo=config_repo,
Expand All @@ -79,6 +115,37 @@ def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, chec
)


def update_checkpoint_state(
cloned_repo,
state,
checkpoint_path,
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
):
checkpoint = fetch_checkpoint_from_github(
config_repo=config_repo,
checkpoint_path=checkpoint_path,
)
checkpoint["state"] = state
update_checkpoints_in_github(
checkpoint=checkpoint,
cloned_repo=cloned_repo,
path=checkpoint_path,
)


def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name):
packages = fetch_checkpoint_from_github(
config_repo=config_repo,
checkpoint_path=checkpoint_path,
)
return write_packages_json(packages, name=name)


def fetch_checkpoint_by_git(cloned_repo, checkpoint_path):
cloned_repo.remotes.origin.pull()
return os.path.join(cloned_repo.working_dir, checkpoint_path)


def write_packageurls_to_file(repo, base_dir, packageurls, append=False):
if not isinstance(packageurls, list):
raise Exception("`packageurls` needs to be a list")
Expand Down
Loading
Loading