aboutcode-org · AyanSinhaMahapatra · Sep 23, 2025 · Sep 29, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/minecode_pipelines/miners/npm.py b/minecode_pipelines/miners/npm.py
@@ -0,0 +1,153 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# purldb is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/aboutcode-org/purldb for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+
+import json
+import requests
+
+from packageurl import PackageURL
+
+
+"""
+Visitors for Npmjs and npmjs-like javascript package repositories.
+
+We have this hierarchy in npm replicate and registry index:
+    npm projects replicate.npmjs.com (paginated JSON) -> versions at registry.npmjs.org (JSON) -> download urls
+
+See https://github.com/orgs/community/discussions/152515 for information on
+the latest replicate.npmjs.com API.
+
+https://replicate.npmjs.com/_all_docs
+This NPMJS replicate API serves as an index to get all npm packages and their revision IDs
+in paginated queries.
+
+https://replicate.npmjs.com/_changes
+This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which
+can be fetched in paginated queries.
+
+https://registry.npmjs.org/{namespace/name}
+For each npm package, a JSON containing details including the list of all releases
+and archives, their URLs, and some metadata for each release.
+
+https://registry.npmjs.org/{namespace/name}/{version}
+For each release, a JSON contains details for the released version and all the
+downloads available for this release.
+"""
+
+
+NPM_REPLICATE_REPO = "https://replicate.npmjs.com/"
+NPM_REGISTRY_REPO = "https://registry.npmjs.org/"
+NPM_TYPE = "NPM"
+NPM_REPLICATE_BATCH_SIZE = 10000
+
+
+def get_package_names_last_key(package_data):
+    names = [package.get("id") for package in package_data.get("rows")]
+    last_key = package_data.get("rows")[-1].get("key")
+    return names, last_key
+
+
+def get_package_names_last_seq(package_data):
+    names = [package.get("id") for package in package_data.get("results")]
+    last_seq = package_data.get("last_seq")
+    return names, last_seq
+
+
+def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO):
+    npm_replicate_latest_changes = replicate_url + "_changes?descending=True"
+    response = requests.get(npm_replicate_latest_changes)
+    if not response.ok:
+        return
+
+    package_data = response.json()
+    _package_names, last_seq = get_package_names_last_seq(package_data)
+    return last_seq
+
+
+def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO):
+    all_package_names = []
+    i = 0
+
+    while True:
+        print(f"Processing iteration: {i}: changes after seq: {last_seq}")
+        npm_replicate_changes = (
+            replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}"
+        )
+        response = requests.get(npm_replicate_changes)
+        if not response.ok:
+            return all_package_names
+
+        package_data = response.json()
+        package_names, last_seq = get_package_names_last_seq(package_data)
+        all_package_names.extend(package_names)
+
+        # We have fetched the last set of changes if True
+        if len(package_names) < NPM_REPLICATE_BATCH_SIZE:
+            break
+
+        i += 1
+
+    return {"packages": all_package_names}, last_seq
+
+
+def get_npm_packages(replicate_url=NPM_REPLICATE_REPO):
+    all_package_names = []
+
+    npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}"
+    response = requests.get(npm_replicate_all)
+    if not response.ok:
+        return all_package_names
+
+    package_data = response.json()
+    package_names, last_key = get_package_names_last_key(package_data)
+    all_package_names.extend(package_names)
+
+    total_rows = package_data.get("total_rows")
+    iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1
+
+    for i in range(iterations):
+        npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"'
+        print(f"Processing iteration: {i}: {npm_replicate_from_id}")
+
+        response = requests.get(npm_replicate_from_id)
+        if not response.ok:
+            raise Exception(npm_replicate_from_id, response.text)
+
+        package_data = response.json()
+        package_names, last_key = get_package_names_last_key(package_data)
+        all_package_names.extend(package_names)
+
+    return {"packages": all_package_names}
+
+
+def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO):
+    packageurls = []
+
+    project_index_api_url = npm_repo + name
+    response = requests.get(project_index_api_url)
+    if not response.ok:
+        return packageurls
+
+    project_data = response.json()
+    for version in project_data.get("versions"):
+        purl = PackageURL(
+            type=NPM_TYPE,
+            name=name,
+            version=version,
+        )
+        packageurls.append(purl.to_string())
+
+    return packageurls
+
+
+def load_npm_packages(packages_file):
+    with open(packages_file) as f:
+        packages_data = json.load(f)
+
+    return packages_data.get("packages", [])
diff --git a/minecode_pipelines/miners/pypi.py b/minecode_pipelines/miners/pypi.py
@@ -13,9 +13,6 @@
 
 from packageurl import PackageURL
 
-from minecode_pipelines.utils import get_temp_file
-from minecode_pipelines.pipes import write_data_to_json_file
-
 """
 Visitors for Pypi and Pypi-like Python package repositories.
 
@@ -52,12 +49,6 @@ def get_pypi_packages(pypi_repo, logger=None):
     return response.json()
 
 
-def write_packages_json(packages, name):
-    temp_file = get_temp_file(name)
-    write_data_to_json_file(path=temp_file, data=packages)
-    return temp_file
-
-
 def get_pypi_packageurls(name):
     packageurls = []
 

diff --git a/minecode_pipelines/pipelines/mine_npm.py b/minecode_pipelines/pipelines/mine_npm.py
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/aboutcode-org/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/aboutcode-org/scancode.io for support and download.
+
+from scanpipe.pipelines import Pipeline
+from scanpipe.pipes import federatedcode
+
+from minecode_pipelines.pipes import npm
+from minecode_pipelines import pipes
+
+
+class MineNPM(Pipeline):
+    """
+    Mine all packageURLs from a npm index and publish them to
+    a FederatedCode repo.
+    """
+
+    @classmethod
+    def steps(cls):
+        return (
+            cls.check_federatedcode_eligibility,
+            cls.mine_npm_packages,
+            cls.mine_and_publish_npm_packageurls,
+            cls.delete_cloned_repos,
+        )
+
+    def check_federatedcode_eligibility(self):
+        """
+        Check if the project fulfills the following criteria for
+        pushing the project result to FederatedCode.
+        """
+        federatedcode.check_federatedcode_configured_and_available(logger=self.log)
+
+    def mine_npm_packages(self):
+        """Mine npm package names from npm indexes or checkpoint."""
+        self.npm_packages, self.state, self.last_seq = npm.mine_npm_packages(logger=self.log)
+
+    def mine_and_publish_npm_packageurls(self):
+        """Get npm packageURLs for all mined npm package names."""
+        self.repos = npm.mine_and_publish_npm_packageurls(
+            packages_file=self.npm_packages,
+            state=self.state,
+            last_seq=self.last_seq,
+            logger=self.log,
+        )
+
+    def delete_cloned_repos(self):
+        pipes.delete_cloned_repos(repos=self.repos, logger=self.log)
diff --git a/minecode_pipelines/pipes/__init__.py b/minecode_pipelines/pipes/__init__.py
@@ -7,15 +7,21 @@
 # See https://aboutcode.org for more information about nexB OSS projects.
 #
 
+import gzip
 import json
 import os
+import shutil
 from pathlib import Path
 from git import Repo
 import requests
 import saneyaml
 
 from aboutcode.hashid import PURLS_FILENAME
 
+from scanpipe.pipes.federatedcode import commit_and_push_changes
+
+from minecode_pipelines.utils import get_temp_file
+
 # states:
 # note: a state is null when mining starts
 INITIAL_SYNC_STATE = "initial-sync"
@@ -25,6 +31,27 @@
 MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/"
 
 
+def compress_packages_file(packages_file, compressed_packages_file):
+    with open(packages_file, "rb") as f_in:
+        with gzip.open(compressed_packages_file, "wb") as f_out:
+            f_out.writelines(f_in)
+
+
+def decompress_packages_file(compressed_packages_file, name):
+    packages_file = get_temp_file(name)
+    with gzip.open(compressed_packages_file, "rb") as f_in:
+        with open(packages_file, "wb") as f_out:
+            f_out.writelines(f_in)
+
+    return packages_file
+
+
+def write_packages_json(packages, name):
+    temp_file = get_temp_file(name)
+    write_data_to_json_file(path=temp_file, data=packages)
+    return temp_file
+
+
 def fetch_checkpoint_from_github(config_repo, checkpoint_path):
     repo_name = config_repo.split("github.com")[-1]
     checkpoints_file = (
@@ -46,8 +73,6 @@ def get_checkpoint_from_file(cloned_repo, path):
 
 
 def update_checkpoints_in_github(checkpoint, cloned_repo, path):
-    from scanpipe.pipes.federatedcode import commit_and_push_changes
-
     checkpoint_path = os.path.join(cloned_repo.working_dir, path)
     write_data_to_json_file(path=checkpoint_path, data=checkpoint)
     commit_message = """Update federatedcode purl mining checkpoint"""
@@ -58,6 +83,17 @@ def update_checkpoints_in_github(checkpoint, cloned_repo, path):
     )
 
 
+def update_checkpoints_file_in_github(checkpoints_file, cloned_repo, path):
+    checkpoint_path = os.path.join(cloned_repo.working_dir, path)
+    shutil.move(checkpoints_file, checkpoint_path)
+    commit_message = """Update federatedcode purl mining checkpoint"""
+    commit_and_push_changes(
+        repo=cloned_repo,
+        files_to_commit=[checkpoint_path],
+        commit_message=commit_message,
+    )
+
+
 def get_mined_packages_from_checkpoint(config_repo, checkpoint_path):
     checkpoint = fetch_checkpoint_from_github(
         config_repo=config_repo,
@@ -79,6 +115,37 @@ def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, chec
     )
 
 
+def update_checkpoint_state(
+    cloned_repo,
+    state,
+    checkpoint_path,
+    config_repo=MINECODE_PIPELINES_CONFIG_REPO,
+):
+    checkpoint = fetch_checkpoint_from_github(
+        config_repo=config_repo,
+        checkpoint_path=checkpoint_path,
+    )
+    checkpoint["state"] = state
+    update_checkpoints_in_github(
+        checkpoint=checkpoint,
+        cloned_repo=cloned_repo,
+        path=checkpoint_path,
+    )
+
+
+def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name):
+    packages = fetch_checkpoint_from_github(
+        config_repo=config_repo,
+        checkpoint_path=checkpoint_path,
+    )
+    return write_packages_json(packages, name=name)
+
+
+def fetch_checkpoint_by_git(cloned_repo, checkpoint_path):
+    cloned_repo.remotes.origin.pull()
+    return os.path.join(cloned_repo.working_dir, checkpoint_path)
+
+
 def write_packageurls_to_file(repo, base_dir, packageurls, append=False):
     if not isinstance(packageurls, list):
         raise Exception("`packageurls` needs to be a list")