From 9e933173b6a36937514cda5fe6a6495630092c80 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Thu, 13 Nov 2025 07:52:50 +0800 Subject: [PATCH 1/7] Add "scan_maven_package" pipeline (Working-in-progress) #1763 Signed-off-by: Chin Yeung Li --- pyproject.toml | 1 + scanpipe/pipelines/scan_maven_package.py | 162 ++++++++++++++++++++++ scanpipe/pipes/resolve.py | 168 +++++++++++++++++++++++ 3 files changed, 331 insertions(+) create mode 100644 scanpipe/pipelines/scan_maven_package.py diff --git a/pyproject.toml b/pyproject.toml index f0ae21f332..3fea1404fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -161,6 +161,7 @@ resolve_dependencies = "scanpipe.pipelines.resolve_dependencies:ResolveDependenc scan_codebase = "scanpipe.pipelines.scan_codebase:ScanCodebase" scan_for_virus = "scanpipe.pipelines.scan_for_virus:ScanForVirus" scan_single_package = "scanpipe.pipelines.scan_single_package:ScanSinglePackage" +scan_maven_package = "scanpipe.pipelines.scan_maven_package:ScanMavenPackage" [tool.setuptools.packages.find] where = ["."] diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py new file mode 100644 index 0000000000..841899ff88 --- /dev/null +++ b/scanpipe/pipelines/scan_maven_package.py @@ -0,0 +1,162 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import json + +from django.core.serializers.json import DjangoJSONEncoder + +from commoncode.hash import multi_checksums + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes import input +from scanpipe.pipes import scancode +from scanpipe.pipes.input import copy_input +from scanpipe.pipes.input import is_archive + +from scanpipe.pipes.resolve import get_pom_url_list +from scanpipe.pipes.resolve import download_and_scan_pom_file + + +class ScanMavenPackage(Pipeline): + """ + Scan a single package archive (or package manifest file). + + This pipeline scans a single package for package metadata, + declared dependencies, licenses, license clarity score and copyrights. + + The output is a summary of the scan results in JSON format. + """ + + @classmethod + def steps(cls): + return ( + cls.get_package_input, + cls.collect_input_information, + cls.extract_input_to_codebase_directory, + cls.extract_archives, + cls.run_scan, + cls.fetch_and_scan_remote_pom, + cls.load_inventory_from_toolkit_scan, + cls.make_summary_from_scan_results, + ) + + scancode_run_scan_args = { + "copyright": True, + "email": True, + "info": True, + "license": True, + "license_text": True, + "license_diagnostics": True, + "license_text_diagnostics": True, + "license_references": True, + "package": True, + "url": True, + "classify": True, + "summary": True, + "todo": True, + } + + def get_package_input(self): + """Locate the package input in the project's input/ directory.""" + # Using the input_sources model property as it includes input sources instances + # as well as any files manually copied into the input/ directory. + input_sources = self.project.input_sources + inputs = list(self.project.inputs("*")) + + if len(inputs) != 1 or len(input_sources) != 1: + raise Exception("Only 1 input file supported") + + self.input_path = inputs[0] + + def collect_input_information(self): + """Collect and store information about the project input.""" + self.project.update_extra_data( + { + "filename": self.input_path.name, + "size": self.input_path.stat().st_size, + **multi_checksums(self.input_path), + } + ) + + def extract_input_to_codebase_directory(self): + """Copy or extract input to project codebase/ directory.""" + if not is_archive(self.input_path): + copy_input(self.input_path, self.project.codebase_path) + return + + self.extract_archive(self.input_path, self.project.codebase_path) + + # Reload the project env post-extraction as the scancode-config.yml file + # may be located in one of the extracted archives. + self.env = self.project.get_env() + + def run_scan(self): + """Scan extracted codebase/ content.""" + scan_output_path = self.project.get_output_file_path("scancode", "json") + self.scan_output_location = str(scan_output_path.absolute()) + + scanning_errors = scancode.run_scan( + location=str(self.project.codebase_path), + output_file=self.scan_output_location, + run_scan_args=self.scancode_run_scan_args.copy(), + ) + + for resource_path, errors in scanning_errors.items(): + self.project.add_error( + description="\n".join(errors), + model=self.pipeline_name, + details={"resource_path": resource_path.removeprefix("codebase/")}, + ) + + if not scan_output_path.exists(): + raise FileNotFoundError("ScanCode output not available.") + + def fetch_and_scan_remote_pom(self): + """Fetch the pom.xml file from from maven.org if not present in codebase.""" + # TODO Verify if the following filter actually work + if not self.project.codebaseresources.files().filter(name="pom.xml").exists(): + with open(self.scan_output_location, 'r') as file: + data = json.load(file) + packages = data.get("packages", []) + + pom_url_list = get_pom_url_list(self.project.input_sources[0], packages) + scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list) + + updated_pacakges = packages + scanned_pom_packages + # Replace/Update the package and dependencies section + data['packages'] = updated_pacakges + # Need to update the dependencies + # data['dependencies'] = scanned_dependencies + with open(self.scan_output_location, 'w') as file: + json.dump(data, file, indent=2) + + def load_inventory_from_toolkit_scan(self): + """Process a JSON Scan results to populate codebase resources and packages.""" + input.load_inventory_from_toolkit_scan(self.project, self.scan_output_location) + + def make_summary_from_scan_results(self): + """Build a summary in JSON format from the generated scan results.""" + summary = scancode.make_results_summary(self.project, self.scan_output_location) + output_file = self.project.get_output_file_path("summary", "json") + + with output_file.open("w") as summary_file: + summary_file.write(json.dumps(summary, indent=2, cls=DjangoJSONEncoder)) diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index 0a409dd88c..f9ea444301 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -22,6 +22,8 @@ import json import logging +import re +import requests import sys import uuid from pathlib import Path @@ -44,6 +46,9 @@ from scanpipe.pipes import update_or_create_dependency from scanpipe.pipes import update_or_create_package +from scanpipe.pipes import fetch +from scanpipe.pipes import scancode + """ Resolve packages from manifest, lockfile, and SBOM. """ @@ -521,3 +526,166 @@ def extract_headers(input_location, extract_fields): return extracted_headers return {} + + +def parse_maven_filename(filename): + """Parse a Maven's jar filename to extract artifactId and version.""" + # Remove the .jar extension + base = filename.rsplit('.', 1)[0] + + # Common classifiers pattern + common_classifiers = { + 'sources', 'javadoc', 'tests', 'test', 'test-sources', + 'src', 'bin', 'docs', 'javadocs', 'client', 'server', + 'linux', 'windows', 'macos', 'linux-x86_64', 'windows-x86_64' + } + + # Remove known classifier if present + for classifier in common_classifiers: + if base.endswith(f"-{classifier}"): + base = base[:-(len(classifier) + 1)] + break + + # Match artifactId and version + match = re.match(r'^(.*)-(\d[\w.\-]+)$', base) + if match: + artifact_id = match.group(1) + version = match.group(2) + return artifact_id, version + else: + return None, None + + +def get_pom_url_list(input_source, packages): + pom_url_list = [] + if packages: + for package in packages: + package_ns = package.get("namespace", "") + package_name = package.get("name", "") + package_version = package.get("version", "") + pom_url = f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/{package_name}/{package_version}/{package_name}-{package_version}.pom".lower() + pom_url_list.append(pom_url) + else: + # Check what's the input source + input_source_url = input_source.get("download_url", "") + + if input_source_url and "maven.org/" in input_source_url: + base_url = input_source_url.rsplit('/', 1)[0] + pom_url = base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom" + pom_url_list.append(pom_url) + else: + # Construct a pom_url from filename + input_filename = input_source.get("filename", "") + if input_filename.endswith(".jar"): + artifact_id, version = parse_maven_filename(input_filename) + if not artifact_id or not version: + return [] + pom_url_list = construct_pom_url_from_filename(artifact_id, version) + else: + # Only work with input that's a .jar file + return [] + + return pom_url_list + + +def construct_pom_url_from_filename(artifact_id, version): + """Construct a pom.xml URL from the given Maven filename.""" + # Search Maven Central for the artifact to get its groupId + url = f"https://search.maven.org/solrsearch/select?q=a:{artifact_id}&wt=json" + pom_url_list = [] + group_ids = [] + try: + response = requests.get(url) + response.raise_for_status() + data = response.json() + # Extract all 'g' fields from the docs array that represent + # groupIds + group_ids = [doc['g'] for doc in data['response']['docs']] + except requests.RequestException as e: + print(f"Error fetching data: {e}") + return [] + except KeyError as e: + print(f"Error parsing JSON: {e}") + return [] + + for group_id in group_ids: + pom_url = f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom".lower() + if is_maven_pom_url(pom_url): + pom_url_list.append(pom_url) + if len(pom_url_list) > 1: + # If multiple valid POM URLs are found, it means the same + # artifactId and version exist under different groupIds. Since we + # can't confidently determine the correct groupId, we return an + # empty list to avoid fetching the wrong POM. + return [] + + return pom_url_list + + +def is_maven_pom_url(url): + """ + Return True if the url is a accessible, False otherwise + Maven Central has a fallback mechanism that serves a generic/error page + instead of returning a proper 404. + """ + try: + response = requests.get(url, timeout=5) + if response.status_code != 200: + return False + # Check content-type + content_type = response.headers.get('content-type', '').lower() + is_xml = 'xml' in content_type or 'text/xml' in content_type + + # Check content + content = response.text.strip() + is_pom = content.startswith(' Date: Thu, 13 Nov 2025 09:58:45 +0800 Subject: [PATCH 2/7] Use pom_url as the datafile_path for the dependenies #1763 Signed-off-by: Chin Yeung Li --- scanpipe/pipes/resolve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index f9ea444301..431108a7a6 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -685,7 +685,7 @@ def download_and_scan_pom_file(pom_url_list): scanned_pom_packages.append(scanned_package) if scanned_dependencies: for scanned_dep in scanned_dependencies: - # Replace the 'datafile_path' with the empty list - scanned_dep['datafile_path'] = scanned_pom_output_path + # Replace the 'datafile_path' with the pom_url + scanned_dep['datafile_path'] = pom_url scanned_pom_deps.append(scanned_dep) return scanned_pom_packages, scanned_pom_deps From d63a1e539094239244280e85e4ec7ba3de6de82f Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Thu, 13 Nov 2025 17:36:50 +0800 Subject: [PATCH 3/7] Use empty string for datafile_path in dependencies #1763 Signed-off-by: Chin Yeung Li --- scanpipe/pipes/resolve.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index 431108a7a6..c5a32918ef 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -557,6 +557,7 @@ def parse_maven_filename(filename): def get_pom_url_list(input_source, packages): + """Generate Maven POM URLs from package metadata or input source.""" pom_url_list = [] if packages: for package in packages: @@ -623,11 +624,9 @@ def construct_pom_url_from_filename(artifact_id, version): def is_maven_pom_url(url): - """ - Return True if the url is a accessible, False otherwise - Maven Central has a fallback mechanism that serves a generic/error page - instead of returning a proper 404. - """ + """Return True if the url is a accessible, False otherwise""" + # Maven Central has a fallback mechanism that serves a generic/error + # page instead of returning a proper 404. try: response = requests.get(url, timeout=5) if response.status_code != 200: @@ -650,6 +649,7 @@ def is_maven_pom_url(url): def download_and_scan_pom_file(pom_url_list): + """Fetch and scan the pom file from the input pom_url_list""" scanned_pom_packages = [] scanned_pom_deps = [] for pom_url in pom_url_list: @@ -661,16 +661,7 @@ def download_and_scan_pom_file(pom_url_list): location=str(downloaded_pom.path), output_file=scanned_pom_output_path, run_scan_args={ - "copyright": True, - "email": True, - "info": True, - "license": True, - "license_text": True, - "license_diagnostics": True, - "license_text_diagnostics": True, - "license_references": True, "package": True, - "url": True, }, ) @@ -685,7 +676,8 @@ def download_and_scan_pom_file(pom_url_list): scanned_pom_packages.append(scanned_package) if scanned_dependencies: for scanned_dep in scanned_dependencies: - # Replace the 'datafile_path' with the pom_url - scanned_dep['datafile_path'] = pom_url + # Replace the 'datafile_path' with empty string + # See https://github.com/aboutcode-org/scancode.io/issues/1763#issuecomment-3525165830 + scanned_dep['datafile_path'] = "" scanned_pom_deps.append(scanned_dep) return scanned_pom_packages, scanned_pom_deps From 9812129f0f1a63d496d514d38710a62a93bedb65 Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Thu, 13 Nov 2025 17:37:18 +0800 Subject: [PATCH 4/7] Removed dup code that's already present in ScanSinglePackage #1763 Signed-off-by: Chin Yeung Li --- scanpipe/pipelines/scan_maven_package.py | 128 ++++------------------- 1 file changed, 19 insertions(+), 109 deletions(-) diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py index 841899ff88..a06697ba6f 100644 --- a/scanpipe/pipelines/scan_maven_package.py +++ b/scanpipe/pipelines/scan_maven_package.py @@ -22,21 +22,13 @@ import json -from django.core.serializers.json import DjangoJSONEncoder - -from commoncode.hash import multi_checksums - -from scanpipe.pipelines import Pipeline -from scanpipe.pipes import input -from scanpipe.pipes import scancode -from scanpipe.pipes.input import copy_input -from scanpipe.pipes.input import is_archive +from scanpipe.pipelines.scan_single_package import ScanSinglePackage from scanpipe.pipes.resolve import get_pom_url_list from scanpipe.pipes.resolve import download_and_scan_pom_file -class ScanMavenPackage(Pipeline): +class ScanMavenPackage(ScanSinglePackage): """ Scan a single package archive (or package manifest file). @@ -59,104 +51,22 @@ def steps(cls): cls.make_summary_from_scan_results, ) - scancode_run_scan_args = { - "copyright": True, - "email": True, - "info": True, - "license": True, - "license_text": True, - "license_diagnostics": True, - "license_text_diagnostics": True, - "license_references": True, - "package": True, - "url": True, - "classify": True, - "summary": True, - "todo": True, - } - - def get_package_input(self): - """Locate the package input in the project's input/ directory.""" - # Using the input_sources model property as it includes input sources instances - # as well as any files manually copied into the input/ directory. - input_sources = self.project.input_sources - inputs = list(self.project.inputs("*")) - - if len(inputs) != 1 or len(input_sources) != 1: - raise Exception("Only 1 input file supported") - - self.input_path = inputs[0] - - def collect_input_information(self): - """Collect and store information about the project input.""" - self.project.update_extra_data( - { - "filename": self.input_path.name, - "size": self.input_path.stat().st_size, - **multi_checksums(self.input_path), - } - ) - - def extract_input_to_codebase_directory(self): - """Copy or extract input to project codebase/ directory.""" - if not is_archive(self.input_path): - copy_input(self.input_path, self.project.codebase_path) - return - - self.extract_archive(self.input_path, self.project.codebase_path) - - # Reload the project env post-extraction as the scancode-config.yml file - # may be located in one of the extracted archives. - self.env = self.project.get_env() - - def run_scan(self): - """Scan extracted codebase/ content.""" - scan_output_path = self.project.get_output_file_path("scancode", "json") - self.scan_output_location = str(scan_output_path.absolute()) - - scanning_errors = scancode.run_scan( - location=str(self.project.codebase_path), - output_file=self.scan_output_location, - run_scan_args=self.scancode_run_scan_args.copy(), - ) - - for resource_path, errors in scanning_errors.items(): - self.project.add_error( - description="\n".join(errors), - model=self.pipeline_name, - details={"resource_path": resource_path.removeprefix("codebase/")}, - ) - - if not scan_output_path.exists(): - raise FileNotFoundError("ScanCode output not available.") - def fetch_and_scan_remote_pom(self): """Fetch the pom.xml file from from maven.org if not present in codebase.""" - # TODO Verify if the following filter actually work - if not self.project.codebaseresources.files().filter(name="pom.xml").exists(): - with open(self.scan_output_location, 'r') as file: - data = json.load(file) - packages = data.get("packages", []) - - pom_url_list = get_pom_url_list(self.project.input_sources[0], packages) - scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list) - - updated_pacakges = packages + scanned_pom_packages - # Replace/Update the package and dependencies section - data['packages'] = updated_pacakges - # Need to update the dependencies - # data['dependencies'] = scanned_dependencies - with open(self.scan_output_location, 'w') as file: - json.dump(data, file, indent=2) - - def load_inventory_from_toolkit_scan(self): - """Process a JSON Scan results to populate codebase resources and packages.""" - input.load_inventory_from_toolkit_scan(self.project, self.scan_output_location) - - def make_summary_from_scan_results(self): - """Build a summary in JSON format from the generated scan results.""" - summary = scancode.make_results_summary(self.project, self.scan_output_location) - output_file = self.project.get_output_file_path("summary", "json") - - with output_file.open("w") as summary_file: - summary_file.write(json.dumps(summary, indent=2, cls=DjangoJSONEncoder)) + with open(self.scan_output_location, 'r') as file: + data = json.load(file) + # Return and do nothing if data has pom.xml + for file in data['files']: + if 'pom.xml' in file['path']: + return + packages = data.get("packages", []) + + pom_url_list = get_pom_url_list(self.project.input_sources[0], packages) + scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list) + + updated_pacakges = packages + scanned_pom_packages + # Replace/Update the package and dependencies section + data['packages'] = updated_pacakges + data['dependencies'] = scanned_dependencies + with open(self.scan_output_location, 'w') as file: + json.dump(data, file, indent=2) From 114eb75f8bf77d5cf5212facb0e01d726428d93e Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Thu, 13 Nov 2025 18:38:51 +0800 Subject: [PATCH 5/7] Update the matching regex for parse_maven_filename and added test #1763 - Update format Signed-off-by: Chin Yeung Li --- scanpipe/pipelines/scan_maven_package.py | 19 +++---- scanpipe/pipes/resolve.py | 68 ++++++++++++++++-------- scanpipe/tests/pipes/test_resolve.py | 27 ++++++++++ 3 files changed, 83 insertions(+), 31 deletions(-) diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py index a06697ba6f..00ddc5e500 100644 --- a/scanpipe/pipelines/scan_maven_package.py +++ b/scanpipe/pipelines/scan_maven_package.py @@ -23,9 +23,8 @@ import json from scanpipe.pipelines.scan_single_package import ScanSinglePackage - -from scanpipe.pipes.resolve import get_pom_url_list from scanpipe.pipes.resolve import download_and_scan_pom_file +from scanpipe.pipes.resolve import get_pom_url_list class ScanMavenPackage(ScanSinglePackage): @@ -53,20 +52,22 @@ def steps(cls): def fetch_and_scan_remote_pom(self): """Fetch the pom.xml file from from maven.org if not present in codebase.""" - with open(self.scan_output_location, 'r') as file: + with open(self.scan_output_location) as file: data = json.load(file) # Return and do nothing if data has pom.xml - for file in data['files']: - if 'pom.xml' in file['path']: + for file in data["files"]: + if "pom.xml" in file["path"]: return packages = data.get("packages", []) pom_url_list = get_pom_url_list(self.project.input_sources[0], packages) - scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list) + scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file( + pom_url_list + ) updated_pacakges = packages + scanned_pom_packages # Replace/Update the package and dependencies section - data['packages'] = updated_pacakges - data['dependencies'] = scanned_dependencies - with open(self.scan_output_location, 'w') as file: + data["packages"] = updated_pacakges + data["dependencies"] = scanned_dependencies + with open(self.scan_output_location, "w") as file: json.dump(data, file, indent=2) diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index c5a32918ef..e94de32e41 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -23,7 +23,6 @@ import json import logging import re -import requests import sys import uuid from pathlib import Path @@ -32,6 +31,7 @@ from django.core.exceptions import ObjectDoesNotExist import python_inspector.api as python_inspector +import requests import saneyaml from attributecode.model import About from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS @@ -41,14 +41,13 @@ from scanpipe.models import DiscoveredDependency from scanpipe.models import DiscoveredPackage from scanpipe.pipes import cyclonedx +from scanpipe.pipes import fetch from scanpipe.pipes import flag +from scanpipe.pipes import scancode from scanpipe.pipes import spdx from scanpipe.pipes import update_or_create_dependency from scanpipe.pipes import update_or_create_package -from scanpipe.pipes import fetch -from scanpipe.pipes import scancode - """ Resolve packages from manifest, lockfile, and SBOM. """ @@ -531,26 +530,42 @@ def extract_headers(input_location, extract_fields): def parse_maven_filename(filename): """Parse a Maven's jar filename to extract artifactId and version.""" # Remove the .jar extension - base = filename.rsplit('.', 1)[0] + base = filename.rsplit(".", 1)[0] # Common classifiers pattern common_classifiers = { - 'sources', 'javadoc', 'tests', 'test', 'test-sources', - 'src', 'bin', 'docs', 'javadocs', 'client', 'server', - 'linux', 'windows', 'macos', 'linux-x86_64', 'windows-x86_64' + "sources", + "javadoc", + "tests", + "test", + "test-sources", + "src", + "bin", + "docs", + "javadocs", + "client", + "server", + "linux", + "windows", + "macos", + "linux-x86_64", + "windows-x86_64", } # Remove known classifier if present for classifier in common_classifiers: if base.endswith(f"-{classifier}"): - base = base[:-(len(classifier) + 1)] + base = base[: -(len(classifier) + 1)] break # Match artifactId and version - match = re.match(r'^(.*)-(\d[\w.\-]+)$', base) + match = re.match(r"^(.*?)-((\d[\w.\-]*))$", base) + if match: artifact_id = match.group(1) version = match.group(2) + print("artifact_id", artifact_id) + print("version", version) return artifact_id, version else: return None, None @@ -564,15 +579,21 @@ def get_pom_url_list(input_source, packages): package_ns = package.get("namespace", "") package_name = package.get("name", "") package_version = package.get("version", "") - pom_url = f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/{package_name}/{package_version}/{package_name}-{package_version}.pom".lower() + pom_url = ( + f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/" + f"{package_name}/{package_version}/" + f"{package_name}-{package_version}.pom".lower() + ) pom_url_list.append(pom_url) else: # Check what's the input source input_source_url = input_source.get("download_url", "") if input_source_url and "maven.org/" in input_source_url: - base_url = input_source_url.rsplit('/', 1)[0] - pom_url = base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom" + base_url = input_source_url.rsplit("/", 1)[0] + pom_url = ( + base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom" + ) pom_url_list.append(pom_url) else: # Construct a pom_url from filename @@ -596,12 +617,12 @@ def construct_pom_url_from_filename(artifact_id, version): pom_url_list = [] group_ids = [] try: - response = requests.get(url) + response = requests.get(url, timeout=5) response.raise_for_status() data = response.json() # Extract all 'g' fields from the docs array that represent # groupIds - group_ids = [doc['g'] for doc in data['response']['docs']] + group_ids = [doc["g"] for doc in data["response"]["docs"]] except requests.RequestException as e: print(f"Error fetching data: {e}") return [] @@ -610,7 +631,10 @@ def construct_pom_url_from_filename(artifact_id, version): return [] for group_id in group_ids: - pom_url = f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom".lower() + pom_url = ( + f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/" + f"{artifact_id}/{version}/{artifact_id}-{version}.pom".lower() + ) if is_maven_pom_url(pom_url): pom_url_list.append(pom_url) if len(pom_url_list) > 1: @@ -632,12 +656,12 @@ def is_maven_pom_url(url): if response.status_code != 200: return False # Check content-type - content_type = response.headers.get('content-type', '').lower() - is_xml = 'xml' in content_type or 'text/xml' in content_type + content_type = response.headers.get("content-type", "").lower() + is_xml = "xml" in content_type or "text/xml" in content_type # Check content content = response.text.strip() - is_pom = content.startswith(' Date: Fri, 14 Nov 2025 15:21:15 +0800 Subject: [PATCH 6/7] Refactor code and add tests #1763 Signed-off-by: Chin Yeung Li --- scanpipe/pipelines/scan_maven_package.py | 10 +- scanpipe/pipes/resolve.py | 30 +++- scanpipe/tests/pipes/test_resolve.py | 186 +++++++++++++++++++++++ 3 files changed, 215 insertions(+), 11 deletions(-) diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py index 00ddc5e500..7438d781c0 100644 --- a/scanpipe/pipelines/scan_maven_package.py +++ b/scanpipe/pipelines/scan_maven_package.py @@ -23,8 +23,9 @@ import json from scanpipe.pipelines.scan_single_package import ScanSinglePackage -from scanpipe.pipes.resolve import download_and_scan_pom_file +from scanpipe.pipes.resolve import download_pom_files from scanpipe.pipes.resolve import get_pom_url_list +from scanpipe.pipes.resolve import scan_pom_files class ScanMavenPackage(ScanSinglePackage): @@ -51,7 +52,7 @@ def steps(cls): ) def fetch_and_scan_remote_pom(self): - """Fetch the pom.xml file from from maven.org if not present in codebase.""" + """Fetch the .pom file from from maven.org if not present in codebase.""" with open(self.scan_output_location) as file: data = json.load(file) # Return and do nothing if data has pom.xml @@ -61,9 +62,8 @@ def fetch_and_scan_remote_pom(self): packages = data.get("packages", []) pom_url_list = get_pom_url_list(self.project.input_sources[0], packages) - scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file( - pom_url_list - ) + pom_file_list = download_pom_files(pom_url_list) + scanned_pom_packages, scanned_dependencies = scan_pom_files(pom_file_list) updated_pacakges = packages + scanned_pom_packages # Replace/Update the package and dependencies section diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index e94de32e41..b672df13bc 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -586,10 +586,13 @@ def get_pom_url_list(input_source, packages): ) pom_url_list.append(pom_url) else: + from urllib.parse import urlparse + # Check what's the input source input_source_url = input_source.get("download_url", "") - if input_source_url and "maven.org/" in input_source_url: + parsed_url = urlparse(input_source_url) + if input_source_url and parsed_url.netloc.endswith("maven.org"): base_url = input_source_url.rsplit("/", 1)[0] pom_url = ( base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom" @@ -672,17 +675,32 @@ def is_maven_pom_url(url): return False -def download_and_scan_pom_file(pom_url_list): +def download_pom_files(pom_url_list): + """Fetch the pom file from the input pom_url_list""" + pom_file_list = [] + for pom_url in pom_url_list: + pom_file_dict = {} + downloaded_pom = fetch.fetch_http(pom_url) + print("download_pom.path", str(downloaded_pom.path)) + pom_file_dict["pom_file_path"] = str(downloaded_pom.path) + pom_file_dict["output_path"] = str(downloaded_pom.path) + "-output.json" + pom_file_dict["pom_url"] = pom_url + pom_file_list.append(pom_file_dict) + return pom_file_list + + +def scan_pom_files(pom_file_list): """Fetch and scan the pom file from the input pom_url_list""" scanned_pom_packages = [] scanned_pom_deps = [] - for pom_url in pom_url_list: - downloaded_pom = fetch.fetch_http(pom_url) - scanned_pom_output_path = str(downloaded_pom.path) + "-output.json" + for pom_file_dict in pom_file_list: + pom_file_path = pom_file_dict.get("pom_file_path", "") + scanned_pom_output_path = pom_file_dict.get("output_path", "") + pom_url = pom_file_dict.get("pom_url", "") # Run a package scan on the fetched pom.xml _scanning_errors = scancode.run_scan( - location=str(downloaded_pom.path), + location=pom_file_path, output_file=scanned_pom_output_path, run_scan_args={ "package": True, diff --git a/scanpipe/tests/pipes/test_resolve.py b/scanpipe/tests/pipes/test_resolve.py index f60608ee04..016d00bd7c 100644 --- a/scanpipe/tests/pipes/test_resolve.py +++ b/scanpipe/tests/pipes/test_resolve.py @@ -400,3 +400,189 @@ def test_scanpipe_resolve_parse_maven_filename(self): self.assertEqual(result3_version, expected3_version) self.assertEqual(result4_name, expected2_name) self.assertEqual(result4_version, expected2_version) + + @mock.patch("requests.get") + def test_scanpipe_resolve_is_maven_pom_url_valid(self, mock_get): + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"content-type": "application/xml"} + mock_response.text = '' + mock_get.return_value = mock_response + + result = resolve.is_maven_pom_url( + "https://repo1.maven.org/maven2/example/example.pom" + ) + self.assertTrue(result) + + @mock.patch("requests.get") + def test_scanpipe_resolve_is_maven_pom_url_404(self, mock_get): + mock_response = mock.Mock() + mock_response.status_code = 404 + mock_get.return_value = mock_response + + result = resolve.is_maven_pom_url( + "https://repo.maven.apache.org/maven2/example/404.pom" + ) + self.assertFalse(result) + + @mock.patch("requests.get") + def test_scanpipe_resolve_is_maven_pom_url_error(self, mock_get): + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"content-type": "text/html"} + mock_response.text = "Error page" + mock_get.return_value = mock_response + + result = resolve.is_maven_pom_url( + "https://repo.maven.apache.org/maven2/example/error.pom" + ) + self.assertFalse(result) + + @mock.patch("scanpipe.pipes.resolve.fetch.fetch_http") + def test_scanpipe_resolve_download_pom_files(self, mock_fetch_http): + mock_response = mock.Mock() + mock_response.path = "/safe/example1.pom" + mock_fetch_http.return_value = mock_response + + pom_urls = ["https://repo1.maven.org/maven2/example/example1.pom"] + + expected = [ + { + "pom_file_path": "/safe/example1.pom", + "output_path": "/safe/example1.pom-output.json", + "pom_url": "https://repo1.maven.org/maven2/example/example1.pom", + } + ] + + result = resolve.download_pom_files(pom_urls) + self.assertEqual(result, expected) + + @mock.patch("scanpipe.pipes.resolve.scancode.run_scan") + @mock.patch("builtins.open", new_callable=mock.mock_open) + @mock.patch("json.load") + def test_scanpipe_resolve_scan_pom_files( + self, mock_json_load, mock_open, mock_run_scan + ): + mock_json_load.return_value = { + "packages": [ + { + "name": "example-package", + "version": "1.0.0", + "datafile_paths": ["/safe/mock_pom.xml"], + } + ], + "dependencies": [ + { + "name": "example-dep", + "version": "2.0.0", + "datafile_path": "/safe/mock_pom.xml", + } + ], + } + + pom_file_list = [ + { + "pom_file_path": "/safe/mock.pom", + "output_path": "/safe/mock.pom-output.json", + "pom_url": "https://repo1.maven.org/maven2/example/example.pom", + } + ] + + expected_packages = [ + { + "name": "example-package", + "version": "1.0.0", + "datafile_paths": [ + "https://repo1.maven.org/maven2/example/example.pom" + ], + } + ] + expected_deps = [ + {"name": "example-dep", "version": "2.0.0", "datafile_path": ""} + ] + + packages, deps = resolve.scan_pom_files(pom_file_list) + + self.assertEqual(packages, expected_packages) + self.assertEqual(deps, expected_deps) + + mock_run_scan.assert_called_once_with( + location="/safe/mock.pom", + output_file="/safe/mock.pom-output.json", + run_scan_args={"package": True}, + ) + mock_open.assert_called_once_with("/safe/mock.pom-output.json") + mock_json_load.assert_called_once() + + @mock.patch("scanpipe.pipes.resolve.is_maven_pom_url") + @mock.patch("scanpipe.pipes.resolve.requests.get") + def test_scanpipe_resolve_construct_pom_url_from_filename( + self, mock_get, mock_is_maven_pom_url + ): + # Setup mock response from Maven Central + mock_response = mock.Mock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = { + "response": {"docs": [{"g": "org.apache.commons"}]} + } + mock_get.return_value = mock_response + mock_is_maven_pom_url.return_value = True + + # Inputs + artifact_id = "commons-lang3" + version = "3.12.0" + + expected_url = [ + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom" + ] + + result = resolve.construct_pom_url_from_filename(artifact_id, version) + + self.assertEqual(result, expected_url) + mock_get.assert_called_once_with( + "https://search.maven.org/solrsearch/select?q=a:commons-lang3&wt=json", + timeout=5, + ) + mock_is_maven_pom_url.assert_called_once_with(expected_url[0]) + + def test_scanpipe_resolve_get_pom_url_list_with_packages(self): + packages = [ + { + "namespace": "org.apache.commons", + "name": "commons-lang3", + "version": "3.12.0", + } + ] + result = resolve.get_pom_url_list({}, packages) + expected = [ + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom" + ] + self.assertEqual(result, expected) + + def test_scanpipe_resolve_get_pom_url_list_with_maven_download_url(self): + input_source = { + "download_url": "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.jar" + } + result = resolve.get_pom_url_list(input_source, []) + expected = [ + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom" + ] + self.assertEqual(result, expected) + + @mock.patch("scanpipe.pipes.resolve.construct_pom_url_from_filename") + @mock.patch("scanpipe.pipes.resolve.parse_maven_filename") + def test_scanpipe_resolve_get_pom_url_list_with_jar_filename( + self, mock_parse, mock_construct + ): + input_source = {"filename": "commons-lang3-3.12.0.jar"} + mock_parse.return_value = ("commons-lang3", "3.12.0") + mock_construct.return_value = [ + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom" + ] + result = resolve.get_pom_url_list(input_source, []) + self.assertEqual(result, mock_construct.return_value) + + def test_scanpipe_resolve_get_pom_url_list_with_invalid_filename(self): + input_source = {"filename": "not-a-jar.txt"} + result = resolve.get_pom_url_list(input_source, []) + self.assertEqual(result, []) From cb623c1c43aea551f6442dfe01b561affd6621cf Mon Sep 17 00:00:00 2001 From: Chin Yeung Li Date: Mon, 17 Nov 2025 18:08:24 +0800 Subject: [PATCH 7/7] Implement "update_package_license_from_resource_if_missing" function #1763 - Update package's license if missing while the same package has license detected in RESOURCES Signed-off-by: Chin Yeung Li --- scanpipe/pipelines/scan_maven_package.py | 5 ++- scanpipe/pipelines/scan_single_package.py | 19 ++++++++ scanpipe/pipes/resolve.py | 29 +++++++++++++ scanpipe/tests/pipes/test_resolve.py | 53 +++++++++++++++++++++++ 4 files changed, 104 insertions(+), 2 deletions(-) diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py index 7438d781c0..6b86b06791 100644 --- a/scanpipe/pipelines/scan_maven_package.py +++ b/scanpipe/pipelines/scan_maven_package.py @@ -47,6 +47,7 @@ def steps(cls): cls.extract_archives, cls.run_scan, cls.fetch_and_scan_remote_pom, + cls.update_package_license_from_resource_if_missing, cls.load_inventory_from_toolkit_scan, cls.make_summary_from_scan_results, ) @@ -65,9 +66,9 @@ def fetch_and_scan_remote_pom(self): pom_file_list = download_pom_files(pom_url_list) scanned_pom_packages, scanned_dependencies = scan_pom_files(pom_file_list) - updated_pacakges = packages + scanned_pom_packages + updated_packages = packages + scanned_pom_packages # Replace/Update the package and dependencies section - data["packages"] = updated_pacakges + data["packages"] = updated_packages data["dependencies"] = scanned_dependencies with open(self.scan_output_location, "w") as file: json.dump(data, file, indent=2) diff --git a/scanpipe/pipelines/scan_single_package.py b/scanpipe/pipelines/scan_single_package.py index 605ef0ea5d..7f0bf8b909 100644 --- a/scanpipe/pipelines/scan_single_package.py +++ b/scanpipe/pipelines/scan_single_package.py @@ -31,6 +31,7 @@ from scanpipe.pipes import scancode from scanpipe.pipes.input import copy_input from scanpipe.pipes.input import is_archive +from scanpipe.pipes.resolve import update_package_license_from_resource_if_missing class ScanSinglePackage(Pipeline): @@ -51,6 +52,7 @@ def steps(cls): cls.extract_input_to_codebase_directory, cls.extract_archives, cls.run_scan, + cls.update_package_license_from_resource_if_missing, cls.load_inventory_from_toolkit_scan, cls.make_summary_from_scan_results, ) @@ -126,6 +128,23 @@ def run_scan(self): if not scan_output_path.exists(): raise FileNotFoundError("ScanCode output not available.") + def update_package_license_from_resource_if_missing(self): + """Update PACKAGE license from the license detected in RESOURCES if missing.""" + with open(self.scan_output_location) as file: + data = json.load(file) + packages = data.get("packages", []) + resources = data.get("files", []) + if not packages or not resources: + return + + updated_packages = update_package_license_from_resource_if_missing( + packages, resources + ) + # Update the package section + data["packages"] = updated_packages + with open(self.scan_output_location, "w") as file: + json.dump(data, file, indent=2) + def load_inventory_from_toolkit_scan(self): """Process a JSON Scan results to populate codebase resources and packages.""" input.load_inventory_from_toolkit_scan(self.project, self.scan_output_location) diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index b672df13bc..4f7ed272c5 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -723,3 +723,32 @@ def scan_pom_files(pom_file_list): scanned_dep["datafile_path"] = "" scanned_pom_deps.append(scanned_dep) return scanned_pom_packages, scanned_pom_deps + + +def update_package_license_from_resource_if_missing(packages, resources): + """Populate missing licenses to packages based on resource data.""" + from license_expression import Licensing + + updated_packages = [] + for package in packages: + if not package.get("declared_license_expression"): + package_uid = package.get("package_uid") + detected_lic_list = [] + for resource in resources: + if ( + resource.get("detected_license_expression") + and package_uid in resource["for_packages"] + ): + if ( + resource.get("detected_license_expression") + not in detected_lic_list + ): + detected_lic_list.append( + resource.get("detected_license_expression") + ) + license_expression = " AND ".join(detected_lic_list) + if license_expression: + declared_license_expression = str(Licensing().dedup(license_expression)) + package["declared_license_expression"] = declared_license_expression + updated_packages.append(package) + return updated_packages diff --git a/scanpipe/tests/pipes/test_resolve.py b/scanpipe/tests/pipes/test_resolve.py index 016d00bd7c..89be9dd4f4 100644 --- a/scanpipe/tests/pipes/test_resolve.py +++ b/scanpipe/tests/pipes/test_resolve.py @@ -586,3 +586,56 @@ def test_scanpipe_resolve_get_pom_url_list_with_invalid_filename(self): input_source = {"filename": "not-a-jar.txt"} result = resolve.get_pom_url_list(input_source, []) self.assertEqual(result, []) + + def test_scanpipe_resolve_update_package_license_from_resource_if_missing(self): + packages = [ + {"package_uid": "pkg1", "declared_license_expression": ""}, + {"package_uid": "pkg2", "declared_license_expression": None}, + {"package_uid": "pkg3", "declared_license_expression": "MIT"}, + ] + resources = [ + { + "for_packages": ["pkg1", "pkg2"], + "detected_license_expression": "GPL-2.0", + }, + {"for_packages": ["pkg1"], "detected_license_expression": "MIT"}, + ] + + expected_pkg1_expr = "GPL-2.0 AND MIT" + expected_pkg2_expr = "GPL-2.0" + + updated = resolve.update_package_license_from_resource_if_missing( + packages, resources + ) + + self.assertEqual(updated[0]["declared_license_expression"], expected_pkg1_expr) + self.assertEqual(updated[1]["declared_license_expression"], expected_pkg2_expr) + self.assertEqual(updated[2]["declared_license_expression"], "MIT") + + def test_scanpipe_resolve_update_package_license_from_resource_if_missing_no_match( + self, + ): + packages = [{"package_uid": "pkgX", "declared_license_expression": None}] + resources = [{"for_packages": ["pkgY"], "detected_license_expression": "MIT"}] + + updated = resolve.update_package_license_from_resource_if_missing( + packages, resources + ) + self.assertEqual(updated[0]["declared_license_expression"], None) + + def test_scanpipe_resolve_update_package_license_from_resource_if_missing_no_change( + self, + ): + packages = [ + {"package_uid": "pkg1", "declared_license_expression": "GPL-2.0"}, + {"package_uid": "pkg2", "declared_license_expression": "Apache-2.0"}, + ] + resources = [ + {"for_packages": ["pkg1", "pkg2"], "detected_license_expression": "MIT"}, + ] + + updated = resolve.update_package_license_from_resource_if_missing( + packages, resources + ) + self.assertEqual(updated[0]["declared_license_expression"], "GPL-2.0") + self.assertEqual(updated[1]["declared_license_expression"], "Apache-2.0")