diff --git a/pyproject.toml b/pyproject.toml index f0ae21f332..3fea1404fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -161,6 +161,7 @@ resolve_dependencies = "scanpipe.pipelines.resolve_dependencies:ResolveDependenc scan_codebase = "scanpipe.pipelines.scan_codebase:ScanCodebase" scan_for_virus = "scanpipe.pipelines.scan_for_virus:ScanForVirus" scan_single_package = "scanpipe.pipelines.scan_single_package:ScanSinglePackage" +scan_maven_package = "scanpipe.pipelines.scan_maven_package:ScanMavenPackage" [tool.setuptools.packages.find] where = ["."] diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py new file mode 100644 index 0000000000..7438d781c0 --- /dev/null +++ b/scanpipe/pipelines/scan_maven_package.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/aboutcode-org/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/aboutcode-org/scancode.io for support and download. + +import json + +from scanpipe.pipelines.scan_single_package import ScanSinglePackage +from scanpipe.pipes.resolve import download_pom_files +from scanpipe.pipes.resolve import get_pom_url_list +from scanpipe.pipes.resolve import scan_pom_files + + +class ScanMavenPackage(ScanSinglePackage): + """ + Scan a single package archive (or package manifest file). + + This pipeline scans a single package for package metadata, + declared dependencies, licenses, license clarity score and copyrights. + + The output is a summary of the scan results in JSON format. + """ + + @classmethod + def steps(cls): + return ( + cls.get_package_input, + cls.collect_input_information, + cls.extract_input_to_codebase_directory, + cls.extract_archives, + cls.run_scan, + cls.fetch_and_scan_remote_pom, + cls.load_inventory_from_toolkit_scan, + cls.make_summary_from_scan_results, + ) + + def fetch_and_scan_remote_pom(self): + """Fetch the .pom file from from maven.org if not present in codebase.""" + with open(self.scan_output_location) as file: + data = json.load(file) + # Return and do nothing if data has pom.xml + for file in data["files"]: + if "pom.xml" in file["path"]: + return + packages = data.get("packages", []) + + pom_url_list = get_pom_url_list(self.project.input_sources[0], packages) + pom_file_list = download_pom_files(pom_url_list) + scanned_pom_packages, scanned_dependencies = scan_pom_files(pom_file_list) + + updated_pacakges = packages + scanned_pom_packages + # Replace/Update the package and dependencies section + data["packages"] = updated_pacakges + data["dependencies"] = scanned_dependencies + with open(self.scan_output_location, "w") as file: + json.dump(data, file, indent=2) diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index 0a409dd88c..b672df13bc 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -22,6 +22,7 @@ import json import logging +import re import sys import uuid from pathlib import Path @@ -30,6 +31,7 @@ from django.core.exceptions import ObjectDoesNotExist import python_inspector.api as python_inspector +import requests import saneyaml from attributecode.model import About from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS @@ -39,7 +41,9 @@ from scanpipe.models import DiscoveredDependency from scanpipe.models import DiscoveredPackage from scanpipe.pipes import cyclonedx +from scanpipe.pipes import fetch from scanpipe.pipes import flag +from scanpipe.pipes import scancode from scanpipe.pipes import spdx from scanpipe.pipes import update_or_create_dependency from scanpipe.pipes import update_or_create_package @@ -521,3 +525,201 @@ def extract_headers(input_location, extract_fields): return extracted_headers return {} + + +def parse_maven_filename(filename): + """Parse a Maven's jar filename to extract artifactId and version.""" + # Remove the .jar extension + base = filename.rsplit(".", 1)[0] + + # Common classifiers pattern + common_classifiers = { + "sources", + "javadoc", + "tests", + "test", + "test-sources", + "src", + "bin", + "docs", + "javadocs", + "client", + "server", + "linux", + "windows", + "macos", + "linux-x86_64", + "windows-x86_64", + } + + # Remove known classifier if present + for classifier in common_classifiers: + if base.endswith(f"-{classifier}"): + base = base[: -(len(classifier) + 1)] + break + + # Match artifactId and version + match = re.match(r"^(.*?)-((\d[\w.\-]*))$", base) + + if match: + artifact_id = match.group(1) + version = match.group(2) + print("artifact_id", artifact_id) + print("version", version) + return artifact_id, version + else: + return None, None + + +def get_pom_url_list(input_source, packages): + """Generate Maven POM URLs from package metadata or input source.""" + pom_url_list = [] + if packages: + for package in packages: + package_ns = package.get("namespace", "") + package_name = package.get("name", "") + package_version = package.get("version", "") + pom_url = ( + f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/" + f"{package_name}/{package_version}/" + f"{package_name}-{package_version}.pom".lower() + ) + pom_url_list.append(pom_url) + else: + from urllib.parse import urlparse + + # Check what's the input source + input_source_url = input_source.get("download_url", "") + + parsed_url = urlparse(input_source_url) + if input_source_url and parsed_url.netloc.endswith("maven.org"): + base_url = input_source_url.rsplit("/", 1)[0] + pom_url = ( + base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom" + ) + pom_url_list.append(pom_url) + else: + # Construct a pom_url from filename + input_filename = input_source.get("filename", "") + if input_filename.endswith(".jar"): + artifact_id, version = parse_maven_filename(input_filename) + if not artifact_id or not version: + return [] + pom_url_list = construct_pom_url_from_filename(artifact_id, version) + else: + # Only work with input that's a .jar file + return [] + + return pom_url_list + + +def construct_pom_url_from_filename(artifact_id, version): + """Construct a pom.xml URL from the given Maven filename.""" + # Search Maven Central for the artifact to get its groupId + url = f"https://search.maven.org/solrsearch/select?q=a:{artifact_id}&wt=json" + pom_url_list = [] + group_ids = [] + try: + response = requests.get(url, timeout=5) + response.raise_for_status() + data = response.json() + # Extract all 'g' fields from the docs array that represent + # groupIds + group_ids = [doc["g"] for doc in data["response"]["docs"]] + except requests.RequestException as e: + print(f"Error fetching data: {e}") + return [] + except KeyError as e: + print(f"Error parsing JSON: {e}") + return [] + + for group_id in group_ids: + pom_url = ( + f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/" + f"{artifact_id}/{version}/{artifact_id}-{version}.pom".lower() + ) + if is_maven_pom_url(pom_url): + pom_url_list.append(pom_url) + if len(pom_url_list) > 1: + # If multiple valid POM URLs are found, it means the same + # artifactId and version exist under different groupIds. Since we + # can't confidently determine the correct groupId, we return an + # empty list to avoid fetching the wrong POM. + return [] + + return pom_url_list + + +def is_maven_pom_url(url): + """Return True if the url is a accessible, False otherwise""" + # Maven Central has a fallback mechanism that serves a generic/error + # page instead of returning a proper 404. + try: + response = requests.get(url, timeout=5) + if response.status_code != 200: + return False + # Check content-type + content_type = response.headers.get("content-type", "").lower() + is_xml = "xml" in content_type or "text/xml" in content_type + + # Check content + content = response.text.strip() + is_pom = content.startswith("' + mock_get.return_value = mock_response + + result = resolve.is_maven_pom_url( + "https://repo1.maven.org/maven2/example/example.pom" + ) + self.assertTrue(result) + + @mock.patch("requests.get") + def test_scanpipe_resolve_is_maven_pom_url_404(self, mock_get): + mock_response = mock.Mock() + mock_response.status_code = 404 + mock_get.return_value = mock_response + + result = resolve.is_maven_pom_url( + "https://repo.maven.apache.org/maven2/example/404.pom" + ) + self.assertFalse(result) + + @mock.patch("requests.get") + def test_scanpipe_resolve_is_maven_pom_url_error(self, mock_get): + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"content-type": "text/html"} + mock_response.text = "Error page" + mock_get.return_value = mock_response + + result = resolve.is_maven_pom_url( + "https://repo.maven.apache.org/maven2/example/error.pom" + ) + self.assertFalse(result) + + @mock.patch("scanpipe.pipes.resolve.fetch.fetch_http") + def test_scanpipe_resolve_download_pom_files(self, mock_fetch_http): + mock_response = mock.Mock() + mock_response.path = "/safe/example1.pom" + mock_fetch_http.return_value = mock_response + + pom_urls = ["https://repo1.maven.org/maven2/example/example1.pom"] + + expected = [ + { + "pom_file_path": "/safe/example1.pom", + "output_path": "/safe/example1.pom-output.json", + "pom_url": "https://repo1.maven.org/maven2/example/example1.pom", + } + ] + + result = resolve.download_pom_files(pom_urls) + self.assertEqual(result, expected) + + @mock.patch("scanpipe.pipes.resolve.scancode.run_scan") + @mock.patch("builtins.open", new_callable=mock.mock_open) + @mock.patch("json.load") + def test_scanpipe_resolve_scan_pom_files( + self, mock_json_load, mock_open, mock_run_scan + ): + mock_json_load.return_value = { + "packages": [ + { + "name": "example-package", + "version": "1.0.0", + "datafile_paths": ["/safe/mock_pom.xml"], + } + ], + "dependencies": [ + { + "name": "example-dep", + "version": "2.0.0", + "datafile_path": "/safe/mock_pom.xml", + } + ], + } + + pom_file_list = [ + { + "pom_file_path": "/safe/mock.pom", + "output_path": "/safe/mock.pom-output.json", + "pom_url": "https://repo1.maven.org/maven2/example/example.pom", + } + ] + + expected_packages = [ + { + "name": "example-package", + "version": "1.0.0", + "datafile_paths": [ + "https://repo1.maven.org/maven2/example/example.pom" + ], + } + ] + expected_deps = [ + {"name": "example-dep", "version": "2.0.0", "datafile_path": ""} + ] + + packages, deps = resolve.scan_pom_files(pom_file_list) + + self.assertEqual(packages, expected_packages) + self.assertEqual(deps, expected_deps) + + mock_run_scan.assert_called_once_with( + location="/safe/mock.pom", + output_file="/safe/mock.pom-output.json", + run_scan_args={"package": True}, + ) + mock_open.assert_called_once_with("/safe/mock.pom-output.json") + mock_json_load.assert_called_once() + + @mock.patch("scanpipe.pipes.resolve.is_maven_pom_url") + @mock.patch("scanpipe.pipes.resolve.requests.get") + def test_scanpipe_resolve_construct_pom_url_from_filename( + self, mock_get, mock_is_maven_pom_url + ): + # Setup mock response from Maven Central + mock_response = mock.Mock() + mock_response.raise_for_status.return_value = None + mock_response.json.return_value = { + "response": {"docs": [{"g": "org.apache.commons"}]} + } + mock_get.return_value = mock_response + mock_is_maven_pom_url.return_value = True + + # Inputs + artifact_id = "commons-lang3" + version = "3.12.0" + + expected_url = [ + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom" + ] + + result = resolve.construct_pom_url_from_filename(artifact_id, version) + + self.assertEqual(result, expected_url) + mock_get.assert_called_once_with( + "https://search.maven.org/solrsearch/select?q=a:commons-lang3&wt=json", + timeout=5, + ) + mock_is_maven_pom_url.assert_called_once_with(expected_url[0]) + + def test_scanpipe_resolve_get_pom_url_list_with_packages(self): + packages = [ + { + "namespace": "org.apache.commons", + "name": "commons-lang3", + "version": "3.12.0", + } + ] + result = resolve.get_pom_url_list({}, packages) + expected = [ + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom" + ] + self.assertEqual(result, expected) + + def test_scanpipe_resolve_get_pom_url_list_with_maven_download_url(self): + input_source = { + "download_url": "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.jar" + } + result = resolve.get_pom_url_list(input_source, []) + expected = [ + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom" + ] + self.assertEqual(result, expected) + + @mock.patch("scanpipe.pipes.resolve.construct_pom_url_from_filename") + @mock.patch("scanpipe.pipes.resolve.parse_maven_filename") + def test_scanpipe_resolve_get_pom_url_list_with_jar_filename( + self, mock_parse, mock_construct + ): + input_source = {"filename": "commons-lang3-3.12.0.jar"} + mock_parse.return_value = ("commons-lang3", "3.12.0") + mock_construct.return_value = [ + "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom" + ] + result = resolve.get_pom_url_list(input_source, []) + self.assertEqual(result, mock_construct.return_value) + + def test_scanpipe_resolve_get_pom_url_list_with_invalid_filename(self): + input_source = {"filename": "not-a-jar.txt"} + result = resolve.get_pom_url_list(input_source, []) + self.assertEqual(result, [])