aboutcode-org · chinyeungli · Nov 12, 2025 · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -161,6 +161,7 @@ resolve_dependencies = "scanpipe.pipelines.resolve_dependencies:ResolveDependenc
 scan_codebase = "scanpipe.pipelines.scan_codebase:ScanCodebase"
 scan_for_virus = "scanpipe.pipelines.scan_for_virus:ScanForVirus"
 scan_single_package = "scanpipe.pipelines.scan_single_package:ScanSinglePackage"
+scan_maven_package = "scanpipe.pipelines.scan_maven_package:ScanMavenPackage"
 
 [tool.setuptools.packages.find]
 where = ["."]

diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/aboutcode-org/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/aboutcode-org/scancode.io for support and download.
+
+import json
+
+from scanpipe.pipelines.scan_single_package import ScanSinglePackage
+from scanpipe.pipes.resolve import download_and_scan_pom_file
+from scanpipe.pipes.resolve import get_pom_url_list
+
+
+class ScanMavenPackage(ScanSinglePackage):
+    """
+    Scan a single package archive (or package manifest file).
+
+    This pipeline scans a single package for package metadata,
+    declared dependencies, licenses, license clarity score and copyrights.
+
+    The output is a summary of the scan results in JSON format.
+    """
+
+    @classmethod
+    def steps(cls):
+        return (
+            cls.get_package_input,
+            cls.collect_input_information,
+            cls.extract_input_to_codebase_directory,
+            cls.extract_archives,
+            cls.run_scan,
+            cls.fetch_and_scan_remote_pom,
+            cls.load_inventory_from_toolkit_scan,
+            cls.make_summary_from_scan_results,
+        )
+
+    def fetch_and_scan_remote_pom(self):
+        """Fetch the pom.xml file from from maven.org if not present in codebase."""
+        with open(self.scan_output_location) as file:
+            data = json.load(file)
+            # Return and do nothing if data has pom.xml
+            for file in data["files"]:
+                if "pom.xml" in file["path"]:
+                    return
+            packages = data.get("packages", [])
+
+        pom_url_list = get_pom_url_list(self.project.input_sources[0], packages)
+        scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(
+            pom_url_list
+        )
+
+        updated_pacakges = packages + scanned_pom_packages
+        # Replace/Update the package and dependencies section
+        data["packages"] = updated_pacakges
+        data["dependencies"] = scanned_dependencies
+        with open(self.scan_output_location, "w") as file:
+            json.dump(data, file, indent=2)
diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py
@@ -22,6 +22,7 @@
 
 import json
 import logging
+import re
 import sys
 import uuid
 from pathlib import Path
@@ -30,6 +31,7 @@
 from django.core.exceptions import ObjectDoesNotExist
 
 import python_inspector.api as python_inspector
+import requests
 import saneyaml
 from attributecode.model import About
 from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS
@@ -39,7 +41,9 @@
 from scanpipe.models import DiscoveredDependency
 from scanpipe.models import DiscoveredPackage
 from scanpipe.pipes import cyclonedx
+from scanpipe.pipes import fetch
 from scanpipe.pipes import flag
+from scanpipe.pipes import scancode
 from scanpipe.pipes import spdx
 from scanpipe.pipes import update_or_create_dependency
 from scanpipe.pipes import update_or_create_package
@@ -521,3 +525,183 @@
         return extracted_headers
 
     return {}
+
+
+def parse_maven_filename(filename):
+    """Parse a Maven's jar filename to extract artifactId and version."""
+    # Remove the .jar extension
+    base = filename.rsplit(".", 1)[0]
+
+    # Common classifiers pattern
+    common_classifiers = {
+        "sources",
+        "javadoc",
+        "tests",
+        "test",
+        "test-sources",
+        "src",
+        "bin",
+        "docs",
+        "javadocs",
+        "client",
+        "server",
+        "linux",
+        "windows",
+        "macos",
+        "linux-x86_64",
+        "windows-x86_64",
+    }
+
+    # Remove known classifier if present
+    for classifier in common_classifiers:
+        if base.endswith(f"-{classifier}"):
+            base = base[: -(len(classifier) + 1)]
+            break
+
+    # Match artifactId and version
+    match = re.match(r"^(.*?)-((\d[\w.\-]*))$", base)
+
+    if match:
+        artifact_id = match.group(1)
+        version = match.group(2)
+        print("artifact_id", artifact_id)
+        print("version", version)
+        return artifact_id, version
+    else:
+        return None, None
+
+
+def get_pom_url_list(input_source, packages):
+    """Generate Maven POM URLs from package metadata or input source."""
+    pom_url_list = []
+    if packages:
+        for package in packages:
+            package_ns = package.get("namespace", "")
+            package_name = package.get("name", "")
+            package_version = package.get("version", "")
+            pom_url = (
+                f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/"
+                f"{package_name}/{package_version}/"
+                f"{package_name}-{package_version}.pom".lower()
+            )
+            pom_url_list.append(pom_url)
+    else:
+        # Check what's the input source
+        input_source_url = input_source.get("download_url", "")
+
+        if input_source_url and "maven.org/" in input_source_url:
+            base_url = input_source_url.rsplit("/", 1)[0]
+            pom_url = (
+                base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom"
+            )
+            pom_url_list.append(pom_url)
+        else:
+            # Construct a pom_url from filename
+            input_filename = input_source.get("filename", "")
+            if input_filename.endswith(".jar"):
+                artifact_id, version = parse_maven_filename(input_filename)
+                if not artifact_id or not version:
+                    return []
+                pom_url_list = construct_pom_url_from_filename(artifact_id, version)
+            else:
+                # Only work with input that's a .jar file
+                return []
+
+    return pom_url_list
+
+
+def construct_pom_url_from_filename(artifact_id, version):
+    """Construct a pom.xml URL from the given Maven filename."""
+    # Search Maven Central for the artifact to get its groupId
+    url = f"https://search.maven.org/solrsearch/select?q=a:{artifact_id}&wt=json"
+    pom_url_list = []
+    group_ids = []
+    try:
+        response = requests.get(url, timeout=5)
+        response.raise_for_status()
+        data = response.json()
+        # Extract all 'g' fields from the docs array that represent
+        # groupIds
+        group_ids = [doc["g"] for doc in data["response"]["docs"]]
+    except requests.RequestException as e:
+        print(f"Error fetching data: {e}")
+        return []
+    except KeyError as e:
+        print(f"Error parsing JSON: {e}")
+        return []
+
+    for group_id in group_ids:
+        pom_url = (
+            f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/"
+            f"{artifact_id}/{version}/{artifact_id}-{version}.pom".lower()
+        )
+        if is_maven_pom_url(pom_url):
+            pom_url_list.append(pom_url)
+    if len(pom_url_list) > 1:
+        # If multiple valid POM URLs are found, it means the same
+        # artifactId and version exist under different groupIds. Since we
+        # can't confidently determine the correct groupId, we return an
+        # empty list to avoid fetching the wrong POM.
+        return []
+
+    return pom_url_list
+
+
+def is_maven_pom_url(url):
+    """Return True if the url is a accessible, False otherwise"""
+    # Maven Central has a fallback mechanism that serves a generic/error
+    # page instead of returning a proper 404.
+    try:
+        response = requests.get(url, timeout=5)
+        if response.status_code != 200:
+            return False
+        # Check content-type
+        content_type = response.headers.get("content-type", "").lower()
+        is_xml = "xml" in content_type or "text/xml" in content_type
+
+        # Check content
+        content = response.text.strip()
+        is_pom = content.startswith("<?xml") and "<project" in content
+
+        if is_xml and is_pom:
+            return True
+        else:
+            # It's probably the Maven Central error page
+            return False
+    except requests.RequestException:
+        return False
+
+
+def download_and_scan_pom_file(pom_url_list):
+    """Fetch and scan the pom file from the input pom_url_list"""
+    scanned_pom_packages = []
+    scanned_pom_deps = []
+    for pom_url in pom_url_list:
+        downloaded_pom = fetch.fetch_http(pom_url)
+        scanned_pom_output_path = str(downloaded_pom.path) + "-output.json"
+
+        # Run a package scan on the fetched pom.xml
+        _scanning_errors = scancode.run_scan(
+            location=str(downloaded_pom.path),
+            output_file=scanned_pom_output_path,
+            run_scan_args={
+                "package": True,
+            },
+        )
+
+        with open(scanned_pom_output_path) as scanned_pom_file:
+            scanned_pom_data = json.load(scanned_pom_file)
+            scanned_packages = scanned_pom_data.get("packages", [])
+            scanned_dependencies = scanned_pom_data.get("dependencies", [])
+            if scanned_packages:
+                for scanned_package in scanned_packages:
+                    # Replace the 'datafile_path' with the pom_url
+                    scanned_package["datafile_paths"] = [pom_url]
+                    scanned_pom_packages.append(scanned_package)
+            if scanned_dependencies:
+                for scanned_dep in scanned_dependencies:
+                    # Replace the 'datafile_path' with empty string
+                    # See https://github.com/aboutcode-org/scancode.io/issues/1763#issuecomment-3525165830
+                    scanned_dep["datafile_path"] = ""
+                    scanned_pom_deps.append(scanned_dep)
+    return scanned_pom_packages, scanned_pom_deps
diff --git a/scanpipe/tests/pipes/test_resolve.py b/scanpipe/tests/pipes/test_resolve.py
@@ -373,3 +373,30 @@ def test_scanpipe_resolve_get_manifest_headers(self):
         ]
         headers = resolve.get_manifest_headers(resource)
         self.assertEqual(expected, list(headers.keys()))
+
+    def test_scanpipe_resolve_parse_maven_filename(self):
+        test1 = "wisp-logging-2025.11.11.195957-97a44b0-sources.jar"
+        test2 = "guava-33.5.0-jre-javadoc.jar"
+        test3 = "junit-4.13.2.jar"
+        test4 = "guava-33.5.0-jre.jar"
+
+        expected1_name = "wisp-logging"
+        expected1_version = "2025.11.11.195957-97a44b0"
+        expected2_name = "guava"
+        expected2_version = "33.5.0-jre"
+        expected3_name = "junit"
+        expected3_version = "4.13.2"
+
+        result1_name, result1_version = resolve.parse_maven_filename(test1)
+        result2_name, result2_version = resolve.parse_maven_filename(test2)
+        result3_name, result3_version = resolve.parse_maven_filename(test3)
+        result4_name, result4_version = resolve.parse_maven_filename(test4)
+
+        self.assertEqual(result1_name, expected1_name)
+        self.assertEqual(result1_version, expected1_version)
+        self.assertEqual(result2_name, expected2_name)
+        self.assertEqual(result2_version, expected2_version)
+        self.assertEqual(result3_name, expected3_name)
+        self.assertEqual(result3_version, expected3_version)
+        self.assertEqual(result4_name, expected2_name)
+        self.assertEqual(result4_version, expected2_version)