From 9e933173b6a36937514cda5fe6a6495630092c80 Mon Sep 17 00:00:00 2001
From: Chin Yeung Li <tli@nexb.com>
Date: Thu, 13 Nov 2025 07:52:50 +0800
Subject: [PATCH 1/7] Add "scan_maven_package" pipeline (Working-in-progress)
 #1763

Signed-off-by: Chin Yeung Li <tli@nexb.com>
---
 pyproject.toml                           |   1 +
 scanpipe/pipelines/scan_maven_package.py | 162 ++++++++++++++++++++++
 scanpipe/pipes/resolve.py                | 168 +++++++++++++++++++++++
 3 files changed, 331 insertions(+)
 create mode 100644 scanpipe/pipelines/scan_maven_package.py

diff --git a/pyproject.toml b/pyproject.toml
index f0ae21f332..3fea1404fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -161,6 +161,7 @@ resolve_dependencies = "scanpipe.pipelines.resolve_dependencies:ResolveDependenc
 scan_codebase = "scanpipe.pipelines.scan_codebase:ScanCodebase"
 scan_for_virus = "scanpipe.pipelines.scan_for_virus:ScanForVirus"
 scan_single_package = "scanpipe.pipelines.scan_single_package:ScanSinglePackage"
+scan_maven_package = "scanpipe.pipelines.scan_maven_package:ScanMavenPackage"
 
 [tool.setuptools.packages.find]
 where = ["."]
diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py
new file mode 100644
index 0000000000..841899ff88
--- /dev/null
+++ b/scanpipe/pipelines/scan_maven_package.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/aboutcode-org/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/aboutcode-org/scancode.io for support and download.
+
+import json
+
+from django.core.serializers.json import DjangoJSONEncoder
+
+from commoncode.hash import multi_checksums
+
+from scanpipe.pipelines import Pipeline
+from scanpipe.pipes import input
+from scanpipe.pipes import scancode
+from scanpipe.pipes.input import copy_input
+from scanpipe.pipes.input import is_archive
+
+from scanpipe.pipes.resolve import get_pom_url_list
+from scanpipe.pipes.resolve import download_and_scan_pom_file
+
+
+class ScanMavenPackage(Pipeline):
+    """
+    Scan a single package archive (or package manifest file).
+
+    This pipeline scans a single package for package metadata,
+    declared dependencies, licenses, license clarity score and copyrights.
+
+    The output is a summary of the scan results in JSON format.
+    """
+
+    @classmethod
+    def steps(cls):
+        return (
+            cls.get_package_input,
+            cls.collect_input_information,
+            cls.extract_input_to_codebase_directory,
+            cls.extract_archives,
+            cls.run_scan,
+            cls.fetch_and_scan_remote_pom,
+            cls.load_inventory_from_toolkit_scan,
+            cls.make_summary_from_scan_results,
+        )
+
+    scancode_run_scan_args = {
+        "copyright": True,
+        "email": True,
+        "info": True,
+        "license": True,
+        "license_text": True,
+        "license_diagnostics": True,
+        "license_text_diagnostics": True,
+        "license_references": True,
+        "package": True,
+        "url": True,
+        "classify": True,
+        "summary": True,
+        "todo": True,
+    }
+
+    def get_package_input(self):
+        """Locate the package input in the project's input/ directory."""
+        # Using the input_sources model property as it includes input sources instances
+        # as well as any files manually copied into the input/ directory.
+        input_sources = self.project.input_sources
+        inputs = list(self.project.inputs("*"))
+
+        if len(inputs) != 1 or len(input_sources) != 1:
+            raise Exception("Only 1 input file supported")
+
+        self.input_path = inputs[0]
+
+    def collect_input_information(self):
+        """Collect and store information about the project input."""
+        self.project.update_extra_data(
+            {
+                "filename": self.input_path.name,
+                "size": self.input_path.stat().st_size,
+                **multi_checksums(self.input_path),
+            }
+        )
+
+    def extract_input_to_codebase_directory(self):
+        """Copy or extract input to project codebase/ directory."""
+        if not is_archive(self.input_path):
+            copy_input(self.input_path, self.project.codebase_path)
+            return
+
+        self.extract_archive(self.input_path, self.project.codebase_path)
+
+        # Reload the project env post-extraction as the scancode-config.yml file
+        # may be located in one of the extracted archives.
+        self.env = self.project.get_env()
+
+    def run_scan(self):
+        """Scan extracted codebase/ content."""
+        scan_output_path = self.project.get_output_file_path("scancode", "json")
+        self.scan_output_location = str(scan_output_path.absolute())
+
+        scanning_errors = scancode.run_scan(
+            location=str(self.project.codebase_path),
+            output_file=self.scan_output_location,
+            run_scan_args=self.scancode_run_scan_args.copy(),
+        )
+
+        for resource_path, errors in scanning_errors.items():
+            self.project.add_error(
+                description="\n".join(errors),
+                model=self.pipeline_name,
+                details={"resource_path": resource_path.removeprefix("codebase/")},
+            )
+
+        if not scan_output_path.exists():
+            raise FileNotFoundError("ScanCode output not available.")
+
+    def fetch_and_scan_remote_pom(self):
+        """Fetch the pom.xml file from from maven.org if not present in codebase."""
+        # TODO Verify if the following filter actually work
+        if not self.project.codebaseresources.files().filter(name="pom.xml").exists():
+            with open(self.scan_output_location, 'r') as file:
+                data = json.load(file)
+                packages = data.get("packages", [])
+
+            pom_url_list = get_pom_url_list(self.project.input_sources[0], packages)
+            scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list)
+
+            updated_pacakges = packages + scanned_pom_packages
+            # Replace/Update the package and dependencies section
+            data['packages'] = updated_pacakges
+            # Need to update the dependencies
+            # data['dependencies'] = scanned_dependencies
+            with open(self.scan_output_location, 'w') as file:
+                json.dump(data, file, indent=2)
+
+    def load_inventory_from_toolkit_scan(self):
+        """Process a JSON Scan results to populate codebase resources and packages."""
+        input.load_inventory_from_toolkit_scan(self.project, self.scan_output_location)
+
+    def make_summary_from_scan_results(self):
+        """Build a summary in JSON format from the generated scan results."""
+        summary = scancode.make_results_summary(self.project, self.scan_output_location)
+        output_file = self.project.get_output_file_path("summary", "json")
+
+        with output_file.open("w") as summary_file:
+            summary_file.write(json.dumps(summary, indent=2, cls=DjangoJSONEncoder))
diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py
index 0a409dd88c..f9ea444301 100644
--- a/scanpipe/pipes/resolve.py
+++ b/scanpipe/pipes/resolve.py
@@ -22,6 +22,8 @@
 
 import json
 import logging
+import re
+import requests
 import sys
 import uuid
 from pathlib import Path
@@ -44,6 +46,9 @@
 from scanpipe.pipes import update_or_create_dependency
 from scanpipe.pipes import update_or_create_package
 
+from scanpipe.pipes import fetch
+from scanpipe.pipes import scancode
+
 """
 Resolve packages from manifest, lockfile, and SBOM.
 """
@@ -521,3 +526,166 @@ def extract_headers(input_location, extract_fields):
         return extracted_headers
 
     return {}
+
+
+def parse_maven_filename(filename):
+    """Parse a Maven's jar filename to extract artifactId and version."""
+    # Remove the .jar extension
+    base = filename.rsplit('.', 1)[0]
+
+    # Common classifiers pattern
+    common_classifiers = {
+        'sources', 'javadoc', 'tests', 'test', 'test-sources',
+        'src', 'bin', 'docs', 'javadocs', 'client', 'server',
+        'linux', 'windows', 'macos', 'linux-x86_64', 'windows-x86_64'
+    }
+
+    # Remove known classifier if present
+    for classifier in common_classifiers:
+        if base.endswith(f"-{classifier}"):
+            base = base[:-(len(classifier) + 1)]
+            break
+
+    # Match artifactId and version
+    match = re.match(r'^(.*)-(\d[\w.\-]+)$', base)
+    if match:
+        artifact_id = match.group(1)
+        version = match.group(2)
+        return artifact_id, version
+    else:
+        return None, None
+
+
+def get_pom_url_list(input_source, packages):
+    pom_url_list = []
+    if packages:
+        for package in packages:
+            package_ns = package.get("namespace", "")
+            package_name = package.get("name", "")
+            package_version = package.get("version", "")
+            pom_url = f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/{package_name}/{package_version}/{package_name}-{package_version}.pom".lower()
+            pom_url_list.append(pom_url)
+    else:
+        # Check what's the input source
+        input_source_url = input_source.get("download_url", "")
+
+        if input_source_url and "maven.org/" in input_source_url:
+            base_url = input_source_url.rsplit('/', 1)[0]
+            pom_url = base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom"
+            pom_url_list.append(pom_url)
+        else:
+            # Construct a pom_url from filename
+            input_filename = input_source.get("filename", "")
+            if input_filename.endswith(".jar"):
+                artifact_id, version = parse_maven_filename(input_filename)
+                if not artifact_id or not version:
+                    return []
+                pom_url_list = construct_pom_url_from_filename(artifact_id, version)
+            else:
+                # Only work with input that's a .jar file
+                return []
+
+    return pom_url_list
+
+
+def construct_pom_url_from_filename(artifact_id, version):
+    """Construct a pom.xml URL from the given Maven filename."""
+    # Search Maven Central for the artifact to get its groupId
+    url = f"https://search.maven.org/solrsearch/select?q=a:{artifact_id}&wt=json"
+    pom_url_list = []
+    group_ids = []
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        data = response.json()
+        # Extract all 'g' fields from the docs array that represent
+        # groupIds
+        group_ids = [doc['g'] for doc in data['response']['docs']]
+    except requests.RequestException as e:
+        print(f"Error fetching data: {e}")
+        return []
+    except KeyError as e:
+        print(f"Error parsing JSON: {e}")
+        return []
+
+    for group_id in group_ids:
+        pom_url = f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom".lower()
+        if is_maven_pom_url(pom_url):
+            pom_url_list.append(pom_url)
+    if len(pom_url_list) > 1:
+        # If multiple valid POM URLs are found, it means the same
+        # artifactId and version exist under different groupIds. Since we
+        # can't confidently determine the correct groupId, we return an
+        # empty list to avoid fetching the wrong POM.
+        return []
+
+    return pom_url_list
+
+
+def is_maven_pom_url(url):
+    """
+    Return True if the url is a accessible, False otherwise
+    Maven Central has a fallback mechanism that serves a generic/error page
+    instead of returning a proper 404.
+    """
+    try:
+        response = requests.get(url, timeout=5)
+        if response.status_code != 200:
+            return False
+        # Check content-type
+        content_type = response.headers.get('content-type', '').lower()
+        is_xml = 'xml' in content_type or 'text/xml' in content_type
+
+        # Check content
+        content = response.text.strip()
+        is_pom = content.startswith('<?xml') and '<project' in content
+
+        if is_xml and is_pom:
+            return True
+        else:
+            # It's probably the Maven Central error page
+            return False
+    except requests.RequestException:
+        return False
+
+
+def download_and_scan_pom_file(pom_url_list):
+    scanned_pom_packages = []
+    scanned_pom_deps = []
+    for pom_url in pom_url_list:
+        downloaded_pom = fetch.fetch_http(pom_url)
+        scanned_pom_output_path = str(downloaded_pom.path) + "-output.json"
+
+        # Run a package scan on the fetched pom.xml
+        _scanning_errors = scancode.run_scan(
+            location=str(downloaded_pom.path),
+            output_file=scanned_pom_output_path,
+            run_scan_args={
+                "copyright": True,
+                "email": True,
+                "info": True,
+                "license": True,
+                "license_text": True,
+                "license_diagnostics": True,
+                "license_text_diagnostics": True,
+                "license_references": True,
+                "package": True,
+                "url": True,
+            },
+        )
+
+        with open(scanned_pom_output_path, 'r') as scanned_pom_file:
+            scanned_pom_data = json.load(scanned_pom_file)
+            scanned_packages = scanned_pom_data.get("packages", [])
+            scanned_dependencies = scanned_pom_data.get("dependencies", [])
+            if scanned_packages:
+                for scanned_package in scanned_packages:
+                    # Replace the 'datafile_path' with the pom_url
+                    scanned_package['datafile_paths'] = [pom_url]
+                    scanned_pom_packages.append(scanned_package)
+            if scanned_dependencies:
+                for scanned_dep in scanned_dependencies:
+                    # Replace the 'datafile_path' with the empty list
+                    scanned_dep['datafile_path'] = scanned_pom_output_path
+                    scanned_pom_deps.append(scanned_dep)
+    return scanned_pom_packages, scanned_pom_deps

From 928f742fa0844bdd26d748f8825f3f91296494a1 Mon Sep 17 00:00:00 2001
From: Chin Yeung Li <tli@nexb.com>
Date: Thu, 13 Nov 2025 09:58:45 +0800
Subject: [PATCH 2/7] Use pom_url as the datafile_path for the dependenies
 #1763

Signed-off-by: Chin Yeung Li <tli@nexb.com>
---
 scanpipe/pipes/resolve.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py
index f9ea444301..431108a7a6 100644
--- a/scanpipe/pipes/resolve.py
+++ b/scanpipe/pipes/resolve.py
@@ -685,7 +685,7 @@ def download_and_scan_pom_file(pom_url_list):
                     scanned_pom_packages.append(scanned_package)
             if scanned_dependencies:
                 for scanned_dep in scanned_dependencies:
-                    # Replace the 'datafile_path' with the empty list
-                    scanned_dep['datafile_path'] = scanned_pom_output_path
+                    # Replace the 'datafile_path' with the pom_url
+                    scanned_dep['datafile_path'] = pom_url
                     scanned_pom_deps.append(scanned_dep)
     return scanned_pom_packages, scanned_pom_deps

From d63a1e539094239244280e85e4ec7ba3de6de82f Mon Sep 17 00:00:00 2001
From: Chin Yeung Li <tli@nexb.com>
Date: Thu, 13 Nov 2025 17:36:50 +0800
Subject: [PATCH 3/7] Use empty string for datafile_path in dependencies #1763

Signed-off-by: Chin Yeung Li <tli@nexb.com>
---
 scanpipe/pipes/resolve.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py
index 431108a7a6..c5a32918ef 100644
--- a/scanpipe/pipes/resolve.py
+++ b/scanpipe/pipes/resolve.py
@@ -557,6 +557,7 @@ def parse_maven_filename(filename):
 
 
 def get_pom_url_list(input_source, packages):
+    """Generate Maven POM URLs from package metadata or input source."""
     pom_url_list = []
     if packages:
         for package in packages:
@@ -623,11 +624,9 @@ def construct_pom_url_from_filename(artifact_id, version):
 
 
 def is_maven_pom_url(url):
-    """
-    Return True if the url is a accessible, False otherwise
-    Maven Central has a fallback mechanism that serves a generic/error page
-    instead of returning a proper 404.
-    """
+    """Return True if the url is a accessible, False otherwise"""
+    # Maven Central has a fallback mechanism that serves a generic/error
+    # page instead of returning a proper 404.
     try:
         response = requests.get(url, timeout=5)
         if response.status_code != 200:
@@ -650,6 +649,7 @@ def is_maven_pom_url(url):
 
 
 def download_and_scan_pom_file(pom_url_list):
+    """Fetch and scan the pom file from the input pom_url_list"""
     scanned_pom_packages = []
     scanned_pom_deps = []
     for pom_url in pom_url_list:
@@ -661,16 +661,7 @@ def download_and_scan_pom_file(pom_url_list):
             location=str(downloaded_pom.path),
             output_file=scanned_pom_output_path,
             run_scan_args={
-                "copyright": True,
-                "email": True,
-                "info": True,
-                "license": True,
-                "license_text": True,
-                "license_diagnostics": True,
-                "license_text_diagnostics": True,
-                "license_references": True,
                 "package": True,
-                "url": True,
             },
         )
 
@@ -685,7 +676,8 @@ def download_and_scan_pom_file(pom_url_list):
                     scanned_pom_packages.append(scanned_package)
             if scanned_dependencies:
                 for scanned_dep in scanned_dependencies:
-                    # Replace the 'datafile_path' with the pom_url
-                    scanned_dep['datafile_path'] = pom_url
+                    # Replace the 'datafile_path' with empty string
+                    # See https://github.com/aboutcode-org/scancode.io/issues/1763#issuecomment-3525165830
+                    scanned_dep['datafile_path'] = ""
                     scanned_pom_deps.append(scanned_dep)
     return scanned_pom_packages, scanned_pom_deps

From 9812129f0f1a63d496d514d38710a62a93bedb65 Mon Sep 17 00:00:00 2001
From: Chin Yeung Li <tli@nexb.com>
Date: Thu, 13 Nov 2025 17:37:18 +0800
Subject: [PATCH 4/7] Removed dup code that's already present in
 ScanSinglePackage #1763

Signed-off-by: Chin Yeung Li <tli@nexb.com>
---
 scanpipe/pipelines/scan_maven_package.py | 128 ++++-------------------
 1 file changed, 19 insertions(+), 109 deletions(-)

diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py
index 841899ff88..a06697ba6f 100644
--- a/scanpipe/pipelines/scan_maven_package.py
+++ b/scanpipe/pipelines/scan_maven_package.py
@@ -22,21 +22,13 @@
 
 import json
 
-from django.core.serializers.json import DjangoJSONEncoder
-
-from commoncode.hash import multi_checksums
-
-from scanpipe.pipelines import Pipeline
-from scanpipe.pipes import input
-from scanpipe.pipes import scancode
-from scanpipe.pipes.input import copy_input
-from scanpipe.pipes.input import is_archive
+from scanpipe.pipelines.scan_single_package import ScanSinglePackage
 
 from scanpipe.pipes.resolve import get_pom_url_list
 from scanpipe.pipes.resolve import download_and_scan_pom_file
 
 
-class ScanMavenPackage(Pipeline):
+class ScanMavenPackage(ScanSinglePackage):
     """
     Scan a single package archive (or package manifest file).
 
@@ -59,104 +51,22 @@ def steps(cls):
             cls.make_summary_from_scan_results,
         )
 
-    scancode_run_scan_args = {
-        "copyright": True,
-        "email": True,
-        "info": True,
-        "license": True,
-        "license_text": True,
-        "license_diagnostics": True,
-        "license_text_diagnostics": True,
-        "license_references": True,
-        "package": True,
-        "url": True,
-        "classify": True,
-        "summary": True,
-        "todo": True,
-    }
-
-    def get_package_input(self):
-        """Locate the package input in the project's input/ directory."""
-        # Using the input_sources model property as it includes input sources instances
-        # as well as any files manually copied into the input/ directory.
-        input_sources = self.project.input_sources
-        inputs = list(self.project.inputs("*"))
-
-        if len(inputs) != 1 or len(input_sources) != 1:
-            raise Exception("Only 1 input file supported")
-
-        self.input_path = inputs[0]
-
-    def collect_input_information(self):
-        """Collect and store information about the project input."""
-        self.project.update_extra_data(
-            {
-                "filename": self.input_path.name,
-                "size": self.input_path.stat().st_size,
-                **multi_checksums(self.input_path),
-            }
-        )
-
-    def extract_input_to_codebase_directory(self):
-        """Copy or extract input to project codebase/ directory."""
-        if not is_archive(self.input_path):
-            copy_input(self.input_path, self.project.codebase_path)
-            return
-
-        self.extract_archive(self.input_path, self.project.codebase_path)
-
-        # Reload the project env post-extraction as the scancode-config.yml file
-        # may be located in one of the extracted archives.
-        self.env = self.project.get_env()
-
-    def run_scan(self):
-        """Scan extracted codebase/ content."""
-        scan_output_path = self.project.get_output_file_path("scancode", "json")
-        self.scan_output_location = str(scan_output_path.absolute())
-
-        scanning_errors = scancode.run_scan(
-            location=str(self.project.codebase_path),
-            output_file=self.scan_output_location,
-            run_scan_args=self.scancode_run_scan_args.copy(),
-        )
-
-        for resource_path, errors in scanning_errors.items():
-            self.project.add_error(
-                description="\n".join(errors),
-                model=self.pipeline_name,
-                details={"resource_path": resource_path.removeprefix("codebase/")},
-            )
-
-        if not scan_output_path.exists():
-            raise FileNotFoundError("ScanCode output not available.")
-
     def fetch_and_scan_remote_pom(self):
         """Fetch the pom.xml file from from maven.org if not present in codebase."""
-        # TODO Verify if the following filter actually work
-        if not self.project.codebaseresources.files().filter(name="pom.xml").exists():
-            with open(self.scan_output_location, 'r') as file:
-                data = json.load(file)
-                packages = data.get("packages", [])
-
-            pom_url_list = get_pom_url_list(self.project.input_sources[0], packages)
-            scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list)
-
-            updated_pacakges = packages + scanned_pom_packages
-            # Replace/Update the package and dependencies section
-            data['packages'] = updated_pacakges
-            # Need to update the dependencies
-            # data['dependencies'] = scanned_dependencies
-            with open(self.scan_output_location, 'w') as file:
-                json.dump(data, file, indent=2)
-
-    def load_inventory_from_toolkit_scan(self):
-        """Process a JSON Scan results to populate codebase resources and packages."""
-        input.load_inventory_from_toolkit_scan(self.project, self.scan_output_location)
-
-    def make_summary_from_scan_results(self):
-        """Build a summary in JSON format from the generated scan results."""
-        summary = scancode.make_results_summary(self.project, self.scan_output_location)
-        output_file = self.project.get_output_file_path("summary", "json")
-
-        with output_file.open("w") as summary_file:
-            summary_file.write(json.dumps(summary, indent=2, cls=DjangoJSONEncoder))
+        with open(self.scan_output_location, 'r') as file:
+            data = json.load(file)
+            # Return and do nothing if data has pom.xml
+            for file in data['files']:
+                if 'pom.xml' in file['path']:
+                    return
+            packages = data.get("packages", [])
+
+        pom_url_list = get_pom_url_list(self.project.input_sources[0], packages)
+        scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list)
+
+        updated_pacakges = packages + scanned_pom_packages
+        # Replace/Update the package and dependencies section
+        data['packages'] = updated_pacakges
+        data['dependencies'] = scanned_dependencies
+        with open(self.scan_output_location, 'w') as file:
+            json.dump(data, file, indent=2)

From 114eb75f8bf77d5cf5212facb0e01d726428d93e Mon Sep 17 00:00:00 2001
From: Chin Yeung Li <tli@nexb.com>
Date: Thu, 13 Nov 2025 18:38:51 +0800
Subject: [PATCH 5/7] Update the matching regex for parse_maven_filename and
 added test #1763

- Update format

Signed-off-by: Chin Yeung Li <tli@nexb.com>
---
 scanpipe/pipelines/scan_maven_package.py | 19 +++----
 scanpipe/pipes/resolve.py                | 68 ++++++++++++++++--------
 scanpipe/tests/pipes/test_resolve.py     | 27 ++++++++++
 3 files changed, 83 insertions(+), 31 deletions(-)

diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py
index a06697ba6f..00ddc5e500 100644
--- a/scanpipe/pipelines/scan_maven_package.py
+++ b/scanpipe/pipelines/scan_maven_package.py
@@ -23,9 +23,8 @@
 import json
 
 from scanpipe.pipelines.scan_single_package import ScanSinglePackage
-
-from scanpipe.pipes.resolve import get_pom_url_list
 from scanpipe.pipes.resolve import download_and_scan_pom_file
+from scanpipe.pipes.resolve import get_pom_url_list
 
 
 class ScanMavenPackage(ScanSinglePackage):
@@ -53,20 +52,22 @@ def steps(cls):
 
     def fetch_and_scan_remote_pom(self):
         """Fetch the pom.xml file from from maven.org if not present in codebase."""
-        with open(self.scan_output_location, 'r') as file:
+        with open(self.scan_output_location) as file:
             data = json.load(file)
             # Return and do nothing if data has pom.xml
-            for file in data['files']:
-                if 'pom.xml' in file['path']:
+            for file in data["files"]:
+                if "pom.xml" in file["path"]:
                     return
             packages = data.get("packages", [])
 
         pom_url_list = get_pom_url_list(self.project.input_sources[0], packages)
-        scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(pom_url_list)
+        scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(
+            pom_url_list
+        )
 
         updated_pacakges = packages + scanned_pom_packages
         # Replace/Update the package and dependencies section
-        data['packages'] = updated_pacakges
-        data['dependencies'] = scanned_dependencies
-        with open(self.scan_output_location, 'w') as file:
+        data["packages"] = updated_pacakges
+        data["dependencies"] = scanned_dependencies
+        with open(self.scan_output_location, "w") as file:
             json.dump(data, file, indent=2)
diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py
index c5a32918ef..e94de32e41 100644
--- a/scanpipe/pipes/resolve.py
+++ b/scanpipe/pipes/resolve.py
@@ -23,7 +23,6 @@
 import json
 import logging
 import re
-import requests
 import sys
 import uuid
 from pathlib import Path
@@ -32,6 +31,7 @@
 from django.core.exceptions import ObjectDoesNotExist
 
 import python_inspector.api as python_inspector
+import requests
 import saneyaml
 from attributecode.model import About
 from packagedcode import APPLICATION_PACKAGE_DATAFILE_HANDLERS
@@ -41,14 +41,13 @@
 from scanpipe.models import DiscoveredDependency
 from scanpipe.models import DiscoveredPackage
 from scanpipe.pipes import cyclonedx
+from scanpipe.pipes import fetch
 from scanpipe.pipes import flag
+from scanpipe.pipes import scancode
 from scanpipe.pipes import spdx
 from scanpipe.pipes import update_or_create_dependency
 from scanpipe.pipes import update_or_create_package
 
-from scanpipe.pipes import fetch
-from scanpipe.pipes import scancode
-
 """
 Resolve packages from manifest, lockfile, and SBOM.
 """
@@ -531,26 +530,42 @@ def extract_headers(input_location, extract_fields):
 def parse_maven_filename(filename):
     """Parse a Maven's jar filename to extract artifactId and version."""
     # Remove the .jar extension
-    base = filename.rsplit('.', 1)[0]
+    base = filename.rsplit(".", 1)[0]
 
     # Common classifiers pattern
     common_classifiers = {
-        'sources', 'javadoc', 'tests', 'test', 'test-sources',
-        'src', 'bin', 'docs', 'javadocs', 'client', 'server',
-        'linux', 'windows', 'macos', 'linux-x86_64', 'windows-x86_64'
+        "sources",
+        "javadoc",
+        "tests",
+        "test",
+        "test-sources",
+        "src",
+        "bin",
+        "docs",
+        "javadocs",
+        "client",
+        "server",
+        "linux",
+        "windows",
+        "macos",
+        "linux-x86_64",
+        "windows-x86_64",
     }
 
     # Remove known classifier if present
     for classifier in common_classifiers:
         if base.endswith(f"-{classifier}"):
-            base = base[:-(len(classifier) + 1)]
+            base = base[: -(len(classifier) + 1)]
             break
 
     # Match artifactId and version
-    match = re.match(r'^(.*)-(\d[\w.\-]+)$', base)
+    match = re.match(r"^(.*?)-((\d[\w.\-]*))$", base)
+
     if match:
         artifact_id = match.group(1)
         version = match.group(2)
+        print("artifact_id", artifact_id)
+        print("version", version)
         return artifact_id, version
     else:
         return None, None
@@ -564,15 +579,21 @@ def get_pom_url_list(input_source, packages):
             package_ns = package.get("namespace", "")
             package_name = package.get("name", "")
             package_version = package.get("version", "")
-            pom_url = f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/{package_name}/{package_version}/{package_name}-{package_version}.pom".lower()
+            pom_url = (
+                f"https://repo1.maven.org/maven2/{package_ns.replace('.', '/')}/"
+                f"{package_name}/{package_version}/"
+                f"{package_name}-{package_version}.pom".lower()
+            )
             pom_url_list.append(pom_url)
     else:
         # Check what's the input source
         input_source_url = input_source.get("download_url", "")
 
         if input_source_url and "maven.org/" in input_source_url:
-            base_url = input_source_url.rsplit('/', 1)[0]
-            pom_url = base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom"
+            base_url = input_source_url.rsplit("/", 1)[0]
+            pom_url = (
+                base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom"
+            )
             pom_url_list.append(pom_url)
         else:
             # Construct a pom_url from filename
@@ -596,12 +617,12 @@ def construct_pom_url_from_filename(artifact_id, version):
     pom_url_list = []
     group_ids = []
     try:
-        response = requests.get(url)
+        response = requests.get(url, timeout=5)
         response.raise_for_status()
         data = response.json()
         # Extract all 'g' fields from the docs array that represent
         # groupIds
-        group_ids = [doc['g'] for doc in data['response']['docs']]
+        group_ids = [doc["g"] for doc in data["response"]["docs"]]
     except requests.RequestException as e:
         print(f"Error fetching data: {e}")
         return []
@@ -610,7 +631,10 @@ def construct_pom_url_from_filename(artifact_id, version):
         return []
 
     for group_id in group_ids:
-        pom_url = f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/{artifact_id}/{version}/{artifact_id}-{version}.pom".lower()
+        pom_url = (
+            f"https://repo1.maven.org/maven2/{group_id.replace('.', '/')}/"
+            f"{artifact_id}/{version}/{artifact_id}-{version}.pom".lower()
+        )
         if is_maven_pom_url(pom_url):
             pom_url_list.append(pom_url)
     if len(pom_url_list) > 1:
@@ -632,12 +656,12 @@ def is_maven_pom_url(url):
         if response.status_code != 200:
             return False
         # Check content-type
-        content_type = response.headers.get('content-type', '').lower()
-        is_xml = 'xml' in content_type or 'text/xml' in content_type
+        content_type = response.headers.get("content-type", "").lower()
+        is_xml = "xml" in content_type or "text/xml" in content_type
 
         # Check content
         content = response.text.strip()
-        is_pom = content.startswith('<?xml') and '<project' in content
+        is_pom = content.startswith("<?xml") and "<project" in content
 
         if is_xml and is_pom:
             return True
@@ -665,19 +689,19 @@ def download_and_scan_pom_file(pom_url_list):
             },
         )
 
-        with open(scanned_pom_output_path, 'r') as scanned_pom_file:
+        with open(scanned_pom_output_path) as scanned_pom_file:
             scanned_pom_data = json.load(scanned_pom_file)
             scanned_packages = scanned_pom_data.get("packages", [])
             scanned_dependencies = scanned_pom_data.get("dependencies", [])
             if scanned_packages:
                 for scanned_package in scanned_packages:
                     # Replace the 'datafile_path' with the pom_url
-                    scanned_package['datafile_paths'] = [pom_url]
+                    scanned_package["datafile_paths"] = [pom_url]
                     scanned_pom_packages.append(scanned_package)
             if scanned_dependencies:
                 for scanned_dep in scanned_dependencies:
                     # Replace the 'datafile_path' with empty string
                     # See https://github.com/aboutcode-org/scancode.io/issues/1763#issuecomment-3525165830
-                    scanned_dep['datafile_path'] = ""
+                    scanned_dep["datafile_path"] = ""
                     scanned_pom_deps.append(scanned_dep)
     return scanned_pom_packages, scanned_pom_deps
diff --git a/scanpipe/tests/pipes/test_resolve.py b/scanpipe/tests/pipes/test_resolve.py
index aa16edaafc..f60608ee04 100644
--- a/scanpipe/tests/pipes/test_resolve.py
+++ b/scanpipe/tests/pipes/test_resolve.py
@@ -373,3 +373,30 @@ def test_scanpipe_resolve_get_manifest_headers(self):
         ]
         headers = resolve.get_manifest_headers(resource)
         self.assertEqual(expected, list(headers.keys()))
+
+    def test_scanpipe_resolve_parse_maven_filename(self):
+        test1 = "wisp-logging-2025.11.11.195957-97a44b0-sources.jar"
+        test2 = "guava-33.5.0-jre-javadoc.jar"
+        test3 = "junit-4.13.2.jar"
+        test4 = "guava-33.5.0-jre.jar"
+
+        expected1_name = "wisp-logging"
+        expected1_version = "2025.11.11.195957-97a44b0"
+        expected2_name = "guava"
+        expected2_version = "33.5.0-jre"
+        expected3_name = "junit"
+        expected3_version = "4.13.2"
+
+        result1_name, result1_version = resolve.parse_maven_filename(test1)
+        result2_name, result2_version = resolve.parse_maven_filename(test2)
+        result3_name, result3_version = resolve.parse_maven_filename(test3)
+        result4_name, result4_version = resolve.parse_maven_filename(test4)
+
+        self.assertEqual(result1_name, expected1_name)
+        self.assertEqual(result1_version, expected1_version)
+        self.assertEqual(result2_name, expected2_name)
+        self.assertEqual(result2_version, expected2_version)
+        self.assertEqual(result3_name, expected3_name)
+        self.assertEqual(result3_version, expected3_version)
+        self.assertEqual(result4_name, expected2_name)
+        self.assertEqual(result4_version, expected2_version)

From 13e8e88a429c2493c51378ad30440a00460b7fab Mon Sep 17 00:00:00 2001
From: Chin Yeung Li <tli@nexb.com>
Date: Fri, 14 Nov 2025 15:21:15 +0800
Subject: [PATCH 6/7] Refactor code and add tests #1763

Signed-off-by: Chin Yeung Li <tli@nexb.com>
---
 scanpipe/pipelines/scan_maven_package.py |  10 +-
 scanpipe/pipes/resolve.py                |  30 +++-
 scanpipe/tests/pipes/test_resolve.py     | 186 +++++++++++++++++++++++
 3 files changed, 215 insertions(+), 11 deletions(-)

diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py
index 00ddc5e500..7438d781c0 100644
--- a/scanpipe/pipelines/scan_maven_package.py
+++ b/scanpipe/pipelines/scan_maven_package.py
@@ -23,8 +23,9 @@
 import json
 
 from scanpipe.pipelines.scan_single_package import ScanSinglePackage
-from scanpipe.pipes.resolve import download_and_scan_pom_file
+from scanpipe.pipes.resolve import download_pom_files
 from scanpipe.pipes.resolve import get_pom_url_list
+from scanpipe.pipes.resolve import scan_pom_files
 
 
 class ScanMavenPackage(ScanSinglePackage):
@@ -51,7 +52,7 @@ def steps(cls):
         )
 
     def fetch_and_scan_remote_pom(self):
-        """Fetch the pom.xml file from from maven.org if not present in codebase."""
+        """Fetch the .pom file from from maven.org if not present in codebase."""
         with open(self.scan_output_location) as file:
             data = json.load(file)
             # Return and do nothing if data has pom.xml
@@ -61,9 +62,8 @@ def fetch_and_scan_remote_pom(self):
             packages = data.get("packages", [])
 
         pom_url_list = get_pom_url_list(self.project.input_sources[0], packages)
-        scanned_pom_packages, scanned_dependencies = download_and_scan_pom_file(
-            pom_url_list
-        )
+        pom_file_list = download_pom_files(pom_url_list)
+        scanned_pom_packages, scanned_dependencies = scan_pom_files(pom_file_list)
 
         updated_pacakges = packages + scanned_pom_packages
         # Replace/Update the package and dependencies section
diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py
index e94de32e41..b672df13bc 100644
--- a/scanpipe/pipes/resolve.py
+++ b/scanpipe/pipes/resolve.py
@@ -586,10 +586,13 @@ def get_pom_url_list(input_source, packages):
             )
             pom_url_list.append(pom_url)
     else:
+        from urllib.parse import urlparse
+
         # Check what's the input source
         input_source_url = input_source.get("download_url", "")
 
-        if input_source_url and "maven.org/" in input_source_url:
+        parsed_url = urlparse(input_source_url)
+        if input_source_url and parsed_url.netloc.endswith("maven.org"):
             base_url = input_source_url.rsplit("/", 1)[0]
             pom_url = (
                 base_url + "/" + "-".join(base_url.rstrip("/").split("/")[-2:]) + ".pom"
@@ -672,17 +675,32 @@ def is_maven_pom_url(url):
         return False
 
 
-def download_and_scan_pom_file(pom_url_list):
+def download_pom_files(pom_url_list):
+    """Fetch the pom file from the input pom_url_list"""
+    pom_file_list = []
+    for pom_url in pom_url_list:
+        pom_file_dict = {}
+        downloaded_pom = fetch.fetch_http(pom_url)
+        print("download_pom.path", str(downloaded_pom.path))
+        pom_file_dict["pom_file_path"] = str(downloaded_pom.path)
+        pom_file_dict["output_path"] = str(downloaded_pom.path) + "-output.json"
+        pom_file_dict["pom_url"] = pom_url
+        pom_file_list.append(pom_file_dict)
+    return pom_file_list
+
+
+def scan_pom_files(pom_file_list):
     """Fetch and scan the pom file from the input pom_url_list"""
     scanned_pom_packages = []
     scanned_pom_deps = []
-    for pom_url in pom_url_list:
-        downloaded_pom = fetch.fetch_http(pom_url)
-        scanned_pom_output_path = str(downloaded_pom.path) + "-output.json"
+    for pom_file_dict in pom_file_list:
+        pom_file_path = pom_file_dict.get("pom_file_path", "")
+        scanned_pom_output_path = pom_file_dict.get("output_path", "")
+        pom_url = pom_file_dict.get("pom_url", "")
 
         # Run a package scan on the fetched pom.xml
         _scanning_errors = scancode.run_scan(
-            location=str(downloaded_pom.path),
+            location=pom_file_path,
             output_file=scanned_pom_output_path,
             run_scan_args={
                 "package": True,
diff --git a/scanpipe/tests/pipes/test_resolve.py b/scanpipe/tests/pipes/test_resolve.py
index f60608ee04..016d00bd7c 100644
--- a/scanpipe/tests/pipes/test_resolve.py
+++ b/scanpipe/tests/pipes/test_resolve.py
@@ -400,3 +400,189 @@ def test_scanpipe_resolve_parse_maven_filename(self):
         self.assertEqual(result3_version, expected3_version)
         self.assertEqual(result4_name, expected2_name)
         self.assertEqual(result4_version, expected2_version)
+
+    @mock.patch("requests.get")
+    def test_scanpipe_resolve_is_maven_pom_url_valid(self, mock_get):
+        mock_response = mock.Mock()
+        mock_response.status_code = 200
+        mock_response.headers = {"content-type": "application/xml"}
+        mock_response.text = '<?xml version="1.0"?><project></project>'
+        mock_get.return_value = mock_response
+
+        result = resolve.is_maven_pom_url(
+            "https://repo1.maven.org/maven2/example/example.pom"
+        )
+        self.assertTrue(result)
+
+    @mock.patch("requests.get")
+    def test_scanpipe_resolve_is_maven_pom_url_404(self, mock_get):
+        mock_response = mock.Mock()
+        mock_response.status_code = 404
+        mock_get.return_value = mock_response
+
+        result = resolve.is_maven_pom_url(
+            "https://repo.maven.apache.org/maven2/example/404.pom"
+        )
+        self.assertFalse(result)
+
+    @mock.patch("requests.get")
+    def test_scanpipe_resolve_is_maven_pom_url_error(self, mock_get):
+        mock_response = mock.Mock()
+        mock_response.status_code = 200
+        mock_response.headers = {"content-type": "text/html"}
+        mock_response.text = "<html>Error page</html>"
+        mock_get.return_value = mock_response
+
+        result = resolve.is_maven_pom_url(
+            "https://repo.maven.apache.org/maven2/example/error.pom"
+        )
+        self.assertFalse(result)
+
+    @mock.patch("scanpipe.pipes.resolve.fetch.fetch_http")
+    def test_scanpipe_resolve_download_pom_files(self, mock_fetch_http):
+        mock_response = mock.Mock()
+        mock_response.path = "/safe/example1.pom"
+        mock_fetch_http.return_value = mock_response
+
+        pom_urls = ["https://repo1.maven.org/maven2/example/example1.pom"]
+
+        expected = [
+            {
+                "pom_file_path": "/safe/example1.pom",
+                "output_path": "/safe/example1.pom-output.json",
+                "pom_url": "https://repo1.maven.org/maven2/example/example1.pom",
+            }
+        ]
+
+        result = resolve.download_pom_files(pom_urls)
+        self.assertEqual(result, expected)
+
+    @mock.patch("scanpipe.pipes.resolve.scancode.run_scan")
+    @mock.patch("builtins.open", new_callable=mock.mock_open)
+    @mock.patch("json.load")
+    def test_scanpipe_resolve_scan_pom_files(
+        self, mock_json_load, mock_open, mock_run_scan
+    ):
+        mock_json_load.return_value = {
+            "packages": [
+                {
+                    "name": "example-package",
+                    "version": "1.0.0",
+                    "datafile_paths": ["/safe/mock_pom.xml"],
+                }
+            ],
+            "dependencies": [
+                {
+                    "name": "example-dep",
+                    "version": "2.0.0",
+                    "datafile_path": "/safe/mock_pom.xml",
+                }
+            ],
+        }
+
+        pom_file_list = [
+            {
+                "pom_file_path": "/safe/mock.pom",
+                "output_path": "/safe/mock.pom-output.json",
+                "pom_url": "https://repo1.maven.org/maven2/example/example.pom",
+            }
+        ]
+
+        expected_packages = [
+            {
+                "name": "example-package",
+                "version": "1.0.0",
+                "datafile_paths": [
+                    "https://repo1.maven.org/maven2/example/example.pom"
+                ],
+            }
+        ]
+        expected_deps = [
+            {"name": "example-dep", "version": "2.0.0", "datafile_path": ""}
+        ]
+
+        packages, deps = resolve.scan_pom_files(pom_file_list)
+
+        self.assertEqual(packages, expected_packages)
+        self.assertEqual(deps, expected_deps)
+
+        mock_run_scan.assert_called_once_with(
+            location="/safe/mock.pom",
+            output_file="/safe/mock.pom-output.json",
+            run_scan_args={"package": True},
+        )
+        mock_open.assert_called_once_with("/safe/mock.pom-output.json")
+        mock_json_load.assert_called_once()
+
+    @mock.patch("scanpipe.pipes.resolve.is_maven_pom_url")
+    @mock.patch("scanpipe.pipes.resolve.requests.get")
+    def test_scanpipe_resolve_construct_pom_url_from_filename(
+        self, mock_get, mock_is_maven_pom_url
+    ):
+        # Setup mock response from Maven Central
+        mock_response = mock.Mock()
+        mock_response.raise_for_status.return_value = None
+        mock_response.json.return_value = {
+            "response": {"docs": [{"g": "org.apache.commons"}]}
+        }
+        mock_get.return_value = mock_response
+        mock_is_maven_pom_url.return_value = True
+
+        # Inputs
+        artifact_id = "commons-lang3"
+        version = "3.12.0"
+
+        expected_url = [
+            "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom"
+        ]
+
+        result = resolve.construct_pom_url_from_filename(artifact_id, version)
+
+        self.assertEqual(result, expected_url)
+        mock_get.assert_called_once_with(
+            "https://search.maven.org/solrsearch/select?q=a:commons-lang3&wt=json",
+            timeout=5,
+        )
+        mock_is_maven_pom_url.assert_called_once_with(expected_url[0])
+
+    def test_scanpipe_resolve_get_pom_url_list_with_packages(self):
+        packages = [
+            {
+                "namespace": "org.apache.commons",
+                "name": "commons-lang3",
+                "version": "3.12.0",
+            }
+        ]
+        result = resolve.get_pom_url_list({}, packages)
+        expected = [
+            "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom"
+        ]
+        self.assertEqual(result, expected)
+
+    def test_scanpipe_resolve_get_pom_url_list_with_maven_download_url(self):
+        input_source = {
+            "download_url": "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.jar"
+        }
+        result = resolve.get_pom_url_list(input_source, [])
+        expected = [
+            "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom"
+        ]
+        self.assertEqual(result, expected)
+
+    @mock.patch("scanpipe.pipes.resolve.construct_pom_url_from_filename")
+    @mock.patch("scanpipe.pipes.resolve.parse_maven_filename")
+    def test_scanpipe_resolve_get_pom_url_list_with_jar_filename(
+        self, mock_parse, mock_construct
+    ):
+        input_source = {"filename": "commons-lang3-3.12.0.jar"}
+        mock_parse.return_value = ("commons-lang3", "3.12.0")
+        mock_construct.return_value = [
+            "https://repo1.maven.org/maven2/org/apache/commons/commons-lang3/3.12.0/commons-lang3-3.12.0.pom"
+        ]
+        result = resolve.get_pom_url_list(input_source, [])
+        self.assertEqual(result, mock_construct.return_value)
+
+    def test_scanpipe_resolve_get_pom_url_list_with_invalid_filename(self):
+        input_source = {"filename": "not-a-jar.txt"}
+        result = resolve.get_pom_url_list(input_source, [])
+        self.assertEqual(result, [])

From cb623c1c43aea551f6442dfe01b561affd6621cf Mon Sep 17 00:00:00 2001
From: Chin Yeung Li <tli@nexb.com>
Date: Mon, 17 Nov 2025 18:08:24 +0800
Subject: [PATCH 7/7] Implement
 "update_package_license_from_resource_if_missing" function #1763

- Update package's license if missing while the same package has license detected in RESOURCES

Signed-off-by: Chin Yeung Li <tli@nexb.com>
---
 scanpipe/pipelines/scan_maven_package.py  |  5 ++-
 scanpipe/pipelines/scan_single_package.py | 19 ++++++++
 scanpipe/pipes/resolve.py                 | 29 +++++++++++++
 scanpipe/tests/pipes/test_resolve.py      | 53 +++++++++++++++++++++++
 4 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/scanpipe/pipelines/scan_maven_package.py b/scanpipe/pipelines/scan_maven_package.py
index 7438d781c0..6b86b06791 100644
--- a/scanpipe/pipelines/scan_maven_package.py
+++ b/scanpipe/pipelines/scan_maven_package.py
@@ -47,6 +47,7 @@ def steps(cls):
             cls.extract_archives,
             cls.run_scan,
             cls.fetch_and_scan_remote_pom,
+            cls.update_package_license_from_resource_if_missing,
             cls.load_inventory_from_toolkit_scan,
             cls.make_summary_from_scan_results,
         )
@@ -65,9 +66,9 @@ def fetch_and_scan_remote_pom(self):
         pom_file_list = download_pom_files(pom_url_list)
         scanned_pom_packages, scanned_dependencies = scan_pom_files(pom_file_list)
 
-        updated_pacakges = packages + scanned_pom_packages
+        updated_packages = packages + scanned_pom_packages
         # Replace/Update the package and dependencies section
-        data["packages"] = updated_pacakges
+        data["packages"] = updated_packages
         data["dependencies"] = scanned_dependencies
         with open(self.scan_output_location, "w") as file:
             json.dump(data, file, indent=2)
diff --git a/scanpipe/pipelines/scan_single_package.py b/scanpipe/pipelines/scan_single_package.py
index 605ef0ea5d..7f0bf8b909 100644
--- a/scanpipe/pipelines/scan_single_package.py
+++ b/scanpipe/pipelines/scan_single_package.py
@@ -31,6 +31,7 @@
 from scanpipe.pipes import scancode
 from scanpipe.pipes.input import copy_input
 from scanpipe.pipes.input import is_archive
+from scanpipe.pipes.resolve import update_package_license_from_resource_if_missing
 
 
 class ScanSinglePackage(Pipeline):
@@ -51,6 +52,7 @@ def steps(cls):
             cls.extract_input_to_codebase_directory,
             cls.extract_archives,
             cls.run_scan,
+            cls.update_package_license_from_resource_if_missing,
             cls.load_inventory_from_toolkit_scan,
             cls.make_summary_from_scan_results,
         )
@@ -126,6 +128,23 @@ def run_scan(self):
         if not scan_output_path.exists():
             raise FileNotFoundError("ScanCode output not available.")
 
+    def update_package_license_from_resource_if_missing(self):
+        """Update PACKAGE license from the license detected in RESOURCES if missing."""
+        with open(self.scan_output_location) as file:
+            data = json.load(file)
+            packages = data.get("packages", [])
+            resources = data.get("files", [])
+            if not packages or not resources:
+                return
+
+        updated_packages = update_package_license_from_resource_if_missing(
+            packages, resources
+        )
+        # Update the package section
+        data["packages"] = updated_packages
+        with open(self.scan_output_location, "w") as file:
+            json.dump(data, file, indent=2)
+
     def load_inventory_from_toolkit_scan(self):
         """Process a JSON Scan results to populate codebase resources and packages."""
         input.load_inventory_from_toolkit_scan(self.project, self.scan_output_location)
diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py
index b672df13bc..4f7ed272c5 100644
--- a/scanpipe/pipes/resolve.py
+++ b/scanpipe/pipes/resolve.py
@@ -723,3 +723,32 @@ def scan_pom_files(pom_file_list):
                     scanned_dep["datafile_path"] = ""
                     scanned_pom_deps.append(scanned_dep)
     return scanned_pom_packages, scanned_pom_deps
+
+
+def update_package_license_from_resource_if_missing(packages, resources):
+    """Populate missing licenses to packages based on resource data."""
+    from license_expression import Licensing
+
+    updated_packages = []
+    for package in packages:
+        if not package.get("declared_license_expression"):
+            package_uid = package.get("package_uid")
+            detected_lic_list = []
+            for resource in resources:
+                if (
+                    resource.get("detected_license_expression")
+                    and package_uid in resource["for_packages"]
+                ):
+                    if (
+                        resource.get("detected_license_expression")
+                        not in detected_lic_list
+                    ):
+                        detected_lic_list.append(
+                            resource.get("detected_license_expression")
+                        )
+            license_expression = " AND ".join(detected_lic_list)
+            if license_expression:
+                declared_license_expression = str(Licensing().dedup(license_expression))
+                package["declared_license_expression"] = declared_license_expression
+        updated_packages.append(package)
+    return updated_packages
diff --git a/scanpipe/tests/pipes/test_resolve.py b/scanpipe/tests/pipes/test_resolve.py
index 016d00bd7c..89be9dd4f4 100644
--- a/scanpipe/tests/pipes/test_resolve.py
+++ b/scanpipe/tests/pipes/test_resolve.py
@@ -586,3 +586,56 @@ def test_scanpipe_resolve_get_pom_url_list_with_invalid_filename(self):
         input_source = {"filename": "not-a-jar.txt"}
         result = resolve.get_pom_url_list(input_source, [])
         self.assertEqual(result, [])
+
+    def test_scanpipe_resolve_update_package_license_from_resource_if_missing(self):
+        packages = [
+            {"package_uid": "pkg1", "declared_license_expression": ""},
+            {"package_uid": "pkg2", "declared_license_expression": None},
+            {"package_uid": "pkg3", "declared_license_expression": "MIT"},
+        ]
+        resources = [
+            {
+                "for_packages": ["pkg1", "pkg2"],
+                "detected_license_expression": "GPL-2.0",
+            },
+            {"for_packages": ["pkg1"], "detected_license_expression": "MIT"},
+        ]
+
+        expected_pkg1_expr = "GPL-2.0 AND MIT"
+        expected_pkg2_expr = "GPL-2.0"
+
+        updated = resolve.update_package_license_from_resource_if_missing(
+            packages, resources
+        )
+
+        self.assertEqual(updated[0]["declared_license_expression"], expected_pkg1_expr)
+        self.assertEqual(updated[1]["declared_license_expression"], expected_pkg2_expr)
+        self.assertEqual(updated[2]["declared_license_expression"], "MIT")
+
+    def test_scanpipe_resolve_update_package_license_from_resource_if_missing_no_match(
+        self,
+    ):
+        packages = [{"package_uid": "pkgX", "declared_license_expression": None}]
+        resources = [{"for_packages": ["pkgY"], "detected_license_expression": "MIT"}]
+
+        updated = resolve.update_package_license_from_resource_if_missing(
+            packages, resources
+        )
+        self.assertEqual(updated[0]["declared_license_expression"], None)
+
+    def test_scanpipe_resolve_update_package_license_from_resource_if_missing_no_change(
+        self,
+    ):
+        packages = [
+            {"package_uid": "pkg1", "declared_license_expression": "GPL-2.0"},
+            {"package_uid": "pkg2", "declared_license_expression": "Apache-2.0"},
+        ]
+        resources = [
+            {"for_packages": ["pkg1", "pkg2"], "detected_license_expression": "MIT"},
+        ]
+
+        updated = resolve.update_package_license_from_resource_if_missing(
+            packages, resources
+        )
+        self.assertEqual(updated[0]["declared_license_expression"], "GPL-2.0")
+        self.assertEqual(updated[1]["declared_license_expression"], "Apache-2.0")