Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions scanpipe/pipes/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,20 +700,35 @@ def to_spdx(project, include_files=False):
Return the path of the generated output file.
"""
output_file = project.get_output_file_path("results", "spdx.json")
document_spdx_id = f"SPDXRef-DOCUMENT-{project.uuid}"

discoveredpackage_qs = get_queryset(project, "discoveredpackage")
discovereddependency_qs = get_queryset(project, "discovereddependency")

document_spdx_id = f"SPDXRef-DOCUMENT-{project.uuid}"
packages_as_spdx = []
project_as_root_package = spdx.Package(
spdx_id=f"SPDXRef-scancodeio-project-{project.uuid}",
name=project.name,
files_analyzed=True,
)

packages_as_spdx = [project_as_root_package]
Copy link

@tsteenbe tsteenbe Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tdruez Using project_as_root_package is incorrect imo as it's possible in ScanCode.io to upload multiple archives as such there would be multiple root packages so variable should be projects_as_root_packages as documentDescribes should be array of SPDX packages (one for each archive). E.g upload 5 archives to ScanCode.io in a project than there should be 5 SPDX root elements in documentDescribes of the resulting SPDX file.

In case ScanCode.io is given a single PURL for code repository as it's project input such as pkg:github/package-url/purl-spec@244fd47e07d1004f0aed9c then documentDescribes is still an array but should only contain a single package for the code repository that was scanned, see the comment of SPDX maintainer Rose spdx/spdx-spec#395 (comment).

If a single SPDX of Cyclone SBOM was provided as ScanCode.io input for a project then documentDescribes should point to the SPDX package for provided SBOM imo.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case ScanCode.io is given a single PURL for code repository as it's project input such as pkg:github/package-url/purl-spec@244fd47e07d1004f0aed9c then documentDescribes is still an array but should only contain a single package for the code repository that was scanned,
If a single SPDX of Cyclone SBOM was provided as ScanCode.io input for a project then documentDescribes should point to the SPDX package for provided SBOM imo.

@tsteenbe The code was adjusted to use the Project's input as the root package, addressing those 2 points.

The following forms of input are supported:

  • Input manually copied to Project's inputs directory
  • Input uploaded
  • Input fetched: download_url, purl, docker, git, ...)

Using project_as_root_package is incorrect imo as it's possible in ScanCode.io to upload multiple archives as such there would be multiple root packages so variable should be projects_as_root_packages as documentDescribes should be array of SPDX packages (one for each archive). E.g upload 5 archives to ScanCode.io in a project than there should be 5 SPDX root elements in documentDescribes of the resulting SPDX file.

Now, for the multiple inputs case, this will require additional design work and likely some changes in the SCIO architecture to properly track CodebaseResource and DiscoveredPackage objects back to their input origin.

This will be handled in a separate PR, since it first requires further discussion.

Also, note that projects with multiple inputs (e.g. when using the deploy_to_develop pipeline) are not expected to fetch SPDX documents.

license_expressions = []
relationships = []

for package in discoveredpackage_qs:
packages_as_spdx.append(package.as_spdx())
spdx_package = package.as_spdx()
packages_as_spdx.append(spdx_package)

if license_expression := package.declared_license_expression:
license_expressions.append(license_expression)

spdx_relationship = spdx.Relationship(
spdx_id=project_as_root_package.spdx_id,
related_spdx_id=spdx_package.spdx_id,
relationship="DEPENDS_ON",
)
relationships.append(spdx_relationship)

for dependency in discovereddependency_qs:
spdx_relationship = get_dependency_as_spdx_relationship(
dependency,
Expand All @@ -729,10 +744,20 @@ def to_spdx(project, include_files=False):
for resource in get_queryset(project, "codebaseresource").files()
]

# Use the Project (top-level package) as the root element that the SPDX document
# describes.
# This ensures "documentDescribes" points only to the main subject of the SBOM,
# not to every dependency or file in the project.
# See https://github.com/spdx/spdx-spec/issues/395 and
# https://github.com/aboutcode-org/scancode.io/issues/564#issuecomment-3269296563
# for detailed context.
describes = [project_as_root_package.spdx_id]

document = spdx.Document(
spdx_id=document_spdx_id,
name=f"scancodeio_{project.name}",
namespace=f"https://scancode.io/spdxdocs/{project.uuid}",
describes=describes,
creation_info=spdx.CreationInfo(tool=f"ScanCode.io-{scancodeio_version}"),
packages=packages_as_spdx,
files=files_as_spdx,
Expand Down
33 changes: 27 additions & 6 deletions scanpipe/pipes/spdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@

Usage::

import pathlib
from scanpipe.pipes import spdx

creation_info = spdx.CreationInfo(
Expand All @@ -53,6 +52,11 @@
tool="SPDXCode-1.0",
)

root_package = spdx.Package(
spdx_id="SPDXRef-project1",
name="project1",
)

package1 = spdx.Package(
spdx_id="SPDXRef-package1",
name="lxml",
Expand All @@ -76,8 +80,9 @@
document = spdx.Document(
name="Document name",
namespace="https://[CreatorWebsite]/[pathToSpdx]/[DocumentName]-[UUID]",
describes=[root_package.spdx_id],
creation_info=creation_info,
packages=[package1],
packages=[root_package, package1],
extracted_licenses=[
spdx.ExtractedLicensingInfo(
license_id="LicenseRef-1",
Expand All @@ -93,7 +98,7 @@
print(document.as_json())

# Validate document
schema = pathlib.Path(spdx.SPDX_JSON_SCHEMA_LOCATION).read_text()
schema = spdx.SPDX_SCHEMA_PATH.read_text()
document.validate(schema)

# Write document to a file:
Expand Down Expand Up @@ -267,17 +272,22 @@ class ExtractedLicensingInfo:
"""

license_id: str
extracted_text: str
extracted_text: str = "NOASSERTION"

name: str = ""
comment: str = ""
see_alsos: list[str] = field(default_factory=list)

def as_dict(self):
"""Return the data as a serializable dict."""
if self.extracted_text.strip():
extracted_text = self.extracted_text
else:
extracted_text = "NOASSERTION"

required_data = {
"licenseId": self.license_id,
"extractedText": self.extracted_text,
"extractedText": extracted_text,
}

optional_data = {
Expand Down Expand Up @@ -542,6 +552,16 @@ class Document:

name: str
namespace: str
# "documentDescribes" identifies the root element(s) that this SPDX document
# describes.
# In most SBOM cases, this will be a single SPDX ID representing the top-level
# package or project (e.g., the root manifest in a repository or the main
# distribution artifact).
# Although defined as an array, it should NOT list every package, file, or snippet.
# Multiple entries are only expected in special, non-SBOM cases
# (e.g., SPDX license lists).
# See https://github.com/spdx/spdx-spec/issues/395 for discussion and clarification.
describes: list
creation_info: CreationInfo
packages: list[Package]

Expand All @@ -562,9 +582,9 @@ def as_dict(self):
"SPDXID": self.spdx_id,
"name": self.safe_document_name(self.name),
"documentNamespace": self.namespace,
"documentDescribes": self.describes,
"creationInfo": self.creation_info.as_dict(),
"packages": [package.as_dict() for package in self.packages],
"documentDescribes": [package.spdx_id for package in self.packages],
}

if self.files:
Expand Down Expand Up @@ -597,6 +617,7 @@ def from_data(cls, data):
data_license=data.get("dataLicense"),
name=data.get("name"),
namespace=data.get("documentNamespace"),
describes=data.get("documentDescribes"),
creation_info=CreationInfo.from_data(data.get("creationInfo", {})),
packages=[
Package.from_data(package_data)
Expand Down
30 changes: 22 additions & 8 deletions scanpipe/tests/data/asgiref/asgiref-3.3.0.spdx.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
"SPDXID": "SPDXRef-DOCUMENT-804c3391-e6f9-415f-bb7a-cb6653853a46",
"name": "scancodeio_asgiref",
"documentNamespace": "https://scancode.io/spdxdocs/804c3391-e6f9-415f-bb7a-cb6653853a46",
"documentDescribes": [
"SPDXRef-scancodeio-project-804c3391-e6f9-415f-bb7a-cb6653853a46"
],
"creationInfo": {
"created": "2000-01-01T01:02:03Z",
"creators": [
Expand All @@ -12,6 +15,15 @@
"licenseListVersion": "3.20"
},
"packages": [
{
"name": "asgiref",
"SPDXID": "SPDXRef-scancodeio-project-804c3391-e6f9-415f-bb7a-cb6653853a46",
"downloadLocation": "NOASSERTION",
"licenseConcluded": "NOASSERTION",
"copyrightText": "NOASSERTION",
"filesAnalyzed": true,
"licenseDeclared": "NOASSERTION"
},
{
"name": "asgiref",
"SPDXID": "SPDXRef-scancodeio-discoveredpackage-9d0bdc32-1117-407a-9908-08d3558dc739",
Expand Down Expand Up @@ -115,16 +127,18 @@
]
}
],
"documentDescribes": [
"SPDXRef-scancodeio-discoveredpackage-9d0bdc32-1117-407a-9908-08d3558dc739",
"SPDXRef-scancodeio-discoveredpackage-7969de5e-5589-4441-bffa-a60e12b43280",
"SPDXRef-scancodeio-discovereddependency-4cff8bf8-197c-4698-a43a-5c793586c780",
"SPDXRef-scancodeio-discovereddependency-4c5c1313-3850-4f81-ac27-8d496080d667",
"SPDXRef-scancodeio-discovereddependency-f983278c-22f1-43e1-ba2b-a020d659531b",
"SPDXRef-scancodeio-discovereddependency-98aeddb5-b81a-43d4-ac56-dc873a589fdf"
],
"files": [],
"relationships": [
{
"spdxElementId": "SPDXRef-scancodeio-project-804c3391-e6f9-415f-bb7a-cb6653853a46",
"relatedSpdxElement": "SPDXRef-scancodeio-discoveredpackage-9d0bdc32-1117-407a-9908-08d3558dc739",
"relationshipType": "DEPENDS_ON"
},
{
"spdxElementId": "SPDXRef-scancodeio-project-804c3391-e6f9-415f-bb7a-cb6653853a46",
"relatedSpdxElement": "SPDXRef-scancodeio-discoveredpackage-7969de5e-5589-4441-bffa-a60e12b43280",
"relationshipType": "DEPENDS_ON"
},
{
"spdxElementId": "SPDXRef-scancodeio-discovereddependency-4cff8bf8-197c-4698-a43a-5c793586c780",
"relatedSpdxElement": "SPDXRef-scancodeio-discoveredpackage-9d0bdc32-1117-407a-9908-08d3558dc739",
Expand Down
34 changes: 27 additions & 7 deletions scanpipe/tests/data/spdx/dependencies.spdx.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
"SPDXID": "SPDXRef-DOCUMENT-b74fe5df-e965-415e-ba65-f38421a0695d",
"name": "scancodeio_analysis",
"documentNamespace": "https://scancode.io/spdxdocs/b74fe5df-e965-415e-ba65-f38421a0695d",
"documentDescribes": [
"SPDXRef-scancodeio-project-b74fe5df-e965-415e-ba65-f38421a0695d"
],
"creationInfo": {
"created": "2000-01-01T01:02:03Z",
"creators": [
Expand All @@ -12,6 +15,15 @@
"licenseListVersion": "3.20"
},
"packages": [
{
"name": "Analysis",
"SPDXID": "SPDXRef-scancodeio-project-b74fe5df-e965-415e-ba65-f38421a0695d",
"downloadLocation": "NOASSERTION",
"licenseConcluded": "NOASSERTION",
"copyrightText": "NOASSERTION",
"filesAnalyzed": true,
"licenseDeclared": "NOASSERTION"
},
{
"name": "a",
"SPDXID": "SPDXRef-scancodeio-discoveredpackage-a83a60de-81bc-4bf4-b48c-dc78e0e658a9",
Expand Down Expand Up @@ -83,14 +95,22 @@
]
}
],
"documentDescribes": [
"SPDXRef-scancodeio-discoveredpackage-a83a60de-81bc-4bf4-b48c-dc78e0e658a9",
"SPDXRef-scancodeio-discoveredpackage-81147701-285f-485c-ba36-9cd3742790b1",
"SPDXRef-scancodeio-discoveredpackage-e391c33e-d7d0-4a97-a3c3-e947375c53d5",
"SPDXRef-scancodeio-discovereddependency-d0e1eab2-9b8b-449b-b9d1-12147ffdd8a8",
"SPDXRef-scancodeio-discovereddependency-29fbe562-a191-44b4-88e8-a9678071ecee"
],
"relationships": [
{
"spdxElementId": "SPDXRef-scancodeio-project-b74fe5df-e965-415e-ba65-f38421a0695d",
"relatedSpdxElement": "SPDXRef-scancodeio-discoveredpackage-a83a60de-81bc-4bf4-b48c-dc78e0e658a9",
"relationshipType": "DEPENDS_ON"
},
{
"spdxElementId": "SPDXRef-scancodeio-project-b74fe5df-e965-415e-ba65-f38421a0695d",
"relatedSpdxElement": "SPDXRef-scancodeio-discoveredpackage-81147701-285f-485c-ba36-9cd3742790b1",
"relationshipType": "DEPENDS_ON"
},
{
"spdxElementId": "SPDXRef-scancodeio-project-b74fe5df-e965-415e-ba65-f38421a0695d",
"relatedSpdxElement": "SPDXRef-scancodeio-discoveredpackage-e391c33e-d7d0-4a97-a3c3-e947375c53d5",
"relationshipType": "DEPENDS_ON"
},
{
"spdxElementId": "SPDXRef-scancodeio-discoveredpackage-81147701-285f-485c-ba36-9cd3742790b1",
"relatedSpdxElement": "SPDXRef-scancodeio-discoveredpackage-a83a60de-81bc-4bf4-b48c-dc78e0e658a9",
Expand Down
4 changes: 2 additions & 2 deletions scanpipe/tests/pipes/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,8 +507,8 @@ def test_scanpipe_pipes_outputs_to_spdx_dependencies(self, mock_uuid4):

output_file = output.to_spdx(project=project)
results_json = json.loads(output_file.read_text())
self.assertEqual(5, len(results_json["packages"]))
self.assertEqual(3, len(results_json["relationships"]))
self.assertEqual(6, len(results_json["packages"]))
self.assertEqual(6, len(results_json["relationships"]))

# Patch the `created` date and tool version
results_json["creationInfo"]["created"] = "2000-01-01T01:02:03Z"
Expand Down
30 changes: 27 additions & 3 deletions scanpipe/tests/pipes/test_spdx.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ def setUp(self):
"https://license1.homepage",
],
}
self.project_as_root_package_data = {
"spdx_id": "SPDXRef-project",
"name": "Project",
}
self.package_data = {
"spdx_id": "SPDXRef-package1",
"name": "lxml",
Expand Down Expand Up @@ -170,7 +174,9 @@ def setUp(self):
"name": "Document name",
"namespace": "https://[CreatorWebsite]/[DocumentName]-[UUID]",
"creation_info": spdx.CreationInfo(**self.creation_info_data),
"describes": [self.project_as_root_package_data["spdx_id"]],
"packages": [
spdx.Package(**self.project_as_root_package_data),
spdx.Package(**self.package_data),
],
"extracted_licenses": [
Expand All @@ -190,6 +196,7 @@ def setUp(self):
"SPDXID": "SPDXRef-DOCUMENT",
"name": "document_name",
"documentNamespace": "https://[CreatorWebsite]/[DocumentName]-[UUID]",
"documentDescribes": ["SPDXRef-project"],
"creationInfo": {
"created": "2022-09-21T13:50:20Z",
"creators": [
Expand All @@ -201,6 +208,15 @@ def setUp(self):
"comment": "Generated with SPDXCode",
},
"packages": [
{
"name": "Project",
"SPDXID": "SPDXRef-project",
"downloadLocation": "NOASSERTION",
"licenseConcluded": "NOASSERTION",
"copyrightText": "NOASSERTION",
"filesAnalyzed": False,
"licenseDeclared": "NOASSERTION",
},
{
"name": "lxml",
"SPDXID": "SPDXRef-package1",
Expand Down Expand Up @@ -228,7 +244,7 @@ def setUp(self):
"referenceLocator": "pkg:pypi/[email protected]",
}
],
}
},
],
"files": [
{
Expand All @@ -247,7 +263,6 @@ def setUp(self):
"licenseComments": "license_comments",
}
],
"documentDescribes": ["SPDXRef-package1"],
"hasExtractedLicensingInfos": [
{
"licenseId": "LicenseRef-1",
Expand Down Expand Up @@ -303,6 +318,15 @@ def test_spdx_extracted_licensing_info_as_dict(self):
licensing_info = spdx.ExtractedLicensingInfo(**self.licensing_info_data)
assert self.licensing_info_spdx_data == licensing_info.as_dict()

def test_spdx_extracted_licensing_info_empty_extracted_text(self):
licensing_info = spdx.ExtractedLicensingInfo(
**{
"license_id": "LicenseRef-1",
"extracted_text": " ",
}
)
assert "NOASSERTION" == licensing_info.as_dict()["extractedText"]

def test_spdx_extracted_licensing_info_from_data(self):
assert spdx.ExtractedLicensingInfo.from_data({})
licensing_info = spdx.ExtractedLicensingInfo.from_data(
Expand Down Expand Up @@ -353,7 +377,7 @@ def test_spdx_relationship_from_data(self):

def test_spdx_document_as_dict(self):
document = spdx.Document(**self.document_data)
assert self.document_spdx_data == document.as_dict()
assert self.document_spdx_data == document.as_dict(), document.as_dict()

def test_spdx_relationship_is_dependency_relationship_property(self):
relationship = spdx.Relationship.from_data(self.relationship_spdx_data)
Expand Down