Skip to content

Commit 29383f0

Browse files
committed
Add new benchmark_purls pipeline #1804
Signed-off-by: tdruez <[email protected]>
1 parent c7ecb48 commit 29383f0

File tree

6 files changed

+142
-0
lines changed

6 files changed

+142
-0
lines changed

CHANGELOG.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ v35.4.0 (unreleased)
1717
- Display the optional steps in the Pipelines autodoc.
1818
https://github.com/aboutcode-org/scancode.io/issues/1822
1919

20+
- Add new ``benchmark_purls`` pipeline.
21+
https://github.com/aboutcode-org/scancode.io/issues/1804
22+
2023
v35.3.0 (2025-08-20)
2124
--------------------
2225

docs/built-in-pipelines.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ Analyse Docker Windows Image
4646
:members:
4747
:member-order: bysource
4848

49+
.. _analyze_benchmark_purls:
50+
51+
Benchmark Purls (addon)
52+
-----------------------
53+
.. autoclass:: scanpipe.pipelines.benchmark_purls.BenchmarkPurls()
54+
:members:
55+
:member-order: bysource
56+
4957
.. _pipeline_collect_strings_gettext:
5058

5159
Collect string with Xgettext (addon)

docs/scanpipe-pipes.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,11 @@ Generic
88
.. automodule:: scanpipe.pipes
99
:members:
1010

11+
Benchmark
12+
---------
13+
.. automodule:: scanpipe.pipes.benchmark
14+
:members:
15+
1116
ClamAV
1217
------
1318
.. automodule:: scanpipe.pipes.clamav

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ run = "scancodeio:combined_run"
135135
analyze_docker_image = "scanpipe.pipelines.analyze_docker:Docker"
136136
analyze_root_filesystem_or_vm_image = "scanpipe.pipelines.analyze_root_filesystem:RootFS"
137137
analyze_windows_docker_image = "scanpipe.pipelines.analyze_docker_windows:DockerWindows"
138+
benchmark_purls = "scanpipe.pipelines.benchmark_purls:BenchmarkPurls"
138139
collect_strings_gettext = "scanpipe.pipelines.collect_strings_gettext:CollectStringsGettext"
139140
collect_symbols_ctags = "scanpipe.pipelines.collect_symbols_ctags:CollectSymbolsCtags"
140141
collect_symbols_pygments = "scanpipe.pipelines.collect_symbols_pygments:CollectSymbolsPygments"
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import benchmark
25+
26+
27+
class BenchmarkPurls(Pipeline):
28+
"""
29+
Validate discovered project packages against a reference list of expected PURLs.
30+
31+
The expected PURLs must be provided as a .txt file with one PURL per line.
32+
Input files are recognized if:
33+
34+
- They are tagged with "purls", or
35+
- Their filename ends with "purls.txt" (e.g., "expected_purls.txt").
36+
37+
"""
38+
39+
download_inputs = False
40+
is_addon = True
41+
42+
@classmethod
43+
def steps(cls):
44+
return (
45+
cls.get_expected_purls,
46+
cls.compare_purls,
47+
)
48+
49+
def get_expected_purls(self):
50+
"""Load the expected PURLs defined in the project inputs."""
51+
self.expected_purls = benchmark.get_expected_purls(self.project)
52+
53+
def compare_purls(self):
54+
"""Run the PURLs diff and write the results to a project output file."""
55+
diff_results = benchmark.compare_purls(self.project, self.expected_purls)
56+
output_file = self.project.get_output_file_path("benchmark_purls", "txt")
57+
output_file.write_text("\n".join(diff_results))

scanpipe/pipes/benchmark.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
import difflib
24+
25+
26+
def get_expected_purls(project):
27+
"""
28+
Load the expected Package URLs (PURLs) from the project's input files.
29+
30+
A file is considered an expected PURLs source if:
31+
- Its filename ends with ``*purls.txt``, or
32+
- Its download URL includes the "#purls" tag.
33+
34+
Each line in the file should contain one PURL. Returns a sorted,
35+
deduplicated list of PURLs. Raises an exception if no input is found.
36+
"""
37+
purls_files = list(project.inputs("*purls.txt"))
38+
purls_files.extend(
39+
[input.path for input in project.inputsources.filter(tag="purls")]
40+
)
41+
42+
expected_purls = []
43+
for file_path in purls_files:
44+
expected_purls.extend(file_path.read_text().splitlines())
45+
46+
if not expected_purls:
47+
raise Exception("Expected PURLs not provided.")
48+
49+
return sorted(set(expected_purls))
50+
51+
52+
def compare_purls(project, expected_purls):
53+
"""
54+
Compare discovered project PURLs against the expected PURLs.
55+
56+
Returns only the differences:
57+
- Lines starting with '-' are missing from the project.
58+
- Lines starting with '+' are unexpected in the project.
59+
"""
60+
project_packages = project.discoveredpackages.only_package_url_fields()
61+
sorted_unique_purls = sorted({package.purl for package in project_packages})
62+
63+
diff_result = difflib.ndiff(sorted_unique_purls, expected_purls)
64+
65+
# Keep only lines that are diffs (- or +)
66+
filtered_diff = [line for line in diff_result if line.startswith(("-", "+"))]
67+
68+
return filtered_diff

0 commit comments

Comments
 (0)