Skip to content

Commit 94d4fe6

Browse files
Merge pull request #3423 from nexB/fix-package-scan-only-performance
Fix package scan only performance
2 parents 3282bc0 + 5d8db2c commit 94d4fe6

File tree

37 files changed

+17378
-2007
lines changed

37 files changed

+17378
-2007
lines changed

CHANGELOG.rst

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,17 @@ v32.1.0 (next, roadmap)
4242
See https://github.com/nexB/scancode-toolkit/issues/1745
4343

4444

45-
v32.0.3 - 2023-05-26
45+
v32.0.4 - 2023-06-07
46+
---------------------
47+
48+
This is a minor bugfix release with the following updates:
49+
50+
- Fixes a performance issue issue arising out of license detection
51+
on files happening in a single-threaded process_codebase step when the
52+
license CLI option is disabled for a package scan.
53+
Reference: https://github.com/nexB/scancode-toolkit/pull/3423
54+
55+
v32.0.3 - 2023-06-06
4656
---------------------
4757

4858
This is a minor bugfix release with the following updates:

setup-mini.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = scancode-toolkit-mini
3-
version = 32.0.3
3+
version = 32.0.4
44
license = Apache-2.0 AND CC-BY-4.0 AND LicenseRef-scancode-other-permissive AND LicenseRef-scancode-other-copyleft
55

66
# description must be on ONE line https://github.com/pypa/setuptools/issues/1390

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = scancode-toolkit
3-
version = 32.0.3
3+
version = 32.0.4
44
license = Apache-2.0 AND CC-BY-4.0 AND LicenseRef-scancode-other-permissive AND LicenseRef-scancode-other-copyleft
55

66
# description must be on ONE line https://github.com/pypa/setuptools/issues/1390

src/packagedcode/build.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
from commoncode import fileutils
1616

17+
from licensedcode.cache import build_spdx_license_expression
18+
from licensedcode.cache import get_cache
1719
from licensedcode.tokenize import query_tokenizer
1820
from licensedcode.detection import detect_licenses
1921
from licensedcode.detection import get_unknown_license_detection
@@ -122,6 +124,11 @@ def assemble(cls, package_data, resource, codebase, package_adder):
122124
resource=resource,
123125
codebase=codebase,
124126
)
127+
if package.declared_license_expression:
128+
package.declared_license_expression_spdx = str(build_spdx_license_expression(
129+
license_expression=package.declared_license_expression,
130+
licensing=get_cache().licensing,
131+
))
125132

126133
cls.assign_package_to_resources(
127134
package=package,
@@ -132,6 +139,7 @@ def assemble(cls, package_data, resource, codebase, package_adder):
132139

133140
yield package
134141

142+
135143
# we yield this as we do not want this further processed
136144
yield resource
137145

src/packagedcode/licensing.py

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def logger_debug(*args):
5656
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
5757

5858

59-
def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
59+
def add_referenced_license_matches_for_package(resource, codebase):
6060
"""
6161
Return an updated ``resource`` saving it in place, after adding new license
6262
detections to the package manifests detected in this resource, following their
@@ -106,13 +106,7 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
106106
if not referenced_resource:
107107
continue
108108

109-
if no_licenses:
110-
referenced_license_detections = get_license_detection_mappings(
111-
location=referenced_resource.location
112-
)
113-
114-
else:
115-
referenced_license_detections = referenced_resource.license_detections
109+
referenced_license_detections = referenced_resource.license_detections
116110

117111
if referenced_license_detections:
118112
modified = True
@@ -160,7 +154,7 @@ def add_referenced_license_matches_for_package(resource, codebase, no_licenses):
160154
yield resource
161155

162156

163-
def add_referenced_license_detection_from_package(resource, codebase, no_licenses):
157+
def add_referenced_license_detection_from_package(resource, codebase):
164158
"""
165159
Return an updated ``resource`` saving it in place, after adding new license
166160
matches (licenses and license_expressions) following their Rule
@@ -209,7 +203,6 @@ def add_referenced_license_detection_from_package(resource, codebase, no_license
209203
sibling_license_detections, _le = get_license_detections_from_sibling_file(
210204
resource=root_resource,
211205
codebase=codebase,
212-
no_licenses=no_licenses,
213206
)
214207
if TRACE:
215208
logger_debug(
@@ -278,12 +271,10 @@ def add_referenced_license_detection_from_package(resource, codebase, no_license
278271
yield resource
279272

280273

281-
def add_license_from_sibling_file(resource, codebase, no_licenses):
274+
def add_license_from_sibling_file(resource, codebase):
282275
"""
283276
Given a resource and it's codebase object, assign licenses to the package
284277
detections in that resource, from the sibling files of it.
285-
286-
If `no_license` is True, then license scan (for resources) is disabled.
287278
"""
288279
if TRACE:
289280
logger_debug(f'packagedcode.licensing: add_license_from_sibling_file: resource: {resource.path}')
@@ -303,7 +294,6 @@ def add_license_from_sibling_file(resource, codebase, no_licenses):
303294
license_detections, license_expression = get_license_detections_from_sibling_file(
304295
resource=resource,
305296
codebase=codebase,
306-
no_licenses=no_licenses,
307297
)
308298
if not license_detections:
309299
return
@@ -333,13 +323,11 @@ def is_legal_or_readme(resource):
333323
return False
334324

335325

336-
def get_license_detections_from_sibling_file(resource, codebase, no_licenses):
326+
def get_license_detections_from_sibling_file(resource, codebase):
337327
"""
338328
Return `license_detections`, a list of LicenseDetection objects and a
339329
`license_expression`, given a resource and it's codebase object, from
340330
the sibling files of the resource.
341-
342-
If `no_license` is True, then license scan (for resources) is disabled.
343331
"""
344332
siblings = []
345333

@@ -357,15 +345,7 @@ def get_license_detections_from_sibling_file(resource, codebase, no_licenses):
357345

358346
license_detections = []
359347
for sibling in siblings:
360-
if no_licenses:
361-
detections = get_license_detection_mappings(
362-
location=sibling.location,
363-
analysis=DetectionCategory.PACKAGE_ADD_FROM_SIBLING_FILE.value,
364-
post_scan=True,
365-
)
366-
license_detections.extend(detections)
367-
else:
368-
license_detections.extend(sibling.license_detections)
348+
license_detections.extend(sibling.license_detections)
369349

370350
if not license_detections:
371351
return [], None

src/packagedcode/plugin_package.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -194,17 +194,19 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):
194194
Also perform additional package license detection that depends on either
195195
file license detection or the package detections.
196196
"""
197-
no_licenses = False
197+
has_licenses = hasattr(codebase.root, 'license_detections')
198198

199199
# These steps add proper license detections to package_data and hence
200200
# this is performed before top level packages creation
201201
for resource in codebase.walk(topdown=False):
202-
if not hasattr(resource, 'license_detections'):
203-
no_licenses = True
202+
if not has_licenses:
203+
#TODO: Add the steps where we detect licenses from files for only a package scan
204+
# in the multiprocessing get_package_data API function
205+
continue
204206

205207
# If we don't detect license in package_data but there is license detected in file
206208
# we add the license expression from the file to a package
207-
modified = add_license_from_file(resource, codebase, no_licenses)
209+
modified = add_license_from_file(resource, codebase)
208210
if TRACE and modified:
209211
logger_debug(f'packagedcode: process_codebase: add_license_from_file: modified: {modified}')
210212

@@ -213,30 +215,30 @@ def process_codebase(self, codebase, strip_root=False, **kwargs):
213215

214216
# If there is referenced files in a extracted license statement, we follow
215217
# the references, look for license detections and add them back
216-
modified = list(add_referenced_license_matches_for_package(resource, codebase, no_licenses))
218+
modified = list(add_referenced_license_matches_for_package(resource, codebase))
217219
if TRACE and modified:
218220
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_for_package: modified: {modified}')
219221

220222
# If there is a LICENSE file on the same level as the manifest, and no license
221223
# is detected in the package_data, we add the license from the file
222-
modified = add_license_from_sibling_file(resource, codebase, no_licenses)
224+
modified = add_license_from_sibling_file(resource, codebase)
223225
if TRACE and modified:
224226
logger_debug(f'packagedcode: process_codebase: add_license_from_sibling_file: modified: {modified}')
225227

226228
# Create codebase-level packages and dependencies
227229
create_package_and_deps(codebase, strip_root=strip_root, **kwargs)
228230

229-
if not no_licenses:
231+
if has_licenses:
230232
# This step is dependent on top level packages
231233
for resource in codebase.walk(topdown=False):
232234
# If there is a unknown reference to a package we add the license
233235
# from the package license detection
234-
modified = list(add_referenced_license_detection_from_package(resource, codebase, no_licenses))
236+
modified = list(add_referenced_license_detection_from_package(resource, codebase))
235237
if TRACE and modified:
236238
logger_debug(f'packagedcode: process_codebase: add_referenced_license_matches_from_package: modified: {modified}')
237239

238240

239-
def add_license_from_file(resource, codebase, no_licenses):
241+
def add_license_from_file(resource, codebase):
240242
"""
241243
Given a Resource, check if the detected package_data doesn't have license detections
242244
and the file has license detections, and if so, populate the package_data license
@@ -248,10 +250,7 @@ def add_license_from_file(resource, codebase, no_licenses):
248250
if not resource.is_file:
249251
return
250252

251-
if no_licenses:
252-
license_detections_file = get_license_detection_mappings(location=resource.location)
253-
else:
254-
license_detections_file = resource.license_detections
253+
license_detections_file = resource.license_detections
255254

256255
if TRACE:
257256
logger_debug(f'add_license_from_file: license_detections_file: {license_detections_file}')

src/scancode_config.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,11 +132,12 @@ def _create_dir(location):
132132
# 4. hardcoded This is the default, fallback version in case package is not installed or we
133133
# do not have a proper version otherwise.
134134
if not __version__:
135-
__version__ = '32.0.3'
135+
__version__ = '32.0.4'
136136

137137
#######################
138138
# used to warn user when the version is out of date
139-
__release_date__ = datetime.datetime(2023, 6, 6)
139+
# this is (year, month, day)
140+
__release_date__ = datetime.datetime(2023, 6, 7)
140141

141142
# See https://github.com/nexB/scancode-toolkit/issues/2653 for more information
142143
# on the data format version

tests/formattedcode/test_output_cyclonedx.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,22 +228,22 @@ def test_cyclonedx_plugin_does_not_fail_without_packages():
228228
def test_cyclonedx_plugin_json():
229229
test_dir = test_env.get_test_loc('cyclonedx/simple')
230230
result_file = test_env.get_temp_file('cyclonedx.json')
231-
run_scan_click(['-p', test_dir, '--cyclonedx', result_file])
231+
run_scan_click(['--package', test_dir, '--cyclonedx', result_file])
232232
expected_file = test_env.get_test_loc('cyclonedx/simple-expected.json')
233233
check_cyclone_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)
234234

235235

236236
def test_cyclonedx_plugin_json_simple_package_icu():
237237
test_dir = test_env.get_test_loc('cyclonedx/simple-icu')
238238
result_file = test_env.get_temp_file('cyclonedx.json')
239-
run_scan_click(['-p', test_dir, '--cyclonedx', result_file])
239+
run_scan_click(['--package', '--license', test_dir, '--cyclonedx', result_file])
240240
expected_file = test_env.get_test_loc('cyclonedx/simple-icu-expected.json')
241241
check_cyclone_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)
242242

243243

244244
def test_cyclonedx_plugin_xml_components_and_dependencies_are_serialized_correctly():
245245
test_dir = test_env.get_test_loc('cyclonedx/simple')
246246
result_file = test_env.get_temp_file('cyclonedx.xml')
247-
run_scan_click(['-p', test_dir, '--cyclonedx-xml', result_file])
247+
run_scan_click(['--package', test_dir, '--cyclonedx-xml', result_file])
248248
expected_file = test_env.get_test_loc('cyclonedx/expected.xml')
249249
check_cyclone_xml_output(expected_file, result_file, regen=REGEN_TEST_FIXTURES)

tests/packagedcode/data/build/buck/end2end-expected.json

Lines changed: 3 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -247,29 +247,9 @@
247247
"vcs_url": null,
248248
"copyright": null,
249249
"holder": null,
250-
"declared_license_expression": "apache-2.0",
251-
"declared_license_expression_spdx": "Apache-2.0",
252-
"license_detections": [
253-
{
254-
"license_expression": "apache-2.0",
255-
"matches": [
256-
{
257-
"score": 100.0,
258-
"start_line": 1,
259-
"end_line": 1,
260-
"matched_length": 3,
261-
"match_coverage": 100.0,
262-
"matcher": "1-hash",
263-
"license_expression": "apache-2.0",
264-
"rule_identifier": "spdx_license_id_apache-2.0_for_apache-2.0.RULE",
265-
"rule_relevance": 100,
266-
"rule_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/rules/spdx_license_id_apache-2.0_for_apache-2.0.RULE",
267-
"matched_text": "apache-2.0"
268-
}
269-
],
270-
"identifier": "apache_2_0-d66ab77d-a5cc-7104-e702-dc7df61fe9e8"
271-
}
272-
],
250+
"declared_license_expression": null,
251+
"declared_license_expression_spdx": null,
252+
"license_detections": [],
273253
"other_license_expression": null,
274254
"other_license_expression_spdx": null,
275255
"other_license_detections": [],

0 commit comments

Comments
 (0)