Skip to content

Commit bb04420

Browse files
authored
Merge pull request #2505 from nexB/2021-04-license-updates
Update license detection
2 parents 806db9d + 4a89a9b commit bb04420

File tree

14,233 files changed

+59288
-696
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

14,233 files changed

+59288
-696
lines changed

CHANGELOG.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ License scanning:
3737
- Add new command line option to filter ignorable copyrights when included
3838
in licenses.
3939

40+
- Add new and improved license detection rules.
41+
Thank you to:
42+
- Sebastian Thomas @sebathomas
43+
- Till Jaeger @LeChasseur
4044

4145

4246
v21.3.31
@@ -93,6 +97,9 @@ Copyright scanning:
9397
- Allow calling copyright detection from text lines to ease integration
9498
Thank you to Jelmer Vernooij @jelmer
9599

100+
- Fixed copyright truncation bug
101+
Thank you to Akanksha Garg @akugarg
102+
96103

97104
Package scanning:
98105
~~~~~~~~~~~~~~~~~

azure-pipelines.yml

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,35 @@ jobs:
6161
bin/pytest -n 3 -vvs --test-suite=all \
6262
tests/licensedcode/test_detection_datadriven_external.py
6363
64-
license_validate1: |
64+
license_validate_basic: |
6565
bin/pytest -n 3 -vvs --test-suite=validate \
6666
tests/licensedcode/test_detection_validate.py \
6767
-k TestValidateLicenseBasic
6868
69-
license_validate2: |
69+
license_validate_extended_1: |
7070
bin/pytest -n 3 -vvs --test-suite=validate \
7171
tests/licensedcode/test_detection_validate.py \
72-
-k TestValidateLicenseExtended
72+
-k TestValidateLicenseExtended1
73+
74+
license_validate_extended_2: |
75+
bin/pytest -n 3 -vvs --test-suite=validate \
76+
tests/licensedcode/test_detection_validate.py \
77+
-k TestValidateLicenseExtended2
78+
79+
license_validate_extended_3: |
80+
bin/pytest -n 3 -vvs --test-suite=validate \
81+
tests/licensedcode/test_detection_validate.py \
82+
-k TestValidateLicenseExtended3
83+
84+
license_validate_extended_4: |
85+
bin/pytest -n 3 -vvs --test-suite=validate \
86+
tests/licensedcode/test_detection_validate.py \
87+
-k TestValidateLicenseExtended4
88+
89+
license_validate_extended_5: |
90+
bin/pytest -n 3 -vvs --test-suite=validate \
91+
tests/licensedcode/test_detection_validate.py \
92+
-k TestValidateLicenseExtended5
7393
7494
license_cache: |
7595
bin/pytest -n 3 -vvs --test-suite=all \
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# Copyright (c) nexB Inc. and others. All rights reserved.
4+
# ScanCode is a trademark of nexB Inc.
5+
# SPDX-License-Identifier: Apache-2.0
6+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
7+
# See https://github.com/nexB/scancode-toolkit for support or download.
8+
# See https://aboutcode.org for more information about nexB OSS projects.
9+
#
10+
11+
import click
12+
13+
from licensedcode.tokenize import ngrams
14+
15+
import synclic
16+
17+
"""
18+
A script to generate false-positive license detection rules from lists of SPDX
19+
licenses.
20+
21+
Common license detection tools use list of SPDX licenses ids to support their operations.
22+
As a result, we get a lot of matched licenses and in these cases, these are false positives.
23+
24+
Here we fetch all released SPDX licenses lists and generate false positives
25+
using these approaches to have a reasonable set of combinations of license ids
26+
as found in the wild:
27+
28+
1. For each SPDX license list release, we consider these lists:
29+
- all IDs
30+
- all non-deprecated IDs
31+
- all licenses
32+
- all non-deprecated licenses
33+
- all exceptions
34+
- all non-deprecated exceptions
35+
36+
We generate lists of ids only and list of ids and name
37+
38+
2. For each of these lists we sort them:
39+
- respective case
40+
- ignoring case
41+
42+
3. for each of these sorted list we collect sub-sequences of 6 license, one
43+
per line and generate a false positive RULE from that.
44+
45+
If a RULE already exists, it will be skipped.
46+
"""
47+
48+
TRACE = False
49+
50+
template = '''----------------------------------------
51+
is_false_positive: yes
52+
notes: a sequence of SPDX license ids and names is not a license
53+
---
54+
{}
55+
'''
56+
57+
58+
@click.command()
59+
@click.argument(
60+
'license_dir', type=click.Path(), metavar='DIR')
61+
62+
@click.argument(
63+
# 'A buildrules-formatted file used to generate new licenses rules.')
64+
'output', type=click.Path(), metavar='FILE')
65+
66+
@click.option(
67+
'--commitish', type=str, default=None,
68+
help='An optional commitish to use for SPDX license data instead of the latest release.')
69+
70+
@click.option(
71+
# 'A buildrules-formatted file used to generate new licenses rules.')
72+
'--from-list', default=None, type=click.Path(), metavar='LIST_FILE',
73+
help='Use file with a list of entries to ignore instead')
74+
75+
@click.option(
76+
'-n', '--ngrams-length', type=int, default=6,
77+
help='Number of elements in a sub-sequence when generating a rule.')
78+
79+
@click.option(
80+
'-t', '--trace', is_flag=True, default=False,
81+
help='Print execution trace.')
82+
83+
@click.help_option('-h', '--help')
84+
def cli(license_dir, output, commitish=None, from_list=None, trace=False, ngrams_length=6):
85+
"""
86+
Generate ScanCode false-positive license detection rules from lists of SPDX
87+
license. Save these in FILE for use with buildrules.
88+
89+
the `spdx` directory is used as a temp store for fetched SPDX licenses.
90+
"""
91+
global TRACE
92+
TRACE = trace
93+
94+
if not from_list:
95+
spdx_source = synclic.SpdxSource(external_base_dir=license_dir)
96+
97+
spdx_by_key = spdx_source.get_licenses(
98+
commitish=commitish,
99+
skip_oddities=False,
100+
)
101+
102+
all_licenses_and_exceptions = []
103+
all_licenses_and_exceptions_non_deprecated = []
104+
licenses = []
105+
exceptions = []
106+
licenses_non_deprecated = []
107+
exceptions_non_deprecated = []
108+
109+
lists_of_licenses = [
110+
all_licenses_and_exceptions,
111+
all_licenses_and_exceptions_non_deprecated,
112+
licenses,
113+
exceptions,
114+
licenses_non_deprecated,
115+
exceptions_non_deprecated,
116+
]
117+
118+
for lspdx in spdx_by_key.values():
119+
all_licenses_and_exceptions.append(lspdx)
120+
is_deprecated = lspdx.is_deprecated
121+
if not is_deprecated:
122+
all_licenses_and_exceptions_non_deprecated.append(lspdx)
123+
if lspdx.is_exception:
124+
exceptions.append(lspdx)
125+
if not is_deprecated:
126+
exceptions_non_deprecated.append(lspdx)
127+
else:
128+
licenses.append(lspdx)
129+
if not is_deprecated:
130+
licenses_non_deprecated.append(lspdx)
131+
132+
lists_of_sorted_licenses = []
133+
for lic_list in lists_of_licenses:
134+
sorted_case_sensitive = sorted(lic_list, key=lambda x: x.spdx_license_key)
135+
136+
as_ids = [l.spdx_license_key for l in sorted_case_sensitive]
137+
lists_of_sorted_licenses.append(as_ids)
138+
139+
as_id_names = [f'{l.spdx_license_key} {l.name}' for l in sorted_case_sensitive]
140+
lists_of_sorted_licenses.append(as_id_names)
141+
142+
sorted_case_insensitive = sorted(lic_list, key=lambda x: x.spdx_license_key.lower())
143+
as_ids = [l.spdx_license_key for l in sorted_case_insensitive]
144+
lists_of_sorted_licenses.append(as_ids)
145+
146+
as_id_names = [f'{l.spdx_license_key} {l.name}' for l in sorted_case_insensitive]
147+
lists_of_sorted_licenses.append(as_id_names)
148+
149+
else:
150+
with open(from_list) as inp:
151+
lists_of_sorted_licenses = [inp.read().splitlines(False)]
152+
153+
with open(output, 'w') as o:
154+
for lic_list in lists_of_sorted_licenses:
155+
write_ngrams(texts=lic_list, output=o, ngram_length=ngrams_length)
156+
157+
o.write('----------------------------------------\n')
158+
159+
160+
def write_ngrams(texts, output, _seen=set(), ngram_length=6):
161+
"""
162+
Write the texts list as ngrams to the output file-like object.
163+
"""
164+
for text in ['\n'.join(ngs) for ngs in ngrams(texts, ngram_length=ngram_length)]:
165+
if text in _seen:
166+
continue
167+
_seen.add(text)
168+
output.write(template.format(text))
169+
170+
171+
if __name__ == '__main__':
172+
cli()

etc/scripts/licenses/synclic.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,15 @@ def __init__(self, external_base_dir):
162162
if not exists(self.new_dir):
163163
mkdir(self.new_dir)
164164

165-
def get_licenses(self, scancode_licenses, **kwargs):
165+
def get_licenses(self, scancode_licenses=None, **kwargs):
166166
"""
167167
Return a mapping of key -> ScanCode License objects either fetched
168168
externally or loaded from the existing `self.original_dir`
169169
"""
170170
print('Fetching and storing external licenses in:', self.original_dir)
171171

172172
licenses = []
173-
for lic, text in self.fetch_licenses(scancode_licenses, **kwargs):
173+
for lic, text in self.fetch_licenses(scancode_licenses=scancode_licenses, **kwargs):
174174
try:
175175
with io.open(lic.text_file, 'w', encoding='utf-8')as tf:
176176
tf.write(text)
@@ -336,10 +336,19 @@ class SpdxSource(ExternalLicensesSource):
336336
'notes',
337337
)
338338

339-
def fetch_licenses(self, scancode_licenses, commitish=None, from_repo=SPDX_DEFAULT_REPO):
339+
def fetch_licenses(
340+
self,
341+
scancode_licenses=None,
342+
commitish=None,
343+
skip_oddities=True,
344+
from_repo=SPDX_DEFAULT_REPO,
345+
):
340346
"""
341-
Yield License objects fetched from the latest SPDX license list.
342-
Use the latest tagged version or the `commitish` is provided.
347+
Yield License objects fetched from the latest SPDX license list. Use the
348+
latest tagged version or the `commitish` if provided.
349+
If skip_oddities is True, some oddities are skipped or handled
350+
specially, such as licenses with a trailing + or foreign language
351+
licenses.
343352
"""
344353
if not commitish:
345354
# get latest tag
@@ -361,32 +370,40 @@ def fetch_licenses(self, scancode_licenses, commitish=None, from_repo=SPDX_DEFAU
361370
and ('/json/details/' in path or '/json/exceptions/' in path)):
362371
continue
363372
if TRACE_FETCH: print('Loading license:', path)
364-
if path.endswith('+.json'):
373+
if skip_oddities and path.endswith('+.json'):
365374
# Skip the old plus licenses. We use them in
366375
# ScanCode, but they are deprecated in SPDX.
367376
continue
368377
details = json.loads(archive.read(path))
369-
lic = self.build_license(details, scancode_licenses)
378+
lic = self.build_license(
379+
mapping=details,
380+
scancode_licenses=scancode_licenses,
381+
skip_oddities=skip_oddities,
382+
)
383+
370384
if lic:
371385
yield lic
372386

373-
def build_license(self, mapping, scancode_licenses):
387+
def build_license(self, mapping, skip_oddities=True, scancode_licenses=None):
374388
"""
375389
Return a ScanCode License object built from an SPDX license mapping.
390+
If skip_oddities is True, some oddities are skipped or handled
391+
specially, such as licenses with a trailing + or foreign language
392+
licenses.
376393
"""
377394
spdx_license_key = mapping.get('licenseId') or mapping.get('licenseExceptionId')
378395
assert spdx_license_key
379396
spdx_license_key = spdx_license_key.strip()
380397
key = spdx_license_key.lower()
381398

382399
# TODO: Not yet available in ScanCode
383-
is_foreign = key in scancode_licenses.non_english_by_spdx_key
384-
if is_foreign:
400+
is_foreign = scancode_licenses and key in scancode_licenses.non_english_by_spdx_key
401+
if skip_oddities and is_foreign:
385402
if TRACE: print('Skipping NON-english license FOR NOW:', key)
386403
return
387404

388405
# these keys have a complicated history
389-
if key in set([
406+
if skip_oddities and key in set([
390407
'gpl-1.0', 'gpl-2.0', 'gpl-3.0',
391408
'lgpl-2.0', 'lgpl-2.1', 'lgpl-3.0',
392409
'agpl-1.0', 'agpl-2.0', 'agpl-3.0',
@@ -399,7 +416,7 @@ def build_license(self, mapping, scancode_licenses):
399416
return
400417

401418
deprecated = mapping.get('isDeprecatedLicenseId', False)
402-
if deprecated:
419+
if skip_oddities and deprecated:
403420
# we use concrete keys for some plus/or later versions for
404421
# simplicity and override SPDX deprecation for these
405422
if key.endswith('+'):

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,7 @@ norecursedirs =
225225
plugins
226226
plugins/*/tests/data
227227
plugins-builtin
228+
z-todo-*
228229

229230
python_files = *.py
230231

src/cluecode/copyrights.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,14 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
10331033
(r'^November$', 'NN'),
10341034
(r'^December$', 'NN'),
10351035

1036+
(r'^Name[\.,]?$', 'NN'),
1037+
(r'^Co-Author[\.,]?$', 'NN'),
1038+
(r'^Author\'s$', 'NN'),
1039+
(r'^Co-Author\'s$', 'NN'),
1040+
# the Universal Copyright Convention (1971 Paris text).
1041+
(r'^Convention[\.,]?$', 'NN'),
1042+
(r'^Paris[\.,]?$', 'NN'),
1043+
10361044
# we do not include Jan and Jun that are common enough first names
10371045
(r'^(Feb|Mar|Apr|May|Jul|Aug|Sep|Oct|Nov|Dec)$', 'NN'),
10381046
(r'^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)$', 'NN'),
@@ -2590,6 +2598,14 @@ def refine_names(s, prefixes):
25902598
'copyright (c) <holders>',
25912599
'copyright (c) , and others',
25922600
'copyright from license',
2601+
'and/or the universal copyright convention 1971',
2602+
'universal copyright convention',
2603+
'copyright 2005 m. y. name',
2604+
'copyright 2005 m. y.',
2605+
'copyright 2003 m. y. name',
2606+
'copyright 2003 m. y.',
2607+
'copyright 2001 m. y. name',
2608+
'copyright 2001 m. y.',
25932609
])
25942610

25952611
################################################################################

0 commit comments

Comments
 (0)