Skip to content

Commit 132884d

Browse files
Merge pull request #3345 from nexB/fix-unknown-license-detection
Fix unknown license detection
2 parents 133b411 + 9a976f0 commit 132884d

File tree

10 files changed

+318
-13
lines changed

10 files changed

+318
-13
lines changed

CHANGELOG.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,12 @@ License detection:
197197

198198
See https://github.com/nexB/scancode-toolkit/issues/3219
199199

200+
- A bugfix has been added to the ``--unknown-licenses`` option where
201+
we would crash when using this option without using ``--matched-text``
202+
option. This is now working correctly and also better tested.
203+
204+
See https://github.com/nexB/scancode-toolkit/issues/3343
205+
200206
v31.2.5 - 2023-04-21
201207
----------------------------------
202208

src/licensedcode/match_unknown.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
TRACE = False
2424

2525
if TRACE:
26+
use_print = True
2627
import logging
2728
import sys
2829

@@ -31,6 +32,9 @@
3132
def logger_debug(*args):
3233
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
3334

35+
if use_print:
36+
logger_debug = print
37+
3438
logging.basicConfig(stream=sys.stdout)
3539
logger.setLevel(logging.DEBUG)
3640

@@ -142,22 +146,25 @@ def match_unknowns(
142146
unknown_ngram_length=unknown_ngram_length,
143147
)
144148

149+
# build match from merged matched ngrams
150+
qspans = (Span(qstart, qend) for qstart, qend in matched_ngrams)
151+
qspan = Span().union(*qspans)
152+
145153
if TRACE:
146154
tokens_by_tid = idx.tokens_by_tid
147155

148156
def get_tokens(_toks):
149157
return (' '.join(tokens_by_tid[t] for t in _toks))
150158

151159
print('match_unknowns: matched_ngrams')
152-
for qstart, qend, matched_toks in matched_ngrams:
160+
161+
for qstart, qend in matched_ngrams:
162+
_span = Span(qstart, qend)
163+
_tokens = [query_tokens[qpos] for qpos in _span]
153164
print(
154165
' ', 'qstart', qstart,
155166
'qend', qend,
156-
'matched_toks', get_tokens(matched_toks))
157-
158-
# build match from merged matched ngrams
159-
qspans = (Span(qstart, qend) for qstart, qend in matched_ngrams)
160-
qspan = Span().union(*qspans)
167+
'matched_toks', get_tokens(_tokens))
161168

162169
if not qspan:
163170
return
@@ -169,7 +176,8 @@ def get_tokens(_toks):
169176
match_len = len(qspan)
170177

171178
if TRACE:
172-
print('match_unknowns: matched_span:', get_tokens(matched_tokens))
179+
#print('match_unknowns: matched_span:', get_tokens(matched_tokens))
180+
print('match_unknowns: qspan, match_len, matched_span:', qspan, match_len, matched_tokens)
173181

174182
# we use the query side to build the ispans
175183
ispan = Span(0, match_len)
@@ -180,9 +188,8 @@ def get_tokens(_toks):
180188
try:
181189
match_start_line = line_by_pos[qspan.start]
182190
match_end_line = line_by_pos[qspan.end]
183-
except:
184-
print('empty span:', qspan)
185-
raise
191+
except Exception as e:
192+
raise Exception('empty span:', qspan) from e
186193

187194
text = ''.join(get_full_qspan_matched_text(
188195
match_qspan=qspan,

src/licensedcode/models.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2542,14 +2542,12 @@ def __attrs_post_init__(self, *args, **kwargs):
25422542
self.identifier = f'license-detection-unknown-{self._unique_id}'
25432543

25442544
self.license_expression = UNKNOWN_LICENSE_KEY
2545-
# note that this could be shared across rules as an optimization
2545+
#TODO: that this could be shared across rules as an optimization
25462546
self.license_expression_object = self.licensing.parse(UNKNOWN_LICENSE_KEY)
25472547
self.is_license_notice = True
25482548
self.notes = 'Unknown license based on a composite of license words.'
25492549
self.is_synthetic = True
25502550
self.setup()
2551-
# called only for it's side effects
2552-
self.tokens()
25532551

25542552

25552553
@attr.s(slots=True, repr=False)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
---
2+
key: apache-2.0
3+
short_name: Apache 2.0
4+
name: Apache License 2.0
5+
category: Permissive
6+
owner: Apache Software Foundation
7+
homepage_url: http://www.apache.org/licenses/
8+
spdx_license_key: Apache-2.0
9+
---
10+
11+
7. Disclaimer of Warranty. Unless required by applicable law or
12+
agreed to in writing, Licensor provides the Work (and each
13+
Contributor provides its Contributions) on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
15+
implied, including, without limitation, any warranties or conditions
16+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
17+
PARTICULAR PURPOSE. You are solely responsible for determining the
18+
appropriateness of using or redistributing the Work and assume any
19+
risks associated with Your exercise of permissions under this License.
20+
21+
22+
Licensed under the Apache License, Version 2.0 (the "License");
23+
you may not use this file except in compliance with the License.
24+
You may obtain a copy of the License at
25+
26+
http://www.apache.org/licenses/LICENSE-2.0
27+
28+
Unless required by applicable law or agreed to in writing, software
29+
distributed under the License is distributed on an "AS IS" BASIS,
30+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31+
See the License for the specific language governing permissions and
32+
limitations under the License.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
---
2+
key: gpl-2.0-plus
3+
short_name: GPL 2.0 or later
4+
name: GNU General Public License 2.0 or later
5+
category: Copyleft
6+
owner: Free Software Foundation (FSF)
7+
homepage_url: http://www.gnu.org/licenses/old-licenses/gpl-2.0-standalone.html
8+
spdx_license_key: GPL-2.0-or-later
9+
---
10+
11+
12+
This program is distributed in the hope that it will be useful, but WITHOUT ANY
13+
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
14+
PARTICULAR PURPOSE. See the GNU General Public License for more details.
15+
16+
17+
BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
18+
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
19+
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
20+
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
21+
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
22+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
23+
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
24+
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
25+
REPAIR OR CORRECTION.
26+
27+
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
28+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
29+
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
30+
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
31+
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
32+
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
33+
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
34+
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
35+
POSSIBILITY OF SUCH DAMAGES.
36+
37+
If you develop a new program, and you want it to be of the greatest
38+
possible use to the public, the best way to achieve this is to make it
39+
free software which everyone can redistribute and change under these terms.
40+
41+
To do so, attach the following notices to the program. It is safest
42+
to attach them to the start of each source file to most effectively
43+
convey the exclusion of warranty; and each file should have at least
44+
the "copyright" line and a pointer to where the full notice is found.
45+
46+
This program is free software; you can redistribute it and/or modify
47+
it under the terms of the GNU General Public License as published by
48+
the Free Software Foundation; either version 2 of the License, or
49+
(at your option) any later version.
50+
51+
This program is distributed in the hope that it will be useful,
52+
but WITHOUT ANY WARRANTY; without even the implied warranty of
53+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
54+
GNU General Public License for more details.
55+
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
---
2+
license_expression: gpl-2.0-plus
3+
is_license_notice: yes
4+
---
5+
6+
License:
7+
8+
This package is free software; you can redistribute it and/or modify
9+
it under the terms of the GNU General Public License as published by
10+
the Free Software Foundation; either version 2 of the License, or
11+
(at your option) any later version.
12+
13+
This package is distributed in the hope that it will be useful,
14+
but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16+
GNU General Public License for more details.
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"license_detections": [
3+
{
4+
"identifier": "unknown-b0897d47-1c91-9898-2364-2e4d1a34b6fd",
5+
"license_expression": "unknown",
6+
"detection_count": 1
7+
}
8+
],
9+
"files": [
10+
{
11+
"path": "unknown.txt",
12+
"type": "file",
13+
"detected_license_expression": "unknown",
14+
"detected_license_expression_spdx": "LicenseRef-scancode-unknown",
15+
"license_detections": [
16+
{
17+
"license_expression": "unknown",
18+
"matches": [
19+
{
20+
"score": 86.89,
21+
"start_line": 1,
22+
"end_line": 10,
23+
"matched_length": 53,
24+
"match_coverage": 100.0,
25+
"matcher": "6-unknown",
26+
"license_expression": "unknown",
27+
"rule_identifier": "license-detection-unknown-296da2cbc15d2bba73baa1359cda5fc8bf39b942",
28+
"rule_relevance": 100,
29+
"rule_url": null
30+
}
31+
],
32+
"identifier": "unknown-b0897d47-1c91-9898-2364-2e4d1a34b6fd"
33+
}
34+
],
35+
"license_clues": [],
36+
"percentage_of_license_text": 86.89,
37+
"scan_errors": []
38+
}
39+
]
40+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
{
2+
"license_detections": [
3+
{
4+
"identifier": "unknown-b0897d47-1c91-9898-2364-2e4d1a34b6fd",
5+
"license_expression": "unknown",
6+
"detection_count": 1,
7+
"detection_log": []
8+
}
9+
],
10+
"files": [
11+
{
12+
"path": "unknown.txt",
13+
"type": "file",
14+
"detected_license_expression": "unknown",
15+
"detected_license_expression_spdx": "LicenseRef-scancode-unknown",
16+
"license_detections": [
17+
{
18+
"license_expression": "unknown",
19+
"matches": [
20+
{
21+
"score": 86.89,
22+
"start_line": 1,
23+
"end_line": 10,
24+
"matched_length": 53,
25+
"match_coverage": 100.0,
26+
"matcher": "6-unknown",
27+
"license_expression": "unknown",
28+
"rule_identifier": "license-detection-unknown-296da2cbc15d2bba73baa1359cda5fc8bf39b942",
29+
"rule_relevance": 100,
30+
"rule_url": null,
31+
"matched_text": "form shall mean the preferred form for making\nthe purposes of this definition control\n[software] [is] [modified] [by] [someone] [else]\n\n\n\nrepresent, as a whole, an original work of authorship. For the purposes\n of this License, Derivative Works shall not include works that remain\n separable from, or merely link (or bind by name) [to] [the] interfaces of,\n the Work and Derivative Works thereof."
32+
}
33+
],
34+
"detection_log": [
35+
"unknown-match"
36+
],
37+
"identifier": "unknown-b0897d47-1c91-9898-2364-2e4d1a34b6fd"
38+
}
39+
],
40+
"license_clues": [],
41+
"percentage_of_license_text": 86.89,
42+
"scan_errors": []
43+
}
44+
]
45+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
form shall mean the preferred form for making
2+
the purposes of this definition control
3+
software is modified by someone else
4+
5+
6+
7+
represent, as a whole, an original work of authorship. For the purposes
8+
of this License, Derivative Works shall not include works that remain
9+
separable from, or merely link (or bind by name) to the interfaces of,
10+
the Work and Derivative Works thereof.

0 commit comments

Comments
 (0)