Skip to content

Commit 5d33b05

Browse files
authored
Merge pull request #2527 from nexB/2525-copyright
Correct problematic regexes #2525
2 parents e34bc1e + 9de1b6b commit 5d33b05

File tree

26 files changed

+498
-325
lines changed

26 files changed

+498
-325
lines changed

etc/scripts/licenses/buildrules.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,8 +235,8 @@ def cli(licenses_file):
235235
print('Skipping already added rule with text for:', base_name)
236236
else:
237237
rules_tokens.add(rule_tokens)
238-
rulerec.dump()
239238
models.update_ignorables(rulerec, verbose=False)
239+
rulerec.dump()
240240
print('Rule added:', 'file://' + rulerec.data_file, '\n', 'file://' + rulerec.text_file,)
241241

242242

src/cluecode/copyrights.py

Lines changed: 40 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,15 @@
2525
if os.environ.get('SCANCODE_DEBUG_COPYRIGHT_DEEP'):
2626
TRACE_DEEP = 1
2727

28+
TRACE_TOK = False or os.environ.get('SCANCODE_DEBUG_COPYRIGHT_TOKEN', False)
29+
2830

2931
# Tracing flags
3032
def logger_debug(*args):
3133
pass
3234

3335

34-
if TRACE or TRACE_DEEP:
36+
if TRACE or TRACE_DEEP or TRACE_TOK:
3537
import logging
3638

3739
logger = logging.getLogger(__name__)
@@ -98,7 +100,18 @@ def detect_copyrights_from_lines(numbered_lines, copyrights=True, holders=True,
98100
"""
99101
detector = CopyrightDetector()
100102

101-
for candidates in candidate_lines(numbered_lines):
103+
candidate_lines_groups = candidate_lines(numbered_lines)
104+
if TRACE:
105+
candidate_lines_groups = list(candidate_lines_groups)
106+
logger_debug(
107+
f'detect_copyrights_from_lines: ALL groups of candidate '
108+
f'lines collected: {len(candidate_lines_groups)}')
109+
110+
for candidates in candidate_lines_groups:
111+
if TRACE:
112+
from pprint import pformat
113+
can = pformat(candidates, width=160)
114+
logger_debug(f' detect_copyrights_from_lines: processing candidates group:\n{can}')
102115

103116
detections = detector.detect(
104117
numbered_lines=candidates,
@@ -109,13 +122,17 @@ def detect_copyrights_from_lines(numbered_lines, copyrights=True, holders=True,
109122
include_allrights=include_allrights
110123
)
111124

125+
if TRACE:
126+
detections = list(detections)
127+
logger_debug(f' detect_copyrights_from_lines: {detections}')
128+
112129
for detection in detections:
113130
# tuple of type, string, start, end
114131
yield detection
132+
115133
if time() > deadline:
116134
break
117135

118-
119136
################################################################################
120137
# DETECTION PROPER
121138
################################################################################
@@ -149,8 +166,15 @@ def detect(self, numbered_lines,
149166
numbered_lines = list(numbered_lines)
150167
start_line = numbered_lines[0][0]
151168
end_line = numbered_lines[-1][0]
169+
170+
if TRACE: logger_debug(f'CopyrightDetector:numbered_lines: {numbered_lines}')
171+
152172
tokens = self.get_tokens(numbered_lines)
153173

174+
if TRACE:
175+
tokens = list(tokens)
176+
logger_debug(f'CopyrightDetector:tokens: {tokens}')
177+
154178
if not tokens:
155179
return
156180

@@ -242,9 +266,9 @@ def get_tokens(self, numbered_lines):
242266
tokens_append = tokens.append
243267

244268
for _line_number, line in numbered_lines:
245-
if TRACE: logger_debug(' get_tokens: bare line: ' + repr(line))
269+
if TRACE_TOK: logger_debug(' get_tokens: bare line: ' + repr(line))
246270
line = prepare_text_line(line)
247-
if TRACE: logger_debug(' get_tokens:preped line: ' + repr(line))
271+
if TRACE_TOK: logger_debug(' get_tokens:preped line: ' + repr(line))
248272
for tok in splitter(line):
249273
# strip trailing single quotes and ignore empties
250274
tok = tok.strip("' ")
@@ -254,7 +278,9 @@ def get_tokens(self, numbered_lines):
254278
tok = tok.lstrip('@').strip()
255279
if tok and tok not in (':',):
256280
tokens_append(tok)
257-
if TRACE: logger_debug(' get_tokens:tokens: ' + repr(tokens))
281+
if TRACE_TOK:
282+
logger_debug(' get_tokens:tokens: ' + repr(tokens))
283+
logger_debug(' get_tokens: ALL tokens collected')
258284
return tokens
259285

260286
@classmethod
@@ -442,7 +468,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
442468

443469
# CamELCaseeXXX is typcally JUNK such as code variable names
444470
# AzaAzaaaAz BBSDSB002923,
445-
(r'^([A-Z][a-z]+){3,}[A-Z]+[0-9]*,?$', 'JUNK'),
471+
(r'^([A-Z][a-z]+){3,20}[A-Z]+[0-9]*,?$', 'JUNK'),
446472

447473
# multiple parens (at least two (x) groups) is a sign of junk
448474
# such as in (1)(ii)(OCT
@@ -501,7 +527,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
501527
(r'^Last$', 'JUNK'),
502528
(r'^[Rr]eleased?$', 'JUNK'),
503529
(r'^[Cc]opyrighting$', 'JUNK'),
504-
(r'^Authori.*$', 'JUNK'),
530+
(r'^[Aa]uthori.*$', 'JUNK'),
505531
(r'^such$', 'JUNK'),
506532
(r'^[Aa]ssignments?[.,]?$', 'JUNK'),
507533
(r'^[Bb]uild$', 'JUNK'),
@@ -805,7 +831,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
805831
(r'^Assignment', 'NN'),
806832
(r'^Atomic$', 'NN'),
807833
(r'^Attribution$', 'NN'),
808-
(r'^Authored$', 'NN'),
834+
(r'^[Aa]uthored$', 'NN'),
809835
(r'^Baslerstr\.?$', 'NN'),
810836
(r'^BSD$', 'NN'),
811837
(r'^BUT$', 'NN'),
@@ -1446,8 +1472,8 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
14461472
############################################################################
14471473

14481474
# email start-at-end: <sebastian.classen at freenet.ag>: <EMAIL_START> <AT> <EMAIL_END>
1449-
(r'^<([a-zA-Z]+[a-zA-Z\.]){3,}$', 'EMAIL_START'),
1450-
(r'^[a-zA-Z\.]{2,}>$', 'EMAIL_END'),
1475+
(r'^<([a-zA-Z]+[a-zA-Z\.]){2,5}$', 'EMAIL_START'),
1476+
(r'^[a-zA-Z\.]{2,5}>$', 'EMAIL_END'),
14511477

14521478
# a .sh shell scripts is NOT an email.
14531479
(r'^.*\.sh\.?$', 'JUNK'),
@@ -3037,9 +3063,9 @@ def is_end_of_statement(chars_only_line):
30373063

30383064
def candidate_lines(numbered_lines):
30393065
"""
3040-
Yield lists of candidate lines where each list element is a tuple of
3041-
(line number, line text) given an iterable of numbered_lines as tuples of
3042-
(line number, line text) .
3066+
Yield groups of candidate lines as list where each list element is a tuple
3067+
of (line number, line text) given an iterable of numbered_lines as tuples
3068+
of (line number, line text) .
30433069
30443070
A candidate line is a line of text that may contain copyright statements.
30453071
A few lines before and after a candidate line are also included.
@@ -3082,7 +3108,6 @@ def candidate_lines(numbered_lines):
30823108
previous_chars = chars_only
30833109
if TRACE: logger_debug(' candidate_lines: line is candidate')
30843110

3085-
30863111
elif 's>' in line:
30873112
# this is for debian-style <s></s> copyright name tags
30883113
# the state is now "in copyright"

src/cluecode/plugin_filter_clues.py

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ def logger_debug(*args):
3838
@post_scan_impl
3939
class RedundantCluesFilter(PostScanPlugin):
4040
"""
41-
Filter redundant clues (copyrights, authors, emails, and urls) that are already
42-
contained in another more important scan result.
41+
Filter redundant clues (copyrights, authors, emails, and urls) that are
42+
already contained in a matched license text.
4343
"""
4444
sort_order = 1
4545

@@ -275,13 +275,14 @@ def filter_ignorable_clues(detections, rules_by_id):
275275

276276
def filter_values(attributes, ignorables, value_key='value', strip=''):
277277
"""
278-
Yield filtered `attributes` based on line positions and values found in a
279-
ignorables.
278+
Yield filtered ``attributes`` based on line positions and values found in a
279+
``ignorables`` Ignorables object. Use the ``value_key`` key for getting the
280+
value.
280281
281282
`attributes` is a list of mappings that contain a `start_line`, `end_line`
282283
and a `value_key` key.
283284
284-
Optionally strip `strip` from the the values.
285+
Optionally strip the ``strip`` characters from the values.
285286
"""
286287
for item in attributes:
287288
if TRACE:
@@ -296,7 +297,7 @@ def filter_values(attributes, ignorables, value_key='value', strip=''):
296297

297298
for ign in ignorables:
298299
if TRACE: logger_debug(' filter_values: ign:', ign)
299-
if (ls in ign.lines_range or el in ign.lines_range) and val in ign.value:
300+
if (ls in ign.lines_range or el in ign.lines_range) and val in ign.value:
300301
ignored = True
301302
if TRACE: logger_debug(' filter_values: skipped')
302303
break
@@ -307,13 +308,12 @@ def filter_values(attributes, ignorables, value_key='value', strip=''):
307308

308309
def collect_ignorables(license_matches, rules_by_id):
309310
"""
310-
Collect and return an ignorable Clues object built from `license_matches`
311-
matched licenses which is the list of "licenses" objects returned in JSON
312-
results.
311+
Collect and return an Ignorables object built from ``license_matches``
312+
matched licenses list of "licenses" objects returned in ScanCode JSON
313+
results and the ``rules_by_id`` mapping of Rule objects by identifier.
313314
314-
The value of each ignorable list of clues is a set of (set of
315-
lines number, set of ignorable values). The return values is a mapping
316-
{label: ignorables}.
315+
The value of each ignorable list of clues is a set of (set of lines number,
316+
set of ignorable values).
317317
"""
318318
emails = set()
319319
urls = set()
@@ -323,16 +323,17 @@ def collect_ignorables(license_matches, rules_by_id):
323323

324324
if not license_matches:
325325
return Ignorables(
326-
copyrights=frozenset(copyrights),
327-
holders=frozenset(holders),
328-
authors=frozenset(authors),
329-
urls=frozenset(urls),
330-
emails=frozenset(emails),
331-
)
326+
copyrights=frozenset(copyrights),
327+
holders=frozenset(holders),
328+
authors=frozenset(authors),
329+
urls=frozenset(urls),
330+
emails=frozenset(emails),
331+
)
332332
# build tuple of (set of lines number, set of ignorbale values)
333333
for lic in license_matches:
334334

335-
if TRACE: logger_debug('collect_ignorables: license:', lic['key'], lic['score'])
335+
if TRACE:
336+
logger_debug('collect_ignorables: license:', lic['key'], lic['score'])
336337

337338
matched_rule = lic.get('matched_rule', {})
338339
rid = matched_rule.get('identifier')
@@ -341,7 +342,8 @@ def collect_ignorables(license_matches, rules_by_id):
341342
# ignore poor partial matches
342343
# TODO: there must be a better way using coverage
343344
if match_coverage < 90:
344-
if TRACE: logger_debug(' collect_ignorables: skipping, match_coverage under 90%')
345+
if TRACE:
346+
logger_debug(' collect_ignorables: skipping, match_coverage under 90%')
345347
continue
346348

347349
if not rid:

src/licensedcode/data/licenses/bitstream.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,4 @@ spdx_license_key: LicenseRef-scancode-bitstream
88
text_urls:
99
- http://www.gnome.org/fonts/#Final_Bitstream_Vera_Fonts
1010
minimum_coverage: 50
11-
ignorable_authors:
12-
- authorization from the Gnome Foundation or Bitstream Inc.
11+

src/licensedcode/data/licenses/xfree86-1.0.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,4 @@ spdx_license_key: LicenseRef-scancode-xfree86-1.0
88
text_urls:
99
- http://www.xfree86.org/current/LICENSE5.html
1010
minimum_coverage: 80
11-
ignorable_authors:
12-
- authorization from the XFree86 Project
11+

src/licensedcode/data/licenses/xfree86-1.1.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,5 @@ faq_url: http://www.xfree86.org/current/LICENSE4.html
1111
minimum_coverage: 70
1212
ignorable_authors:
1313
- The XFree86 Project, Inc (http://www.xfree86.org/)
14-
- authorization from The XFree86 Project, Inc.
1514
ignorable_urls:
1615
- http://www.xfree86.org/

src/licensedcode/data/rules/other-permissive_181.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,5 @@ notes: Seen in https://github.com/apple-opensource-mirror/X11/blob/43e6e6c55db66
55
This is a mix of Apache-1.1 and xfree86-1.1
66
ignorable_authors:
77
- X-Oz Technologies (http://www.x-oz.com/)
8-
- authorization from X-Oz Technologies
98
ignorable_urls:
109
- http://www.x-oz.com/
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
license_expression: x11-opengroup
22
is_license_text: yes
33
minimum_coverage: 50
4-
ignorable_authors:
5-
- authorization from The Open Group
4+

src/licensedcode/data/rules/x11-opengroup_2.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,4 @@ is_license_text: yes
33
relevance: 90
44
minimum_coverage: 90
55
notes: truncated text
6-
ignorable_authors:
7-
- authorization from The Open Group
6+
Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
license_expression: x11-opengroup
22
is_license_text: yes
33
relevance: 100
4-
ignorable_authors:
5-
- authorization from The Open Group
4+

0 commit comments

Comments
 (0)