Skip to content

Commit 2d5b269

Browse files
committed
Improve copyright detection
Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent e71f8af commit 2d5b269

File tree

67 files changed

+382
-176
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+382
-176
lines changed

src/cluecode/copyrights.py

Lines changed: 81 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,22 @@
1212
import re
1313
import string
1414
import sys
15+
16+
from collections import deque
1517
from time import time
1618

1719
import attr
20+
21+
from commoncode.text import toascii
22+
from commoncode.text import unixlinesep
1823
from pygmars import lex
1924
from pygmars import parse
2025
from pygmars import Token
2126
from pygmars.tree import Tree
2227

23-
from commoncode.text import toascii
24-
from commoncode.text import unixlinesep
2528

2629
from cluecode import copyrights_hint
27-
from textcode.markup import strip_debian_markup
28-
from textcode.markup import strip_markup_text
30+
from textcode.markup import strip_known_markup_from_text
2931

3032
# Tracing flags
3133
TRACE = False or os.environ.get('SCANCODE_DEBUG_COPYRIGHT', False)
@@ -167,10 +169,10 @@ def detect_copyrights_from_lines(
167169
else:
168170
detector = DETECTOR
169171

170-
candidate_lines_groups = collect_candidate_lines(numbered_lines)
172+
candidate_lines_groups = list(collect_candidate_lines(numbered_lines))
171173

172174
if TRACE or TRACE_TOK:
173-
candidate_lines_groups = list(candidate_lines_groups)
175+
candidate_lines_groups = candidate_lines_groups
174176
logger_debug(
175177
f'detect_copyrights_from_lines: ALL groups of candidate '
176178
f'lines collected: {len(candidate_lines_groups)}',
@@ -386,6 +388,7 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
386388
if TRACE_TOK:
387389
logger_debug(' get_tokens: bare line: ' + repr(line))
388390

391+
# keep or skip empty lines
389392
if not line.strip():
390393
stripped = last_line.lower().strip(string.punctuation)
391394
if (
@@ -398,11 +401,10 @@ def get_tokens(numbered_lines, splitter=re.compile(r'[\t =;]+').split):
398401
pos += 1
399402
last_line = ""
400403
continue
404+
401405
if TRACE_TOK:
402406
logger_debug(' get_tokens: before preped line: ' + repr(line))
403407

404-
# line = prepare_text_line(line)
405-
406408
last_line = line
407409

408410
if TRACE_TOK:
@@ -801,6 +803,9 @@ def build_detection_from_node(
801803
(r'^Earth$', 'NN'),
802804
(r'^Maps/Google$', 'NN'),
803805

806+
# verbatime star
807+
(r'^\*$', 'JUNK'),
808+
804809
(r'^([A-Z][a-z]+){3,}$', 'JUNK'),
805810

806811
############################################################################
@@ -919,6 +924,8 @@ def build_detection_from_node(
919924
(r'^WARRANTS?$', 'JUNK'),
920925
(r'^WARRANTYS?$', 'JUNK'),
921926

927+
(r'^Row\(', 'JUNK'),
928+
922929
(r'^hispagestyle$', 'JUNK'),
923930
(r'^Generic$', 'JUNK'),
924931
(r'^generate-', 'JUNK'),
@@ -1890,6 +1897,8 @@ def build_detection_from_node(
18901897
(r'^(SPRL|srl)[\.,]?$', 'COMP'),
18911898
# Poland
18921899
(r'^(sp\.|o\.o\.)$', 'COMP'),
1900+
# Eingetragener Kaufmann
1901+
(r'^(e\.K\.|e\.Kfm\.|e\.Kfr\.)$', 'COMP'),
18931902

18941903
# company suffix : AS: this is frequent beyond Norway.
18951904
(r'^AS', 'CAPS'),
@@ -2952,6 +2961,10 @@ def build_detection_from_node(
29522961
# Copyright (C) 1999-2000 VA Linux Systems
29532962
COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <CAPS> <NN|LINUX> <NNP>} #2280-1
29542963
2964+
# Russ Dill <[email protected]> 2001-2003
2965+
# Rewrited by Vladimir Oleynik <[email protected]> (C) 2003
2966+
COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #22793.5
2967+
29552968
COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280
29562969
29572970
# using #2280 above: Copyright 2018 Developers of the Rand project
@@ -3241,10 +3254,6 @@ def build_detection_from_node(
32413254
# 1995-2003 by Internet Software Consortium
32423255
COPYRIGHT: {<COPYRIGHT> <NN> <YR-RANGE> <BY> <COMPANY> } #1615
32433256
3244-
# Russ Dill <[email protected]> 2001-2003
3245-
# Rewrited by Vladimir Oleynik <[email protected]> (C) 2003
3246-
COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #22793.5
3247-
32483257
# portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
32493258
COPYRIGHT: {<PORTIONS> <COPY> <NN> <NAME>} #157998
32503259
@@ -3647,6 +3656,7 @@ def refine_names(s, prefixes):
36473656
r'Copyright \(c\) 2021 Dot',
36483657
r'^\(c\) \(c\) B$',
36493658
r'^\(c\) group$',
3659+
r'^\(c\) \(c\) A$',
36503660
]
36513661

36523662
# a collection of junk junk matcher callables
@@ -4175,7 +4185,7 @@ def is_end_of_statement(chars_only_line):
41754185
)
41764186

41774187

4178-
remove_non_chars = re.compile(r'[^a-z0-9]').sub
4188+
remove_non_chars = re.compile(r'[^a-z0-9]', re.IGNORECASE).sub
41794189

41804190
has_trailing_year = re.compile(r'(?:19\d\d|20[0-4]\d)+$').findall
41814191

@@ -4189,8 +4199,9 @@ def collect_candidate_lines(numbered_lines):
41894199
A candidate line is a line of text that may contain copyright statements.
41904200
A few lines before and after a candidate line are also included.
41914201
"""
4192-
candidates = []
4202+
candidates = deque()
41934203
candidates_append = candidates.append
4204+
candidates_clear = candidates.clear
41944205

41954206
# used as a state and line counter
41964207
in_copyright = 0
@@ -4216,10 +4227,10 @@ def collect_candidate_lines(numbered_lines):
42164227
candidates_append((ln, prepared,))
42174228

42184229
if TRACE:
4219-
logger_debug(f' collect_candidate_lines: is EOS: yielding candidates\n {candidates!r}\n')
4230+
logger_debug(f' collect_candidate_lines: is EOS: yielding candidates\n {list(candidates)!r}\n')
42204231

4221-
yield candidates
4222-
candidates = []
4232+
yield list(candidates)
4233+
candidates_clear()
42234234
in_copyright = 0
42244235
previous_chars = None
42254236

@@ -4253,35 +4264,35 @@ def collect_candidate_lines(numbered_lines):
42534264
):
42544265

42554266
if TRACE:
4256-
logger_debug(f' collect_candidate_lines: empty: yielding candidates\n {candidates!r}\n')
4267+
logger_debug(f' collect_candidate_lines: empty: yielding candidates\n {list(candidates)!r}\n')
42574268

4258-
yield candidates
4259-
candidates = []
4269+
yield list(candidates)
4270+
candidates_clear()
42604271
in_copyright = 0
42614272
previous_chars = None
42624273

42634274
else:
4264-
candidates_append((ln, line,))
4275+
candidates_append((ln, prepared,))
42654276
# and decrement our state
42664277
in_copyright -= 1
42674278
if TRACE:
42684279
logger_debug(' collect_candidate_lines: line is in copyright')
42694280

42704281
elif candidates:
42714282
if TRACE:
4272-
logger_debug(f' collect_candidate_lines: not in COP: yielding candidates\n {candidates!r}\n')
4283+
logger_debug(f' collect_candidate_lines: not in COP: yielding candidates\n {list(candidates)!r}\n')
42734284

4274-
yield candidates
4275-
candidates = []
4285+
yield list(candidates)
4286+
candidates_clear()
42764287
in_copyright = 0
42774288
previous_chars = None
42784289

42794290
# finally
42804291
if candidates:
42814292
if TRACE:
4282-
logger_debug(f'collect_candidate_lines: finally yielding candidates\n {candidates!r}\n')
4293+
logger_debug(f'collect_candidate_lines: finally yielding candidates\n {list(candidates)!r}\n')
42834294

4284-
yield candidates
4295+
yield list(candidates)
42854296

42864297
################################################################################
42874298
# TEXT PRE PROCESSING
@@ -4299,12 +4310,29 @@ def collect_candidate_lines(numbered_lines):
42994310

43004311
# less common rem comment line prefix in dos
43014312
# less common dnl comment line prefix in autotools am/in
4302-
remove_comment_markers = re.compile(r'^(rem|\@rem|dnl)\s+').sub
4313+
remove_weird_comment_markers = re.compile(r'^(rem|\@rem|dnl)\s+').sub
43034314

43044315
# common comment line prefix in man pages
43054316
remove_man_comment_markers = re.compile(r'\."').sub
43064317

43074318

4319+
def remove_code_comment_markers(s):
4320+
"""
4321+
Return ``s`` removing code comments such as C and C++ style comment markers and assimilated
4322+
4323+
>>> remove_code_comment_markers("\\*#%; /\\/*a*/b/*c\\d#e%f \\*#%; /")
4324+
'a b c d e f'
4325+
"""
4326+
return (s
4327+
.replace('/*', ' ')
4328+
.replace('*/', ' ')
4329+
.replace('*', ' ')
4330+
.replace('#', ' ')
4331+
.replace('%', ' ')
4332+
.strip(' \\/*#%;')
4333+
)
4334+
4335+
43084336
def prepare_text_line(line):
43094337
"""
43104338
Prepare a text ``line`` for copyright detection.
@@ -4324,19 +4352,20 @@ def prepare_text_line(line):
43244352
logger_debug(' prepare_text_line: after remove_printf_format_codes: ' + repr(line))
43254353

43264354
# less common comment line prefixes
4327-
line = remove_comment_markers(' ', line)
4355+
line = remove_weird_comment_markers(' ', line)
43284356
if TRACE_TOK:
4329-
logger_debug(' prepare_text_line: after remove_comment_markers: ' + repr(line))
4357+
logger_debug(' prepare_text_line: after remove_weird_comment_markers: ' + repr(line))
43304358

43314359
line = remove_man_comment_markers(' ', line)
4332-
43334360
if TRACE_TOK:
43344361
logger_debug(' prepare_text_line: after remove_man_comment_markers: ' + repr(line))
43354362

4363+
line = remove_code_comment_markers(line)
4364+
if TRACE_TOK:
4365+
logger_debug(' prepare_text_line: after remove_code_comment_markers: ' + repr(line))
4366+
43364367
line = (line
43374368
# C and C++ style comment markers
4338-
.replace('/*', ' ').replace('*/', ' ')
4339-
.strip().strip('/*#')
43404369
# in rst
43414370
.replace('|copy|', ' (c) ')
43424371
# un common pipe chars in some ascii art
@@ -4368,6 +4397,11 @@ def prepare_text_line(line):
43684397
.replace('\\XA9', ' (c) ')
43694398
.replace('\\A9', ' (c) ')
43704399
.replace('\\a9', ' (c) ')
4400+
.replace('<A9>', ' (c) ')
4401+
.replace('XA9;', ' (c) ')
4402+
.replace('Xa9;', ' (c) ')
4403+
.replace('xA9;', ' (c) ')
4404+
.replace('xa9;', ' (c) ')
43714405
# \xc2 is a Â
43724406
.replace('\xc2', '')
43734407
.replace('\\xc2', '')
@@ -4393,18 +4427,22 @@ def prepare_text_line(line):
43934427
.replace('&amp;', '&')
43944428
.replace('&#38;', '&')
43954429
.replace('&gt;', '>')
4430+
.replace('&gt', '>')
43964431
.replace('&#62;', '>')
43974432
.replace('&lt;', '<')
4433+
.replace('&lt', '<')
43984434
.replace('&#60;', '<')
43994435

44004436
# normalize (possibly repeated) quotes to unique single quote '
44014437
# backticks ` and "
44024438
.replace('`', "'")
44034439
.replace('"', "'")
4404-
# u nicode prefix in Python strings
4440+
# u unicode prefix in legacy Python2 strings
44054441
.replace(" u'", " '")
44064442
# see https://github.com/nexB/scancode-toolkit/issues/3667
44074443
.replace('§', " ")
4444+
# keep http
4445+
.replace('<http', " http")
44084446
)
44094447

44104448
if TRACE_TOK:
@@ -4427,15 +4465,18 @@ def prepare_text_line(line):
44274465
# replace ('
44284466
.replace('("', ' ')
44294467
# some trailing garbage ')
4430-
.replace(u"')", ' ')
4431-
.replace(u"],", ' ')
4468+
.replace("')", ' ')
4469+
.replace("],", ' ')
44324470
)
44334471
if TRACE_TOK:
44344472
logger_debug(' prepare_text_line: after replacements2: ' + repr(line))
44354473

4436-
line = strip_markup_text(line)
44374474
# note that we do not replace the debian tag by a space: we remove it
4438-
line = strip_debian_markup(line)
4475+
# This "Debian" legacy copyright file <s> </s> markup tags seen in
4476+
# older copyright files. Note we replace by nothing.
4477+
line = line.replace("</s>", "").replace("<s>", "").replace("<s/>", "")
4478+
4479+
line = strip_known_markup_from_text(line)
44394480

44404481
if TRACE_TOK:
44414482
logger_debug(' prepare_text_line: after strip_markup: ' + repr(line))
@@ -4455,14 +4496,13 @@ def prepare_text_line(line):
44554496
# normalize to ascii text
44564497
line = toascii(line, translit=True)
44574498

4499+
# remove stars
4500+
line = line.strip(' *')
4501+
44584502
# normalize to use only LF as line endings so we can split correctly
44594503
# and keep line endings
44604504
line = unixlinesep(line)
44614505

4462-
# strip verbatim back slash and comment signs again at both ends of a line
4463-
# FIXME: this is done at the start of this function already
4464-
line = line.strip('\\/*#%;')
4465-
44664506
# normalize spaces
44674507
line = ' '.join(line.split())
44684508

tests/cluecode/data/authors/author_russ_c-c.c.yml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@ what:
44
- copyrights
55
- holders
66
copyrights:
7-
- Russ Dill <[email protected]> 2001-2003
8-
- Vladimir Oleynik <[email protected]> (c) 2003
7+
- Russ Dill <[email protected]> 2001-2003 Rewrited by Vladimir Oleynik <[email protected]> (c)
8+
2003
99
holders:
10-
- Russ Dill
11-
- Vladimir Oleynik <[email protected]>
10+
- Russ Dill Rewrited by Vladimir Oleynik
1211
notes: these are detected as copyrights, not authors

tests/cluecode/data/authors/hdp.c.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ what:
33
- authors_summary
44
- copyrights
55
copyrights:
6-
- Copyright (c) 2010 GSyC/LibreSoft, Universidad Rey Juan Carlos. Authors Santiago Carot Nemesio at gmail.com
6+
- Copyright (c) 2010 GSyC/LibreSoft, Universidad Rey Juan Carlos. Authors Santiago Carot Nemesio

tests/cluecode/data/copyright_fossology/testdata118.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ holders:
3939
- Guy Eric Schalnat, Group 42, Inc.
4040
- Jean-loup Gailly and Mark Adler
4141
authors:
42-
- artofcode LLC. <http://artofcode.com/>
42+
- artofcode LLC. http://artofcode.com

tests/cluecode/data/copyright_fossology/testdata119.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ holders:
3939
- Guy Eric Schalnat, Group 42, Inc.
4040
- Jean-loup Gailly and Mark Adler
4141
authors:
42-
- artofcode LLC. <http://artofcode.com/>
42+
- artofcode LLC. http://artofcode.com

tests/cluecode/data/copyright_fossology/testdata128.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ holders:
3939
- Guy Eric Schalnat, Group 42, Inc.
4040
- Jean-loup Gailly and Mark Adler
4141
authors:
42-
- artofcode LLC. <http://artofcode.com/>
42+
- artofcode LLC. http://artofcode.com

tests/cluecode/data/copyright_fossology/testdata14.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ what:
44
- authors
55
copyrights:
66
- Copyright (c) 1999-2006 Trolltech ASA, Norway
7-
- Copyright (c) 2006 trolltech.html' Trolltech
7+
- Copyright (c) 2006 trolltech.html Trolltech
88
holders:
99
- Trolltech ASA, Norway
10-
- trolltech.html' Trolltech
10+
- trolltech.html Trolltech

0 commit comments

Comments
 (0)