Skip to content

Commit 9159a27

Browse files
committed
Improve copyright detection in markup
Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 341c9c1 commit 9159a27

File tree

16 files changed

+314
-243
lines changed

16 files changed

+314
-243
lines changed

src/cluecode/copyrights.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def detect_copyrights(
107107
numbered_lines = numbered_text_lines(location, demarkup=demarkup)
108108
numbered_lines = list(numbered_lines)
109109

110-
if TRACE:
110+
if TRACE or TRACE_TOK:
111111
logger_debug('detect_copyrights: numbered_lines')
112112
for nl in numbered_lines:
113113
logger_debug(' numbered_line:', repr(nl))
@@ -167,7 +167,7 @@ def detect_copyrights_from_lines(
167167

168168
candidate_lines_groups = candidate_lines(numbered_lines)
169169

170-
if TRACE:
170+
if TRACE or TRACE_TOK:
171171
candidate_lines_groups = list(candidate_lines_groups)
172172
logger_debug(
173173
f'detect_copyrights_from_lines: ALL groups of candidate '
@@ -245,7 +245,7 @@ def detect(self,
245245
if not numbered_lines:
246246
return
247247

248-
if TRACE:
248+
if TRACE or TRACE_TOK:
249249
logger_debug(f'CopyrightDetector: numbered_lines: {numbered_lines}')
250250

251251
tokens = list(get_tokens(numbered_lines))
@@ -633,10 +633,12 @@ def build_detection_from_node(
633633
# (c)opyright and (c)opyleft, we ignore case
634634
(r'^(?i:\(c\)opy(rights?|righted|left))$', 'COPY'),
635635

636-
# opyright and opyleft, we ignore case
636+
# truncated opyright and opyleft, we ignore case
637637
(r'^(?i:opy(rights?|righted|left|lefted)[\.\,]?)$', 'COPY'),
638638
(r'^//opylefted$', 'COPY'),
639639
(r"^c'opylefted$", 'COPY'),
640+
# typo in cppyright
641+
(r'^[Cc]ppyright[\.\,]?$', 'COPY'),
640642

641643
# with a trailing comma
642644
(r'^Copyright,$', 'COPY'),
@@ -782,6 +784,7 @@ def build_detection_from_node(
782784
(r'^[Pp]rocedures?$', 'JUNK'),
783785
(r'^You$', 'JUNK'),
784786
(r'^Everyone$', 'JUNK'),
787+
(r'^[Ff]unded$', 'JUNK'),
785788
(r'^Unless$', 'JUNK'),
786789
(r'^rant$', 'JUNK'),
787790
(r'^Subject$', 'JUNK'),
@@ -1244,6 +1247,7 @@ def build_detection_from_node(
12441247
(r'^Each$', 'NN'),
12451248
(r'^Education$', 'NN'),
12461249
(r'^Extended', 'NN'),
1250+
(r'^Every$', 'NN'),
12471251
(r'^Digitized', 'NN'),
12481252
(r'^END$', 'NN'),
12491253
(r'^Entity$', 'NN'),
@@ -3490,10 +3494,14 @@ def remove_dupe_copyright_words(c):
34903494
c = c.replace('SPDX-FileCopyrightText', 'Copyright')
34913495
c = c.replace('SPDX-SnippetCopyrightText', 'Copyright')
34923496
c = c.replace('Bundle-Copyright', 'Copyright')
3493-
# from .net assemblies
3494-
c = c.replace('AssemblyCopyright', 'Copyright')
3495-
c = c.replace('AppCopyright', 'Copyright')
3496-
3497+
c = (
3498+
# from .net assemblies
3499+
c.replace('AssemblyCopyright', 'Copyright')
3500+
.replace('AppCopyright', 'Copyright')
3501+
# typos
3502+
.replace('Cppyright', 'Copyright')
3503+
.replace('cppyright', 'Copyright')
3504+
)
34973505
# various prefix to the word copyright seen in binaries
34983506
# TODO use a regex instead
34993507
c = c.replace('BCopyright', 'Copyright')
@@ -3858,6 +3866,11 @@ def candidate_lines(numbered_lines):
38583866
# used as a state and line counter
38593867
in_copyright = 0
38603868

3869+
if TRACE_TOK:
3870+
numbered_lines = list(numbered_lines)
3871+
logger_debug(
3872+
f'candidate_lines: numbered_lines: {numbered_lines!r}')
3873+
38613874
# the previous line (chars only)
38623875
previous_chars = None
38633876
for numbered_line in numbered_lines:
@@ -4024,21 +4037,29 @@ def prepare_text_line(line, dedeb=True, to_ascii=True):
40244037
40254038
If ``to_ascii`` convert the text to ASCII characters.
40264039
"""
4040+
if TRACE_TOK:
4041+
logger_debug(' prepare_text_line: initial: ' + repr(line))
4042+
40274043
# remove some junk in man pages: \(co
40284044
line = (line
40294045
.replace('\\\\ co', ' ')
40304046
.replace('\\ co', ' ')
40314047
.replace('(co ', ' ')
40324048
)
40334049
line = remove_printf_format_codes(' ', line)
4050+
if TRACE_TOK:
4051+
logger_debug(' prepare_text_line: after remove_printf_format_codes: ' + repr(line))
4052+
40344053

40354054
# less common comment line prefixes
40364055
line = remove_comment_markers(' ', line)
4056+
if TRACE_TOK:
4057+
logger_debug(' prepare_text_line: after remove_comment_markers: ' + repr(line))
40374058

40384059
line = remove_man_comment_markers(' ', line)
40394060

40404061
if TRACE_TOK:
4041-
logger_debug(' get_tokens: WIP line1: ' + repr(line))
4062+
logger_debug(' prepare_text_line: after remove_man_comment_markers: ' + repr(line))
40424063

40434064
line = (line
40444065
# C and C++ style comment markers
@@ -4103,6 +4124,10 @@ def prepare_text_line(line, dedeb=True, to_ascii=True):
41034124
.replace('`', u"'")
41044125
.replace('"', u"'")
41054126
)
4127+
4128+
if TRACE_TOK:
4129+
logger_debug(' prepare_text_line: after replacements: ' + repr(line))
4130+
41064131
# keep only one quote
41074132
line = fold_consecutive_quotes(u"'", line)
41084133

src/licensedcode/data/licenses/bsl-1.0.LICENSE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ other_urls:
1111
- https://mariadb.com/bsl-faq-adopting
1212
- https://mariadb.com/products/mariadb-enterprise
1313
ignorable_copyrights:
14-
- (c) 2016 MariaDB Corporation
14+
- (c) 2016 MariaDB Corporation Ab
1515
ignorable_holders:
16-
- MariaDB Corporation
16+
- MariaDB Corporation Ab
1717
ignorable_urls:
1818
- https://mariadb.com/products/mariadb-enterprise
1919
---

src/licensedcode/data/rules/other-permissive_341.RULE

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ license_expression: other-permissive
33
is_license_text: yes
44
notes: https://github.com/DataJuggler/DataTier.Net/blob/cdd3493436b4c081db19829e7db3bed9389bf4b5/DataTier.Net/ProjectTemplates/DataTier.Net6.DataTemplate/Working/templates/DataTier.Net6.ClassLibrary/License/License.txt
55
ignorable_copyrights:
6-
- Copyright (c) 2022 - Data Juggler. Do
6+
- Copyright (c) 2022 - Data Juggler
77
ignorable_holders:
8-
- Data Juggler. Do
8+
- Data Juggler
99
---
1010

1111
DataJuggler Do What You Want License

src/textcode/markup.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#
1010

1111
from collections import Counter
12-
import logging
1312
import os
1413
import re
1514

@@ -20,7 +19,25 @@
2019
Extract text from HTML, XML and related angular markup-like files.
2120
"""
2221

23-
logger = logging.getLogger(__name__)
22+
# Tracing flags
23+
TRACE = False or os.environ.get('SCANCODE_DEBUG_TEXT_ANALYSIS', False)
24+
25+
26+
# Tracing flags
27+
def logger_debug(*args):
28+
pass
29+
30+
31+
if TRACE:
32+
import logging
33+
import sys
34+
35+
logger = logging.getLogger(__name__)
36+
logging.basicConfig(stream=sys.stdout)
37+
logger.setLevel(logging.DEBUG)
38+
39+
def logger_debug(*args):
40+
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
2441

2542
bin_dir = os.path.join(os.path.dirname(__file__), 'bin')
2643

@@ -46,7 +63,7 @@ def is_markup(location):
4663

4764
with open(location, 'rb') as f:
4865
start = as_unicode(f.read(1024))
49-
66+
5067
return is_markup_text(start)
5168

5269

@@ -86,9 +103,14 @@ def demarkup(location):
86103
from textcode.analysis import unicode_text_lines
87104

88105
for line in unicode_text_lines(location):
106+
if TRACE:
107+
logger_debug(f'demarkup: {line} : demarked: {demarkup_text(line)}')
89108
yield demarkup_text(line)
90109

91110

111+
get_tags_and_entities = re.compile(r'(</?[^\s></]+(?:>|\s)?|&[^\s&]+;|href|[\'"]?\/\>)', re.IGNORECASE).split
112+
113+
92114
def demarkup_text(text):
93115
"""
94116
Return text lightly stripped from markup. The whitespaces are collapsed to
@@ -100,18 +122,22 @@ def demarkup_text(text):
100122
kept_tags = (
101123
'lic', 'copy', 'www', 'http', 'auth', 'contr', 'leg', 'inc', '@',
102124
'<s>', '</s>', '169', 'a9'
103-
)
125+
)
104126

105127
# find start and closing tags or the first white space whichever comes first
106-
# or entities
107-
# this regex is such that ' '.join(tags.split(a))==a
128+
# or entities. This regex is such that ' '.join(tags.split(a))==a
108129

109-
tags_ents = re.compile(r'(</?[^\s></]+(?:>|\s)?|&[^\s&]+;|href|[\'"]?\/\>)', re.IGNORECASE).split
130+
tags_and_ents = get_tags_and_entities(text)
131+
if TRACE:
132+
logger_debug(f'demarkup_text: {text!r}')
133+
logger_debug(f'demarkup_text: tags_and_ents: {tags_and_ents}')
110134

111135
cleaned = []
112-
for token in tags_ents(text):
113-
if token.lower().startswith(('<', '&', 'href', '=')) and not any(k in token.lower() for k in kept_tags):
136+
cleaned_append = cleaned.append
137+
for token in tags_and_ents:
138+
tlow = token.lower()
139+
if tlow.startswith(('<', '&', 'href',)) and not any(k in tlow for k in kept_tags):
114140
continue
115141
else:
116-
cleaned.append(token)
142+
cleaned_append(token)
117143
return u' '.join(cleaned)

tests/cluecode/data/copyrights/babkin_txt.txt.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ what:
44
- holders_summary
55
copyrights:
66
- Copyright (c) North
7-
- Copyright (c) South Copyright (c)
7+
- Copyright (c) South
88
- Copyright (c) 2001 by the TTF2PT1 project
99
- Copyright (c) 2001 by Sergey Babkin
1010
holders:

tests/cluecode/data/copyrights/index.html.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ copyrights:
66
- Copyright (c) 2002-2004 James W. Newkirk, Michael C. Two, Alexei A. Vorontsov
77
- Copyright (c) 2000-2002 Philip A. Craig
88
- Copyright (c) 2003-2008, Terence Parr
9-
- Copyright (c) 1994-2011 Lua.org
9+
- Copyright (c) 1994-2011 Lua.org, PUC-Rio
1010
- Copyright (c) 2000,2001,2002,2003,2004,2006,2007 Keith Packard
1111
- Copyright (c) 2005 Patrick Lam
1212
- Copyright (c) 2009 Roozbeh Pournader
@@ -29,7 +29,7 @@ holders:
2929
- James W. Newkirk, Michael C. Two, Alexei A. Vorontsov
3030
- Philip A. Craig
3131
- Terence Parr
32-
- Lua.org
32+
- Lua.org, PUC-Rio
3333
- Keith Packard
3434
- Patrick Lam
3535
- Roozbeh Pournader

tests/cluecode/data/copyrights/index.txt.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ copyrights:
66
- Copyright (c) 2002-2004 James W. Newkirk, Michael C. Two, Alexei A. Vorontsov
77
- Copyright (c) 2000-2002 Philip A. Craig
88
- Copyright (c) 2003-2008, Terence Parr
9-
- Copyright (c) 1994-2011 Lua.org
9+
- Copyright (c) 1994-2011 Lua.org, PUC-Rio
1010
- Copyright (c) 2000,2001,2002,2003,2004,2006,2007 Keith Packard
1111
- Copyright (c) 2005 Patrick Lam
1212
- Copyright (c) 2009 Roozbeh Pournader
@@ -29,7 +29,7 @@ holders:
2929
- James W. Newkirk, Michael C. Two, Alexei A. Vorontsov
3030
- Philip A. Craig
3131
- Terence Parr
32-
- Lua.org
32+
- Lua.org, PUC-Rio
3333
- Keith Packard
3434
- Patrick Lam
3535
- Roozbeh Pournader

tests/cluecode/data/copyrights/openoffice_org_report_builder_bin.copyright.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ copyrights:
1818
- Copyright (c) 2002 France Telecom
1919
- Copyright (c) 1990-2003 Sleepycat Software
2020
- Copyright (c) 1990, 1993, 1994, 1995 The Regents of the University of California
21-
- Copyright (c) 2003 by Bitstream, Inc.
22-
- Cppyright Copyright (c) 2006 by Tavmjong Bah
21+
- Copyright (c) 2003 by Bitstream, Inc. Cppyright
22+
- Copyright (c) 2006 by Tavmjong Bah
2323
- Copyright (c) 2007 Red Hat, Inc
2424
- Copyright (c) 2007 Red Hat, Inc.
2525
- Copyright 2000-2003 Beman Dawes

tests/cluecode/data/ics/bluetooth-glib-gio-xdgmime/xdgmimealias.h.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ holders:
99
- Red Hat, Inc.
1010
- Matthias Clasen
1111
holders_summary:
12-
- value: Red Hat
13-
count: 1
1412
- value: Matthias Clasen
1513
count: 1
16-
14+
- value: Red Hat
15+
count: 1

tests/cluecode/test_copyrights_basic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,7 @@ def test_copyright_lines_babkin_txt(self):
442442
test_file = self.get_test_loc('copyrights_basic/babkin_txt.txt')
443443
expected = [
444444
copyrights.CopyrightDetection('Copyright (c) North', 1, 1),
445-
copyrights.CopyrightDetection('Copyright (c) South Copyright (c)', 2, 3),
445+
copyrights.CopyrightDetection('Copyright (c) South', 2, 3),
446446
copyrights.CopyrightDetection('Copyright (c) 2001 by the TTF2PT1 project', 4, 4),
447447
copyrights.CopyrightDetection('Copyright (c) 2001 by Sergey Babkin', 5, 5),
448448
]

0 commit comments

Comments
 (0)