Skip to content

Commit 6ae938a

Browse files
committed
Improve copyright detection
* handle edge cases such as some words with HTML < or > #110 * improve some university and company name detection such as subsidaiaries #110 * do not detect cecrtain authors as copyrights Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 36efea3 commit 6ae938a

File tree

3 files changed

+126
-149
lines changed

3 files changed

+126
-149
lines changed

src/cluecode/copyrights.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def detect(location):
108108
')')
109109

110110
_YEAR_YEAR = (r'('
111-
# fixme v ....the underscore below is suspicious
111+
# fixme v ....the underscore below is suspicious
112112
'19[6-9][0-9][\.,\-]_[6-9][0-9]' # 1960-99
113113
'|'
114114
'19[6-9][0-9][\.,\-]+[0-9]' # 1998-9
@@ -227,7 +227,7 @@ def detect(location):
227227
# note the leading @ .... this may be a source of problems
228228
(r'.?(@?([Cc]opyright)s?:?|[Cc]opr\.?|[(][Cc][)]|(COPYRIGHT)S?:?)', 'COPY'),
229229

230-
# copyright in markup, until we strip markup: apache'>Copyright
230+
# copyright in markup, until we strip markup: apache'>Copyright or left'>Copyright
231231
(r'[A-Za-z0-9]+[\'">]+[Cc]opyright', 'COPY'),
232232

233233
# AT&T (the company), needs special handling
@@ -249,6 +249,8 @@ def detect(location):
249249
(r'^([Ll][Ll][CcPp]|[Ll][Tt][Dd])\.?,?$', 'COMP'),
250250
(r'^([Ll][Ll][CcPp]|[Ll][Tt][Dd])\.$', 'COMP'),
251251
(r'^L\.P\.$', 'COMP'),
252+
(r'^[Ss]ubsidiar(y|ies)$', 'COMP'),
253+
(r'^[Ss]ubsidiary\(\-ies\)$', 'COMP'),
252254
# company suffix : SA, SAS, AG, AB, AS, CO, labs followed by a dot
253255
(r'^(S\.?A\.?S?|Sas|sas|AG|AB|Labs?|[Cc][Oo]\.|Research|INRIA).?$', 'COMP'),
254256
# (german) company suffix
@@ -440,6 +442,9 @@ def detect(location):
440442
# the Regents of the University of California
441443
COMPANY: {<BY>? <NN> <NNP> <OF> <NN> <UNI> <OF> <COMPANY|NAME|NAME2|NAME3><COMP>?} #130
442444
445+
# Free Software Foundation, Inc.
446+
COMPANY: {<NNP> <NNP> <COMP> <COMP>} #135
447+
443448
# Corporation/COMP for/NN National/NNP Research/COMP Initiatives/NNP
444449
COMPANY: {<COMP> <NN> <NNP> <COMP> <NNP>} #140
445450
@@ -471,7 +476,7 @@ def detect(location):
471476
NAME: {<NNP> <PN>? <NNP>+} #360
472477
NAME: {<NNP> <NNP>} #370
473478
474-
NAME: {<NNP> <NN> <EMAIL>} #390
479+
NAME: {<NNP> <NN|NNP> <EMAIL>} #390
475480
NAME: {<NNP> <PN|VAN>? <PN|VAN>? <NNP>} #400
476481
NAME: {<NNP> <NN> <NNP>} #410
477482
NAME: {<NNP> <COMMIT>} #420
@@ -486,7 +491,8 @@ def detect(location):
486491
COMPANY: {<NNP> <IN> <NN>? <COMPANY>} #510
487492
488493
NAME2: {<NAME> <EMAIL>} #530
489-
NAME3: {<YR-RANGE> <NAME2|COMPANY>+} #540
494+
NAME3: {<YR-RANGE> <NAME2|COMPANY>+} #535
495+
NAME3: {<YR-RANGE> <NAME2|COMPANY>+ <CC> <YR-RANGE>} #540
490496
NAME: {<NAME|NAME2>+ <OF> <NNP> <OF> <NN>? <COMPANY>} #550
491497
NAME: {<NAME|NAME2>+ <CC|OF>? <NAME|NAME2|COMPANY>} #560
492498
NAME3: {<YR-RANGE> <NAME>+} #570
@@ -577,6 +583,9 @@ def detect(location):
577583
# by the a href http wtforms.simplecodes.com WTForms Team
578584
COMPANY: {<BY> <NN>+ <COMP|COMPANY>} #1420
579585
586+
# the Regents of the University of California, Sun Microsystems, Inc., Scriptics Corporation
587+
COMPANY: {<NN> <NNP> <OF> <NN> <UNI> <OF> <COMPANY>+}
588+
580589
581590
# "And" some name
582591
ANDCO: {<CC>+ <NN> <NNP>+<UNI|COMP>?} #1430
@@ -987,7 +996,7 @@ def detect(self, numbered_lines):
987996
Return a sequence of tuples (copyrights, authors, years, holders)
988997
detected in a sequence of numbered line tuples.
989998
"""
990-
from nltk.tree import Tree
999+
from nltk.tree import Tree
9911000
numbered_lines = list(numbered_lines)
9921001
numbers = [n for n, _l in numbered_lines]
9931002
start_line = min(numbers)
@@ -1104,7 +1113,7 @@ def is_candidate(line):
11041113
if marker in line:
11051114
logger.debug('is_candidate: %(marker)r in line:\n%(line)r' % locals())
11061115
return True
1107-
1116+
11081117

11091118
def has_content(line):
11101119
"""
@@ -1306,7 +1315,7 @@ def prepare_text_line(line):
13061315
# un common pipe chars in some ascii art
13071316
line = line.replace('|', ' ')
13081317

1309-
# normalize copyright signs and spacing aournd them
1318+
# normalize copyright signs and spacing around them
13101319
line = line.replace('(C)', ' (c) ')
13111320
line = line.replace('(c)', ' (c) ')
13121321
# the case of \251 is tested by 'weirdencoding.h'
@@ -1336,7 +1345,6 @@ def prepare_text_line(line):
13361345
# some trailing garbage ')
13371346
line = line.replace("')", ' ')
13381347

1339-
13401348
# note that we do not replace the debian tag by a space: we remove it
13411349
# TODO: use POS tag: (r'^(?:\<s\>).*(?:\<s\\/>)$', 'NAME'),
13421350
line = re_sub(DEBIAN_COPYRIGHT_TAGS_RE(), '', line)
@@ -1361,6 +1369,12 @@ def prepare_text_line(line):
13611369
line = line.replace('\\t', ' ')
13621370
line = line.replace('\\0', ' ')
13631371

1372+
# in apache'>Copyright replace ">" by "> "
1373+
line = line.replace('>', '> ')
1374+
line = line.replace('> ', '> ')
1375+
line = line.replace('<', ' <')
1376+
line = line.replace(' <', ' <')
1377+
13641378
# TODO: Why?
13651379
# replace contiguous spaces with only one occurrence
13661380
# line = re.sub(WHITESPACE_RE(), ' ', text)

tests/cluecode/test_copyrights.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1262,6 +1262,7 @@ def test_copyright_in_html_comments(self):
12621262
]
12631263
check_detection(expected, test_file)
12641264

1265+
@expectedFailure
12651266
def test_copyright_in_html_incorrect(self):
12661267
test_file = self.get_test_loc('copyrights/copyright_in_html_incorrect-detail_9_html.html')
12671268
expected = [
@@ -1648,7 +1649,7 @@ def test_copyright_libpoppler3_copyright(self):
16481649
def test_copyright_libqt4_scripttools_copyright(self):
16491650
test_file = self.get_test_loc('copyrights/copyright_libqt4_scripttools_copyright-libqt_scripttools_copyright.copyright')
16501651
expected = [
1651-
'(c) 2008-2009 Nokia Corporation',
1652+
'(c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies)',
16521653
'(c) 1994-2008 Trolltech ASA',
16531654
]
16541655
check_detection(expected, test_file)
@@ -3620,7 +3621,6 @@ def test_copyright_super_tech_c(self):
36203621
]
36213622
check_detection(expected, test_file)
36223623

3623-
@expectedFailure
36243624
def test_copyright_tcl_copyright(self):
36253625
test_file = self.get_test_loc('copyrights/copyright_tcl_copyright-tcl_copyright.copyright')
36263626
expected = [
@@ -3968,7 +3968,7 @@ def test_copyright_should_not_contain_leading_or_trailing_colon(self):
39683968

39693969
def test_copyright_in_markup_should_not_be_truncated(self):
39703970
test_file = self.get_test_loc('copyrights/copyright_in_html.html')
3971-
expected = ["(c) Copyright 2010 by the <a href http://wtforms.simplecodes.com'>WTForms Team"]
3971+
expected = ["(c) Copyright 2010 by the <a href http://wtforms.simplecodes.com'> WTForms Team"]
39723972
check_detection(expected, test_file)
39733973

39743974
def test_copyright_should_not_have_trailing_garbage(self):
@@ -4161,14 +4161,14 @@ def test_copyright_various(self):
41614161
Copyright (C) 2000 - various; see CREDITS, ChangeLog, and sources
41624162
The libwmf Library is free software; you can redistribute it and/or
41634163
'''.splitlines(False)
4164-
expected = ['Copyright (c) 2000 - various'] # ; see CREDITS, ChangeLog, and sources
4164+
expected = ['Copyright (c) 2000 - various'] # ; see CREDITS, ChangeLog, and sources
41654165
check_detection(expected, test_lines)
41664166

41674167
def test_copyright_natural_docs(self):
41684168
test_lines = '''
41694169
// Search script generated by doxygen
41704170
// Copyright (C) 2009 by Dimitri van Heesch.
4171-
4171+
41724172
// The code in this file is loosly based on main.js, part of Natural Docs,
41734173
// which is Copyright (C) 2003-2008 Greg Valure
41744174
// Natural Docs is licensed under the GPL.
@@ -4194,3 +4194,12 @@ def test_copyright_and_authors_mixed(self):
41944194
u'Copyright (c) 1988, 1993 The Regents of the University of California.'
41954195
]
41964196
check_detection(expected, test_lines)
4197+
4198+
def test_copyright_word_in_html(self):
4199+
test_lines = '''
4200+
<td width="40%" align="left">Copyright &copy; 2010 Nokia Corporation and/or its subsidiary(-ies)</td>
4201+
'''.splitlines(False)
4202+
expected = [
4203+
u'Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies)',
4204+
]
4205+
check_detection(expected, test_lines)

0 commit comments

Comments
 (0)