Skip to content

Commit 5294521

Browse files
committed
Improve copyright detection
- Start detecting "is held by" - Do not include some trailing junk Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent d1cf644 commit 5294521

File tree

10 files changed

+43
-37
lines changed

10 files changed

+43
-37
lines changed

src/cluecode/copyrights.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -279,18 +279,22 @@ def detect(self,
279279
'YR-RANGE', 'YR-AND', 'YR', 'YR-PLUS', 'BARE-YR',
280280
'EMAIL', 'URL',
281281
'HOLDER', 'AUTHOR',
282+
'IS', 'HELD',
283+
282284
])
283285

284286
non_holder_labels_mini = frozenset([
285287
'COPY',
286288
'YR-RANGE', 'YR-AND', 'YR', 'YR-PLUS', 'BARE-YR',
287289
'HOLDER', 'AUTHOR',
290+
'IS', 'HELD',
288291
])
289292

290293
non_authors_labels = frozenset([
291294
'COPY',
292295
'YR-RANGE', 'YR-AND', 'YR', 'YR-PLUS', 'BARE-YR',
293296
'HOLDER', 'AUTHOR',
297+
'IS', 'HELD',
294298
])
295299

296300
# then walk the parse parse_tree, collecting copyrights, years and authors
@@ -703,6 +707,11 @@ def build_detection_from_node(
703707
(r'^[Rr]éservés[\.,]*$', 'RESERVED'),
704708
(r'^[Rr]eserves[\.,]*$', 'RESERVED'),
705709

710+
# used to detect "copyright is held by..."
711+
(r'^is$', 'IS'),
712+
(r'^are$', 'IS'),
713+
(r'^held$', 'HELD'),
714+
706715
# TODO: in Dutch Alle rechten voorbehouden.
707716
# TODO: in Spanish Reservados todos los derechos
708717

@@ -736,6 +745,9 @@ def build_detection_from_node(
736745
# JUNK proper
737746
############################################################################
738747

748+
# all lower case with dashes "enforce-trailing-newline" at least 3 times
749+
(r'^((\w+-){3,}\w+)$', 'JUNK'),
750+
739751
# path with trailing year-like are NOT a year as in
740752
# Landroid/icu/impl/IDNA2003 : treat as JUNK
741753
(r'^[^\\/]+[\\/][^\\/]+[\\/].*$', 'JUNK'),
@@ -825,7 +837,6 @@ def build_detection_from_node(
825837
(r'^Idata$', 'JUNK'),
826838
(r'^[Cc]ontributed?$', 'JUNK'),
827839
(r'^[Ff]unctions?$', 'JUNK'),
828-
(r'^[Nn]otices?$', 'JUNK'),
829840
(r'^[Mm]ust$', 'JUNK'),
830841
(r'^ISUPPER?$', 'JUNK'),
831842
(r'^ISLOWER$', 'JUNK'),
@@ -891,8 +902,6 @@ def build_detection_from_node(
891902

892903
(r'^providing$', 'JUNK'),
893904
(r'^Execute$', 'JUNK'),
894-
(r'^NOTICE[.,]*$', 'JUNK'),
895-
(r'^[Nn]otice[.,]*$', 'JUNK'),
896905
(r'^passes$', 'JUNK'),
897906
(r'^Should$', 'JUNK'),
898907
(r'^[Ll]icensing\@?$', 'JUNK'),
@@ -964,7 +973,6 @@ def build_detection_from_node(
964973
(r'^Much$', 'JUNK'),
965974
(r'^remains?,?$', 'JUNK'),
966975
(r'^earlier$', 'JUNK'),
967-
(r'^is$', 'JUNK'),
968976
(r'^[lL]aws?$', 'JUNK'),
969977
(r'^Insert$', 'JUNK'),
970978
(r'^url$', 'JUNK'),
@@ -986,7 +994,6 @@ def build_detection_from_node(
986994
(r'^interfaces?,?$', 'JUNK'),
987995
(r'^than$', 'JUNK'),
988996
(r'^whom$', 'JUNK'),
989-
(r'^are$', 'JUNK'),
990997
(r'^However,?$', 'JUNK'),
991998
(r'^[Cc]ollectively$', 'JUNK'),
992999
(r'^following$', 'JUNK'),
@@ -1365,7 +1372,8 @@ def build_detection_from_node(
13651372
(r'^Neither$', 'NN'),
13661373
(r'^Norwegian$', 'NN'),
13671374
(r'^Notes?$', 'NN'),
1368-
(r'^NOTICE', 'NN'),
1375+
(r'^NOTICE[\.\,]?$', 'NN'),
1376+
(r'^[Nn]otices?[\.,]?$', 'NN'),
13691377
(r'^NOT$', 'NN'),
13701378
(r'^NULL$', 'NN'),
13711379
(r'^Objects?$', 'NN'),
@@ -2764,8 +2772,8 @@ def build_detection_from_node(
27642772
# portions copyright
27652773
COPYRIGHT: {<PORTIONS> <COPYRIGHT|COPYRIGHT2>} #2610
27662774
2767-
#copyright notice (3dfx Interactive, Inc. 1999), (notice is JUNK)
2768-
COPYRIGHT: {<COPY> <JUNK> <COMPANY> <YR-RANGE>} #2620
2775+
#copyright notice (3dfx Interactive, Inc. 1999),
2776+
COPYRIGHT: {<COPY> <NN> <COMPANY> <YR-RANGE>} #2620
27692777
27702778
# Copyright (C) <2013>, GENIVI Alliance, Inc.
27712779
COPYRIGHT: {<COPYRIGHT2> <ANDCO>} #2625
@@ -2977,7 +2985,7 @@ def build_detection_from_node(
29772985
COPYRIGHT: {<COPY> <HOLDER> <NAME>} #83000
29782986
29792987
#holder is Tim Hudson ([email protected]).
2980-
COPYRIGHT: {<HOLDER> <JUNK> <NAME-EMAIL>} #83001
2988+
COPYRIGHT: {<HOLDER> <IS> <NAME-EMAIL>} #83001
29812989
29822990
# Copyright lowRISC contributors.
29832991
COPYRIGHT: {<COPY> <NN> <CONTRIBUTORS>} #83002
@@ -2991,14 +2999,19 @@ def build_detection_from_node(
29912999
# Copyright OProfile authors
29923000
COPYRIGHT: {<COPY> <NN>?<NNP>+ <AUTHS>} #83004
29933001
3002+
#######################################
3003+
# Copyright is held by ....
3004+
#######################################
3005+
# Copyright is held by ....
3006+
COPYRIGHT: {<COPY> <IS> <HELD> <BY> <NNP|NAME|COMPANYNAME-EMAIL>+ } #10989898
29943007
29953008
29963009
#######################################
29973010
# Authors
29983011
#######################################
29993012
30003013
# SPDX-FileContributor special case
3001-
AUTHOR: {<SPDX-CONTRIB> <CCOMPANY|NAME|NAME-EMAIL|NAME-YEAR|EMAIL> <COMPANY|NAME|NAME-EMAIL|NAME-YEAR|EMAIL|NN>? } #264000
3014+
AUTHOR: {<SPDX-CONTRIB> <COMPANY|NAME|NAME-EMAIL|NAME-YEAR|EMAIL> <COMPANY|NAME|NAME-EMAIL|NAME-YEAR|EMAIL|NN>? } #264000
30023015
30033016
# developed by Project Mayo.
30043017
AUTHOR: {<AUTH2>+ <BY> <COMPANY> <NNP>} #2645-1
@@ -3635,11 +3648,11 @@ def strip_trailing_period(s):
36353648

36363649
is_single_word = len(s.split()) == 1
36373650

3638-
# U.S.A., e.V., M.I.T. and similar
3651+
# U.S.A., e.V., M.I.T. and similar
36393652
if s[-2].isupper() and not is_single_word:
36403653
return s
36413654

3642-
# S.A., e.v., b.v. and other
3655+
# S.A., e.v., b.v. and other
36433656
if s[-3] == '.':
36443657
return s
36453658

tests/cluecode/data/copyright_fossology/testdata118_raw

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ For files in the libpng directory:
334334
Willem van Schaik</s>
335335

336336
libpng versions 0.89, June 1996, through 0.96, May 1997, are
337-
<s>Copyright (c) 1996, 1997 Andreas Dilger Distributed</s> according to the
337+
<s>Copyright (c) 1996, 1997 Andreas Dilger</s> Distributed according to the
338338
same disclaimer and license as libpng-0.88, with the following
339339
individuals added to the list of Contributing <t>Authors:
340340

tests/cluecode/data/copyright_fossology/testdata119_raw

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ For files in the libpng directory:
334334
Willem van Schaik</s>
335335

336336
libpng versions 0.89, June 1996, through 0.96, May 1997, are
337-
<s>Copyright (c) 1996, 1997 Andreas Dilger Distributed</s> according to the
337+
<s>Copyright (c) 1996, 1997 Andreas Dilger</s> Distributed according to the
338338
same disclaimer and license as libpng-0.88, with the following
339339
individuals added to the list of Contributing <s>Authors:
340340

tests/cluecode/data/copyright_fossology/testdata128_raw

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ For files in the libpng directory:
334334
Willem van Schaik</t>
335335

336336
libpng versions 0.89, June 1996, through 0.96, May 1997, are
337-
<s>Copyright (c) 1996, 1997 Andreas Dilger Distributed</s> according to the
337+
<s>Copyright (c) 1996, 1997 Andreas Dilger</s> Distributed according to the
338338
same disclaimer and license as libpng-0.88, with the following
339339
individuals added to the list of Contributing Authors:<t>
340340

tests/cluecode/data/copyright_fossology/testdata23_raw

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@
229229
</div>
230230
<table xmlns:rev="http://www.cs.rpi.edu/~gregod/boost/tools/doc/revision" width="100%"><tr>
231231
<td align="left"></td>
232-
<td align="right"><div class="copyright-footer"><s>Copyright (c) 2003 - 2008 Christopher M. Kohlhoff
233-
Distributed</s> <p>under the Boost Software License, Version 1.0. (See accompanying
232+
<td align="right"><div class="copyright-footer"><s>Copyright (c) 2003 - 2008 Christopher M. Kohlhoff</s>
233+
Distributed <p>under the Boost Software License, Version 1.0. (See accompanying
234234
file LICENSE_1_0.txt or copy at <a href="http://www.boost.org/LICENSE_1_0.txt" target="_top">http://www.boost.org/LICENSE_1_0.txt</a>)
235235
</p>
236236
</div></td>

tests/cluecode/data/copyrights/with_trailing_words.js.yml

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ copyrights:
77
- Copyright 2009-2015 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
88
- Copyright 2009-2015 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
99
- Copyright 2009-2015 Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
10-
Distributed
1110
- (c) Varun Malhotra 2013 Source Code https://github.com/softvar/json2html
1211
- Copyright 2015, Mycompany
1312
- Copyright 2015, Mycompany
@@ -16,18 +15,16 @@ holders:
1615
- The Dojo Foundation
1716
- Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
1817
- Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
19-
- Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors Distributed
18+
- Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
2019
- Varun Malhotra Source Code
2120
- Mycompany
2221
- Mycompany
2322
- Mycompany
2423
holders_summary:
24+
- value: Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
25+
count: 3
2526
- value: Mycompany
2627
count: 3
27-
- value: Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors
28-
count: 2
29-
- value: Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors Distributed
30-
count: 1
3128
- value: The Dojo Foundation
3229
count: 1
3330
- value: Varun Malhotra Source Code

tests/cluecode/data/generated/copyright_49.txt.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,5 @@ what:
44
- authors
55
copyrights:
66
- Copyright (c) 2006, Industrial Light & Magic
7-
- copyright held by others as indicated
87
holders:
98
- Industrial Light & Magic
10-
- others as indicated

tests/cluecode/data/ics/chromium-chrome-browser-resources/about_credits.html.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ copyrights:
6060
- Copyright (c) 2004, 2006-2009 Glenn Randers-Pehrson
6161
- Copyright (c) 2000-2002 Glenn Randers-Pehrson
6262
- Copyright (c) 1998, 1999 Glenn Randers-Pehrson
63-
- Copyright (c) 1996, 1997 Andreas Dilger Distributed
63+
- Copyright (c) 1996, 1997 Andreas Dilger
6464
- Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
6565
- Copyright (c) 2001-2006 Cisco Systems, Inc.
6666
- Copyright (c) 2010, Google Inc.
@@ -165,7 +165,7 @@ holders:
165165
- Glenn Randers-Pehrson
166166
- Glenn Randers-Pehrson
167167
- Glenn Randers-Pehrson
168-
- Andreas Dilger Distributed
168+
- Andreas Dilger
169169
- Guy Eric Schalnat, Group 42, Inc.
170170
- Cisco Systems, Inc.
171171
- Google Inc.
@@ -247,7 +247,7 @@ holders_summary:
247247
count: 1
248248
- value: Analog Devices Inc.
249249
count: 1
250-
- value: Andreas Dilger Distributed
250+
- value: Andreas Dilger
251251
count: 1
252252
- value: Andrew Tridgell
253253
count: 1

tests/cluecode/data/ics/chromium-chrome-common-extensions-docs-examples-extensions-oauth_contacts/NOTICE.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ copyrights:
66
- copyright unitedHeroes.net
77
- Copyright (c) 2009, unitedHeroes.net
88
- Copyright Paul Johnston 2000 - 2002. Other contributors Greg Holt, Andrew Kepert, Ydnar,
9-
Lostinet Distributed
9+
Lostinet
1010
holders:
1111
- unitedHeroes.net
1212
- unitedHeroes.net
13-
- Paul Johnston - Other contributors Greg Holt, Andrew Kepert, Ydnar, Lostinet Distributed
13+
- Paul Johnston - Other contributors Greg Holt, Andrew Kepert, Ydnar, Lostinet
1414
holders_summary:
1515
- value: unitedHeroes.net
1616
count: 2
17-
- value: Paul Johnston - Other contributors Greg Holt, Andrew Kepert, Ydnar, Lostinet Distributed
17+
- value: Paul Johnston - Other contributors Greg Holt, Andrew Kepert, Ydnar, Lostinet
1818
count: 1

tests/cluecode/data/ics/qemu-distrib-libpng-1.2.19/png.h.yml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ copyrights:
99
- Copyright (c) 2004, 2006-2007 Glenn Randers-Pehrson
1010
- Copyright (c) 2000-2002 Glenn Randers-Pehrson
1111
- Copyright (c) 1998, 1999, 2000 Glenn Randers-Pehrson
12-
- Copyright (c) 1996, 1997 Andreas Dilger Distributed
12+
- Copyright (c) 1996, 1997 Andreas Dilger
1313
- Copyright (c) 1995, 1996 Guy Eric Schalnat, Group 42, Inc.
1414
holders:
1515
- Glenn Randers-Pehrson
@@ -18,14 +18,12 @@ holders:
1818
- Glenn Randers-Pehrson
1919
- Glenn Randers-Pehrson
2020
- Glenn Randers-Pehrson
21-
- Andreas Dilger Distributed
21+
- Andreas Dilger
2222
- Guy Eric Schalnat, Group 42, Inc.
2323
holders_summary:
2424
- value: Glenn Randers-Pehrson
2525
count: 4
26+
- value: Andreas Dilger
27+
count: 2
2628
- value: Guy Eric Schalnat, Group 42, Inc.
2729
count: 2
28-
- value: Andreas Dilger
29-
count: 1
30-
- value: Andreas Dilger Distributed
31-
count: 1

0 commit comments

Comments
 (0)