Skip to content

Commit 1847eb4

Browse files
committed
Improve copyright detection
Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent 1f94c9d commit 1847eb4

File tree

62 files changed

+195
-226
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+195
-226
lines changed

src/cluecode/copyrights.py

Lines changed: 109 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,16 @@ class CopyrightDetector(object):
215215
"""
216216

217217
def __init__(self):
218-
self.lexer = lex.Lexer(patterns)
219-
self.parser = parse.Parser(grammar, trace=TRACE_DEEP, validate=VALIDATE)
218+
"""
219+
Initialize this detector with a lexer and a parser.
220+
"""
221+
self.lexer = lex.Lexer(matchers=PATTERNS)
222+
self.parser = parse.Parser(
223+
grammar=GRAMMAR,
224+
loop=1,
225+
trace=TRACE_DEEP,
226+
validate=VALIDATE,
227+
)
220228

221229
def detect(self,
222230
numbered_lines,
@@ -591,7 +599,7 @@ def build_detection_from_node(
591599
_YEAR_THEN_YEAR_SHORT = fr'({_YEAR_OR_YEAR_YEAR_WITH_PUNCT}({_YEAR_SHORT_PUNCT})*)'
592600
_YEAR_DASH_PRESENT = _YEAR + r'[\-~]? ?[Pp]resent\.?,?'
593601

594-
patterns = [
602+
PATTERNS = [
595603
############################################################################
596604
# COPYRIGHT
597605
############################################################################
@@ -877,6 +885,7 @@ def build_detection_from_node(
877885
(r'^DISCLAIMS?$', 'JUNK'),
878886
(r'^SPECIFICALLY$', 'JUNK'),
879887

888+
(r'^identifying', 'JUNK'),
880889
(r'^IDENTIFICATION$', 'JUNK'),
881890
(r'^WARRANTIE?S?$', 'JUNK'),
882891
(r'^WARRANTS?$', 'JUNK'),
@@ -1204,6 +1213,7 @@ def build_detection_from_node(
12041213
# NN often used in conjunction with copyright
12051214
(r'^[Ss]tatements?.?$', 'JUNK'),
12061215
(r'^issues?.?$', 'JUNK'),
1216+
(r'^retain?.?$', 'JUNK'),
12071217

12081218
############################################################################
12091219
# Nouns and proper Nouns
@@ -1267,6 +1277,7 @@ def build_detection_from_node(
12671277
(r'^But$', 'NN'),
12681278
(r'^Builders?\.?$', 'NN'),
12691279
(r'^Cacute$', 'NN'),
1280+
(r'^CD$', 'JUNK'),
12701281
(r'^Cell.$', 'NN'),
12711282
(r'^Change\.?[lL]og$', 'NN'),
12721283
(r'^CHANGElogger$', 'NN'),
@@ -1304,17 +1315,18 @@ def build_detection_from_node(
13041315
(r'^Education$', 'NN'),
13051316
(r'^Extended', 'NN'),
13061317
(r'^Every$', 'NN'),
1307-
(r'^Exhibit$', 'NN'),
1318+
(r'^EXHIBIT$', 'JUNK'),
1319+
(r'^Exhibit$', 'JUNK'),
13081320
(r'^Digitized', 'NN'),
13091321
(r'^[Ds]istributed?.?$', 'NN'),
1322+
(r'^Distributions?', 'NN'),
13101323
(r'^Multiply$', 'NN'),
13111324
(r'^Convert$', 'NN'),
13121325
(r'^Compute$', 'NN'),
13131326
(r'^Case$', 'NN'),
13141327
(r'^Hessian$', 'NN'),
13151328
(r'^Include', 'NN'),
13161329
(r'^Downstream', 'NN'),
1317-
(r'^Distributions?', 'NN'),
13181330
(r'^Volumes?', 'NN'),
13191331
(r'^Manuals?.?', 'NN'),
13201332
(r'^Update.?', 'NN'),
@@ -1435,8 +1447,8 @@ def build_detection_from_node(
14351447
(r'^Packaging$', 'NN'),
14361448
(r'^Patent', 'NN'),
14371449
(r'^Pentium$', 'NN'),
1438-
(r'^[Pp]ermission', 'NN'),
1439-
(r'^PERMISSIONS?', 'NN'),
1450+
(r'^[Pp]ermission', 'JUNK'),
1451+
(r'^PERMISSIONS?', 'JUNK'),
14401452
(r'^PGP$', 'NN'),
14411453
(r'^Phrase', 'NN'),
14421454
(r'^Plugin', 'NN'),
@@ -1871,15 +1883,16 @@ def build_detection_from_node(
18711883

18721884
# same for developed, etc...
18731885
(r'^[Cc]oded$', 'AUTH2'),
1874-
(r'^[Rr]ecoded$', 'AUTH2'),
1875-
(r'^[Mm]odified$', 'AUTH2'),
1876-
(r'^[Cc]reated$', 'AUTH2'),
1886+
(r'^\(?[Rr]ecoded$', 'AUTH2'),
1887+
(r'^\(?[Mm]odified$', 'AUTH2'),
1888+
(r'^\(?[Cc]reated$', 'AUTH2'),
18771889
# written is often mispelled
1878-
(r'^[Ww]ritt?e[dn]$', 'AUTH2'),
1890+
(r'^\(?[Ww]ritt?e[dn]$', 'AUTH2'),
18791891
# rewritten is often mispelled
1880-
(r'^[Rr]ewritt?e[dn]$', 'AUTH2'),
1881-
(r'^[Mm]aintained$', 'AUTH2'),
1882-
(r'^[Dd]eveloped$', 'AUTH2'),
1892+
(r'^\(?[Rr]ewritt?e[dn]$', 'AUTH2'),
1893+
(r'^\(?[Mm]aintained$', 'AUTH2'),
1894+
(r'^\(?[Dd]eveloped$', 'AUTH2'),
1895+
(r'^\(?[Au]thored$', 'AUTH2'),
18831896

18841897
# commiters is interesting, and so a tag of its own
18851898
(r'[Cc]ommitters\.?,?', 'COMMIT'),
@@ -2162,7 +2175,13 @@ def build_detection_from_node(
21622175
# Comments in the Grammar are lines that start with #
21632176
# End of line commenst are rules descriptions.
21642177
# One rule per line.
2165-
grammar = """
2178+
2179+
USE_MAIN_BRANCH = False or os.environ.get('SCANCODE_COPYRIGHT_USE_MAIN_BRANCH', False)
2180+
2181+
if USE_MAIN_BRANCH:
2182+
from cluecode.copyrightorig import grammar as GRAMMAR
2183+
else:
2184+
GRAMMAR = """
21662185
21672186
#######################################
21682187
# YEARS
@@ -2173,7 +2192,7 @@ def build_detection_from_node(
21732192
YR-RANGE: {<CD|BARE-YR>? <YR> <BARE-YR>?} #40
21742193
YR-RANGE: {<YR>+ <BARE-YR>? } #50
21752194
YR-AND: {<CC>? <YR>+ <CC>+ <YR>} #60
2176-
YR-RANGE: {<YR-AND>+} #70|
2195+
YR-RANGE: {<YR-AND>+} #70
21772196
YR-RANGE: {<YR-RANGE>+ <DASH|TO> <YR-RANGE>+} #71
21782197
YR-RANGE: {<YR-RANGE>+ <DASH>?} #72
21792198
@@ -2336,7 +2355,7 @@ def build_detection_from_node(
23362355
NAME: {<NNP|PN>+ <NNP>+} #351
23372356
23382357
# Distributed Management Task Force
2339-
# NAME: {<NN> <NNP>{3}} #881111
2358+
NAME: {<NN> <NNP>{3}} #881111
23402359
23412360
# @author <a href="mailto:[email protected]">Stephane Hillion</a>
23422361
NAME: { <NN>? <NN>? <EMAIL> <NAME> } #351.1
@@ -2407,10 +2426,10 @@ def build_detection_from_node(
24072426
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>?} #5612
24082427
24092428
#Academy of Motion Picture Arts and Sciences
2410-
NAME: {<NAME> <CC> <NNP>} # 561
2429+
NAME: {<NAME> <CC> <NNP>} #561
24112430
24122431
# Adam Weinberger and the GNOME Foundation
2413-
NAME: {<CC> <NN> <COMPANY>} # 565
2432+
ANDCO: {<CC> <NN> <COMPANY>} #565
24142433
24152434
# (c) 1991-1992, Thomas G. Lane , Part of the Independent JPEG Group's
24162435
NAME: {<PORTIONS> <OF> <NN> <NAME>+} #566
@@ -2497,6 +2516,8 @@ def build_detection_from_node(
24972516
# <s>Timothy Terriberry</s>, <s>CSIRO</s>, and other contributors
24982517
ANDCO: {<CC> <CAPS|COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+} #960
24992518
2519+
COMPANY: {<COMPANY|NAME|NAME-EMAIL|NAME-YEAR> <ANDCO>+} #970
2520+
25002521
# Copyright © 1998-2009 Bill Spitzak ([email protected] ) and others,
25012522
COMPANY: {<COMPANY|NAME|NAME-EMAIL|NAME-YEAR> <PARENS>? <ANDCO>+} #970
25022523
@@ -2596,7 +2617,7 @@ def build_detection_from_node(
25962617
# Copyright 2015 The Error Prone Authors.
25972618
NAME: {<NN> <NAME> <CONTRIBUTORS|AUTHS>} #196023
25982619
2599-
# Copyright (C) <s>Suresh P <[email protected]></s>
2620+
# Copyright (C) <s>Suresh P <[email protected]></s>
26002621
NAME: {<NNP> <PN> <EMAIL>} #19601.1
26012622
26022623
# Copyright or Copr. Mines Paristech, France - Mark NOBLE, Alexandrine GESRET
@@ -2611,9 +2632,14 @@ def build_detection_from_node(
26112632
# Copyright (C) 1998-2001 VideoLAN ( Johan Bilien <[email protected]> and Gildas Bazin <[email protected]> )
26122633
NAME: {<PARENS> <NAME> <PARENS>} #19653
26132634
2635+
# by the Initial Developer
2636+
INITIALDEV: {<BY>? <NN> <NN> <MAINT>} #19663
2637+
2638+
# UNIVERTSITY OF CHICAGO
2639+
NAME: {<UNI> <OF> <CAPS>} #19673
26142640
2615-
################################# #COPYRIGHT: {<COPY> <COPY> <MIT>} #1802
2616-
######
2641+
2642+
#######################################
26172643
# VARIOUS FORMS OF COPYRIGHT
26182644
#######################################
26192645
@@ -2646,12 +2672,13 @@ def build_detection_from_node(
26462672
# Bart Hanssens from FedICT
26472673
COPYRIGHT: {<COPY>+ <NAME-YEAR> <NN> <CAPS> <NN> <OF> <COMPANY> <NAME>} #83005
26482674
2675+
# Gracenote, Inc., copyright © 2000-2008 Gracenote.
2676+
# Gracenote Software, copyright © 2000-2008 Gracenote.
2677+
# COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12
2678+
26492679
# Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
26502680
COPYRIGHT: {<COPY>+ <NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #157999
26512681
2652-
# portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
2653-
COPYRIGHT: {<PORTIONS> <COPY> <NN> <NAME>} #157998
2654-
26552682
COPYRIGHT: {<COPY>+ <CAPS|NNP>+ <CC> <NN> <COPY> <YR-RANGE>?} #1590
26562683
26572684
# // (c) (C) → ©
@@ -2740,8 +2767,8 @@ def build_detection_from_node(
27402767
# (c) Copyright 1985-1999 SOME TECHNOLOGY SYSTEMS
27412768
COPYRIGHT2: {<COPY> <COPY> <YR-RANGE> <CAPS> <CAPS> <CAPS>? <CAPS>?} #2271
27422769
2743-
# Minpack Copyright Notice (1999) University of Chicago
2744-
COPYRIGHT: {<COPY> <NOTICE> <NAME-YEAR>} #2273.1
2770+
# Copyright Notice (1999) University of Chicago. All rights reserved
2771+
COPYRIGHT: {<COPY> <NOTICE> <NAME-YEAR> <ALLRIGHTRESERVED>? } #2271.1
27452772
27462773
# NAME-COPY is a name with a trailing copyright
27472774
# Daisy (c) 1998
@@ -2789,7 +2816,6 @@ def build_detection_from_node(
27892816
# Copyright (C) 2006 XStream committers.
27902817
# Copyright (c) 2019-2021, Open source contributors.
27912818
# Copyright 2007 ZXing authors
2792-
# Copyright (c) 2002 the Initial Developer
27932819
# Copyright (c) 2024 bgme <[email protected]>.
27942820
COPYRIGHT: {<COPY>+ <YR-RANGE> <NN>+ <CONTRIBUTORS|COMMIT|AUTHS|MAINT>? <EMAIL>? <ALLRIGHTRESERVED>?} #22793.3
27952821
@@ -3081,9 +3107,6 @@ def build_detection_from_node(
30813107
# Copyright OProfile authors
30823108
COPYRIGHT: {<COPY> <NN>?<NNP>+ <AUTHS>} #83004
30833109
3084-
# (C) Distributed Management Task Force (Distributed is an NN)
3085-
# COPYRIGHT: {<COPY> <NN> <NAME>} #83010
3086-
30873110
# Copyright (c) 2014 The Rust Project Developers
30883111
COPYRIGHT: {<COPYRIGHT> <MAINT> } #83020
30893112
@@ -3092,15 +3115,45 @@ def build_detection_from_node(
30923115
30933116
# Copyright: 2004-2007 by Internet Systems Consortium, Inc. ("ISC")
30943117
# 1995-2003 by Internet Software Consortium
3095-
COPYRIGHT: {<YR-RANGE> <BY> <COMPANY> } #1615
3118+
COPYRIGHT: {<COPYRIGHT> <NN> <YR-RANGE> <BY> <COMPANY> } #1615
3119+
3120+
# Russ Dill <[email protected]> 2001-2003
3121+
# Rewrited by Vladimir Oleynik <[email protected]> (C) 2003
3122+
COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #22793.5
3123+
3124+
# portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
3125+
COPYRIGHT: {<PORTIONS> <COPY> <NN> <NAME>} #157998
3126+
3127+
# Minpack Copyright Notice (1999) University of Chicago
3128+
COPYRIGHT: {<COPY> <NOTICE> <NAME-YEAR>} #2273.1
3129+
3130+
# Portions created by the Initial Developer are Copyright (C)
3131+
# the Initial Developer. All Rights Reserved.
3132+
COPYRIGHT: {<PORTIONS> <AUTH2> <INITIALDEV> <IS> <COPY|COPYRIGHT2>+ <YR-RANGE>? <INITIALDEV>} #2609.1
3133+
3134+
# Portions created by the Initial Developer are Copyright (C)
3135+
# the Initial Developer. All Rights Reserved.
3136+
# and
3137+
# Portions created by the Initial Developer are Copyright (C) 2002
3138+
# the Initial Developer. All Rights Reserved.
3139+
COPYRIGHT: {<COPYRIGHT|COPYRIGHT2> <INITIALDEV> <ALLRIGHTRESERVED>?} #2609.2
3140+
3141+
# Copyright (C) the Initial Developer.
3142+
COPYRIGHT: {<COPY>+ <INITIALDEV>} #35012
3143+
3144+
# (C) Distributed Management Task Force (Distributed is an NN)
3145+
# COPYRIGHT: {<COPY> <NN>? <NAME>} #83010
3146+
3147+
# Gracenote, Inc., copyright © 2000-2008 Gracenote.
3148+
# Gracenote Software, copyright © 2000-2008 Gracenote.
3149+
COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12
30963150
30973151
#######################################
30983152
# Copyright is held by ....
30993153
#######################################
31003154
# Copyright is held by ....
31013155
COPYRIGHT: {<COPY> <IS> <HELD> <BY> <NNP|NAME|COMPANYNAME-EMAIL>+ } #10989898
31023156
3103-
31043157
#######################################
31053158
# Authors
31063159
#######################################
@@ -3120,6 +3173,9 @@ def build_detection_from_node(
31203173
# @author [email protected] (Anatol Pomazau)
31213174
AUTHOR: {<AUTH|CONTRIBUTORS|AUTHS>+ <NN>? <COMPANY|NAME|YR-RANGE>* <BY>? <EMAIL>+ <NAME>?} #2650
31223175
3176+
# developed by the National Center for Supercomputing Applications at the University of Illinois at Urbana-Champaign
3177+
AUTHOR: {<AUTH|CONTRIBUTORS|AUTHS>+ <NN>? <COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+ <NN>? <COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #2660
3178+
31233179
AUTHOR: {<AUTH|CONTRIBUTORS|AUTHS>+ <NN>? <COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #2660
31243180
31253181
# developed by the Center for Information
@@ -3145,7 +3201,7 @@ def build_detection_from_node(
31453201
AUTHOR: {<AUTH|AUTHS|AUTH2> <BY>? <NNP> <CC> <PN>} #2761
31463202
31473203
# developed by the National Center for Supercomputing Applications at the University of Illinois at Urbana-Champaign
3148-
AUTHOR: {<AUTHOR> <NN> <NAME> <NAME>} #2762
3204+
AUTHOR: {<AUTHOR> <NN> <NAME|COMPANY>+ } #2762
31493205
31503206
# created by Axel Metzger and Till Jaeger, Institut fur Rechtsfragen der Freien und Open Source Software
31513207
AUTHOR: {<AUTH2> <CC> <AUTHOR> <NN> <NAME> <NN> <NN> <NNP>} #2645-4
@@ -3194,7 +3250,8 @@ def build_detection_from_node(
31943250
# Copyright (c) 2015 Jon Schlinkert, contributors.
31953251
COPYRIGHT: { <COPYRIGHT> <CONTRIBUTORS>} #420121
31963252
3197-
3253+
# J. Schoenwaelder, Copyright (c) 1999
3254+
# COPYRIGHT: {<NAME> <COPYRIGHT>} #22793.7
31983255
31993256
#######################################
32003257
# Last resort catch all ending with ALLRIGHTRESERVED
@@ -3246,6 +3303,19 @@ def refine_copyright(c):
32463303
return c.strip()
32473304

32483305

3306+
def remove_dupe_holder(h):
3307+
"""
3308+
Remove duplicated holders
3309+
"""
3310+
dupes_holders = {
3311+
"the Initial Developer the Initial Developer": "the Initial Developer",
3312+
}
3313+
for src, tgt in dupes_holders.items():
3314+
if src in h:
3315+
h = h.replace(src, tgt)
3316+
return h
3317+
3318+
32493319
def refine_holder(h):
32503320
"""
32513321
Refine a detected holder.
@@ -3278,6 +3348,7 @@ def refine_holder(h):
32783348
h = h.strip('+- ')
32793349
h = strip_trailing_period(h)
32803350
h = h.strip('+- ')
3351+
h = remove_dupe_holder(h)
32813352
h = ' '.join(h.split())
32823353
if h and h.lower() not in HOLDERS_JUNK:
32833354
return h
@@ -3694,7 +3765,9 @@ def remove_some_extra_words_and_punct(c):
36943765
c = c.replace(".net'", ".net")
36953766
c = c.replace("mailto:", "")
36963767
c = c.replace("@see", "")
3697-
return c
3768+
if c.endswith('as represented by'):
3769+
c, _, _ = c.partition('as represented by')
3770+
return c.strip()
36983771

36993772

37003773
def strip_prefixes(s, prefixes=()):

src/licensedcode/data/licenses/cups.LICENSE

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ ignorable_holders:
1616
- Jelmer Vernooij
1717
ignorable_authors:
1818
- Apple Inc.
19-
- permission from Apple Inc.
2019
ignorable_urls:
2120
- http://www.cups.org/
2221
---
@@ -74,4 +73,4 @@ Altered source versions must be plainly marked as such, and must not be misrepre
7473
This notice may not be removed or altered from any source distribution.
7574
Trademarks
7675

77-
CUPS and the CUPS logo (the "CUPS Marks") are trademarks of Apple Inc. Apple grants you a non-exclusive and non-transferable right to use the CUPS Marks in any direct port or binary distribution incorporating CUPS software and in any promotional material therefor. You agree that your products will meet the highest levels of quality and integrity for similar goods, not be unlawful, and be developed, manufactured, and distributed in compliance with this license. You will not interfere with Apple's rights in the CUPS Marks, and all use of the CUPS Marks shall inure to the benefit of Apple. This license does not apply to use of the CUPS Marks in a derivative products, which requires prior written permission from Apple Inc.
76+
CUPS and the CUPS logo (the "CUPS Marks") are trademarks of Apple Inc. Apple grants you a non-exclusive and non-transferable right to use the CUPS Marks in any direct port or binary distribution incorporating CUPS software and in any promotional material therefor. You agree that your products will meet the highest levels of quality and integrity for similar goods, not be unlawful, and be developed, manufactured, and distributed in compliance with this license. You will not interfere with Apple's rights in the CUPS Marks, and all use of the CUPS Marks shall inure to the benefit of Apple. This license does not apply to use of the CUPS Marks in a derivative products, which requires prior written permission from Apple Inc.

src/licensedcode/data/licenses/efsl-1.0.LICENSE

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,11 @@ other_urls:
1313
- https://www.eclipse.org/legal/epl-2.0
1414
- https://www.opensource.org/licenses/EPL-2.0
1515
ignorable_copyrights:
16-
- COPYRIGHT HOLDERS AND THE ECLIPSE FOUNDATION
1716
- Copyright (c) Eclipse Foundation
1817
- Copyright (c) Eclipse Foundation, Inc.
1918
ignorable_holders:
2019
- Eclipse Foundation
2120
- Eclipse Foundation, Inc.
22-
- THE ECLIPSE FOUNDATION
2321
---
2422

2523
Eclipse Foundation Specification License - v1.0
@@ -73,4 +71,4 @@ DOCUMENT OR THE PERFORMANCE OR IMPLEMENTATION OF THE CONTENTS THEREOF.
7371
The name and trademarks of the copyright holders or the Eclipse Foundation may
7472
NOT be used in advertising or publicity pertaining to this document or its
7573
contents without specific, written prior permission. Title to copyright in this
76-
document will at all times remain with copyright holders.
74+
document will at all times remain with copyright holders.

0 commit comments

Comments
 (0)