@@ -215,8 +215,16 @@ class CopyrightDetector(object):
215
215
"""
216
216
217
217
def __init__ (self ):
218
- self .lexer = lex .Lexer (patterns )
219
- self .parser = parse .Parser (grammar , trace = TRACE_DEEP , validate = VALIDATE )
218
+ """
219
+ Initialize this detector with a lexer and a parser.
220
+ """
221
+ self .lexer = lex .Lexer (matchers = PATTERNS )
222
+ self .parser = parse .Parser (
223
+ grammar = GRAMMAR ,
224
+ loop = 1 ,
225
+ trace = TRACE_DEEP ,
226
+ validate = VALIDATE ,
227
+ )
220
228
221
229
def detect (self ,
222
230
numbered_lines ,
@@ -591,7 +599,7 @@ def build_detection_from_node(
591
599
_YEAR_THEN_YEAR_SHORT = fr'({ _YEAR_OR_YEAR_YEAR_WITH_PUNCT } ({ _YEAR_SHORT_PUNCT } )*)'
592
600
_YEAR_DASH_PRESENT = _YEAR + r'[\-~]? ?[Pp]resent\.?,?'
593
601
594
- patterns = [
602
+ PATTERNS = [
595
603
############################################################################
596
604
# COPYRIGHT
597
605
############################################################################
@@ -877,6 +885,7 @@ def build_detection_from_node(
877
885
(r'^DISCLAIMS?$' , 'JUNK' ),
878
886
(r'^SPECIFICALLY$' , 'JUNK' ),
879
887
888
+ (r'^identifying' , 'JUNK' ),
880
889
(r'^IDENTIFICATION$' , 'JUNK' ),
881
890
(r'^WARRANTIE?S?$' , 'JUNK' ),
882
891
(r'^WARRANTS?$' , 'JUNK' ),
@@ -1204,6 +1213,7 @@ def build_detection_from_node(
1204
1213
# NN often used in conjunction with copyright
1205
1214
(r'^[Ss]tatements?.?$' , 'JUNK' ),
1206
1215
(r'^issues?.?$' , 'JUNK' ),
1216
+ (r'^retain?.?$' , 'JUNK' ),
1207
1217
1208
1218
############################################################################
1209
1219
# Nouns and proper Nouns
@@ -1267,6 +1277,7 @@ def build_detection_from_node(
1267
1277
(r'^But$' , 'NN' ),
1268
1278
(r'^Builders?\.?$' , 'NN' ),
1269
1279
(r'^Cacute$' , 'NN' ),
1280
+ (r'^CD$' , 'JUNK' ),
1270
1281
(r'^Cell.$' , 'NN' ),
1271
1282
(r'^Change\.?[lL]og$' , 'NN' ),
1272
1283
(r'^CHANGElogger$' , 'NN' ),
@@ -1304,17 +1315,18 @@ def build_detection_from_node(
1304
1315
(r'^Education$' , 'NN' ),
1305
1316
(r'^Extended' , 'NN' ),
1306
1317
(r'^Every$' , 'NN' ),
1307
- (r'^Exhibit$' , 'NN' ),
1318
+ (r'^EXHIBIT$' , 'JUNK' ),
1319
+ (r'^Exhibit$' , 'JUNK' ),
1308
1320
(r'^Digitized' , 'NN' ),
1309
1321
(r'^[Ds]istributed?.?$' , 'NN' ),
1322
+ (r'^Distributions?' , 'NN' ),
1310
1323
(r'^Multiply$' , 'NN' ),
1311
1324
(r'^Convert$' , 'NN' ),
1312
1325
(r'^Compute$' , 'NN' ),
1313
1326
(r'^Case$' , 'NN' ),
1314
1327
(r'^Hessian$' , 'NN' ),
1315
1328
(r'^Include' , 'NN' ),
1316
1329
(r'^Downstream' , 'NN' ),
1317
- (r'^Distributions?' , 'NN' ),
1318
1330
(r'^Volumes?' , 'NN' ),
1319
1331
(r'^Manuals?.?' , 'NN' ),
1320
1332
(r'^Update.?' , 'NN' ),
@@ -1435,8 +1447,8 @@ def build_detection_from_node(
1435
1447
(r'^Packaging$' , 'NN' ),
1436
1448
(r'^Patent' , 'NN' ),
1437
1449
(r'^Pentium$' , 'NN' ),
1438
- (r'^[Pp]ermission' , 'NN ' ),
1439
- (r'^PERMISSIONS?' , 'NN ' ),
1450
+ (r'^[Pp]ermission' , 'JUNK ' ),
1451
+ (r'^PERMISSIONS?' , 'JUNK ' ),
1440
1452
(r'^PGP$' , 'NN' ),
1441
1453
(r'^Phrase' , 'NN' ),
1442
1454
(r'^Plugin' , 'NN' ),
@@ -1871,15 +1883,16 @@ def build_detection_from_node(
1871
1883
1872
1884
# same for developed, etc...
1873
1885
(r'^[Cc]oded$' , 'AUTH2' ),
1874
- (r'^[Rr]ecoded$' , 'AUTH2' ),
1875
- (r'^[Mm]odified$' , 'AUTH2' ),
1876
- (r'^[Cc]reated$' , 'AUTH2' ),
1886
+ (r'^\(? [Rr]ecoded$' , 'AUTH2' ),
1887
+ (r'^\(? [Mm]odified$' , 'AUTH2' ),
1888
+ (r'^\(? [Cc]reated$' , 'AUTH2' ),
1877
1889
# written is often mispelled
1878
- (r'^[Ww]ritt?e[dn]$' , 'AUTH2' ),
1890
+ (r'^\(? [Ww]ritt?e[dn]$' , 'AUTH2' ),
1879
1891
# rewritten is often mispelled
1880
- (r'^[Rr]ewritt?e[dn]$' , 'AUTH2' ),
1881
- (r'^[Mm]aintained$' , 'AUTH2' ),
1882
- (r'^[Dd]eveloped$' , 'AUTH2' ),
1892
+ (r'^\(?[Rr]ewritt?e[dn]$' , 'AUTH2' ),
1893
+ (r'^\(?[Mm]aintained$' , 'AUTH2' ),
1894
+ (r'^\(?[Dd]eveloped$' , 'AUTH2' ),
1895
+ (r'^\(?[Au]thored$' , 'AUTH2' ),
1883
1896
1884
1897
# commiters is interesting, and so a tag of its own
1885
1898
(r'[Cc]ommitters\.?,?' , 'COMMIT' ),
@@ -2162,7 +2175,13 @@ def build_detection_from_node(
2162
2175
# Comments in the Grammar are lines that start with #
2163
2176
# End of line commenst are rules descriptions.
2164
2177
# One rule per line.
2165
- grammar = """
2178
+
2179
+ USE_MAIN_BRANCH = False or os .environ .get ('SCANCODE_COPYRIGHT_USE_MAIN_BRANCH' , False )
2180
+
2181
+ if USE_MAIN_BRANCH :
2182
+ from cluecode .copyrightorig import grammar as GRAMMAR
2183
+ else :
2184
+ GRAMMAR = """
2166
2185
2167
2186
#######################################
2168
2187
# YEARS
@@ -2173,7 +2192,7 @@ def build_detection_from_node(
2173
2192
YR-RANGE: {<CD|BARE-YR>? <YR> <BARE-YR>?} #40
2174
2193
YR-RANGE: {<YR>+ <BARE-YR>? } #50
2175
2194
YR-AND: {<CC>? <YR>+ <CC>+ <YR>} #60
2176
- YR-RANGE: {<YR-AND>+} #70|
2195
+ YR-RANGE: {<YR-AND>+} #70
2177
2196
YR-RANGE: {<YR-RANGE>+ <DASH|TO> <YR-RANGE>+} #71
2178
2197
YR-RANGE: {<YR-RANGE>+ <DASH>?} #72
2179
2198
@@ -2336,7 +2355,7 @@ def build_detection_from_node(
2336
2355
NAME: {<NNP|PN>+ <NNP>+} #351
2337
2356
2338
2357
# Distributed Management Task Force
2339
- # NAME: {<NN> <NNP>{3}} #881111
2358
+ NAME: {<NN> <NNP>{3}} #881111
2340
2359
2341
2360
# @author <a href="mailto:[email protected] ">Stephane Hillion</a>
2342
2361
NAME: { <NN>? <NN>? <EMAIL> <NAME> } #351.1
@@ -2407,10 +2426,10 @@ def build_detection_from_node(
2407
2426
NAME-YEAR: {<YR-RANGE> <NNP>+ <CAPS>?} #5612
2408
2427
2409
2428
#Academy of Motion Picture Arts and Sciences
2410
- NAME: {<NAME> <CC> <NNP>} # 561
2429
+ NAME: {<NAME> <CC> <NNP>} #561
2411
2430
2412
2431
# Adam Weinberger and the GNOME Foundation
2413
- NAME : {<CC> <NN> <COMPANY>} # 565
2432
+ ANDCO : {<CC> <NN> <COMPANY>} #565
2414
2433
2415
2434
# (c) 1991-1992, Thomas G. Lane , Part of the Independent JPEG Group's
2416
2435
NAME: {<PORTIONS> <OF> <NN> <NAME>+} #566
@@ -2497,6 +2516,8 @@ def build_detection_from_node(
2497
2516
# <s>Timothy Terriberry</s>, <s>CSIRO</s>, and other contributors
2498
2517
ANDCO: {<CC> <CAPS|COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+} #960
2499
2518
2519
+ COMPANY: {<COMPANY|NAME|NAME-EMAIL|NAME-YEAR> <ANDCO>+} #970
2520
+
2500
2521
# Copyright © 1998-2009 Bill Spitzak ([email protected] ) and others,
2501
2522
COMPANY: {<COMPANY|NAME|NAME-EMAIL|NAME-YEAR> <PARENS>? <ANDCO>+} #970
2502
2523
@@ -2596,7 +2617,7 @@ def build_detection_from_node(
2596
2617
# Copyright 2015 The Error Prone Authors.
2597
2618
NAME: {<NN> <NAME> <CONTRIBUTORS|AUTHS>} #196023
2598
2619
2599
- # Copyright (C) <s>Suresh P <[email protected] ></s>
2620
+ # Copyright (C) <s>Suresh P <[email protected] ></s>
2600
2621
NAME: {<NNP> <PN> <EMAIL>} #19601.1
2601
2622
2602
2623
# Copyright or Copr. Mines Paristech, France - Mark NOBLE, Alexandrine GESRET
@@ -2611,9 +2632,14 @@ def build_detection_from_node(
2611
2632
# Copyright (C) 1998-2001 VideoLAN ( Johan Bilien <[email protected] > and Gildas Bazin <[email protected] > )
2612
2633
NAME: {<PARENS> <NAME> <PARENS>} #19653
2613
2634
2635
+ # by the Initial Developer
2636
+ INITIALDEV: {<BY>? <NN> <NN> <MAINT>} #19663
2637
+
2638
+ # UNIVERTSITY OF CHICAGO
2639
+ NAME: {<UNI> <OF> <CAPS>} #19673
2614
2640
2615
- ################################# #COPYRIGHT: {<COPY> <COPY> <MIT>} #1802
2616
- ######
2641
+
2642
+ #######################################
2617
2643
# VARIOUS FORMS OF COPYRIGHT
2618
2644
#######################################
2619
2645
@@ -2646,12 +2672,13 @@ def build_detection_from_node(
2646
2672
# Bart Hanssens from FedICT
2647
2673
COPYRIGHT: {<COPY>+ <NAME-YEAR> <NN> <CAPS> <NN> <OF> <COMPANY> <NAME>} #83005
2648
2674
2675
+ # Gracenote, Inc., copyright © 2000-2008 Gracenote.
2676
+ # Gracenote Software, copyright © 2000-2008 Gracenote.
2677
+ # COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12
2678
+
2649
2679
# Copyright (c) Ian F. Darwin 1986, 1987, 1989, 1990, 1991, 1992, 1994, 1995.
2650
2680
COPYRIGHT: {<COPY>+ <NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #157999
2651
2681
2652
- # portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
2653
- COPYRIGHT: {<PORTIONS> <COPY> <NN> <NAME>} #157998
2654
-
2655
2682
COPYRIGHT: {<COPY>+ <CAPS|NNP>+ <CC> <NN> <COPY> <YR-RANGE>?} #1590
2656
2683
2657
2684
# // (c) (C) → ©
@@ -2740,8 +2767,8 @@ def build_detection_from_node(
2740
2767
# (c) Copyright 1985-1999 SOME TECHNOLOGY SYSTEMS
2741
2768
COPYRIGHT2: {<COPY> <COPY> <YR-RANGE> <CAPS> <CAPS> <CAPS>? <CAPS>?} #2271
2742
2769
2743
- # Minpack Copyright Notice (1999) University of Chicago
2744
- COPYRIGHT: {<COPY> <NOTICE> <NAME-YEAR>} #2273 .1
2770
+ # Copyright Notice (1999) University of Chicago. All rights reserved
2771
+ COPYRIGHT: {<COPY> <NOTICE> <NAME-YEAR> <ALLRIGHTRESERVED>? } #2271 .1
2745
2772
2746
2773
# NAME-COPY is a name with a trailing copyright
2747
2774
# Daisy (c) 1998
@@ -2789,7 +2816,6 @@ def build_detection_from_node(
2789
2816
# Copyright (C) 2006 XStream committers.
2790
2817
# Copyright (c) 2019-2021, Open source contributors.
2791
2818
# Copyright 2007 ZXing authors
2792
- # Copyright (c) 2002 the Initial Developer
2793
2819
# Copyright (c) 2024 bgme <[email protected] >.
2794
2820
COPYRIGHT: {<COPY>+ <YR-RANGE> <NN>+ <CONTRIBUTORS|COMMIT|AUTHS|MAINT>? <EMAIL>? <ALLRIGHTRESERVED>?} #22793.3
2795
2821
@@ -3081,9 +3107,6 @@ def build_detection_from_node(
3081
3107
# Copyright OProfile authors
3082
3108
COPYRIGHT: {<COPY> <NN>?<NNP>+ <AUTHS>} #83004
3083
3109
3084
- # (C) Distributed Management Task Force (Distributed is an NN)
3085
- # COPYRIGHT: {<COPY> <NN> <NAME>} #83010
3086
-
3087
3110
# Copyright (c) 2014 The Rust Project Developers
3088
3111
COPYRIGHT: {<COPYRIGHT> <MAINT> } #83020
3089
3112
@@ -3092,15 +3115,45 @@ def build_detection_from_node(
3092
3115
3093
3116
# Copyright: 2004-2007 by Internet Systems Consortium, Inc. ("ISC")
3094
3117
# 1995-2003 by Internet Software Consortium
3095
- COPYRIGHT: {<YR-RANGE> <BY> <COMPANY> } #1615
3118
+ COPYRIGHT: {<COPYRIGHT> <NN> <YR-RANGE> <BY> <COMPANY> } #1615
3119
+
3120
+ # Russ Dill <[email protected] > 2001-2003
3121
+ # Rewrited by Vladimir Oleynik <[email protected] > (C) 2003
3122
+ COPYRIGHT: {<NAME-EMAIL> <YR-RANGE> <AUTH2> <BY> <NAME-EMAIL> <COPY> <YR-RANGE>} #22793.5
3123
+
3124
+ # portions copyright The Internet Society, Tom Tromey and Red Hat, Inc.
3125
+ COPYRIGHT: {<PORTIONS> <COPY> <NN> <NAME>} #157998
3126
+
3127
+ # Minpack Copyright Notice (1999) University of Chicago
3128
+ COPYRIGHT: {<COPY> <NOTICE> <NAME-YEAR>} #2273.1
3129
+
3130
+ # Portions created by the Initial Developer are Copyright (C)
3131
+ # the Initial Developer. All Rights Reserved.
3132
+ COPYRIGHT: {<PORTIONS> <AUTH2> <INITIALDEV> <IS> <COPY|COPYRIGHT2>+ <YR-RANGE>? <INITIALDEV>} #2609.1
3133
+
3134
+ # Portions created by the Initial Developer are Copyright (C)
3135
+ # the Initial Developer. All Rights Reserved.
3136
+ # and
3137
+ # Portions created by the Initial Developer are Copyright (C) 2002
3138
+ # the Initial Developer. All Rights Reserved.
3139
+ COPYRIGHT: {<COPYRIGHT|COPYRIGHT2> <INITIALDEV> <ALLRIGHTRESERVED>?} #2609.2
3140
+
3141
+ # Copyright (C) the Initial Developer.
3142
+ COPYRIGHT: {<COPY>+ <INITIALDEV>} #35012
3143
+
3144
+ # (C) Distributed Management Task Force (Distributed is an NN)
3145
+ # COPYRIGHT: {<COPY> <NN>? <NAME>} #83010
3146
+
3147
+ # Gracenote, Inc., copyright © 2000-2008 Gracenote.
3148
+ # Gracenote Software, copyright © 2000-2008 Gracenote.
3149
+ COPYRIGHT: {<COMPANY> <COPY>{1,2} <NAME-YEAR>} #157999.12
3096
3150
3097
3151
#######################################
3098
3152
# Copyright is held by ....
3099
3153
#######################################
3100
3154
# Copyright is held by ....
3101
3155
COPYRIGHT: {<COPY> <IS> <HELD> <BY> <NNP|NAME|COMPANYNAME-EMAIL>+ } #10989898
3102
3156
3103
-
3104
3157
#######################################
3105
3158
# Authors
3106
3159
#######################################
@@ -3120,6 +3173,9 @@ def build_detection_from_node(
3120
3173
# @author [email protected] (Anatol Pomazau)
3121
3174
AUTHOR: {<AUTH|CONTRIBUTORS|AUTHS>+ <NN>? <COMPANY|NAME|YR-RANGE>* <BY>? <EMAIL>+ <NAME>?} #2650
3122
3175
3176
+ # developed by the National Center for Supercomputing Applications at the University of Illinois at Urbana-Champaign
3177
+ AUTHOR: {<AUTH|CONTRIBUTORS|AUTHS>+ <NN>? <COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+ <NN>? <COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #2660
3178
+
3123
3179
AUTHOR: {<AUTH|CONTRIBUTORS|AUTHS>+ <NN>? <COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+ <YR-RANGE>*} #2660
3124
3180
3125
3181
# developed by the Center for Information
@@ -3145,7 +3201,7 @@ def build_detection_from_node(
3145
3201
AUTHOR: {<AUTH|AUTHS|AUTH2> <BY>? <NNP> <CC> <PN>} #2761
3146
3202
3147
3203
# developed by the National Center for Supercomputing Applications at the University of Illinois at Urbana-Champaign
3148
- AUTHOR: {<AUTHOR> <NN> <NAME> <NAME> } #2762
3204
+ AUTHOR: {<AUTHOR> <NN> <NAME|COMPANY>+ } #2762
3149
3205
3150
3206
# created by Axel Metzger and Till Jaeger, Institut fur Rechtsfragen der Freien und Open Source Software
3151
3207
AUTHOR: {<AUTH2> <CC> <AUTHOR> <NN> <NAME> <NN> <NN> <NNP>} #2645-4
@@ -3194,7 +3250,8 @@ def build_detection_from_node(
3194
3250
# Copyright (c) 2015 Jon Schlinkert, contributors.
3195
3251
COPYRIGHT: { <COPYRIGHT> <CONTRIBUTORS>} #420121
3196
3252
3197
-
3253
+ # J. Schoenwaelder, Copyright (c) 1999
3254
+ # COPYRIGHT: {<NAME> <COPYRIGHT>} #22793.7
3198
3255
3199
3256
#######################################
3200
3257
# Last resort catch all ending with ALLRIGHTRESERVED
@@ -3246,6 +3303,19 @@ def refine_copyright(c):
3246
3303
return c .strip ()
3247
3304
3248
3305
3306
+ def remove_dupe_holder (h ):
3307
+ """
3308
+ Remove duplicated holders
3309
+ """
3310
+ dupes_holders = {
3311
+ "the Initial Developer the Initial Developer" : "the Initial Developer" ,
3312
+ }
3313
+ for src , tgt in dupes_holders .items ():
3314
+ if src in h :
3315
+ h = h .replace (src , tgt )
3316
+ return h
3317
+
3318
+
3249
3319
def refine_holder (h ):
3250
3320
"""
3251
3321
Refine a detected holder.
@@ -3278,6 +3348,7 @@ def refine_holder(h):
3278
3348
h = h .strip ('+- ' )
3279
3349
h = strip_trailing_period (h )
3280
3350
h = h .strip ('+- ' )
3351
+ h = remove_dupe_holder (h )
3281
3352
h = ' ' .join (h .split ())
3282
3353
if h and h .lower () not in HOLDERS_JUNK :
3283
3354
return h
@@ -3694,7 +3765,9 @@ def remove_some_extra_words_and_punct(c):
3694
3765
c = c .replace (".net'" , ".net" )
3695
3766
c = c .replace ("mailto:" , "" )
3696
3767
c = c .replace ("@see" , "" )
3697
- return c
3768
+ if c .endswith ('as represented by' ):
3769
+ c , _ , _ = c .partition ('as represented by' )
3770
+ return c .strip ()
3698
3771
3699
3772
3700
3773
def strip_prefixes (s , prefixes = ()):
0 commit comments