@@ -451,6 +451,9 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
451451 # such as in (1)(ii)(OCT
452452 (r'^.*\(.*\).*\(.*\).*$' , 'JUNK' ),
453453
454+ # parens such as (1) or (a) is a sign of junk but of course NOT (c)
455+ (r'^\(([abdefghi\d]|ii|iii)\)$' , 'JUNK' ),
456+
454457 # found in crypto certificates and LDAP
455458 (r'^O=$' , 'JUNK' ),
456459 (r'^OU=?$' , 'JUNK' ),
@@ -502,7 +505,8 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
502505 (r'^[Rr]eleased?$' , 'JUNK' ),
503506 (r'^[Cc]opyrighting$' , 'JUNK' ),
504507 (r'^Authori.*$' , 'JUNK' ),
505-
508+ (r'^such$' , 'JUNK' ),
509+ (r'^[Aa]ssignments?[.,]?$' , 'JUNK' ),
506510 (r'^[Bb]uild$' , 'JUNK' ),
507511 (r'^[Ss]tring$' , 'JUNK' ),
508512 (r'^Implementation-Vendor$' , 'JUNK' ),
@@ -618,6 +622,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
618622 (r'^Updates$' , 'JUNK' ),
619623 (r'^Record-keeping$' , 'JUNK' ),
620624 (r'^Privacy$' , 'JUNK' ),
625+ (r'^within$' , 'JUNK' ),
621626
622627 # various trailing words that are junk
623628 (r'^Copyleft$' , 'JUNK' ),
@@ -666,6 +671,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
666671 (r'^However,?$' , 'JUNK' ),
667672 (r'^[Cc]ollectively$' , 'JUNK' ),
668673 (r'^following$' , 'JUNK' ),
674+ (r'^file\.$' , 'JUNK' ),
669675
670676 # junk when HOLDER(S): typically used in disclaimers instead
671677 (r'^HOLDER\(S\)$' , 'JUNK' ),
@@ -739,6 +745,8 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
739745 (r'^GA$' , 'JUNK' ),
740746 (r'^unzip$' , 'JUNK' ),
741747 (r'^EULA' , 'JUNK' ),
748+ (r'^Terms?[.,]?$' , 'JUNK' ),
749+ (r'^Non-Assertion$' , 'JUNK' ),
742750
743751 # this is not Copr.
744752 (r'^Coproduct,?[,\.]?$$' , 'JUNK' ),
@@ -747,6 +755,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
747755 (r'^CONTRIBUTORS?[,\.]?$' , 'JUNK' ),
748756 (r'^OTHERS?[,\.]?$' , 'JUNK' ),
749757 (r'^Contributors?\:[,\.]?$' , 'JUNK' ),
758+ (r'^Version$' , 'JUNK' ),
750759
751760 ############################################################################
752761 # Nouns and proper Nouns
@@ -846,6 +855,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
846855 (r'^GPL\'d' , 'NN' ),
847856 (r'^Gnome$' , 'NN' ),
848857 (r'^GnuPG$' , 'NN' ),
858+ (r'^Government.' , 'NNP' ),
849859 (r'^Government' , 'NN' ),
850860 (r'^Grants?\.?,?$' , 'NN' ),
851861 (r'^Header' , 'NN' ),
@@ -946,6 +956,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
946956 (r'^Section' , 'NN' ),
947957 (r'^Send$' , 'NN' ),
948958 (r'^Separa' , 'NN' ),
959+ (r'^Service$' , 'NN' ),
949960 (r'^Several$' , 'NN' ),
950961 (r'^SIGN$' , 'NN' ),
951962 (r'^Site\.?$' , 'NN' ),
@@ -983,7 +994,6 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
983994 (r'^VALUE$' , 'NN' ),
984995 (r'^Various' , 'NN' ),
985996 (r'^Vendor' , 'NN' ),
986- (r'^Version' , 'NN' ),
987997 (r'^VIEW$' , 'NN' ),
988998 (r'^Visit' , 'NN' ),
989999 (r'^Website' , 'NN' ),
@@ -993,8 +1003,11 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
9931003 (r'^WPA$' , 'NN' ),
9941004 (r'^Xalan$' , 'NN' ),
9951005 (r'^YOUR' , 'NN' ),
1006+ (r'^Your' , 'NN' ),
9961007 (r'^DateTime' , 'NN' ),
9971008 (r'^Create$' , 'NN' ),
1009+ (r'^Engine\.$' , 'NN' ),
1010+ (r'^While$' , 'NN' ),
9981011
9991012 # Hours/Date/Day/Month text references
10001013 (r'^am$' , 'NN' ),
@@ -1091,10 +1104,15 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
10911104 (r'werken$' , 'NNP' ),
10921105 (r'various\.?$' , 'NNP' ),
10931106
1107+ # treat Attributable as proper noun as it is seen in Author tags such as in:
1108+ # @author not attributable
1109+ (r'^[Aa]ttributable$' , 'NNP' ),
1110+
10941111 # rarer caps
10951112 # EPFL-LRC/ICA
10961113 (r'^[A-Z]{3,6}-[A-Z]{3,6}/[A-Z]{3,6}' , 'NNP' ),
10971114
1115+
10981116 ############################################################################
10991117 # Named entities: companies, groups, universities, etc
11001118 ############################################################################
@@ -1221,6 +1239,9 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
12211239 (r'^[Cc]ontribut(ors|ing)\.?$' , 'CONTRIBUTORS' ),
12221240 (r'^contributors,$' , 'CONTRIBUTORS' ),
12231241
1242+ (r'^Contributor[,.]?$' , 'NN' ),
1243+ (r'^Licensor[,.]?$' , 'NN' ),
1244+
12241245 # same for developed, etc...
12251246 (r'^[Cc]oded$' , 'AUTH2' ),
12261247 (r'^[Rr]ecoded$' , 'AUTH2' ),
@@ -1416,6 +1437,8 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
14161437 (r'^<([a-zA-Z]+[a-zA-Z\.]){3,}$' , 'EMAIL_START' ),
14171438 (r'^[a-zA-Z\.]{2,}>$' , 'EMAIL_END' ),
14181439
1440+ # a .sh shell scripts is NOT an email.
1441+ (r'^.*\.sh\.?$' , 'JUNK' ),
14191442 # email eventually in parens or brackets with some trailing punct.
14201443 (r'^[\<\(]?[a-zA-Z0-9]+[a-zA-Z0-9\+_\-\.\%]*(@|at)[a-zA-Z0-9][a-zA-Z0-9\+_\-\.\%]+\.[a-zA-Z]{2,5}?[\>\)\.\,]*$' , 'EMAIL' ),
14211444
@@ -1704,7 +1727,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
17041727 # and other contributors
17051728 NAME: {<CC> <NN>? <CONTRIBUTORS>} #644
17061729
1707- NAME: {<NNP|CAPS>+ <AUTHS|CONTRIBUTORS>} #660
1730+ NAME: {<NNP|CAPS>+ <AUTHS|AUTHDOT| CONTRIBUTORS>} #660
17081731
17091732 NAME: {<VAN|OF> <NAME>} #680
17101733 NAME: {<NAME-YEAR> <COMP|COMPANY>} #690
@@ -1970,7 +1993,17 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
19701993 COPYRIGHT: { <COPY> <COPY> <YR-RANGE> <CONTRIBUTORS> <OTH> } #2276
19711994
19721995 # copyrighted by Object Computing, Inc., St. Louis Missouri, Copyright (C) 2002, all rights reserved.
1973- COPYRIGHT: {<COPYRIGHT> <COPY>+ <YR-RANGE> <ALLRIGHTRESERVED>} #2290
1996+ COPYRIGHT: {<COPYRIGHT> <COPY>+ <YR-RANGE> <ALLRIGHTRESERVED>} #2278
1997+
1998+ # copyrighted by Object Computing, Inc., St. Louis Missouri, Copyright (C) 2002, all rights reserved.
1999+ COPYRIGHT: {<COPYRIGHT> <COPY>+ <YR-RANGE> <ALLRIGHTRESERVED>} #2279
2000+
2001+ # Copyright (c) 2004, The Codehaus
2002+ COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <NN> <NNP>} #22790
2003+
2004+ # Copyright (c) 2017 odahcam
2005+ COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <NN> <ALLRIGHTRESERVED>} #22791
2006+ COPYRIGHT: {<COPY> <COPY> <YR-RANGE> <NN>} #22792
19742007
19752008 COPYRIGHT2: {<COPY>+ <NN|CAPS>? <YR-RANGE>+ <PN>*} #2280
19762009
@@ -2036,6 +2069,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
20362069 # Copyright 2008 TJ <[email protected] > 20372070 COPYRIGHT: {<COPYRIGHT2> <EMAIL>} #2636
20382071
2072+ # Copyright RUSS DILL Russ <[email protected] > 20392073 COPYRIGHT: {<COPYRIGHT> <CAPS> <NAME-EMAIL>} #2637
20402074
20412075 # maintainer Norbert Tretkowski <[email protected] > 2005-04-16 @@ -2137,7 +2171,7 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
21372171 COPYRIGHT: {<COMPANY> <ALLRIGHTRESERVED> <COPYRIGHT2>} #3030
21382172
21392173 # Copyright (c) 2000 United States Government as represented by the Secretary of the Navy. All rights reserved.
2140- COPYRIGHT: {<COPYRIGHT> <NN> <NN> <NN> <BY> <NN> <NAME> <ALLRIGHTRESERVED>} #3035
2174+ COPYRIGHT: {<COPYRIGHT> <NN> <NN> <NN|NNP > <BY> <NN> <NAME> <ALLRIGHTRESERVED>} #3035
21412175
21422176 # Copyright (c) 2007-2008, Y Giridhar Appaji Nag <[email protected] > 21432177 COPYRIGHT: {<COPYRIGHT> <COMPANY|NAME|NAME-EMAIL|NAME-YEAR>+} #3040
@@ -2209,6 +2243,9 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
22092243 # Copyright (C) 2005-2006 dann frazier <[email protected] > 22102244 COPYRIGHT: {<COPYRIGHT2> <NN> <NN> <EMAIL>} #999991
22112245
2246+ # Copyright (c) 2008 Intel Corporation / Qualcomm Inc.
2247+ COPYRIGHT: {<COPYRIGHT> <DASH> <COMPANY>} #copydash-co
2248+
22122249#######################################
22132250# Authors
22142251#######################################
@@ -2252,13 +2289,22 @@ def as_str(cls, node, ignores=frozenset(), include_allrights=False):
22522289 # developed by the XML DB Initiative http//www.xmldb.org
22532290 AUTHOR: {<AUTH2> <COMPANY>} #2645-7
22542291
2292+ # Author not attributable
2293+ AUTHOR: {<AUTH> <NN> <NNP>} #not attributable
2294+
2295+ # author (Panagiotis Tsirigotis)
2296+ AUTHOR: {<AUTH> <NNP><NNP>+} #author Foo Bar
2297+
2298+
22552299#######################################
2256- # Mixed AUTHORS and COPYRIGHTS
2300+ # Mixed AUTHOR and COPYRIGHT
22572301#######################################
22582302
22592303 # Compounded statements usings authors
2260- # found in some rare cases with a long list of authors.
2261- COPYRIGHT: {<COPY> <BY> <AUTHOR>+ <YR-RANGE>*} #2800
2304+
2305+ # Copyright by Daniel K. Gebhart
2306+ # Also found in some rare cases with a long list of authors.
2307+ COPYRIGHT: {<COPY> <BY>? <AUTHOR>+ <YR-RANGE>*} #2800-1
22622308
22632309 COPYRIGHT: {<AUTHOR> <COPYRIGHT2>} #2820
22642310 COPYRIGHT: {<AUTHOR> <YR-RANGE>} #2830
@@ -2312,6 +2358,7 @@ def refine_copyright(c):
23122358 c = strip_balanced_edge_parens (c )
23132359 c = strip_suffixes (c , suffixes = COPYRIGHTS_SUFFIXES )
23142360 c = strip_trailing_period (c )
2361+ c = c .strip ("'" )
23152362 return c .strip ()
23162363
23172364
@@ -2334,8 +2381,6 @@ def refine_holder(h):
23342381 h = h .strip ()
23352382 h = strip_trailing_period (h )
23362383 h = h .strip ()
2337- h = strip_balanced_edge_parens (h )
2338- h = h .strip ()
23392384 if h and h .lower () not in HOLDERS_JUNK :
23402385 return h
23412386
@@ -2351,6 +2396,10 @@ def refine_author(a):
23512396 a = a .strip ()
23522397 a = strip_trailing_period (a )
23532398 a = a .strip ()
2399+ a = strip_balanced_edge_parens (a )
2400+ a = a .strip ()
2401+ a = refine_names (a , prefixes = AUTHORS_PREFIXES )
2402+ a = a .strip ()
23542403 if a and a .lower () not in AUTHORS_JUNK :
23552404 return a
23562405
@@ -2365,6 +2414,8 @@ def refine_names(s, prefixes):
23652414 s = strip_all_unbalanced_parens (s )
23662415 s = strip_some_punct (s )
23672416 s = s .strip ()
2417+ s = strip_balanced_edge_parens (s )
2418+ s = s .strip ()
23682419 s = strip_prefixes (s , prefixes )
23692420 s = s .strip ()
23702421 return s
@@ -2543,6 +2594,7 @@ def refine_names(s, prefixes):
25432594 'author\' ' ,
25442595 'authors,' ,
25452596 'authorship' ,
2597+ 'or' ,
25462598 ])
25472599))
25482600
@@ -2561,6 +2613,11 @@ def refine_names(s, prefixes):
25612613 'company' ,
25622614 'contributing project' ,
25632615 'its author' ,
2616+ 'gnomovision' ,
2617+ 'would' ,
2618+ 'may' ,
2619+ 'attributions' ,
2620+ 'the' ,
25642621])
25652622
25662623################################################################################
0 commit comments