@@ -108,7 +108,7 @@ def detect(location):
108108')' )
109109
110110_YEAR_YEAR = (r'('
111- # fixme v ....the underscore below is suspicious
111+ # fixme v ....the underscore below is suspicious
112112 '19[6-9][0-9][\.,\-]_[6-9][0-9]' # 1960-99
113113 '|'
114114 '19[6-9][0-9][\.,\-]+[0-9]' # 1998-9
@@ -227,7 +227,7 @@ def detect(location):
227227 # note the leading @ .... this may be a source of problems
228228 (r'.?(@?([Cc]opyright)s?:?|[Cc]opr\.?|[(][Cc][)]|(COPYRIGHT)S?:?)' , 'COPY' ),
229229
230- # copyright in markup, until we strip markup: apache'>Copyright
230+ # copyright in markup, until we strip markup: apache'>Copyright or left'>Copyright
231231 (r'[A-Za-z0-9]+[\'">]+[Cc]opyright' , 'COPY' ),
232232
233233 # AT&T (the company), needs special handling
@@ -249,6 +249,8 @@ def detect(location):
249249 (r'^([Ll][Ll][CcPp]|[Ll][Tt][Dd])\.?,?$' , 'COMP' ),
250250 (r'^([Ll][Ll][CcPp]|[Ll][Tt][Dd])\.$' , 'COMP' ),
251251 (r'^L\.P\.$' , 'COMP' ),
252+ (r'^[Ss]ubsidiar(y|ies)$' , 'COMP' ),
253+ (r'^[Ss]ubsidiary\(\-ies\)$' , 'COMP' ),
252254 # company suffix : SA, SAS, AG, AB, AS, CO, labs followed by a dot
253255 (r'^(S\.?A\.?S?|Sas|sas|AG|AB|Labs?|[Cc][Oo]\.|Research|INRIA).?$' , 'COMP' ),
254256 # (german) company suffix
@@ -440,6 +442,9 @@ def detect(location):
440442 # the Regents of the University of California
441443 COMPANY: {<BY>? <NN> <NNP> <OF> <NN> <UNI> <OF> <COMPANY|NAME|NAME2|NAME3><COMP>?} #130
442444
445+ # Free Software Foundation, Inc.
446+ COMPANY: {<NNP> <NNP> <COMP> <COMP>} #135
447+
443448 # Corporation/COMP for/NN National/NNP Research/COMP Initiatives/NNP
444449 COMPANY: {<COMP> <NN> <NNP> <COMP> <NNP>} #140
445450
@@ -471,7 +476,7 @@ def detect(location):
471476 NAME: {<NNP> <PN>? <NNP>+} #360
472477 NAME: {<NNP> <NNP>} #370
473478
474- NAME: {<NNP> <NN> <EMAIL>} #390
479+ NAME: {<NNP> <NN|NNP > <EMAIL>} #390
475480 NAME: {<NNP> <PN|VAN>? <PN|VAN>? <NNP>} #400
476481 NAME: {<NNP> <NN> <NNP>} #410
477482 NAME: {<NNP> <COMMIT>} #420
@@ -486,7 +491,8 @@ def detect(location):
486491 COMPANY: {<NNP> <IN> <NN>? <COMPANY>} #510
487492
488493 NAME2: {<NAME> <EMAIL>} #530
489- NAME3: {<YR-RANGE> <NAME2|COMPANY>+} #540
494+ NAME3: {<YR-RANGE> <NAME2|COMPANY>+} #535
495+ NAME3: {<YR-RANGE> <NAME2|COMPANY>+ <CC> <YR-RANGE>} #540
490496 NAME: {<NAME|NAME2>+ <OF> <NNP> <OF> <NN>? <COMPANY>} #550
491497 NAME: {<NAME|NAME2>+ <CC|OF>? <NAME|NAME2|COMPANY>} #560
492498 NAME3: {<YR-RANGE> <NAME>+} #570
@@ -577,6 +583,9 @@ def detect(location):
577583# by the a href http wtforms.simplecodes.com WTForms Team
578584 COMPANY: {<BY> <NN>+ <COMP|COMPANY>} #1420
579585
586+ # the Regents of the University of California, Sun Microsystems, Inc., Scriptics Corporation
587+ COMPANY: {<NN> <NNP> <OF> <NN> <UNI> <OF> <COMPANY>+}
588+
580589
581590# "And" some name
582591 ANDCO: {<CC>+ <NN> <NNP>+<UNI|COMP>?} #1430
@@ -987,7 +996,7 @@ def detect(self, numbered_lines):
987996 Return a sequence of tuples (copyrights, authors, years, holders)
988997 detected in a sequence of numbered line tuples.
989998 """
990- from nltk .tree import Tree
999+ from nltk .tree import Tree
9911000 numbered_lines = list (numbered_lines )
9921001 numbers = [n for n , _l in numbered_lines ]
9931002 start_line = min (numbers )
@@ -1104,7 +1113,7 @@ def is_candidate(line):
11041113 if marker in line :
11051114 logger .debug ('is_candidate: %(marker)r in line:\n %(line)r' % locals ())
11061115 return True
1107-
1116+
11081117
11091118def has_content (line ):
11101119 """
@@ -1306,7 +1315,7 @@ def prepare_text_line(line):
13061315 # un common pipe chars in some ascii art
13071316 line = line .replace ('|' , ' ' )
13081317
1309- # normalize copyright signs and spacing aournd them
1318+ # normalize copyright signs and spacing around them
13101319 line = line .replace ('(C)' , ' (c) ' )
13111320 line = line .replace ('(c)' , ' (c) ' )
13121321 # the case of \251 is tested by 'weirdencoding.h'
@@ -1336,7 +1345,6 @@ def prepare_text_line(line):
13361345 # some trailing garbage ')
13371346 line = line .replace ("')" , ' ' )
13381347
1339-
13401348 # note that we do not replace the debian tag by a space: we remove it
13411349 # TODO: use POS tag: (r'^(?:\<s\>).*(?:\<s\\/>)$', 'NAME'),
13421350 line = re_sub (DEBIAN_COPYRIGHT_TAGS_RE (), '' , line )
@@ -1361,6 +1369,12 @@ def prepare_text_line(line):
13611369 line = line .replace ('\\ t' , ' ' )
13621370 line = line .replace ('\\ 0' , ' ' )
13631371
1372+ # in apache'>Copyright replace ">" by "> "
1373+ line = line .replace ('>' , '> ' )
1374+ line = line .replace ('> ' , '> ' )
1375+ line = line .replace ('<' , ' <' )
1376+ line = line .replace (' <' , ' <' )
1377+
13641378 # TODO: Why?
13651379 # replace contiguous spaces with only one occurrence
13661380 # line = re.sub(WHITESPACE_RE(), ' ', text)
0 commit comments