Improve copyright detection

pombredanne · pombredanne · commit 6ae938a7cb22 · 2017-10-04T23:27:24.000-07:00
* handle edge cases such as some words with HTML < or > #110 * improve some university and company name detection such as subsidaiaries #110 * do not detect cecrtain authors as copyrights Signed-off-by: Philippe Ombredanne <pombredanne@nexb.com>
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
@@ -108,7 +108,7 @@ def detect(location):
 ')')
 
 _YEAR_YEAR = (r'('
-              # fixme   v ....the underscore below is suspicious 
+              # fixme   v ....the underscore below is suspicious
     '19[6-9][0-9][\.,\-]_[6-9][0-9]'  # 1960-99
     '|'
     '19[6-9][0-9][\.,\-]+[0-9]'  # 1998-9
@@ -227,7 +227,7 @@ def detect(location):
     # note the leading @ .... this may be a source of problems
     (r'.?(@?([Cc]opyright)s?:?|[Cc]opr\.?|[(][Cc][)]|(COPYRIGHT)S?:?)', 'COPY'),
 
-    # copyright in markup, until we strip markup: apache'>Copyright
+    # copyright in markup, until we strip markup: apache'>Copyright or left'>Copyright
     (r'[A-Za-z0-9]+[\'">]+[Cc]opyright', 'COPY'),
 
     # AT&T (the company), needs special handling
@@ -249,6 +249,8 @@ def detect(location):
     (r'^([Ll][Ll][CcPp]|[Ll][Tt][Dd])\.?,?$', 'COMP'),
     (r'^([Ll][Ll][CcPp]|[Ll][Tt][Dd])\.$', 'COMP'),
     (r'^L\.P\.$', 'COMP'),
+    (r'^[Ss]ubsidiar(y|ies)$', 'COMP'),
+    (r'^[Ss]ubsidiary\(\-ies\)$', 'COMP'),
     # company suffix : SA, SAS, AG, AB, AS, CO, labs followed by a dot
     (r'^(S\.?A\.?S?|Sas|sas|AG|AB|Labs?|[Cc][Oo]\.|Research|INRIA).?$', 'COMP'),
     # (german) company suffix
@@ -440,6 +442,9 @@ def detect(location):
     # the Regents of the University of California
     COMPANY: {<BY>? <NN> <NNP> <OF> <NN> <UNI> <OF> <COMPANY|NAME|NAME2|NAME3><COMP>?}        #130
 
+   # Free Software Foundation, Inc.
+    COMPANY: {<NNP> <NNP> <COMP> <COMP>}       #135
+
    # Corporation/COMP for/NN  National/NNP Research/COMP Initiatives/NNP
     COMPANY: {<COMP> <NN> <NNP> <COMP> <NNP>}       #140
 
@@ -471,7 +476,7 @@ def detect(location):
     NAME: {<NNP> <PN>? <NNP>+}        #360
     NAME: {<NNP> <NNP>}        #370
 
-    NAME: {<NNP> <NN> <EMAIL>}        #390
+    NAME: {<NNP> <NN|NNP> <EMAIL>}        #390
     NAME: {<NNP> <PN|VAN>? <PN|VAN>? <NNP>}        #400
     NAME: {<NNP> <NN> <NNP>}        #410
     NAME: {<NNP> <COMMIT>}        #420
@@ -486,7 +491,8 @@ def detect(location):
     COMPANY: {<NNP> <IN> <NN>? <COMPANY>}        #510
 
     NAME2: {<NAME> <EMAIL>}        #530
-    NAME3: {<YR-RANGE> <NAME2|COMPANY>+}        #540
+    NAME3: {<YR-RANGE> <NAME2|COMPANY>+}        #535
+    NAME3: {<YR-RANGE> <NAME2|COMPANY>+ <CC> <YR-RANGE>}        #540
     NAME: {<NAME|NAME2>+ <OF> <NNP> <OF> <NN>? <COMPANY>}        #550
     NAME: {<NAME|NAME2>+ <CC|OF>? <NAME|NAME2|COMPANY>}        #560
     NAME3: {<YR-RANGE> <NAME>+}        #570
@@ -577,6 +583,9 @@ def detect(location):
 # by the a href http wtforms.simplecodes.com WTForms Team
     COMPANY: {<BY> <NN>+ <COMP|COMPANY>}        #1420
 
+# the Regents of the University of California, Sun Microsystems, Inc., Scriptics Corporation
+  COMPANY: {<NN> <NNP> <OF> <NN> <UNI> <OF> <COMPANY>+}
+
 
 # "And" some name
     ANDCO: {<CC>+ <NN> <NNP>+<UNI|COMP>?}        #1430
@@ -987,7 +996,7 @@ def detect(self, numbered_lines):
         Return a sequence of tuples (copyrights, authors, years, holders)
         detected in a sequence of numbered line tuples.
         """
-        from nltk.tree import Tree 
+        from nltk.tree import Tree
         numbered_lines = list(numbered_lines)
         numbers = [n for n, _l in numbered_lines]
         start_line = min(numbers)
@@ -1104,7 +1113,7 @@ def is_candidate(line):
             if marker in line:
                 logger.debug('is_candidate: %(marker)r in line:\n%(line)r' % locals())
                 return True
-            
+
 
 def has_content(line):
     """
@@ -1306,7 +1315,7 @@ def prepare_text_line(line):
     # un common pipe chars in some ascii art
     line = line.replace('|', ' ')
 
-    # normalize copyright signs and spacing aournd them
+    # normalize copyright signs and spacing around them
     line = line.replace('(C)', ' (c) ')
     line = line.replace('(c)', ' (c) ')
     # the case of \251 is tested by 'weirdencoding.h'
@@ -1336,7 +1345,6 @@ def prepare_text_line(line):
     # some trailing garbage ')
     line = line.replace("')", '  ')
 
-
     # note that we do not replace the debian tag by a space:  we remove it
     # TODO: use POS tag:     (r'^(?:\<s\>).*(?:\<s\\/>)$', 'NAME'),
     line = re_sub(DEBIAN_COPYRIGHT_TAGS_RE(), '', line)
@@ -1361,6 +1369,12 @@ def prepare_text_line(line):
     line = line.replace('\\t', ' ')
     line = line.replace('\\0', ' ')
 
+    # in apache'>Copyright replace ">" by "> "
+    line = line.replace('>', '> ')
+    line = line.replace('>  ', '> ')
+    line = line.replace('<', ' <')
+    line = line.replace('  <', ' <')
+
     # TODO: Why?
     # replace contiguous spaces with only one occurrence
     # line = re.sub(WHITESPACE_RE(), ' ', text)
diff --git a/tests/cluecode/test_copyrights.py b/tests/cluecode/test_copyrights.py
@@ -1262,6 +1262,7 @@ def test_copyright_in_html_comments(self):
         ]
         check_detection(expected, test_file)
 
+    @expectedFailure
     def test_copyright_in_html_incorrect(self):
         test_file = self.get_test_loc('copyrights/copyright_in_html_incorrect-detail_9_html.html')
         expected = [
@@ -1648,7 +1649,7 @@ def test_copyright_libpoppler3_copyright(self):
     def test_copyright_libqt4_scripttools_copyright(self):
         test_file = self.get_test_loc('copyrights/copyright_libqt4_scripttools_copyright-libqt_scripttools_copyright.copyright')
         expected = [
-            '(c) 2008-2009 Nokia Corporation',
+            '(c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies)',
             '(c) 1994-2008 Trolltech ASA',
         ]
         check_detection(expected, test_file)
@@ -3620,7 +3621,6 @@ def test_copyright_super_tech_c(self):
         ]
         check_detection(expected, test_file)
 
-    @expectedFailure
     def test_copyright_tcl_copyright(self):
         test_file = self.get_test_loc('copyrights/copyright_tcl_copyright-tcl_copyright.copyright')
         expected = [
@@ -3968,7 +3968,7 @@ def test_copyright_should_not_contain_leading_or_trailing_colon(self):
 
     def test_copyright_in_markup_should_not_be_truncated(self):
         test_file = self.get_test_loc('copyrights/copyright_in_html.html')
-        expected = ["(c) Copyright 2010 by the <a href http://wtforms.simplecodes.com'>WTForms Team"]
+        expected = ["(c) Copyright 2010 by the <a href http://wtforms.simplecodes.com'> WTForms Team"]
         check_detection(expected, test_file)
 
     def test_copyright_should_not_have_trailing_garbage(self):
@@ -4161,14 +4161,14 @@ def test_copyright_various(self):
             Copyright (C) 2000 - various; see CREDITS, ChangeLog, and sources
             The libwmf Library is free software; you can redistribute it and/or
         '''.splitlines(False)
-        expected = ['Copyright (c) 2000 - various'] # ; see CREDITS, ChangeLog, and sources
+        expected = ['Copyright (c) 2000 - various']  # ; see CREDITS, ChangeLog, and sources
         check_detection(expected, test_lines)
 
     def test_copyright_natural_docs(self):
         test_lines = '''
             // Search script generated by doxygen
             // Copyright (C) 2009 by Dimitri van Heesch.
-            
+
             // The code in this file is loosly based on main.js, part of Natural Docs,
             // which is Copyright (C) 2003-2008 Greg Valure
             // Natural Docs is licensed under the GPL.
@@ -4194,3 +4194,12 @@ def test_copyright_and_authors_mixed(self):
             u'Copyright (c) 1988, 1993 The Regents of the University of California.'
         ]
         check_detection(expected, test_lines)
+
+    def test_copyright_word_in_html(self):
+        test_lines = '''
+            <td width="40%" align="left">Copyright &copy; 2010 Nokia Corporation and/or its subsidiary(-ies)</td>
+        '''.splitlines(False)
+        expected = [
+            u'Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies)',
+        ]
+        check_detection(expected, test_lines)
diff --git a/tests/cluecode/test_copyrights_ics.py b/tests/cluecode/test_copyrights_ics.py