Combine markup stripping code in markup.py

pombredanne · pombredanne · commit f6696fe5d7fe · 2024-09-09T18:50:27.000+02:00
We had code in tow places. Now all is combined in one place.
The code has further been streamlined and refactored for clarity
and simplicity, in combination with copyright cndaidate selection.

Also amend tests as needed.

Signed-off-by: Philippe Ombredanne &lt;pombredanne@nexb.com&gt;
diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py
diff --git a/src/textcode/markup.py b/src/textcode/markup.py
@@ -94,20 +94,10 @@ def is_markup_text(text):
     return has_tags and balanced
 
 
-def demarkup(location):
-    """
-    Return an iterator of unicode text lines for the file at `location` lightly
-    stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
-    etc. The whitespaces are collapsed to one space.
-    """
-    from textcode.analysis import unicode_text_lines
-
-    for line in unicode_text_lines(location, decrlf=True):
-        if TRACE:
-            logger_debug(f'demarkup: {line} : demarked: {demarkup_text(line)}')
-        yield demarkup_text(line)
-
-
+"""
+Find start and closing tags or the first white space whichever comes first or entities.
+This regex is such that ' '.join(tags.split(a))==a
+"""
 get_tags_and_entities = re.compile(
     r'('
     r'</?[^\s></]+(?:>'
@@ -126,24 +116,35 @@ def demarkup(location):
 ).split
 
 
+def is_kept_tags(t):
+    """
+    Return True if a tag should be kepts, base on its opening tag name or content
+
+    """
+    return t and any(kp in t for kp in (
+        'lic', 'copy',
+        'auth', 'contr',
+        # URLs
+        'www', 'http',
+        # legal
+        'leg',
+        # <s> are from legacy Debian copyright files.
+        '<s>', '</s>',
+        # encoded copyright signs
+        '@',
+        '169', 'a9',
+        # in <red hat inc>
+        'red',
+        'inc',
+        # also keep dates as in <2003-2009>
+        )) or t[1].isdigit() or t[-1].isdigit()
+
+
 def demarkup_text(text):
     """
     Return text lightly stripped from markup. The whitespaces are collapsed to
     one space.
     """
-
-    # keep the opening tag name of certain tags that contains these strings
-    # note: <s> are from debian copyright files
-    kept_tags = (
-        'lic', 'copy', 'www', 'http', 'auth', 'contr', 'leg', 'inc', '@',
-        '<s>', '</s>', '169', 'a9',
-        # in <red hat
-        'red'
-    )
-
-    # find start and closing tags or the first white space whichever comes first
-    # or entities. This regex is such that ' '.join(tags.split(a))==a
-
     tags_and_ents = get_tags_and_entities(text)
     if TRACE:
         logger_debug(f'demarkup_text: {text!r}')
@@ -153,8 +154,54 @@ def demarkup_text(text):
     cleaned_append = cleaned.append
     for token in tags_and_ents:
         tlow = token.lower()
-        if tlow.startswith(('<', '/>', '"/>', "'/>", '&', 'href',)) and not any(k in tlow for k in kept_tags):
+        if tlow.startswith(('<', '/>', '"/>', "'/>", '&', 'href',)) and not is_kept_tags(tlow):
             cleaned_append(' ')
         else:
             cleaned_append(token)
     return ''.join(cleaned)
+
+
+# this catches tags but not does not remove the text inside tags
+_remove_tags = re.compile(
+    r'<'
+     r'[(-\-)\?\!\%\/]?'
+     r'[a-gi-vx-zA-GI-VX-Z][a-zA-Z#\"\=\s\.\;\:\%\&?!,\+\*\-_\/]*'
+     r'[a-zA-Z0-9#\"\=\s\.\;\:\%\&?!,\+\*\-_\/]+'
+    r'\/?>',
+    re.MULTILINE | re.UNICODE
+)
+
+remove_tags = _remove_tags.sub
+split_tags = _remove_tags.sub
+
+
+def strip_markup_text(text):
+    """
+    Strip markup tags from ``text``.
+    """
+    return remove_tags(' ', text)
+
+
+def strip_debian_markup(text):
+    """
+    Remove "Debian" legacy copyright file <s> </s> markup tags seen in
+    older copyright files.
+    """
+    return text.replace('</s>', '').replace('<s>', '').replace('<s/>', '')
+
+
+def demarkup(location, stripper=demarkup_text):
+    """
+    Return an iterator of unicode text lines for the file at `location` lightly
+    stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
+    etc. The whitespaces are collapsed to one space.
+
+    Use the ``stripper`` callable, one of demarkup_text or strip_markup_text.
+    """
+    from textcode.analysis import unicode_text_lines
+
+    for line in unicode_text_lines(location):
+        if TRACE:
+            logger_debug(f'demarkup: {line} : demarked: {demarkup(line)}')
+        yield stripper(line)
+
diff --git a/tests/cluecode/data/authors/author_russ_c-c.c b/tests/cluecode/data/authors/author_russ_c-c.c
@@ -6,17 +6,4 @@
  * Russ Dill <Russ.Dill@asu.edu> 2001-2003
  * Rewrited by Vladimir Oleynik <dzo@simtreas.ru> (C) 2003
  *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
+ */
diff --git a/tests/cluecode/data/authors/hdp.c.yml b/tests/cluecode/data/authors/hdp.c.yml
@@ -3,4 +3,4 @@ what:
   - authors_summary
   - copyrights
 copyrights:
-  - Copyright (c) 2010 GSyC/LibreSoft, Universidad Rey Juan Carlos. Authors Santiago Carot Nemesio
+  - Copyright (c) 2010 GSyC/LibreSoft, Universidad Rey Juan Carlos. Authors Santiago Carot Nemesio at gmail.com
diff --git a/tests/cluecode/data/copyrights/afferogplv3-AfferoGPLv.yml b/tests/cluecode/data/copyrights/afferogplv3-AfferoGPLv.yml
@@ -3,7 +3,7 @@ what:
   - holders
   - holders_summary
 copyrights:
-  - Copyright (c) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+  - Copyright (c) 2007 Free Software Foundation, Inc. http://fsf.org/
 holders:
   - Free Software Foundation, Inc.
 holders_summary:
diff --git a/tests/cluecode/data/copyrights/misco2/html_allright.txt.yml b/tests/cluecode/data/copyrights/misco2/html_allright.txt.yml
@@ -4,4 +4,4 @@ what:
   - holders_summary
   - authors
 copyrights:
-  - Copyright (c) 2003-2022, a
+  - Copyright (c) 2003-2022
diff --git a/tests/cluecode/data/ics/dnsmasq/COPYING-v3.yml b/tests/cluecode/data/ics/dnsmasq/COPYING-v3.yml
@@ -3,7 +3,7 @@ what:
   - holders
   - holders_summary
 copyrights:
-  - Copyright (c) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+  - Copyright (c) 2007 Free Software Foundation, Inc. http://fsf.org/
 holders:
   - Free Software Foundation, Inc.
 holders_summary:
diff --git a/tests/cluecode/data/ics/guava/guava.ipr.yml b/tests/cluecode/data/ics/guava/guava.ipr.yml
@@ -2,6 +2,6 @@ what:
   - copyrights
   - holders
 copyrights:
-  - Copyright (c) & 36 today.year Google Inc.
+  - Copyright (c) today.year Google Inc.
 holders:
   - Google Inc.
diff --git a/tests/cluecode/test_copyrights_basic.py b/tests/cluecode/test_copyrights_basic.py
@@ -13,6 +13,8 @@
 import cluecode_test_utils  # NOQA
 from commoncode.testcase import FileBasedTesting
 from cluecode import copyrights
+from textcode import markup
+from cluecode.copyrights import prepare_text_line
 
 
 class TestTextPreparation(FileBasedTesting):
@@ -28,56 +30,56 @@ def test_strip_leading_numbers(self):
 
     def test_prepare_text_line(self):
         cp = 'test (C) all rights reserved'
-        result = copyrights.prepare_text_line(cp)
+        result = prepare_text_line(cp)
         assert result == 'test (c) all rights reserved'
 
     def test_prepare_text_line_debian(self):
         cp = 'Parts Copyright (c) 1992 <s>Uri Blumentha<s>l, I</s>BM</s>'
-        result = copyrights.prepare_text_line(cp)
+        result = prepare_text_line(cp)
         assert result == 'Parts Copyright (c) 1992 Uri Blumenthal, IBM'
 
     def test_prepare_text_line_does_not_truncate_transliterable_unicode(self):
         cp = 'Muła'
-        result = copyrights.prepare_text_line(cp)
+        result = prepare_text_line(cp)
         assert result == 'Mula'
 
-    def test_strip_markup(self):
+    def test_strip_markup_removes_debian_legacy_s_tags(self):
         cp = 'Parts Copyright (c) 1992 <s>Uri Blumentha<s>l, I</s>BM</s>'
-        result = copyrights.strip_markup(cp)
+        result = markup.strip_markup_text(cp)
         assert result == 'Parts Copyright (c) 1992 Uri Blumenthal, IBM'
 
     def test_prepare_text_line_removes_C_comments(self):
         cp = '/*  Copyright 1996-2005, 2008-2011 by   */'
-        result = copyrights.prepare_text_line(cp)
+        result = prepare_text_line(cp)
         assert result == 'Copyright 1996-2005, 2008-2011 by'
 
     def test_prepare_text_line_removes_C_comments2(self):
         cp = '/*  David Turner, Robert Wilhelm, and Werner Lemberg. */'
-        result = copyrights.prepare_text_line(cp)
+        result = prepare_text_line(cp)
         assert result == 'David Turner, Robert Wilhelm, and Werner Lemberg.'
 
     def test_prepare_text_line_removes_Cpp_comments(self):
         cp = '//  David Turner, Robert Wilhelm, and Werner Lemberg. */'
-        result = copyrights.prepare_text_line(cp)
+        result = prepare_text_line(cp)
         assert result == 'David Turner, Robert Wilhelm, and Werner Lemberg.'
 
     def test_prepare_text_line_does_not_damage_urls(self):
         cp = 'copyright (c) 2000 World Wide Web Consortium, http://www.w3.org'
-        result = copyrights.prepare_text_line(cp)
+        result = prepare_text_line(cp)
         assert result == 'copyright (c) 2000 World Wide Web Consortium, http://www.w3.org'
 
     def test_is_end_of_statement(self):
         line = '''          "All rights reserved\\n"'''
-        _line, char_only_line = copyrights.prep_line(line)
+        _line, char_only_line = prepare_text_line(line)
         assert copyrights.is_end_of_statement(char_only_line)
 
-    def test_candidate_lines_simple(self):
+    def test_collect_candidate_lines_simple(self):
         lines = [(1, ' test (C) all rights reserved')]
-        result = list(copyrights.candidate_lines(lines))
+        result = list(copyrights.collect_candidate_lines(lines))
         expected = [[(1, ' test (C) all rights reserved')]]
         assert result == expected
 
-    def test_candidate_lines_complex(self):
+    def test_collect_candidate_lines_complex(self):
         lines = '''
            Apache Xalan (Xalan XSLT processor)
            Copyright 1999-2006 The Apache Software Foundation
@@ -118,57 +120,57 @@ def test_candidate_lines_complex(self):
             [(22, '           this product includes software developed by the following:')]
         ]
 
-        result = list(copyrights.candidate_lines(enumerate(lines, 1)))
+        result = list(copyrights.collect_candidate_lines(enumerate(lines, 1)))
         assert result == expected
 
     def test_is_candidates_should_not_select_line_with_bare_full_year(self):
         line = '2012'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert not copyrights.is_candidate(line)
 
     def test_is_candidates_should_not_select_line_with_full_year_before_160_and_after_2018(self):
         line = '1959 2019'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert not copyrights.is_candidate(line)
 
     def test_is_candidate_should_not_select_line_with_only_two_digit_numbers(self):
         line = 'template<class V> struct v_iter<V, mpl::int_<10> > { typedef typename V::item10 type; typedef v_iter<V, mpl::int_<10 + 1> > next; };'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert not copyrights.is_candidate(line)
 
     def test_is_candidate_should_select_line_with_sign(self):
         line = 'template<class V> struct v_iter<V, mpl::int_<10> (c) { typedef typename V::item10 type; typedef v_iter<V, mpl::int_<10 + 1> > next; };'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert copyrights.is_candidate(line)
 
     def test_is_candidate_should_not_select_line_with_junk_hex(self):
         line = '01061C3F5280CD4AC504152B81E452BD82015442014'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert not copyrights.is_candidate(line)
 
     def test_is_candidate_should_select_line_with_a_trailing_years(self):
         line = '01061C3F5280CD4AC504152B81E452BD820154 2014\n'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert copyrights.is_candidate(line)
 
     def test_is_candidate_should_select_line_with_proper_years(self):
         line = '01061C3F5280CD4AC504152B81E452BD820154 2014-'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert copyrights.is_candidate(line)
 
     def test_is_candidate_should_select_line_with_proper_years2(self):
         line = '01061C3F5280CD4,2016 152B81E452BD820154'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert copyrights.is_candidate(line)
 
     def test_is_candidate_should_select_line_with_dashed_year(self):
         line = 'pub   1024D/CCD6F801 2006-11-15'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert copyrights.is_candidate(line)
 
     def test_is_candidate_should_select_line_with_iso_date_year(self):
         line = 'sig 3 ccd6f801 2006-11-15 nathan mittler <nathan.mittler@gmail.com>'
-        line, _char_only = copyrights.prep_line(line)
+        line, _char_only = prepare_text_line(line)
         assert copyrights.is_candidate(line)
 
     def test_is_candidate_should_not_select_lines_made_only_of_punct_and_digits(self):
@@ -189,7 +191,7 @@ def test_is_candidate_should_not_select_lines_made_only_of_punct_and_digits(self
             '''.splitlines()
 
         for line in lines:
-            line, _ = copyrights.prep_line(line)
+            line, _ = prepare_text_line(line)
             assert not copyrights.is_candidate(line)