Skip to content

Commit f6696fe

Browse files
committed
Combine markup stripping code in markup.py
We had code in tow places. Now all is combined in one place. The code has further been streamlined and refactored for clarity and simplicity, in combination with copyright cndaidate selection. Also amend tests as needed. Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent e76e5ab commit f6696fe

File tree

9 files changed

+182
-202
lines changed

9 files changed

+182
-202
lines changed

src/cluecode/copyrights.py

Lines changed: 74 additions & 130 deletions
Large diffs are not rendered by default.

src/textcode/markup.py

Lines changed: 75 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -94,20 +94,10 @@ def is_markup_text(text):
9494
return has_tags and balanced
9595

9696

97-
def demarkup(location):
98-
"""
99-
Return an iterator of unicode text lines for the file at `location` lightly
100-
stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
101-
etc. The whitespaces are collapsed to one space.
102-
"""
103-
from textcode.analysis import unicode_text_lines
104-
105-
for line in unicode_text_lines(location, decrlf=True):
106-
if TRACE:
107-
logger_debug(f'demarkup: {line} : demarked: {demarkup_text(line)}')
108-
yield demarkup_text(line)
109-
110-
97+
"""
98+
Find start and closing tags or the first white space whichever comes first or entities.
99+
This regex is such that ' '.join(tags.split(a))==a
100+
"""
111101
get_tags_and_entities = re.compile(
112102
r'('
113103
r'</?[^\s></]+(?:>'
@@ -126,24 +116,35 @@ def demarkup(location):
126116
).split
127117

128118

119+
def is_kept_tags(t):
120+
"""
121+
Return True if a tag should be kepts, base on its opening tag name or content
122+
123+
"""
124+
return t and any(kp in t for kp in (
125+
'lic', 'copy',
126+
'auth', 'contr',
127+
# URLs
128+
'www', 'http',
129+
# legal
130+
'leg',
131+
# <s> are from legacy Debian copyright files.
132+
'<s>', '</s>',
133+
# encoded copyright signs
134+
'@',
135+
'169', 'a9',
136+
# in <red hat inc>
137+
'red',
138+
'inc',
139+
# also keep dates as in <2003-2009>
140+
)) or t[1].isdigit() or t[-1].isdigit()
141+
142+
129143
def demarkup_text(text):
130144
"""
131145
Return text lightly stripped from markup. The whitespaces are collapsed to
132146
one space.
133147
"""
134-
135-
# keep the opening tag name of certain tags that contains these strings
136-
# note: <s> are from debian copyright files
137-
kept_tags = (
138-
'lic', 'copy', 'www', 'http', 'auth', 'contr', 'leg', 'inc', '@',
139-
'<s>', '</s>', '169', 'a9',
140-
# in <red hat
141-
'red'
142-
)
143-
144-
# find start and closing tags or the first white space whichever comes first
145-
# or entities. This regex is such that ' '.join(tags.split(a))==a
146-
147148
tags_and_ents = get_tags_and_entities(text)
148149
if TRACE:
149150
logger_debug(f'demarkup_text: {text!r}')
@@ -153,8 +154,54 @@ def demarkup_text(text):
153154
cleaned_append = cleaned.append
154155
for token in tags_and_ents:
155156
tlow = token.lower()
156-
if tlow.startswith(('<', '/>', '"/>', "'/>", '&', 'href',)) and not any(k in tlow for k in kept_tags):
157+
if tlow.startswith(('<', '/>', '"/>', "'/>", '&', 'href',)) and not is_kept_tags(tlow):
157158
cleaned_append(' ')
158159
else:
159160
cleaned_append(token)
160161
return ''.join(cleaned)
162+
163+
164+
# this catches tags but not does not remove the text inside tags
165+
_remove_tags = re.compile(
166+
r'<'
167+
r'[(-\-)\?\!\%\/]?'
168+
r'[a-gi-vx-zA-GI-VX-Z][a-zA-Z#\"\=\s\.\;\:\%\&?!,\+\*\-_\/]*'
169+
r'[a-zA-Z0-9#\"\=\s\.\;\:\%\&?!,\+\*\-_\/]+'
170+
r'\/?>',
171+
re.MULTILINE | re.UNICODE
172+
)
173+
174+
remove_tags = _remove_tags.sub
175+
split_tags = _remove_tags.sub
176+
177+
178+
def strip_markup_text(text):
179+
"""
180+
Strip markup tags from ``text``.
181+
"""
182+
return remove_tags(' ', text)
183+
184+
185+
def strip_debian_markup(text):
186+
"""
187+
Remove "Debian" legacy copyright file <s> </s> markup tags seen in
188+
older copyright files.
189+
"""
190+
return text.replace('</s>', '').replace('<s>', '').replace('<s/>', '')
191+
192+
193+
def demarkup(location, stripper=demarkup_text):
194+
"""
195+
Return an iterator of unicode text lines for the file at `location` lightly
196+
stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
197+
etc. The whitespaces are collapsed to one space.
198+
199+
Use the ``stripper`` callable, one of demarkup_text or strip_markup_text.
200+
"""
201+
from textcode.analysis import unicode_text_lines
202+
203+
for line in unicode_text_lines(location):
204+
if TRACE:
205+
logger_debug(f'demarkup: {line} : demarked: {demarkup(line)}')
206+
yield stripper(line)
207+

tests/cluecode/data/authors/author_russ_c-c.c

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,4 @@
66
* Russ Dill <[email protected]> 2001-2003
77
* Rewrited by Vladimir Oleynik <[email protected]> (C) 2003
88
*
9-
* This program is free software; you can redistribute it and/or modify
10-
* it under the terms of the GNU General Public License as published by
11-
* the Free Software Foundation; either version 2 of the License, or
12-
* (at your option) any later version.
13-
*
14-
* This program is distributed in the hope that it will be useful,
15-
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16-
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17-
* GNU General Public License for more details.
18-
*
19-
* You should have received a copy of the GNU General Public License
20-
* along with this program; if not, write to the Free Software
21-
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22-
*/
9+
*/

tests/cluecode/data/authors/hdp.c.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@ what:
33
- authors_summary
44
- copyrights
55
copyrights:
6-
- Copyright (c) 2010 GSyC/LibreSoft, Universidad Rey Juan Carlos. Authors Santiago Carot Nemesio
6+
- Copyright (c) 2010 GSyC/LibreSoft, Universidad Rey Juan Carlos. Authors Santiago Carot Nemesio at gmail.com

tests/cluecode/data/copyrights/afferogplv3-AfferoGPLv.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ what:
33
- holders
44
- holders_summary
55
copyrights:
6-
- Copyright (c) 2007 Free Software Foundation, Inc. <http://fsf.org/>
6+
- Copyright (c) 2007 Free Software Foundation, Inc. http://fsf.org/
77
holders:
88
- Free Software Foundation, Inc.
99
holders_summary:

tests/cluecode/data/copyrights/misco2/html_allright.txt.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ what:
44
- holders_summary
55
- authors
66
copyrights:
7-
- Copyright (c) 2003-2022, a
7+
- Copyright (c) 2003-2022

tests/cluecode/data/ics/dnsmasq/COPYING-v3.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ what:
33
- holders
44
- holders_summary
55
copyrights:
6-
- Copyright (c) 2007 Free Software Foundation, Inc. <http://fsf.org/>
6+
- Copyright (c) 2007 Free Software Foundation, Inc. http://fsf.org/
77
holders:
88
- Free Software Foundation, Inc.
99
holders_summary:

tests/cluecode/data/ics/guava/guava.ipr.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@ what:
22
- copyrights
33
- holders
44
copyrights:
5-
- Copyright (c) & 36 today.year Google Inc.
5+
- Copyright (c) today.year Google Inc.
66
holders:
77
- Google Inc.

tests/cluecode/test_copyrights_basic.py

Lines changed: 27 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import cluecode_test_utils # NOQA
1414
from commoncode.testcase import FileBasedTesting
1515
from cluecode import copyrights
16+
from textcode import markup
17+
from cluecode.copyrights import prepare_text_line
1618

1719

1820
class TestTextPreparation(FileBasedTesting):
@@ -28,56 +30,56 @@ def test_strip_leading_numbers(self):
2830

2931
def test_prepare_text_line(self):
3032
cp = 'test (C) all rights reserved'
31-
result = copyrights.prepare_text_line(cp)
33+
result = prepare_text_line(cp)
3234
assert result == 'test (c) all rights reserved'
3335

3436
def test_prepare_text_line_debian(self):
3537
cp = 'Parts Copyright (c) 1992 <s>Uri Blumentha<s>l, I</s>BM</s>'
36-
result = copyrights.prepare_text_line(cp)
38+
result = prepare_text_line(cp)
3739
assert result == 'Parts Copyright (c) 1992 Uri Blumenthal, IBM'
3840

3941
def test_prepare_text_line_does_not_truncate_transliterable_unicode(self):
4042
cp = 'Muła'
41-
result = copyrights.prepare_text_line(cp)
43+
result = prepare_text_line(cp)
4244
assert result == 'Mula'
4345

44-
def test_strip_markup(self):
46+
def test_strip_markup_removes_debian_legacy_s_tags(self):
4547
cp = 'Parts Copyright (c) 1992 <s>Uri Blumentha<s>l, I</s>BM</s>'
46-
result = copyrights.strip_markup(cp)
48+
result = markup.strip_markup_text(cp)
4749
assert result == 'Parts Copyright (c) 1992 Uri Blumenthal, IBM'
4850

4951
def test_prepare_text_line_removes_C_comments(self):
5052
cp = '/* Copyright 1996-2005, 2008-2011 by */'
51-
result = copyrights.prepare_text_line(cp)
53+
result = prepare_text_line(cp)
5254
assert result == 'Copyright 1996-2005, 2008-2011 by'
5355

5456
def test_prepare_text_line_removes_C_comments2(self):
5557
cp = '/* David Turner, Robert Wilhelm, and Werner Lemberg. */'
56-
result = copyrights.prepare_text_line(cp)
58+
result = prepare_text_line(cp)
5759
assert result == 'David Turner, Robert Wilhelm, and Werner Lemberg.'
5860

5961
def test_prepare_text_line_removes_Cpp_comments(self):
6062
cp = '// David Turner, Robert Wilhelm, and Werner Lemberg. */'
61-
result = copyrights.prepare_text_line(cp)
63+
result = prepare_text_line(cp)
6264
assert result == 'David Turner, Robert Wilhelm, and Werner Lemberg.'
6365

6466
def test_prepare_text_line_does_not_damage_urls(self):
6567
cp = 'copyright (c) 2000 World Wide Web Consortium, http://www.w3.org'
66-
result = copyrights.prepare_text_line(cp)
68+
result = prepare_text_line(cp)
6769
assert result == 'copyright (c) 2000 World Wide Web Consortium, http://www.w3.org'
6870

6971
def test_is_end_of_statement(self):
7072
line = ''' "All rights reserved\\n"'''
71-
_line, char_only_line = copyrights.prep_line(line)
73+
_line, char_only_line = prepare_text_line(line)
7274
assert copyrights.is_end_of_statement(char_only_line)
7375

74-
def test_candidate_lines_simple(self):
76+
def test_collect_candidate_lines_simple(self):
7577
lines = [(1, ' test (C) all rights reserved')]
76-
result = list(copyrights.candidate_lines(lines))
78+
result = list(copyrights.collect_candidate_lines(lines))
7779
expected = [[(1, ' test (C) all rights reserved')]]
7880
assert result == expected
7981

80-
def test_candidate_lines_complex(self):
82+
def test_collect_candidate_lines_complex(self):
8183
lines = '''
8284
Apache Xalan (Xalan XSLT processor)
8385
Copyright 1999-2006 The Apache Software Foundation
@@ -118,57 +120,57 @@ def test_candidate_lines_complex(self):
118120
[(22, ' this product includes software developed by the following:')]
119121
]
120122

121-
result = list(copyrights.candidate_lines(enumerate(lines, 1)))
123+
result = list(copyrights.collect_candidate_lines(enumerate(lines, 1)))
122124
assert result == expected
123125

124126
def test_is_candidates_should_not_select_line_with_bare_full_year(self):
125127
line = '2012'
126-
line, _char_only = copyrights.prep_line(line)
128+
line, _char_only = prepare_text_line(line)
127129
assert not copyrights.is_candidate(line)
128130

129131
def test_is_candidates_should_not_select_line_with_full_year_before_160_and_after_2018(self):
130132
line = '1959 2019'
131-
line, _char_only = copyrights.prep_line(line)
133+
line, _char_only = prepare_text_line(line)
132134
assert not copyrights.is_candidate(line)
133135

134136
def test_is_candidate_should_not_select_line_with_only_two_digit_numbers(self):
135137
line = 'template<class V> struct v_iter<V, mpl::int_<10> > { typedef typename V::item10 type; typedef v_iter<V, mpl::int_<10 + 1> > next; };'
136-
line, _char_only = copyrights.prep_line(line)
138+
line, _char_only = prepare_text_line(line)
137139
assert not copyrights.is_candidate(line)
138140

139141
def test_is_candidate_should_select_line_with_sign(self):
140142
line = 'template<class V> struct v_iter<V, mpl::int_<10> (c) { typedef typename V::item10 type; typedef v_iter<V, mpl::int_<10 + 1> > next; };'
141-
line, _char_only = copyrights.prep_line(line)
143+
line, _char_only = prepare_text_line(line)
142144
assert copyrights.is_candidate(line)
143145

144146
def test_is_candidate_should_not_select_line_with_junk_hex(self):
145147
line = '01061C3F5280CD4AC504152B81E452BD82015442014'
146-
line, _char_only = copyrights.prep_line(line)
148+
line, _char_only = prepare_text_line(line)
147149
assert not copyrights.is_candidate(line)
148150

149151
def test_is_candidate_should_select_line_with_a_trailing_years(self):
150152
line = '01061C3F5280CD4AC504152B81E452BD820154 2014\n'
151-
line, _char_only = copyrights.prep_line(line)
153+
line, _char_only = prepare_text_line(line)
152154
assert copyrights.is_candidate(line)
153155

154156
def test_is_candidate_should_select_line_with_proper_years(self):
155157
line = '01061C3F5280CD4AC504152B81E452BD820154 2014-'
156-
line, _char_only = copyrights.prep_line(line)
158+
line, _char_only = prepare_text_line(line)
157159
assert copyrights.is_candidate(line)
158160

159161
def test_is_candidate_should_select_line_with_proper_years2(self):
160162
line = '01061C3F5280CD4,2016 152B81E452BD820154'
161-
line, _char_only = copyrights.prep_line(line)
163+
line, _char_only = prepare_text_line(line)
162164
assert copyrights.is_candidate(line)
163165

164166
def test_is_candidate_should_select_line_with_dashed_year(self):
165167
line = 'pub 1024D/CCD6F801 2006-11-15'
166-
line, _char_only = copyrights.prep_line(line)
168+
line, _char_only = prepare_text_line(line)
167169
assert copyrights.is_candidate(line)
168170

169171
def test_is_candidate_should_select_line_with_iso_date_year(self):
170172
line = 'sig 3 ccd6f801 2006-11-15 nathan mittler <[email protected]>'
171-
line, _char_only = copyrights.prep_line(line)
173+
line, _char_only = prepare_text_line(line)
172174
assert copyrights.is_candidate(line)
173175

174176
def test_is_candidate_should_not_select_lines_made_only_of_punct_and_digits(self):
@@ -189,7 +191,7 @@ def test_is_candidate_should_not_select_lines_made_only_of_punct_and_digits(self
189191
'''.splitlines()
190192

191193
for line in lines:
192-
line, _ = copyrights.prep_line(line)
194+
line, _ = prepare_text_line(line)
193195
assert not copyrights.is_candidate(line)
194196

195197

0 commit comments

Comments
 (0)