Skip to content

Commit 7d46ce9

Browse files
committed
Refine marker stripper for copyrights
* Discard some placeholder tags such as <year> * Remove specific code for Debian tags * Streamline code for clarity Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent b3caf23 commit 7d46ce9

30 files changed

+2021
-5854
lines changed

src/textcode/markup.py

Lines changed: 64 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,14 @@
88
# See https://aboutcode.org for more information about nexB OSS projects.
99
#
1010

11-
from collections import Counter
1211
import os
1312
import re
1413

14+
from collections import Counter
15+
from functools import partial
16+
1517
from commoncode.text import as_unicode
1618
from typecode import get_type
17-
from functools import partial
1819

1920
"""
2021
Extract plain text from HTML, XML and related angular markup-like files and texts.
@@ -40,9 +41,6 @@ def logger_debug(*args):
4041
def logger_debug(*args):
4142
return logger.debug(" ".join(isinstance(a, str) and a or repr(a) for a in args))
4243

43-
44-
bin_dir = os.path.join(os.path.dirname(__file__), "bin")
45-
4644
extensions = (
4745
".html",
4846
".htm",
@@ -249,53 +247,20 @@ def demarkup_text(text):
249247
return get_demarkuped_text(text, splitter=split_on_tags_and_entities, keeper=KEEPER)
250248

251249

252-
"""
253-
Split text on tags start and end
254-
"""
255-
split_on_tags = re.compile(r"(< */? *[a-z]+[a-z0-9]* */?>?|>)", re.IGNORECASE).split
256-
257-
KEPT_MARKUP2 = (
258-
"lic",
259-
"copy",
260-
"auth",
261-
"contr",
262-
# legal
263-
"leg",
264-
# encoded copyright signs
265-
"@",
266-
"169",
267-
"a9",
268-
# in <red hat inc>
269-
"red",
270-
"inc",
271-
">",
272-
)
273-
274-
MARKUP_MARKERS2 = (
275-
"<",
276-
">",
277-
"/>",
278-
'"/>',
279-
"'/>",
280-
"&",
281-
"href",
282-
)
283-
284-
KEEPER2 = partial(is_kept_tag, markup_markers=MARKUP_MARKERS2, kept_markup=KEPT_MARKUP2)
285-
286-
287-
def strip_markup_text(text):
288-
"""
289-
Strip markup tags from ``text``.
250+
def demarkup(location, stripper=demarkup_text):
290251
"""
291-
return get_demarkuped_text(text, splitter=split_on_tags, keeper=KEEPER2)
292-
252+
Return an iterator of unicode text lines for the file at `location` lightly
253+
stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
254+
etc. The whitespaces are collapsed to one space.
293255
294-
def strip_known_markup_from_text(text):
295-
"""
296-
Strip markup tags from ``text`` using a list of tags
256+
Use the ``stripper`` callable, one of demarkup_text or strip_markup_text.
297257
"""
298-
return get_demarkuped_text(text, splitter=split_on_tags, keeper=tag_keeper)
258+
from textcode.analysis import unicode_text_lines
259+
260+
for line in unicode_text_lines(location):
261+
if TRACE:
262+
logger_debug(f"demarkup: {line} : demarked: {demarkup(line)}")
263+
yield stripper(line)
299264

300265

301266
# ## Old style stripper
@@ -317,28 +282,30 @@ def strip_markup_text_legacy(text):
317282
return remove_tags_legacy(" ", text).strip()
318283

319284

320-
def strip_debian_markup(text):
321-
"""
322-
Remove "Debian" legacy copyright file <s> </s> markup tags seen in
323-
older copyright files. Note we replace by nothing.
324-
"""
325-
return text.replace("</s>", "").replace("<s>", "").replace("<s/>", "")
285+
"""
286+
Split text on tags start and end
287+
"""
288+
split_on_tags = re.compile(
289+
r"("
290+
# a tag
291+
# URL
292+
r"<https?://[^<>\"\']+>"
293+
r"|<www[^<>\"\']+>"
294+
r"|< */? *[a-z]+[a-z0-9@\-\._\+]* */? *>?"
295+
# emails
296+
r"|mailto:"
297+
r"|>"
298+
r"| "
299+
r")",
300+
re.IGNORECASE,
301+
).split
326302

327303

328-
def demarkup(location, stripper=demarkup_text):
304+
def strip_known_markup_from_text(text):
329305
"""
330-
Return an iterator of unicode text lines for the file at `location` lightly
331-
stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
332-
etc. The whitespaces are collapsed to one space.
333-
334-
Use the ``stripper`` callable, one of demarkup_text or strip_markup_text.
306+
Str_ip markup tags from ``text`` using a list of tags
335307
"""
336-
from textcode.analysis import unicode_text_lines
337-
338-
for line in unicode_text_lines(location):
339-
if TRACE:
340-
logger_debug(f"demarkup: {line} : demarked: {demarkup(line)}")
341-
yield stripper(line)
308+
return get_demarkuped_text(text, splitter=split_on_tags, keeper=keep_tag)
342309

343310

344311
ALL_TAGS = frozenset(
@@ -1512,8 +1479,9 @@ def demarkup(location, stripper=demarkup_text):
15121479
"/ruby>",
15131480
"ruby>",
15141481
" ruby>",
1515-
# special "debian" legacy tag for copyright holders
1516-
# "s",
1482+
"<s>",
1483+
"<s/>",
1484+
"</s>",
15171485
"<samp",
15181486
"<samp ",
15191487
"<samp>",
@@ -1943,6 +1911,9 @@ def demarkup(location, stripper=demarkup_text):
19431911
"/xmp>",
19441912
"xmp>",
19451913
" xmp>",
1914+
# not XML/HTML
1915+
"<year>",
1916+
"<name>",
19461917
# common XML namespaces
19471918
"http://www.w3.org/1998/math/mathml",
19481919
"http://www.w3.org/1999/xhtml",
@@ -1953,17 +1924,37 @@ def demarkup(location, stripper=demarkup_text):
19531924
]
19541925
)
19551926

1927+
19561928
SKIP_ATTRIBUTES = (
1957-
"href",
1929+
"href=",
19581930
"class=",
19591931
"width=",
1932+
"@end",
1933+
"@group",
1934+
"mailto:",
19601935
)
19611936

19621937

1963-
def tag_keeper(token, skips_tags=ALL_TAGS, skip_attributes=SKIP_ATTRIBUTES):
1938+
KEEP_MARKERS = (
1939+
"copyright",
1940+
"author",
1941+
"legal",
1942+
)
1943+
1944+
1945+
def keep_tag(token, skips_tags=ALL_TAGS, skip_attributes=SKIP_ATTRIBUTES, kept_tags=KEEP_MARKERS):
19641946
"""
19651947
Return True if a tag should be kept, base on a list of tag name or content.
1966-
Always keep debian-style legacy <s> tags and digit-only tags
19671948
"""
19681949
tlow = token.lower()
1969-
return tlow not in skips_tags and not tlow.startswith(skip_attributes)
1950+
1951+
if any(k in tlow for k in kept_tags):
1952+
return True
1953+
1954+
if tlow.startswith(skip_attributes):
1955+
return False
1956+
1957+
if tlow in skips_tags or tlow == ">":
1958+
return False
1959+
1960+
return True

tests/cluecode/data/copyrights/misco2/copyrighted-by-colon.txt.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ what:
44
- holders_summary
55
- authors
66
copyrights:
7-
- Copyright 2003 Sun Microsystems, Inc.
7+
- Copyright (c) 2003 Sun Microsystems, Inc.
88
holders:
99
- Sun Microsystems, Inc.
1010
holders_summary:

tests/cluecode/data/copyrights/misco2/html_allright.txt.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,10 @@ what:
44
- holders_summary
55
- authors
66
copyrights:
7-
- Copyright (c) 2003-2022
7+
- Copyright (c) 2003-2022, CKSource Holding sp. z o.o.
8+
holders:
9+
- CKSource Holding sp. z o.o.
10+
holders_summary:
11+
- value: CKSource Holding sp. z o.o.
12+
count: 1
13+

0 commit comments

Comments
 (0)