8
8
# See https://aboutcode.org for more information about nexB OSS projects.
9
9
#
10
10
11
- from collections import Counter
12
11
import os
13
12
import re
14
13
14
+ from collections import Counter
15
+ from functools import partial
16
+
15
17
from commoncode .text import as_unicode
16
18
from typecode import get_type
17
- from functools import partial
18
19
19
20
"""
20
21
Extract plain text from HTML, XML and related angular markup-like files and texts.
@@ -40,9 +41,6 @@ def logger_debug(*args):
40
41
def logger_debug (* args ):
41
42
return logger .debug (" " .join (isinstance (a , str ) and a or repr (a ) for a in args ))
42
43
43
-
44
- bin_dir = os .path .join (os .path .dirname (__file__ ), "bin" )
45
-
46
44
extensions = (
47
45
".html" ,
48
46
".htm" ,
@@ -249,53 +247,20 @@ def demarkup_text(text):
249
247
return get_demarkuped_text (text , splitter = split_on_tags_and_entities , keeper = KEEPER )
250
248
251
249
252
- """
253
- Split text on tags start and end
254
- """
255
- split_on_tags = re .compile (r"(< */? *[a-z]+[a-z0-9]* */?>?|>)" , re .IGNORECASE ).split
256
-
257
- KEPT_MARKUP2 = (
258
- "lic" ,
259
- "copy" ,
260
- "auth" ,
261
- "contr" ,
262
- # legal
263
- "leg" ,
264
- # encoded copyright signs
265
- "@" ,
266
- "169" ,
267
- "a9" ,
268
- # in <red hat inc>
269
- "red" ,
270
- "inc" ,
271
- ">" ,
272
- )
273
-
274
- MARKUP_MARKERS2 = (
275
- "<" ,
276
- ">" ,
277
- "/>" ,
278
- '"/>' ,
279
- "'/>" ,
280
- "&" ,
281
- "href" ,
282
- )
283
-
284
- KEEPER2 = partial (is_kept_tag , markup_markers = MARKUP_MARKERS2 , kept_markup = KEPT_MARKUP2 )
285
-
286
-
287
- def strip_markup_text (text ):
288
- """
289
- Strip markup tags from ``text``.
250
+ def demarkup (location , stripper = demarkup_text ):
290
251
"""
291
- return get_demarkuped_text (text , splitter = split_on_tags , keeper = KEEPER2 )
292
-
252
+ Return an iterator of unicode text lines for the file at `location` lightly
253
+ stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
254
+ etc. The whitespaces are collapsed to one space.
293
255
294
- def strip_known_markup_from_text (text ):
295
- """
296
- Strip markup tags from ``text`` using a list of tags
256
+ Use the ``stripper`` callable, one of demarkup_text or strip_markup_text.
297
257
"""
298
- return get_demarkuped_text (text , splitter = split_on_tags , keeper = tag_keeper )
258
+ from textcode .analysis import unicode_text_lines
259
+
260
+ for line in unicode_text_lines (location ):
261
+ if TRACE :
262
+ logger_debug (f"demarkup: { line } : demarked: { demarkup (line )} " )
263
+ yield stripper (line )
299
264
300
265
301
266
# ## Old style stripper
@@ -317,28 +282,30 @@ def strip_markup_text_legacy(text):
317
282
return remove_tags_legacy (" " , text ).strip ()
318
283
319
284
320
- def strip_debian_markup (text ):
321
- """
322
- Remove "Debian" legacy copyright file <s> </s> markup tags seen in
323
- older copyright files. Note we replace by nothing.
324
- """
325
- return text .replace ("</s>" , "" ).replace ("<s>" , "" ).replace ("<s/>" , "" )
285
+ """
286
+ Split text on tags start and end
287
+ """
288
+ split_on_tags = re .compile (
289
+ r"("
290
+ # a tag
291
+ # URL
292
+ r"<https?://[^<>\"\']+>"
293
+ r"|<www[^<>\"\']+>"
294
+ r"|< */? *[a-z]+[a-z0-9@\-\._\+]* */? *>?"
295
+ # emails
296
+ r"|mailto:"
297
+ r"|>"
298
+ r"| "
299
+ r")" ,
300
+ re .IGNORECASE ,
301
+ ).split
326
302
327
303
328
- def demarkup ( location , stripper = demarkup_text ):
304
+ def strip_known_markup_from_text ( text ):
329
305
"""
330
- Return an iterator of unicode text lines for the file at `location` lightly
331
- stripping markup if the file is some kind of markup, such as HTML, XML, PHP,
332
- etc. The whitespaces are collapsed to one space.
333
-
334
- Use the ``stripper`` callable, one of demarkup_text or strip_markup_text.
306
+ Str_ip markup tags from ``text`` using a list of tags
335
307
"""
336
- from textcode .analysis import unicode_text_lines
337
-
338
- for line in unicode_text_lines (location ):
339
- if TRACE :
340
- logger_debug (f"demarkup: { line } : demarked: { demarkup (line )} " )
341
- yield stripper (line )
308
+ return get_demarkuped_text (text , splitter = split_on_tags , keeper = keep_tag )
342
309
343
310
344
311
ALL_TAGS = frozenset (
@@ -1512,8 +1479,9 @@ def demarkup(location, stripper=demarkup_text):
1512
1479
"/ruby>" ,
1513
1480
"ruby>" ,
1514
1481
" ruby>" ,
1515
- # special "debian" legacy tag for copyright holders
1516
- # "s",
1482
+ "<s>" ,
1483
+ "<s/>" ,
1484
+ "</s>" ,
1517
1485
"<samp" ,
1518
1486
"<samp " ,
1519
1487
"<samp>" ,
@@ -1943,6 +1911,9 @@ def demarkup(location, stripper=demarkup_text):
1943
1911
"/xmp>" ,
1944
1912
"xmp>" ,
1945
1913
" xmp>" ,
1914
+ # not XML/HTML
1915
+ "<year>" ,
1916
+ "<name>" ,
1946
1917
# common XML namespaces
1947
1918
"http://www.w3.org/1998/math/mathml" ,
1948
1919
"http://www.w3.org/1999/xhtml" ,
@@ -1953,17 +1924,37 @@ def demarkup(location, stripper=demarkup_text):
1953
1924
]
1954
1925
)
1955
1926
1927
+
1956
1928
SKIP_ATTRIBUTES = (
1957
- "href" ,
1929
+ "href= " ,
1958
1930
"class=" ,
1959
1931
"width=" ,
1932
+ "@end" ,
1933
+ "@group" ,
1934
+ "mailto:" ,
1960
1935
)
1961
1936
1962
1937
1963
- def tag_keeper (token , skips_tags = ALL_TAGS , skip_attributes = SKIP_ATTRIBUTES ):
1938
+ KEEP_MARKERS = (
1939
+ "copyright" ,
1940
+ "author" ,
1941
+ "legal" ,
1942
+ )
1943
+
1944
+
1945
+ def keep_tag (token , skips_tags = ALL_TAGS , skip_attributes = SKIP_ATTRIBUTES , kept_tags = KEEP_MARKERS ):
1964
1946
"""
1965
1947
Return True if a tag should be kept, base on a list of tag name or content.
1966
- Always keep debian-style legacy <s> tags and digit-only tags
1967
1948
"""
1968
1949
tlow = token .lower ()
1969
- return tlow not in skips_tags and not tlow .startswith (skip_attributes )
1950
+
1951
+ if any (k in tlow for k in kept_tags ):
1952
+ return True
1953
+
1954
+ if tlow .startswith (skip_attributes ):
1955
+ return False
1956
+
1957
+ if tlow in skips_tags or tlow == ">" :
1958
+ return False
1959
+
1960
+ return True
0 commit comments