Skip to content

Commit e5f44cb

Browse files
committed
fix extraction from nodes without children
len(node) is a number of children, so it can return 0 for elements with text. Handling of empty content is improved by using create_root_node from parsel.
1 parent ba05527 commit e5f44cb

File tree

3 files changed

+20
-6
lines changed

3 files changed

+20
-6
lines changed

html_text/html_text.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import lxml.etree
66
from lxml.html.clean import Cleaner
77
import parsel
8+
from parsel.selector import create_root_node
89

910

1011
NEWLINE_TAGS = frozenset([
@@ -39,16 +40,14 @@ def _cleaned_html_tree(html):
3940
if isinstance(html, lxml.html.HtmlElement):
4041
tree = html
4142
else:
42-
parser = lxml.html.HTMLParser(encoding='utf8')
43-
tree = lxml.html.fromstring(html.encode('utf8'), parser=parser)
43+
tree = parse_html(html)
4444
return _clean_html(tree)
4545

4646

4747
def parse_html(html):
4848
""" Create an lxml.html.HtmlElement from a string with html.
4949
"""
50-
parser = lxml.html.HTMLParser(encoding='utf8')
51-
return lxml.html.fromstring(html.encode('utf8'), parser=parser)
50+
return create_root_node(html, lxml.html.HTMLParser)
5251

5352

5453
_whitespace = re.compile(r'\s+')
@@ -196,7 +195,7 @@ def extract_text(html,
196195
Default newline and double newline tags can be found in
197196
`html_text.NEWLINE_TAGS` and `html_text.DOUBLE_NEWLINE_TAGS`.
198197
"""
199-
if html is None or len(html) == 0:
198+
if html is None:
200199
return ''
201200
cleaned = _cleaned_html_tree(html)
202201
return _html_to_text(

setup.py

100644100755
File mode changed.

tests/test_html_text.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ def test_declared_encoding(all_options):
3838

3939
def test_empty(all_options):
4040
assert extract_text(u'', **all_options) == ''
41+
assert extract_text(u' ', **all_options) == ''
42+
assert extract_text(None, **all_options) == ''
4143

4244

4345
def test_extract_text_from_tree(all_options):
@@ -47,6 +49,14 @@ def test_extract_text_from_tree(all_options):
4749
assert extract_text(tree, **all_options) == u'Hello, world!'
4850

4951

52+
def test_extract_text_from_node(all_options):
53+
html = (u'<html><style>.div {}</style>'
54+
'<body><p>Hello, world!</p></body></html>')
55+
tree = parse_html(html)
56+
node = tree.xpath('//p')[0]
57+
assert extract_text(node, **all_options) == u'Hello, world!'
58+
59+
5060
def test_inline_tags_whitespace(all_options):
5161
html = u'<span>field</span><span>value of</span><span></span>'
5262
assert extract_text(html, **all_options) == u'field value of'
@@ -79,17 +89,22 @@ def test_bad_punct_whitespace():
7989
assert text == u'trees = webstruct.load_trees("train/*.html")'
8090

8191

82-
def test_selector(all_options):
92+
def test_selectors(all_options):
8393
html = (u'<span><span id="extract-me">text<a>more</a>'
8494
'</span>and more text <a> and some more</a> <a></a> </span>')
95+
# Selector
8596
sel = cleaned_selector(html)
8697
assert selector_to_text(sel, **all_options) == 'text more and more text and some more'
98+
99+
# SelectorList
87100
subsel = sel.xpath('//span[@id="extract-me"]')
88101
assert selector_to_text(subsel, **all_options) == 'text more'
89102
subsel = sel.xpath('//a')
90103
assert selector_to_text(subsel, **all_options) == 'more and some more'
91104
subsel = sel.xpath('//a[@id="extract-me"]')
92105
assert selector_to_text(subsel, **all_options) == ''
106+
subsel = sel.xpath('//foo')
107+
assert selector_to_text(subsel, **all_options) == ''
93108

94109

95110
def test_guess_layout():

0 commit comments

Comments
 (0)