fix extraction from nodes without children

kmike · kmike · commit e5f44cb83b6d · 2018-09-25T23:21:01.000+05:00
len(node) is a number of children, so it can return 0 for elements with text.

Handling of empty content is improved by using create_root_node from parsel.
diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -5,6 +5,7 @@
 import lxml.etree
 from lxml.html.clean import Cleaner
 import parsel
+from parsel.selector import create_root_node
 
 
 NEWLINE_TAGS = frozenset([
@@ -39,16 +40,14 @@ def _cleaned_html_tree(html):
     if isinstance(html, lxml.html.HtmlElement):
         tree = html
     else:
-        parser = lxml.html.HTMLParser(encoding='utf8')
-        tree = lxml.html.fromstring(html.encode('utf8'), parser=parser)
+        tree = parse_html(html)
     return _clean_html(tree)
 
 
 def parse_html(html):
     """ Create an lxml.html.HtmlElement from a string with html.
     """
-    parser = lxml.html.HTMLParser(encoding='utf8')
-    return lxml.html.fromstring(html.encode('utf8'), parser=parser)
+    return create_root_node(html, lxml.html.HTMLParser)
 
 
 _whitespace = re.compile(r'\s+')
@@ -196,7 +195,7 @@ def extract_text(html,
     Default newline and double newline tags can be found in
     `html_text.NEWLINE_TAGS` and `html_text.DOUBLE_NEWLINE_TAGS`.
     """
-    if html is None or len(html) == 0:
+    if html is None:
         return ''
     cleaned = _cleaned_html_tree(html)
     return _html_to_text(
diff --git a/setup.py b/setup.py
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -38,6 +38,8 @@ def test_declared_encoding(all_options):
 
 def test_empty(all_options):
     assert extract_text(u'', **all_options) == ''
+    assert extract_text(u' ', **all_options) == ''
+    assert extract_text(None, **all_options) == ''
 
 
 def test_extract_text_from_tree(all_options):
@@ -47,6 +49,14 @@ def test_extract_text_from_tree(all_options):
     assert extract_text(tree, **all_options) == u'Hello, world!'
 
 
+def test_extract_text_from_node(all_options):
+    html = (u'<html><style>.div {}</style>'
+            '<body><p>Hello,   world!</p></body></html>')
+    tree = parse_html(html)
+    node = tree.xpath('//p')[0]
+    assert extract_text(node, **all_options) == u'Hello, world!'
+
+
 def test_inline_tags_whitespace(all_options):
     html = u'<span>field</span><span>value  of</span><span></span>'
     assert extract_text(html, **all_options) == u'field value of'
@@ -79,17 +89,22 @@ def test_bad_punct_whitespace():
     assert text == u'trees = webstruct.load_trees("train/*.html")'
 
 
-def test_selector(all_options):
+def test_selectors(all_options):
     html = (u'<span><span id="extract-me">text<a>more</a>'
             '</span>and more text <a> and some more</a> <a></a> </span>')
+    # Selector
     sel = cleaned_selector(html)
     assert selector_to_text(sel, **all_options) == 'text more and more text and some more'
+
+    # SelectorList
     subsel = sel.xpath('//span[@id="extract-me"]')
     assert selector_to_text(subsel, **all_options) == 'text more'
     subsel = sel.xpath('//a')
     assert selector_to_text(subsel, **all_options) == 'more and some more'
     subsel = sel.xpath('//a[@id="extract-me"]')
     assert selector_to_text(subsel, **all_options) == ''
+    subsel = sel.xpath('//foo')
+    assert selector_to_text(subsel, **all_options) == ''
 
 
 def test_guess_layout():