Refactor Lexbor empty text detection to ASCII whitespace only

pygarap · web-flow · commit 51efb931ea73 · 2025-11-29T11:28:25.000+04:00
diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd
@@ -241,12 +241,15 @@ cdef class LexborNode:
         lxb_dom_node_t *node
         public LexborHTMLParser parser
     cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type)
-    cdef bint _is_empty_text_node(self, lxb_dom_node_t *node)
 
     @staticmethod
     cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
 
 
+cdef bint is_empty_text_node(lxb_dom_node_t *node)
+cdef inline bint _is_whitespace_only(const lxb_char_t *buffer, size_t buffer_length) nogil
+
+
 cdef class LexborCSSSelector:
     cdef lxb_css_parser_t* parser
     cdef lxb_selectors_t * selectors
diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi
@@ -189,8 +189,9 @@ class LexborNode:
             If ``True``, apply ``str.strip()`` to each fragment before joining to
             remove surrounding whitespace. Defaults to ``False``.
         skip_empty : bool, optional
-            Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
-            ``True``. Defaults to ``False``.
+            Exclude text nodes whose content is only ASCII whitespace (space,
+            tab, newline, form feed or carriage return) when ``True``.
+            Defaults to ``False``.
 
         Returns
         -------
@@ -424,8 +425,9 @@ class LexborNode:
             When ``True``, yield text nodes in addition to element nodes. Defaults
             to ``False``.
         skip_empty : bool, optional
-            When ``include_text`` is ``True``, ignore text nodes that
-            ``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
+            When ``include_text`` is ``True``, ignore text nodes made up solely
+            of ASCII whitespace (space, tab, newline, form feed or carriage
+            return). Defaults to ``False``.
 
         Yields
         ------
@@ -512,8 +514,9 @@ class LexborNode:
             When ``True``, include text nodes in the traversal sequence. Defaults
             to ``False``.
         skip_empty : bool, optional
-            Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
-            when ``include_text`` is ``True``. Defaults to ``False``.
+            Skip text nodes that contain only ASCII whitespace (space, tab,
+            newline, form feed or carriage return) when ``include_text`` is
+            ``True``. Defaults to ``False``.
 
         Yields
         ------
@@ -783,8 +786,9 @@ class LexborNode:
         Returns
         -------
         bool
-            ``True`` when the node is a text node and ``lxb_dom_node_is_empty``
-            reports that it contains no characters.
+            ``True`` when the node is a text node whose data consists solely of
+            ASCII whitespace characters (space, tab, newline, form feed or
+            carriage return).
         """
         ...
 
@@ -915,8 +919,9 @@ class LexborHTMLParser:
         deep : bool, default True
             If True, includes text from all child nodes.
         skip_empty : bool, optional
-            Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
-            ``True``. Defaults to ``False``.
+            Exclude text nodes whose content is only ASCII whitespace (space,
+            tab, newline, form feed or carriage return) when ``True``.
+            Defaults to ``False``.
 
         Returns
         -------
diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx
@@ -324,8 +324,9 @@ cdef class LexborHTMLParser:
         deep : bool, default True
             If True, includes text from all child nodes.
         skip_empty : bool, optional
-            Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
-            ``True``. Defaults to ``False``.
+            Exclude text nodes whose content is only ASCII whitespace (space,
+            tab, newline, form feed or carriage return) when ``True``.
+            Defaults to ``False``.
 
         Returns
         -------
diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi
@@ -149,8 +149,9 @@ cdef class LexborNode:
             If ``True``, apply ``str.strip()`` to each fragment before joining to
             remove surrounding whitespace. Defaults to ``False``.
         skip_empty : bool, optional
-            Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
-            ``True``. Defaults to ``False``.
+            Exclude text nodes whose content is only ASCII whitespace (space,
+            tab, newline, form feed or carriage return) when ``True``.
+            Defaults to ``False``.
 
         Returns
         -------
@@ -174,7 +175,7 @@ cdef class LexborNode:
                 if node.type == LXB_DOM_NODE_TYPE_TEXT:
                     text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> node).data)
                     if text != NULL:
-                        if not skip_empty or not self._is_empty_text_node(node):
+                        if not skip_empty or not is_empty_text_node(node):
                             py_text = text.decode(_ENCODING)
                             container.append(py_text)
                 node = node.next
@@ -440,8 +441,9 @@ cdef class LexborNode:
             When ``True``, yield text nodes in addition to element nodes. Defaults
             to ``False``.
         skip_empty : bool, optional
-            When ``include_text`` is ``True``, ignore text nodes that
-            ``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
+            When ``include_text`` is ``True``, ignore text nodes made up solely
+            of ASCII whitespace (space, tab, newline, form feed or carriage
+            return). Defaults to ``False``.
 
         Yields
         ------
@@ -457,7 +459,7 @@ cdef class LexborNode:
             if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
                 node = node.next
                 continue
-            if node.type == LXB_DOM_NODE_TYPE_TEXT and include_text and skip_empty and self._is_empty_text_node(node):
+            if node.type == LXB_DOM_NODE_TYPE_TEXT and include_text and skip_empty and is_empty_text_node(node):
                 node = node.next
                 continue
 
@@ -594,8 +596,9 @@ cdef class LexborNode:
             When ``True``, include text nodes in the traversal sequence. Defaults
             to ``False``.
         skip_empty : bool, optional
-            Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
-            when ``include_text`` is ``True``. Defaults to ``False``.
+            Skip text nodes that contain only ASCII whitespace (space, tab,
+            newline, form feed or carriage return) when ``include_text`` is
+            ``True``. Defaults to ``False``.
 
         Yields
         ------
@@ -609,7 +612,7 @@ cdef class LexborNode:
 
         while node != NULL:
             if include_text or node.type != LXB_DOM_NODE_TYPE_TEXT:
-                if not skip_empty or not self._is_empty_text_node(node):
+                if not skip_empty or not is_empty_text_node(node):
                     lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
                     yield lxb_node
 
@@ -1039,22 +1042,11 @@ cdef class LexborNode:
         Returns
         -------
         bool
-            ``True`` when the node is a text node and
-            ``lxb_dom_node_is_empty`` reports that its parent subtree contains
-            only whitespace (or nothing).
+            ``True`` when the node is a text node whose character data consists
+            only of ASCII whitespace characters (space, tab, newline, form feed
+            or carriage return).
         """
-        return self._is_empty_text_node(self.node)
-
-    cdef inline bint _is_empty_text_node(self, lxb_dom_node_t *node):
-        if node.type != LXB_DOM_NODE_TYPE_TEXT:
-            return False
-
-        # lexbor's emptiness check walks children of the passed node; for a
-        # text node we need to evaluate its parent so the text itself is
-        # inspected.
-        if node.parent != NULL:
-            return lxb_dom_node_is_empty(node.parent)
-        return lxb_dom_node_is_empty(node)
+        return is_empty_text_node(self.node)
 
 
 @cython.internal
diff --git a/selectolax/lexbor/util.pxi b/selectolax/lexbor/util.pxi
@@ -18,3 +18,70 @@ def parse_fragment(html: str):
     if they are missing. This function does not add these tags.
     """
     return do_parse_fragment(html, LexborHTMLParser)
+
+
+cdef inline bint is_empty_text_node(lxb_dom_node_t *text_node):
+    """
+    Check whether a node is a text node made up solely of HTML ASCII whitespace.
+
+    Parameters
+    ----------
+    text_node : lxb_dom_node_t *
+        Pointer to the node that should be inspected.
+
+    Returns
+    -------
+    bint
+        ``True`` if ``text_node`` is a text node whose character data contains
+        only space, tab, newline, form feed, or carriage return characters;
+        otherwise ``False``.
+    """
+    if text_node == NULL or text_node.type != LXB_DOM_NODE_TYPE_TEXT:
+        return False
+
+    cdef lxb_dom_character_data_t *text_character_data = <lxb_dom_character_data_t *> text_node
+    cdef lexbor_str_t *text_buffer = &text_character_data.data
+    cdef size_t text_length = text_buffer.length
+    cdef lxb_char_t *text_bytes = text_buffer.data
+
+    return _is_whitespace_only(text_bytes, text_length)
+
+cdef inline bint _is_whitespace_only(const lxb_char_t *buffer, size_t buffer_length) nogil:
+    """
+    Determine whether a byte buffer consists only of HTML ASCII whitespace.
+
+    Parameters
+    ----------
+    buffer : const lxb_char_t *
+        Pointer to the buffer to inspect.
+    buffer_length : size_t
+        Number of bytes available in ``buffer``.
+
+    Returns
+    -------
+    bint
+        ``True`` if ``buffer`` is ``NULL``, empty, or contains only space
+        (0x20), tab (0x09), line feed (0x0A), form feed (0x0C), or carriage
+        return (0x0D) bytes; otherwise ``False``.
+
+    Notes
+    -----
+    Mirrors Lexbor's ``lexbor_utils_whitespace`` macro and stays inline to
+    keep the GIL released in hot loops.
+    """
+    cdef const lxb_char_t *cursor = buffer
+    cdef const lxb_char_t *end = buffer + buffer_length
+    cdef lxb_char_t current_char
+
+    if buffer == NULL or buffer_length == 0:
+        return True
+
+    # Inline whitespace check mirroring lexbor_utils_whitespace(chr, !=, &&)
+    while cursor < end:
+        current_char = cursor[0]
+        if (current_char != ' ' and current_char != '\t' and current_char != '\n'
+                and current_char != '\f' and current_char != '\r'):
+            return False
+        cursor += 1
+
+    return True
diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py
@@ -1,8 +1,14 @@
 """Tests for functionality that is only supported by lexbor backend."""
 
+from inspect import cleandoc
+
 from selectolax.lexbor import LexborHTMLParser, parse_fragment
 
 
+def clean_doc(text: str) -> str:
+    return f"{cleandoc(text)}\n"
+
+
 def test_reads_inner_html():
     html = """<div id="main"><div>Hi</div><div id="updated">2025-09-27</div></div>"""
     parser = LexborHTMLParser(html)
@@ -120,6 +126,80 @@ def test_traverse_respects_skip_empty_on_text_nodes():
     assert ", ".join(children) == "div, span, -text, title"
 
 
+def test_traverse_with_skip_empty_on_a_full_html_document():
+    html = clean_doc(
+        """
+        <!doctype html>
+        <html lang="en">
+          <head>
+            <meta charset="utf-8">
+            <meta name="viewport" content="width=device-width,initial-scale=1">
+            <title>Title!</title>
+            <!-- My crazy comment -->
+          </head>
+          <body>
+            <p>Hello <strong>World</strong>!</p>
+            <div hidden draggable="true" translate="no" contenteditable="true" tabindex="3">
+              Div
+            </div>
+          </body>
+        </html>
+        """
+    )
+    parser = LexborHTMLParser(html)
+    children = [
+        (node.tag, node.text_content)
+        for node in parser.root.traverse(include_text=True, skip_empty=False)
+    ]
+    assert children == [
+        ("html", None),
+        ("head", None),
+        ("-text", "\n    "),
+        ("meta", None),
+        ("-text", "\n    "),
+        ("meta", None),
+        ("-text", "\n    "),
+        ("title", None),
+        ("-text", "Title!"),
+        ("-text", "\n    "),
+        ("-comment", None),
+        ("-text", "\n  "),
+        ("-text", "\n  "),
+        ("body", None),
+        ("-text", "\n    "),
+        ("p", None),
+        ("-text", "Hello "),
+        ("strong", None),
+        ("-text", "World"),
+        ("-text", "!"),
+        ("-text", "\n    "),
+        ("div", None),
+        ("-text", "\n      Div\n    "),
+        ("-text", "\n  \n\n"),
+    ]
+    children = [
+        (node.tag, node.text_content)
+        for node in parser.root.traverse(include_text=True, skip_empty=True)
+    ]
+    assert children == [
+        ("html", None),
+        ("head", None),
+        ("meta", None),
+        ("meta", None),
+        ("title", None),
+        ("-text", "Title!"),
+        ("-comment", None),
+        ("body", None),
+        ("p", None),
+        ("-text", "Hello "),
+        ("strong", None),
+        ("-text", "World"),
+        ("-text", "!"),
+        ("div", None),
+        ("-text", "\n      Div\n    "),
+    ]
+
+
 def test_is_empty_text_node_property():
     parser = LexborHTMLParser("<div><span>\n \n</span><title>X</title></div>")
     text_node = parser.css_first("span").first_child