rushter · rushter · Nov 29, 2025 · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd
@@ -242,6 +242,7 @@ cdef class LexborNode:
         public LexborHTMLParser parser
     cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type)
     cdef bint _is_empty_text_node(self, lxb_dom_node_t *node)
+    cdef inline bint _is_whitespace_only(self, const lxb_char_t *buffer, size_t buffer_length) nogil
 
     @staticmethod
     cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)

diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi
@@ -189,8 +189,9 @@ class LexborNode:
             If ``True``, apply ``str.strip()`` to each fragment before joining to
             remove surrounding whitespace. Defaults to ``False``.
         skip_empty : bool, optional
-            Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
-            ``True``. Defaults to ``False``.
+            Exclude text nodes whose content is only ASCII whitespace (space,
+            tab, newline, form feed or carriage return) when ``True``.
+            Defaults to ``False``.
 
         Returns
         -------
@@ -424,8 +425,9 @@ class LexborNode:
             When ``True``, yield text nodes in addition to element nodes. Defaults
             to ``False``.
         skip_empty : bool, optional
-            When ``include_text`` is ``True``, ignore text nodes that
-            ``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
+            When ``include_text`` is ``True``, ignore text nodes made up solely
+            of ASCII whitespace (space, tab, newline, form feed or carriage
+            return). Defaults to ``False``.
 
         Yields
         ------
@@ -512,8 +514,9 @@ class LexborNode:
             When ``True``, include text nodes in the traversal sequence. Defaults
             to ``False``.
         skip_empty : bool, optional
-            Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
-            when ``include_text`` is ``True``. Defaults to ``False``.
+            Skip text nodes that contain only ASCII whitespace (space, tab,
+            newline, form feed or carriage return) when ``include_text`` is
+            ``True``. Defaults to ``False``.
 
         Yields
         ------
@@ -783,8 +786,9 @@ class LexborNode:
         Returns
         -------
         bool
-            ``True`` when the node is a text node and ``lxb_dom_node_is_empty``
-            reports that it contains no characters.
+            ``True`` when the node is a text node whose data consists solely of
+            ASCII whitespace characters (space, tab, newline, form feed or
+            carriage return).
         """
         ...
 
@@ -915,8 +919,9 @@ class LexborHTMLParser:
         deep : bool, default True
             If True, includes text from all child nodes.
         skip_empty : bool, optional
-            Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
-            ``True``. Defaults to ``False``.
+            Exclude text nodes whose content is only ASCII whitespace (space,
+            tab, newline, form feed or carriage return) when ``True``.
+            Defaults to ``False``.
 
         Returns
         -------

diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx
@@ -324,8 +324,9 @@ cdef class LexborHTMLParser:
         deep : bool, default True
             If True, includes text from all child nodes.
         skip_empty : bool, optional
-            Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
-            ``True``. Defaults to ``False``.
+            Exclude text nodes whose content is only ASCII whitespace (space,
+            tab, newline, form feed or carriage return) when ``True``.
+            Defaults to ``False``.
 
         Returns
         -------

diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi
@@ -149,8 +149,9 @@ cdef class LexborNode:
             If ``True``, apply ``str.strip()`` to each fragment before joining to
             remove surrounding whitespace. Defaults to ``False``.
         skip_empty : bool, optional
-            Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
-            ``True``. Defaults to ``False``.
+            Exclude text nodes whose content is only ASCII whitespace (space,
+            tab, newline, form feed or carriage return) when ``True``.
+            Defaults to ``False``.
 
         Returns
         -------
@@ -440,8 +441,9 @@ cdef class LexborNode:
             When ``True``, yield text nodes in addition to element nodes. Defaults
             to ``False``.
         skip_empty : bool, optional
-            When ``include_text`` is ``True``, ignore text nodes that
-            ``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
+            When ``include_text`` is ``True``, ignore text nodes made up solely
+            of ASCII whitespace (space, tab, newline, form feed or carriage
+            return). Defaults to ``False``.
 
         Yields
         ------
@@ -594,8 +596,9 @@ cdef class LexborNode:
             When ``True``, include text nodes in the traversal sequence. Defaults
             to ``False``.
         skip_empty : bool, optional
-            Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
-            when ``include_text`` is ``True``. Defaults to ``False``.
+            Skip text nodes that contain only ASCII whitespace (space, tab,
+            newline, form feed or carriage return) when ``include_text`` is
+            ``True``. Defaults to ``False``.
 
         Yields
         ------
@@ -1039,23 +1042,77 @@ cdef class LexborNode:
         Returns
         -------
         bool
-            ``True`` when the node is a text node and
-            ``lxb_dom_node_is_empty`` reports that its parent subtree contains
-            only whitespace (or nothing).
+            ``True`` when the node is a text node whose character data consists
+            only of ASCII whitespace characters (space, tab, newline, form feed
+            or carriage return).
         """
         return self._is_empty_text_node(self.node)
 
-    cdef inline bint _is_empty_text_node(self, lxb_dom_node_t *node):
-        if node.type != LXB_DOM_NODE_TYPE_TEXT:
+    cdef inline bint _is_empty_text_node(self, lxb_dom_node_t *text_node):
+        """
+        Check whether a node is a text node made up solely of HTML ASCII whitespace.
+
+        Parameters
+        ----------
+        text_node : lxb_dom_node_t *
+            Pointer to the node that should be inspected.
+
+        Returns
+        -------
+        bint
+            ``True`` if ``text_node`` is a text node whose character data contains
+            only space, tab, newline, form feed, or carriage return characters;
+            otherwise ``False``.
+        """
+        if text_node == NULL or text_node.type != LXB_DOM_NODE_TYPE_TEXT:
             return False
 
-        # lexbor's emptiness check walks children of the passed node; for a
-        # text node we need to evaluate its parent so the text itself is
-        # inspected.
-        if node.parent != NULL:
-            return lxb_dom_node_is_empty(node.parent)
-        return lxb_dom_node_is_empty(node)
+        cdef lxb_dom_character_data_t *text_character_data = <lxb_dom_character_data_t *> text_node
+        cdef lexbor_str_t *text_buffer = &text_character_data.data
+        cdef size_t text_length = text_buffer.length
+        cdef lxb_char_t *text_bytes = text_buffer.data
 
+        return self._is_whitespace_only(text_bytes, text_length)
+
+    cdef inline bint _is_whitespace_only(self, const lxb_char_t *buffer, size_t buffer_length) nogil:
+        """
+        Determine whether a byte buffer consists only of HTML ASCII whitespace.
+
+        Parameters
+        ----------
+        buffer : const lxb_char_t *
+            Pointer to the buffer to inspect.
+        buffer_length : size_t
+            Number of bytes available in ``buffer``.
+
+        Returns
+        -------
+        bint
+            ``True`` if ``buffer`` is ``NULL``, empty, or contains only space
+            (0x20), tab (0x09), line feed (0x0A), form feed (0x0C), or carriage
+            return (0x0D) bytes; otherwise ``False``.
+
+        Notes
+        -----
+        Mirrors Lexbor's ``lexbor_utils_whitespace`` macro and stays inline to
+        keep the GIL released in hot loops.
+        """
+        cdef const lxb_char_t *cursor = buffer
+        cdef const lxb_char_t *end = buffer + buffer_length
+        cdef lxb_char_t current_char
+
+        if buffer == NULL or buffer_length == 0:
+            return True
+
+        # Inline whitespace check mirroring lexbor_utils_whitespace(chr, !=, &&)
+        while cursor < end:
+            current_char = cursor[0]
+            if (current_char != ' ' and current_char != '\t' and current_char != '\n'
+                    and current_char != '\f' and current_char != '\r'):
+                return False
+            cursor += 1
+
+        return True
 
 @cython.internal
 @cython.final

diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py
@@ -1,8 +1,14 @@
 """Tests for functionality that is only supported by lexbor backend."""
 
+from inspect import cleandoc
+
 from selectolax.lexbor import LexborHTMLParser, parse_fragment
 
 
+def clean_doc(text: str) -> str:
+    return f"{cleandoc(text)}\n"
+
+
 def test_reads_inner_html():
     html = """<div id="main"><div>Hi</div><div id="updated">2025-09-27</div></div>"""
     parser = LexborHTMLParser(html)
@@ -120,6 +126,80 @@ def test_traverse_respects_skip_empty_on_text_nodes():
     assert ", ".join(children) == "div, span, -text, title"
 
 
+def test_traverse_with_skip_empty_on_a_full_html_document():
+    html = clean_doc(
+        """
+        <!doctype html>
+        <html lang="en">
+          <head>
+            <meta charset="utf-8">
+            <meta name="viewport" content="width=device-width,initial-scale=1">
+            <title>Title!</title>
+            <!-- My crazy comment -->
+          </head>
+          <body>
+            <p>Hello <strong>World</strong>!</p>
+            <div hidden draggable="true" translate="no" contenteditable="true" tabindex="3">
+              Div
+            </div>
+          </body>
+        </html>
+        """
+    )
+    parser = LexborHTMLParser(html)
+    children = [
+        (node.tag, node.text_content)
+        for node in parser.root.traverse(include_text=True, skip_empty=False)
+    ]
+    assert children == [
+        ("html", None),
+        ("head", None),
+        ("-text", "\n    "),
+        ("meta", None),
+        ("-text", "\n    "),
+        ("meta", None),
+        ("-text", "\n    "),
+        ("title", None),
+        ("-text", "Title!"),
+        ("-text", "\n    "),
+        ("-comment", None),
+        ("-text", "\n  "),
+        ("-text", "\n  "),
+        ("body", None),
+        ("-text", "\n    "),
+        ("p", None),
+        ("-text", "Hello "),
+        ("strong", None),
+        ("-text", "World"),
+        ("-text", "!"),
+        ("-text", "\n    "),
+        ("div", None),
+        ("-text", "\n      Div\n    "),
+        ("-text", "\n  \n\n"),
+    ]
+    children = [
+        (node.tag, node.text_content)
+        for node in parser.root.traverse(include_text=True, skip_empty=True)
+    ]
+    assert children == [
+        ("html", None),
+        ("head", None),
+        ("meta", None),
+        ("meta", None),
+        ("title", None),
+        ("-text", "Title!"),
+        ("-comment", None),
+        ("body", None),
+        ("p", None),
+        ("-text", "Hello "),
+        ("strong", None),
+        ("-text", "World"),
+        ("-text", "!"),
+        ("div", None),
+        ("-text", "\n      Div\n    "),
+    ]
+
+
 def test_is_empty_text_node_property():
     parser = LexborHTMLParser("<div><span>\n \n</span><title>X</title></div>")
     text_node = parser.css_first("span").first_child