Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions selectolax/lexbor.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ cdef class LexborNode:
public LexborHTMLParser parser
cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type)
cdef bint _is_empty_text_node(self, lxb_dom_node_t *node)
cdef inline bint _is_whitespace_only(self, const lxb_char_t *buffer, size_t buffer_length) nogil

@staticmethod
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
Expand Down
25 changes: 15 additions & 10 deletions selectolax/lexbor.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,9 @@ class LexborNode:
If ``True``, apply ``str.strip()`` to each fragment before joining to
remove surrounding whitespace. Defaults to ``False``.
skip_empty : bool, optional
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
``True``. Defaults to ``False``.
Exclude text nodes whose content is only ASCII whitespace (space,
tab, newline, form feed or carriage return) when ``True``.
Defaults to ``False``.

Returns
-------
Expand Down Expand Up @@ -424,8 +425,9 @@ class LexborNode:
When ``True``, yield text nodes in addition to element nodes. Defaults
to ``False``.
skip_empty : bool, optional
When ``include_text`` is ``True``, ignore text nodes that
``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
When ``include_text`` is ``True``, ignore text nodes made up solely
of ASCII whitespace (space, tab, newline, form feed or carriage
return). Defaults to ``False``.

Yields
------
Expand Down Expand Up @@ -512,8 +514,9 @@ class LexborNode:
When ``True``, include text nodes in the traversal sequence. Defaults
to ``False``.
skip_empty : bool, optional
Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
when ``include_text`` is ``True``. Defaults to ``False``.
Skip text nodes that contain only ASCII whitespace (space, tab,
newline, form feed or carriage return) when ``include_text`` is
``True``. Defaults to ``False``.

Yields
------
Expand Down Expand Up @@ -783,8 +786,9 @@ class LexborNode:
Returns
-------
bool
``True`` when the node is a text node and ``lxb_dom_node_is_empty``
reports that it contains no characters.
``True`` when the node is a text node whose data consists solely of
ASCII whitespace characters (space, tab, newline, form feed or
carriage return).
"""
...

Expand Down Expand Up @@ -915,8 +919,9 @@ class LexborHTMLParser:
deep : bool, default True
If True, includes text from all child nodes.
skip_empty : bool, optional
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
``True``. Defaults to ``False``.
Exclude text nodes whose content is only ASCII whitespace (space,
tab, newline, form feed or carriage return) when ``True``.
Defaults to ``False``.

Returns
-------
Expand Down
5 changes: 3 additions & 2 deletions selectolax/lexbor.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,9 @@ cdef class LexborHTMLParser:
deep : bool, default True
If True, includes text from all child nodes.
skip_empty : bool, optional
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
``True``. Defaults to ``False``.
Exclude text nodes whose content is only ASCII whitespace (space,
tab, newline, form feed or carriage return) when ``True``.
Defaults to ``False``.

Returns
-------
Expand Down
91 changes: 74 additions & 17 deletions selectolax/lexbor/node.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,9 @@ cdef class LexborNode:
If ``True``, apply ``str.strip()`` to each fragment before joining to
remove surrounding whitespace. Defaults to ``False``.
skip_empty : bool, optional
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
``True``. Defaults to ``False``.
Exclude text nodes whose content is only ASCII whitespace (space,
tab, newline, form feed or carriage return) when ``True``.
Defaults to ``False``.

Returns
-------
Expand Down Expand Up @@ -440,8 +441,9 @@ cdef class LexborNode:
When ``True``, yield text nodes in addition to element nodes. Defaults
to ``False``.
skip_empty : bool, optional
When ``include_text`` is ``True``, ignore text nodes that
``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
When ``include_text`` is ``True``, ignore text nodes made up solely
of ASCII whitespace (space, tab, newline, form feed or carriage
return). Defaults to ``False``.

Yields
------
Expand Down Expand Up @@ -594,8 +596,9 @@ cdef class LexborNode:
When ``True``, include text nodes in the traversal sequence. Defaults
to ``False``.
skip_empty : bool, optional
Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
when ``include_text`` is ``True``. Defaults to ``False``.
Skip text nodes that contain only ASCII whitespace (space, tab,
newline, form feed or carriage return) when ``include_text`` is
``True``. Defaults to ``False``.

Yields
------
Expand Down Expand Up @@ -1039,23 +1042,77 @@ cdef class LexborNode:
Returns
-------
bool
``True`` when the node is a text node and
``lxb_dom_node_is_empty`` reports that its parent subtree contains
only whitespace (or nothing).
``True`` when the node is a text node whose character data consists
only of ASCII whitespace characters (space, tab, newline, form feed
or carriage return).
"""
return self._is_empty_text_node(self.node)

cdef inline bint _is_empty_text_node(self, lxb_dom_node_t *node):
if node.type != LXB_DOM_NODE_TYPE_TEXT:
cdef inline bint _is_empty_text_node(self, lxb_dom_node_t *text_node):
"""
Check whether a node is a text node made up solely of HTML ASCII whitespace.

Parameters
----------
text_node : lxb_dom_node_t *
Pointer to the node that should be inspected.

Returns
-------
bint
``True`` if ``text_node`` is a text node whose character data contains
only space, tab, newline, form feed, or carriage return characters;
otherwise ``False``.
"""
if text_node == NULL or text_node.type != LXB_DOM_NODE_TYPE_TEXT:
return False

# lexbor's emptiness check walks children of the passed node; for a
# text node we need to evaluate its parent so the text itself is
# inspected.
if node.parent != NULL:
return lxb_dom_node_is_empty(node.parent)
return lxb_dom_node_is_empty(node)
cdef lxb_dom_character_data_t *text_character_data = <lxb_dom_character_data_t *> text_node
cdef lexbor_str_t *text_buffer = &text_character_data.data
cdef size_t text_length = text_buffer.length
cdef lxb_char_t *text_bytes = text_buffer.data

return self._is_whitespace_only(text_bytes, text_length)

cdef inline bint _is_whitespace_only(self, const lxb_char_t *buffer, size_t buffer_length) nogil:
"""
Determine whether a byte buffer consists only of HTML ASCII whitespace.

Parameters
----------
buffer : const lxb_char_t *
Pointer to the buffer to inspect.
buffer_length : size_t
Number of bytes available in ``buffer``.

Returns
-------
bint
``True`` if ``buffer`` is ``NULL``, empty, or contains only space
(0x20), tab (0x09), line feed (0x0A), form feed (0x0C), or carriage
return (0x0D) bytes; otherwise ``False``.

Notes
-----
Mirrors Lexbor's ``lexbor_utils_whitespace`` macro and stays inline to
keep the GIL released in hot loops.
"""
cdef const lxb_char_t *cursor = buffer
cdef const lxb_char_t *end = buffer + buffer_length
cdef lxb_char_t current_char

if buffer == NULL or buffer_length == 0:
return True

# Inline whitespace check mirroring lexbor_utils_whitespace(chr, !=, &&)
while cursor < end:
current_char = cursor[0]
if (current_char != ' ' and current_char != '\t' and current_char != '\n'
and current_char != '\f' and current_char != '\r'):
return False
cursor += 1

return True

@cython.internal
@cython.final
Expand Down
80 changes: 80 additions & 0 deletions tests/test_lexbor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
"""Tests for functionality that is only supported by lexbor backend."""

from inspect import cleandoc

from selectolax.lexbor import LexborHTMLParser, parse_fragment


def clean_doc(text: str) -> str:
return f"{cleandoc(text)}\n"


def test_reads_inner_html():
html = """<div id="main"><div>Hi</div><div id="updated">2025-09-27</div></div>"""
parser = LexborHTMLParser(html)
Expand Down Expand Up @@ -120,6 +126,80 @@ def test_traverse_respects_skip_empty_on_text_nodes():
assert ", ".join(children) == "div, span, -text, title"


def test_traverse_with_skip_empty_on_a_full_html_document():
html = clean_doc(
"""
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>Title!</title>
<!-- My crazy comment -->
</head>
<body>
<p>Hello <strong>World</strong>!</p>
<div hidden draggable="true" translate="no" contenteditable="true" tabindex="3">
Div
</div>
</body>
</html>
"""
)
parser = LexborHTMLParser(html)
children = [
(node.tag, node.text_content)
for node in parser.root.traverse(include_text=True, skip_empty=False)
]
assert children == [
("html", None),
("head", None),
("-text", "\n "),
("meta", None),
("-text", "\n "),
("meta", None),
("-text", "\n "),
("title", None),
("-text", "Title!"),
("-text", "\n "),
("-comment", None),
("-text", "\n "),
("-text", "\n "),
("body", None),
("-text", "\n "),
("p", None),
("-text", "Hello "),
("strong", None),
("-text", "World"),
("-text", "!"),
("-text", "\n "),
("div", None),
("-text", "\n Div\n "),
("-text", "\n \n\n"),
]
children = [
(node.tag, node.text_content)
for node in parser.root.traverse(include_text=True, skip_empty=True)
]
assert children == [
("html", None),
("head", None),
("meta", None),
("meta", None),
("title", None),
("-text", "Title!"),
("-comment", None),
("body", None),
("p", None),
("-text", "Hello "),
("strong", None),
("-text", "World"),
("-text", "!"),
("div", None),
("-text", "\n Div\n "),
]


def test_is_empty_text_node_property():
parser = LexborHTMLParser("<div><span>\n \n</span><title>X</title></div>")
text_node = parser.css_first("span").first_child
Expand Down
Loading