Skip to content

Commit 51efb93

Browse files
authored
Refactor Lexbor empty text detection to ASCII whitespace only
1 parent 10c17c0 commit 51efb93

File tree

6 files changed

+185
-37
lines changed

6 files changed

+185
-37
lines changed

selectolax/lexbor.pxd

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,12 +241,15 @@ cdef class LexborNode:
241241
lxb_dom_node_t *node
242242
public LexborHTMLParser parser
243243
cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type)
244-
cdef bint _is_empty_text_node(self, lxb_dom_node_t *node)
245244

246245
@staticmethod
247246
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
248247

249248

249+
cdef bint is_empty_text_node(lxb_dom_node_t *node)
250+
cdef inline bint _is_whitespace_only(const lxb_char_t *buffer, size_t buffer_length) nogil
251+
252+
250253
cdef class LexborCSSSelector:
251254
cdef lxb_css_parser_t* parser
252255
cdef lxb_selectors_t * selectors

selectolax/lexbor.pyi

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -189,8 +189,9 @@ class LexborNode:
189189
If ``True``, apply ``str.strip()`` to each fragment before joining to
190190
remove surrounding whitespace. Defaults to ``False``.
191191
skip_empty : bool, optional
192-
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
193-
``True``. Defaults to ``False``.
192+
Exclude text nodes whose content is only ASCII whitespace (space,
193+
tab, newline, form feed or carriage return) when ``True``.
194+
Defaults to ``False``.
194195
195196
Returns
196197
-------
@@ -424,8 +425,9 @@ class LexborNode:
424425
When ``True``, yield text nodes in addition to element nodes. Defaults
425426
to ``False``.
426427
skip_empty : bool, optional
427-
When ``include_text`` is ``True``, ignore text nodes that
428-
``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
428+
When ``include_text`` is ``True``, ignore text nodes made up solely
429+
of ASCII whitespace (space, tab, newline, form feed or carriage
430+
return). Defaults to ``False``.
429431
430432
Yields
431433
------
@@ -512,8 +514,9 @@ class LexborNode:
512514
When ``True``, include text nodes in the traversal sequence. Defaults
513515
to ``False``.
514516
skip_empty : bool, optional
515-
Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
516-
when ``include_text`` is ``True``. Defaults to ``False``.
517+
Skip text nodes that contain only ASCII whitespace (space, tab,
518+
newline, form feed or carriage return) when ``include_text`` is
519+
``True``. Defaults to ``False``.
517520
518521
Yields
519522
------
@@ -783,8 +786,9 @@ class LexborNode:
783786
Returns
784787
-------
785788
bool
786-
``True`` when the node is a text node and ``lxb_dom_node_is_empty``
787-
reports that it contains no characters.
789+
``True`` when the node is a text node whose data consists solely of
790+
ASCII whitespace characters (space, tab, newline, form feed or
791+
carriage return).
788792
"""
789793
...
790794

@@ -915,8 +919,9 @@ class LexborHTMLParser:
915919
deep : bool, default True
916920
If True, includes text from all child nodes.
917921
skip_empty : bool, optional
918-
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
919-
``True``. Defaults to ``False``.
922+
Exclude text nodes whose content is only ASCII whitespace (space,
923+
tab, newline, form feed or carriage return) when ``True``.
924+
Defaults to ``False``.
920925
921926
Returns
922927
-------

selectolax/lexbor.pyx

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,8 +324,9 @@ cdef class LexborHTMLParser:
324324
deep : bool, default True
325325
If True, includes text from all child nodes.
326326
skip_empty : bool, optional
327-
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
328-
``True``. Defaults to ``False``.
327+
Exclude text nodes whose content is only ASCII whitespace (space,
328+
tab, newline, form feed or carriage return) when ``True``.
329+
Defaults to ``False``.
329330

330331
Returns
331332
-------

selectolax/lexbor/node.pxi

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -149,8 +149,9 @@ cdef class LexborNode:
149149
If ``True``, apply ``str.strip()`` to each fragment before joining to
150150
remove surrounding whitespace. Defaults to ``False``.
151151
skip_empty : bool, optional
152-
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
153-
``True``. Defaults to ``False``.
152+
Exclude text nodes whose content is only ASCII whitespace (space,
153+
tab, newline, form feed or carriage return) when ``True``.
154+
Defaults to ``False``.
154155
155156
Returns
156157
-------
@@ -174,7 +175,7 @@ cdef class LexborNode:
174175
if node.type == LXB_DOM_NODE_TYPE_TEXT:
175176
text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> node).data)
176177
if text != NULL:
177-
if not skip_empty or not self._is_empty_text_node(node):
178+
if not skip_empty or not is_empty_text_node(node):
178179
py_text = text.decode(_ENCODING)
179180
container.append(py_text)
180181
node = node.next
@@ -440,8 +441,9 @@ cdef class LexborNode:
440441
When ``True``, yield text nodes in addition to element nodes. Defaults
441442
to ``False``.
442443
skip_empty : bool, optional
443-
When ``include_text`` is ``True``, ignore text nodes that
444-
``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
444+
When ``include_text`` is ``True``, ignore text nodes made up solely
445+
of ASCII whitespace (space, tab, newline, form feed or carriage
446+
return). Defaults to ``False``.
445447
446448
Yields
447449
------
@@ -457,7 +459,7 @@ cdef class LexborNode:
457459
if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
458460
node = node.next
459461
continue
460-
if node.type == LXB_DOM_NODE_TYPE_TEXT and include_text and skip_empty and self._is_empty_text_node(node):
462+
if node.type == LXB_DOM_NODE_TYPE_TEXT and include_text and skip_empty and is_empty_text_node(node):
461463
node = node.next
462464
continue
463465

@@ -594,8 +596,9 @@ cdef class LexborNode:
594596
When ``True``, include text nodes in the traversal sequence. Defaults
595597
to ``False``.
596598
skip_empty : bool, optional
597-
Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
598-
when ``include_text`` is ``True``. Defaults to ``False``.
599+
Skip text nodes that contain only ASCII whitespace (space, tab,
600+
newline, form feed or carriage return) when ``include_text`` is
601+
``True``. Defaults to ``False``.
599602
600603
Yields
601604
------
@@ -609,7 +612,7 @@ cdef class LexborNode:
609612

610613
while node != NULL:
611614
if include_text or node.type != LXB_DOM_NODE_TYPE_TEXT:
612-
if not skip_empty or not self._is_empty_text_node(node):
615+
if not skip_empty or not is_empty_text_node(node):
613616
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
614617
yield lxb_node
615618

@@ -1039,22 +1042,11 @@ cdef class LexborNode:
10391042
Returns
10401043
-------
10411044
bool
1042-
``True`` when the node is a text node and
1043-
``lxb_dom_node_is_empty`` reports that its parent subtree contains
1044-
only whitespace (or nothing).
1045+
``True`` when the node is a text node whose character data consists
1046+
only of ASCII whitespace characters (space, tab, newline, form feed
1047+
or carriage return).
10451048
"""
1046-
return self._is_empty_text_node(self.node)
1047-
1048-
cdef inline bint _is_empty_text_node(self, lxb_dom_node_t *node):
1049-
if node.type != LXB_DOM_NODE_TYPE_TEXT:
1050-
return False
1051-
1052-
# lexbor's emptiness check walks children of the passed node; for a
1053-
# text node we need to evaluate its parent so the text itself is
1054-
# inspected.
1055-
if node.parent != NULL:
1056-
return lxb_dom_node_is_empty(node.parent)
1057-
return lxb_dom_node_is_empty(node)
1049+
return is_empty_text_node(self.node)
10581050

10591051

10601052
@cython.internal

selectolax/lexbor/util.pxi

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,70 @@ def parse_fragment(html: str):
1818
if they are missing. This function does not add these tags.
1919
"""
2020
return do_parse_fragment(html, LexborHTMLParser)
21+
22+
23+
cdef inline bint is_empty_text_node(lxb_dom_node_t *text_node):
24+
"""
25+
Check whether a node is a text node made up solely of HTML ASCII whitespace.
26+
27+
Parameters
28+
----------
29+
text_node : lxb_dom_node_t *
30+
Pointer to the node that should be inspected.
31+
32+
Returns
33+
-------
34+
bint
35+
``True`` if ``text_node`` is a text node whose character data contains
36+
only space, tab, newline, form feed, or carriage return characters;
37+
otherwise ``False``.
38+
"""
39+
if text_node == NULL or text_node.type != LXB_DOM_NODE_TYPE_TEXT:
40+
return False
41+
42+
cdef lxb_dom_character_data_t *text_character_data = <lxb_dom_character_data_t *> text_node
43+
cdef lexbor_str_t *text_buffer = &text_character_data.data
44+
cdef size_t text_length = text_buffer.length
45+
cdef lxb_char_t *text_bytes = text_buffer.data
46+
47+
return _is_whitespace_only(text_bytes, text_length)
48+
49+
cdef inline bint _is_whitespace_only(const lxb_char_t *buffer, size_t buffer_length) nogil:
50+
"""
51+
Determine whether a byte buffer consists only of HTML ASCII whitespace.
52+
53+
Parameters
54+
----------
55+
buffer : const lxb_char_t *
56+
Pointer to the buffer to inspect.
57+
buffer_length : size_t
58+
Number of bytes available in ``buffer``.
59+
60+
Returns
61+
-------
62+
bint
63+
``True`` if ``buffer`` is ``NULL``, empty, or contains only space
64+
(0x20), tab (0x09), line feed (0x0A), form feed (0x0C), or carriage
65+
return (0x0D) bytes; otherwise ``False``.
66+
67+
Notes
68+
-----
69+
Mirrors Lexbor's ``lexbor_utils_whitespace`` macro and stays inline to
70+
keep the GIL released in hot loops.
71+
"""
72+
cdef const lxb_char_t *cursor = buffer
73+
cdef const lxb_char_t *end = buffer + buffer_length
74+
cdef lxb_char_t current_char
75+
76+
if buffer == NULL or buffer_length == 0:
77+
return True
78+
79+
# Inline whitespace check mirroring lexbor_utils_whitespace(chr, !=, &&)
80+
while cursor < end:
81+
current_char = cursor[0]
82+
if (current_char != ' ' and current_char != '\t' and current_char != '\n'
83+
and current_char != '\f' and current_char != '\r'):
84+
return False
85+
cursor += 1
86+
87+
return True

tests/test_lexbor.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
11
"""Tests for functionality that is only supported by lexbor backend."""
22

3+
from inspect import cleandoc
4+
35
from selectolax.lexbor import LexborHTMLParser, parse_fragment
46

57

8+
def clean_doc(text: str) -> str:
9+
return f"{cleandoc(text)}\n"
10+
11+
612
def test_reads_inner_html():
713
html = """<div id="main"><div>Hi</div><div id="updated">2025-09-27</div></div>"""
814
parser = LexborHTMLParser(html)
@@ -120,6 +126,80 @@ def test_traverse_respects_skip_empty_on_text_nodes():
120126
assert ", ".join(children) == "div, span, -text, title"
121127

122128

129+
def test_traverse_with_skip_empty_on_a_full_html_document():
130+
html = clean_doc(
131+
"""
132+
<!doctype html>
133+
<html lang="en">
134+
<head>
135+
<meta charset="utf-8">
136+
<meta name="viewport" content="width=device-width,initial-scale=1">
137+
<title>Title!</title>
138+
<!-- My crazy comment -->
139+
</head>
140+
<body>
141+
<p>Hello <strong>World</strong>!</p>
142+
<div hidden draggable="true" translate="no" contenteditable="true" tabindex="3">
143+
Div
144+
</div>
145+
</body>
146+
</html>
147+
"""
148+
)
149+
parser = LexborHTMLParser(html)
150+
children = [
151+
(node.tag, node.text_content)
152+
for node in parser.root.traverse(include_text=True, skip_empty=False)
153+
]
154+
assert children == [
155+
("html", None),
156+
("head", None),
157+
("-text", "\n "),
158+
("meta", None),
159+
("-text", "\n "),
160+
("meta", None),
161+
("-text", "\n "),
162+
("title", None),
163+
("-text", "Title!"),
164+
("-text", "\n "),
165+
("-comment", None),
166+
("-text", "\n "),
167+
("-text", "\n "),
168+
("body", None),
169+
("-text", "\n "),
170+
("p", None),
171+
("-text", "Hello "),
172+
("strong", None),
173+
("-text", "World"),
174+
("-text", "!"),
175+
("-text", "\n "),
176+
("div", None),
177+
("-text", "\n Div\n "),
178+
("-text", "\n \n\n"),
179+
]
180+
children = [
181+
(node.tag, node.text_content)
182+
for node in parser.root.traverse(include_text=True, skip_empty=True)
183+
]
184+
assert children == [
185+
("html", None),
186+
("head", None),
187+
("meta", None),
188+
("meta", None),
189+
("title", None),
190+
("-text", "Title!"),
191+
("-comment", None),
192+
("body", None),
193+
("p", None),
194+
("-text", "Hello "),
195+
("strong", None),
196+
("-text", "World"),
197+
("-text", "!"),
198+
("div", None),
199+
("-text", "\n Div\n "),
200+
]
201+
202+
123203
def test_is_empty_text_node_property():
124204
parser = LexborHTMLParser("<div><span>\n \n</span><title>X</title></div>")
125205
text_node = parser.css_first("span").first_child

0 commit comments

Comments
 (0)