Skip to content

Commit 9d27a52

Browse files
authored
Introducing the ability to skip empty text nodes in LexborNode! (#187)
1 parent 51ebd42 commit 9d27a52

File tree

4 files changed

+192
-48
lines changed

4 files changed

+192
-48
lines changed

selectolax/lexbor.pxd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ cdef class LexborNode:
234234
lxb_dom_node_t *node
235235
public LexborHTMLParser parser
236236
cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type)
237+
cdef bint _is_empty_text_node(self, lxb_dom_node_t *node)
237238

238239
@staticmethod
239240
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
@@ -299,6 +300,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
299300
lxb_dom_collection_t * lxb_dom_collection_make(lxb_dom_document_t *document, size_t start_list_size)
300301
lxb_char_t * lxb_dom_node_text_content(lxb_dom_node_t *node, size_t *len)
301302
lxb_status_t lxb_dom_node_text_content_set(lxb_dom_node_t *node, const lxb_char_t *content, size_t len)
303+
bint lxb_dom_node_is_empty(lxb_dom_node_t *node)
302304
void lxb_dom_node_remove(lxb_dom_node_t *node)
303305
void * lxb_dom_document_destroy_text_noi(lxb_dom_document_t *document, lxb_char_t *text)
304306
lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)

selectolax/lexbor.pyi

Lines changed: 60 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -167,21 +167,33 @@ class LexborNode:
167167
"""
168168
...
169169

170-
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
171-
"""Returns the text of the node including text of all its child nodes.
170+
def text(
171+
self,
172+
deep: bool = True,
173+
separator: str = "",
174+
strip: bool = False,
175+
skip_empty: bool = False,
176+
) -> str:
177+
"""Return concatenated text from this node.
172178
173179
Parameters
174180
----------
175-
strip : bool, default False
176-
If true, calls ``str.strip()`` on each text part to remove extra white spaces.
177-
separator : str, default ''
178-
The separator to use when joining text from different nodes.
179-
deep : bool, default True
180-
If True, includes text from all child nodes.
181+
deep : bool, optional
182+
When ``True`` (default), include text from all descendant nodes; when
183+
``False``, only include direct children.
184+
separator : str, optional
185+
String inserted between successive text fragments.
186+
strip : bool, optional
187+
If ``True``, apply ``str.strip()`` to each fragment before joining to
188+
remove surrounding whitespace. Defaults to ``False``.
189+
skip_empty : bool, optional
190+
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
191+
``True``. Defaults to ``False``.
181192
182193
Returns
183194
-------
184195
text : str
196+
Combined textual content assembled according to the provided options.
185197
"""
186198
...
187199

@@ -399,17 +411,25 @@ class LexborNode:
399411
"""
400412
...
401413

402-
def iter(self, include_text: bool = False) -> Iterator[LexborNode]:
403-
"""Iterate over nodes on the current level.
414+
def iter(
415+
self, include_text: bool = False, skip_empty: bool = False
416+
) -> Iterator[LexborNode]:
417+
"""Iterate over direct children of this node.
404418
405419
Parameters
406420
----------
407-
include_text : bool
408-
If True, includes text nodes as well.
421+
include_text : bool, optional
422+
When ``True``, yield text nodes in addition to element nodes. Defaults
423+
to ``False``.
424+
skip_empty : bool, optional
425+
When ``include_text`` is ``True``, ignore text nodes that
426+
``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
409427
410428
Yields
411-
-------
412-
node
429+
------
430+
LexborNode
431+
Child nodes on the same tree level as this node, filtered according
432+
to the provided options.
413433
"""
414434
...
415435

@@ -479,17 +499,25 @@ class LexborNode:
479499
"""
480500
...
481501

482-
def traverse(self, include_text: bool = False) -> Iterator[LexborNode]:
483-
"""Iterate over all child and next nodes starting from the current level.
502+
def traverse(
503+
self, include_text: bool = False, skip_empty: bool = False
504+
) -> Iterator[LexborNode]:
505+
"""Depth-first traversal starting at the current node.
484506
485507
Parameters
486508
----------
487-
include_text : bool
488-
If True, includes text nodes as well.
509+
include_text : bool, optional
510+
When ``True``, include text nodes in the traversal sequence. Defaults
511+
to ``False``.
512+
skip_empty : bool, optional
513+
Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
514+
when ``include_text`` is ``True``. Defaults to ``False``.
489515
490516
Yields
491-
-------
492-
node
517+
------
518+
LexborNode
519+
Nodes encountered in depth-first order beginning with the current
520+
node, filtered according to the provided options.
493521
"""
494522
...
495523

@@ -746,6 +774,18 @@ class LexborNode:
746774
"""Return True if the node represents a document node."""
747775
...
748776

777+
@property
778+
def is_empty_text_node(self) -> bool:
779+
"""Check whether the current node is an empty text node.
780+
781+
Returns
782+
-------
783+
bool
784+
``True`` when the node is a text node and ``lxb_dom_node_is_empty``
785+
reports that it contains no characters.
786+
"""
787+
...
788+
749789
class LexborHTMLParser:
750790
"""The lexbor HTML parser.
751791

selectolax/lexbor/node.pxi

Lines changed: 83 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -135,21 +135,27 @@ cdef class LexborNode:
135135
lxb_dom_document_destroy_text_noi(self.node.owner_document, text)
136136
return unicode_text
137137

138-
def text(self, bool deep=True, str separator='', bool strip=False):
139-
"""Returns the text of the node including text of all its child nodes.
138+
def text(self, bool deep=True, str separator='', bool strip=False, bool skip_empty=False):
139+
"""Return concatenated text from this node.
140140
141141
Parameters
142142
----------
143-
strip : bool, default False
144-
If true, calls ``str.strip()`` on each text part to remove extra white spaces.
145-
separator : str, default ''
146-
The separator to use when joining text from different nodes.
147-
deep : bool, default True
148-
If True, includes text from all child nodes.
143+
deep : bool, optional
144+
When ``True`` (default), include text from all descendant nodes; when
145+
``False``, only include direct children.
146+
separator : str, optional
147+
String inserted between successive text fragments.
148+
strip : bool, optional
149+
If ``True``, apply ``str.strip()`` to each fragment before joining to
150+
remove surrounding whitespace. Defaults to ``False``.
151+
skip_empty : bool, optional
152+
Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
153+
``True``. Defaults to ``False``.
149154
150155
Returns
151156
-------
152157
text : str
158+
Combined textual content assembled according to the provided options.
153159
154160
"""
155161
cdef unsigned char * text
@@ -160,23 +166,26 @@ cdef class LexborNode:
160166
if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
161167
text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
162168
if text != NULL:
163-
py_text = text.decode(_ENCODING)
164-
container.append(py_text)
169+
if not skip_empty or not self.is_empty_text_node:
170+
py_text = text.decode(_ENCODING)
171+
container.append(py_text)
165172

166173
while node != NULL:
167174
if node.type == LXB_DOM_NODE_TYPE_TEXT:
168175
text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> node).data)
169176
if text != NULL:
170-
py_text = text.decode(_ENCODING)
171-
container.append(py_text)
177+
if not skip_empty or not self._is_empty_text_node(node):
178+
py_text = text.decode(_ENCODING)
179+
container.append(py_text)
172180
node = node.next
173181
return container.text
174182
else:
175183
container = TextContainer(separator, strip)
176184
if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
177185
text = <unsigned char *> lexbor_str_data_noi(&(<lxb_dom_character_data_t *> self.node).data)
178186
if text != NULL:
179-
container.append(text.decode(_ENCODING))
187+
if not skip_empty or not self.is_empty_text_node:
188+
container.append(text.decode(_ENCODING))
180189

181190
lxb_dom_node_simple_walk(
182191
<lxb_dom_node_t *> self.node,
@@ -422,17 +431,23 @@ cdef class LexborNode:
422431
return value.decode(_ENCODING) if value else None
423432
return None
424433

425-
def iter(self, include_text=False):
426-
"""Iterate over nodes on the current level.
434+
def iter(self, bool include_text = False, bool skip_empty = False):
435+
"""Iterate over direct children of this node.
427436
428437
Parameters
429438
----------
430-
include_text : bool
431-
If True, includes text nodes as well.
439+
include_text : bool, optional
440+
When ``True``, yield text nodes in addition to element nodes. Defaults
441+
to ``False``.
442+
skip_empty : bool, optional
443+
When ``include_text`` is ``True``, ignore text nodes that
444+
``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
432445
433446
Yields
434-
-------
435-
node
447+
------
448+
LexborNode
449+
Child nodes on the same tree level as this node, filtered according
450+
to the provided options.
436451
"""
437452

438453
cdef lxb_dom_node_t *node = self.node.first_child
@@ -442,11 +457,20 @@ cdef class LexborNode:
442457
if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
443458
node = node.next
444459
continue
460+
if node.type == LXB_DOM_NODE_TYPE_TEXT and include_text and skip_empty and self._is_empty_text_node(node):
461+
node = node.next
462+
continue
445463

446464
next_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
447465
yield next_node
448466
node = node.next
449467

468+
def __iter__(self):
469+
return self.iter()
470+
471+
def __next__(self):
472+
return self.next
473+
450474
def unwrap(self, bint delete_empty=False):
451475
"""Replace node with whatever is inside this node.
452476
@@ -561,26 +585,33 @@ cdef class LexborNode:
561585
LexborNode.new(node, self.parser).merge_text_nodes()
562586
node = next_node
563587

564-
def traverse(self, include_text=False):
565-
"""Iterate over all child and next nodes starting from the current level.
588+
def traverse(self, bool include_text = False, bool skip_empty = False):
589+
"""Depth-first traversal starting at the current node.
566590
567591
Parameters
568592
----------
569-
include_text : bool
570-
If True, includes text nodes as well.
593+
include_text : bool, optional
594+
When ``True``, include text nodes in the traversal sequence. Defaults
595+
to ``False``.
596+
skip_empty : bool, optional
597+
Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
598+
when ``include_text`` is ``True``. Defaults to ``False``.
571599
572600
Yields
573-
-------
574-
node
601+
------
602+
LexborNode
603+
Nodes encountered in depth-first order beginning with the current
604+
node, filtered according to the provided options.
575605
"""
576606
cdef lxb_dom_node_t * root = self.node
577607
cdef lxb_dom_node_t * node = root
578608
cdef LexborNode lxb_node
579609

580610
while node != NULL:
581-
if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
582-
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
583-
yield lxb_node
611+
if include_text or node.type != LXB_DOM_NODE_TYPE_TEXT:
612+
if not skip_empty or not self._is_empty_text_node(node):
613+
lxb_node = LexborNode.new(<lxb_dom_node_t *> node, self.parser)
614+
yield lxb_node
584615

585616
if node.first_child != NULL:
586617
node = node.first_child
@@ -1001,6 +1032,30 @@ cdef class LexborNode:
10011032
"""Return True if the node represents a document node."""
10021033
return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT)
10031034

1035+
@property
1036+
def is_empty_text_node(self) -> bool:
1037+
"""Check whether the current node is an empty text node.
1038+
1039+
Returns
1040+
-------
1041+
bool
1042+
``True`` when the node is a text node and
1043+
``lxb_dom_node_is_empty`` reports that its parent subtree contains
1044+
only whitespace (or nothing).
1045+
"""
1046+
return self._is_empty_text_node(self.node)
1047+
1048+
cdef inline bint _is_empty_text_node(self, lxb_dom_node_t *node):
1049+
if node.type != LXB_DOM_NODE_TYPE_TEXT:
1050+
return False
1051+
1052+
# lexbor's emptiness check walks children of the passed node; for a
1053+
# text node we need to evaluate its parent so the text itself is
1054+
# inspected.
1055+
if node.parent != NULL:
1056+
return lxb_dom_node_is_empty(node.parent)
1057+
return lxb_dom_node_is_empty(node)
1058+
10041059

10051060
@cython.internal
10061061
@cython.final

tests/test_lexbor.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,50 @@ def test_node_type_helpers():
8181
assert document_node is not None
8282
assert document_node.is_document_node
8383
assert not document_node.is_element_node
84+
85+
86+
def test_text_honors_skip_empty_flag():
87+
parser = LexborHTMLParser("<div><span>value</span><title>\n \n</title></div>")
88+
span = parser.css_first("span")
89+
assert span is not None
90+
assert span.text(deep=False, skip_empty=False) == "value"
91+
assert span.text(deep=False, skip_empty=True) == "value"
92+
title = parser.css_first("title")
93+
assert title is not None
94+
assert title.text(deep=False, skip_empty=False) == "\n \n"
95+
assert title.text(deep=False, skip_empty=True) == ""
96+
97+
98+
def test_iter_includes_text_nodes_when_requested():
99+
parser = LexborHTMLParser("<div><span>value</span><title>\n \n</title></div>")
100+
div = parser.css_first("div")
101+
children = [node for node in div.iter(include_text=True, skip_empty=True)]
102+
assert (
103+
", ".join(
104+
node.tag for node in children[0].iter(include_text=True, skip_empty=True)
105+
)
106+
== "-text"
107+
)
108+
assert (
109+
", ".join(
110+
node.tag for node in children[1].iter(include_text=True, skip_empty=True)
111+
)
112+
== ""
113+
)
114+
115+
116+
def test_traverse_respects_skip_empty_on_text_nodes():
117+
parser = LexborHTMLParser("<div><span>value</span><title>\n \n</title></div>")
118+
div = parser.css_first("div")
119+
children = [node.tag for node in div.traverse(include_text=True, skip_empty=True)]
120+
assert ", ".join(children) == "div, span, -text, title"
121+
122+
123+
def test_is_empty_text_node_property():
124+
parser = LexborHTMLParser("<div><span>\n \n</span><title>X</title></div>")
125+
text_node = parser.css_first("span").first_child
126+
assert text_node.text_content == "\n \n"
127+
assert text_node.is_empty_text_node
128+
text_node = parser.css_first("title").first_child
129+
assert text_node.text_content == "X"
130+
assert not text_node.is_empty_text_node

0 commit comments

Comments
 (0)