@@ -135,21 +135,27 @@ cdef class LexborNode:
135135 lxb_dom_document_destroy_text_noi(self .node.owner_document, text)
136136 return unicode_text
137137
138- def text (self , bool deep = True , str separator = ' ' , bool strip = False ):
139- """ Returns the text of the node including text of all its child nodes .
138+ def text (self , bool deep = True , str separator = ' ' , bool strip = False , bool skip_empty = False ):
139+ """ Return concatenated text from this node.
140140
141141 Parameters
142142 ----------
143- strip : bool, default False
144- If true, calls ``str.strip()`` on each text part to remove extra white spaces.
145- separator : str, default ''
146- The separator to use when joining text from different nodes.
147- deep : bool, default True
148- If True, includes text from all child nodes.
143+ deep : bool, optional
144+ When ``True`` (default), include text from all descendant nodes; when
145+ ``False``, only include direct children.
146+ separator : str, optional
147+ String inserted between successive text fragments.
148+ strip : bool, optional
149+ If ``True``, apply ``str.strip()`` to each fragment before joining to
150+ remove surrounding whitespace. Defaults to ``False``.
151+ skip_empty : bool, optional
152+ Exclude text nodes that ``lxb_dom_node_is_empty`` considers empty when
153+ ``True``. Defaults to ``False``.
149154
150155 Returns
151156 -------
152157 text : str
158+ Combined textual content assembled according to the provided options.
153159
154160 """
155161 cdef unsigned char * text
@@ -160,23 +166,26 @@ cdef class LexborNode:
160166 if self ._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
161167 text = < unsigned char * > lexbor_str_data_noi(& (< lxb_dom_character_data_t * > self .node).data)
162168 if text != NULL :
163- py_text = text.decode(_ENCODING)
164- container.append(py_text)
169+ if not skip_empty or not self .is_empty_text_node:
170+ py_text = text.decode(_ENCODING)
171+ container.append(py_text)
165172
166173 while node != NULL :
167174 if node.type == LXB_DOM_NODE_TYPE_TEXT:
168175 text = < unsigned char * > lexbor_str_data_noi(& (< lxb_dom_character_data_t * > node).data)
169176 if text != NULL :
170- py_text = text.decode(_ENCODING)
171- container.append(py_text)
177+ if not skip_empty or not self ._is_empty_text_node(node):
178+ py_text = text.decode(_ENCODING)
179+ container.append(py_text)
172180 node = node.next
173181 return container.text
174182 else :
175183 container = TextContainer(separator, strip)
176184 if self ._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
177185 text = < unsigned char * > lexbor_str_data_noi(& (< lxb_dom_character_data_t * > self .node).data)
178186 if text != NULL :
179- container.append(text.decode(_ENCODING))
187+ if not skip_empty or not self .is_empty_text_node:
188+ container.append(text.decode(_ENCODING))
180189
181190 lxb_dom_node_simple_walk(
182191 < lxb_dom_node_t * > self .node,
@@ -422,17 +431,23 @@ cdef class LexborNode:
422431 return value.decode(_ENCODING) if value else None
423432 return None
424433
425- def iter (self , include_text = False ):
426- """ Iterate over nodes on the current level .
434+ def iter (self , bool include_text = False , bool skip_empty = False ):
435+ """ Iterate over direct children of this node .
427436
428437 Parameters
429438 ----------
430- include_text : bool
431- If True, includes text nodes as well.
439+ include_text : bool, optional
440+ When ``True``, yield text nodes in addition to element nodes. Defaults
441+ to ``False``.
442+ skip_empty : bool, optional
443+ When ``include_text`` is ``True``, ignore text nodes that
444+ ``lxb_dom_node_is_empty`` deems empty. Defaults to ``False``.
432445
433446 Yields
434- -------
435- node
447+ ------
448+ LexborNode
449+ Child nodes on the same tree level as this node, filtered according
450+ to the provided options.
436451 """
437452
438453 cdef lxb_dom_node_t * node = self .node.first_child
@@ -442,11 +457,20 @@ cdef class LexborNode:
442457 if node.type == LXB_DOM_NODE_TYPE_TEXT and not include_text:
443458 node = node.next
444459 continue
460+ if node.type == LXB_DOM_NODE_TYPE_TEXT and include_text and skip_empty and self ._is_empty_text_node(node):
461+ node = node.next
462+ continue
445463
446464 next_node = LexborNode.new(< lxb_dom_node_t * > node, self .parser)
447465 yield next_node
448466 node = node.next
449467
468+ def __iter__ (self ):
469+ return self .iter()
470+
471+ def __next__ (self ):
472+ return self .next
473+
450474 def unwrap (self , bint delete_empty = False ):
451475 """ Replace node with whatever is inside this node.
452476
@@ -561,26 +585,33 @@ cdef class LexborNode:
561585 LexborNode.new(node, self .parser).merge_text_nodes()
562586 node = next_node
563587
564- def traverse (self , include_text = False ):
565- """ Iterate over all child and next nodes starting from the current level .
588+ def traverse (self , bool include_text = False , bool skip_empty = False ):
589+ """ Depth-first traversal starting at the current node .
566590
567591 Parameters
568592 ----------
569- include_text : bool
570- If True, includes text nodes as well.
593+ include_text : bool, optional
594+ When ``True``, include text nodes in the traversal sequence. Defaults
595+ to ``False``.
596+ skip_empty : bool, optional
597+ Skip empty text nodes (as determined by ``lxb_dom_node_is_empty``)
598+ when ``include_text`` is ``True``. Defaults to ``False``.
571599
572600 Yields
573- -------
574- node
601+ ------
602+ LexborNode
603+ Nodes encountered in depth-first order beginning with the current
604+ node, filtered according to the provided options.
575605 """
576606 cdef lxb_dom_node_t * root = self .node
577607 cdef lxb_dom_node_t * node = root
578608 cdef LexborNode lxb_node
579609
580610 while node != NULL :
581- if not (not include_text and node.type == LXB_DOM_NODE_TYPE_TEXT):
582- lxb_node = LexborNode.new(< lxb_dom_node_t * > node, self .parser)
583- yield lxb_node
611+ if include_text or node.type != LXB_DOM_NODE_TYPE_TEXT:
612+ if not skip_empty or not self ._is_empty_text_node(node):
613+ lxb_node = LexborNode.new(< lxb_dom_node_t * > node, self .parser)
614+ yield lxb_node
584615
585616 if node.first_child != NULL :
586617 node = node.first_child
@@ -1001,6 +1032,30 @@ cdef class LexborNode:
10011032 """Return True if the node represents a document node."""
10021033 return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT )
10031034
1035+ @property
1036+ def is_empty_text_node(self ) -> bool:
1037+ """Check whether the current node is an empty text node.
1038+
1039+ Returns
1040+ -------
1041+ bool
1042+ ``True`` when the node is a text node and
1043+ ``lxb_dom_node_is_empty`` reports that its parent subtree contains
1044+ only whitespace (or nothing ).
1045+ """
1046+ return self._is_empty_text_node(self.node )
1047+
1048+ cdef inline bint _is_empty_text_node(self , lxb_dom_node_t *node ):
1049+ if node.type != LXB_DOM_NODE_TYPE_TEXT:
1050+ return False
1051+
1052+ # lexbor's emptiness check walks children of the passed node; for a
1053+ # text node we need to evaluate its parent so the text itself is
1054+ # inspected.
1055+ if node.parent != NULL :
1056+ return lxb_dom_node_is_empty(node.parent)
1057+ return lxb_dom_node_is_empty(node)
1058+
10041059
10051060@cython.internal
10061061@cython.final
0 commit comments