Skip to content

Commit 2d0afb8

Browse files
committed
Add .inner_html property
1 parent bbbdd0c commit 2d0afb8

File tree

7 files changed

+233
-0
lines changed

7 files changed

+233
-0
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
- Fix docstring for `css_first` method
77
- Fix memory leak in `merge_text_nodes` for lexbor backend
88
- Update lexbor backend
9+
- Add `.inner_html` property. Allows to get and set inner HTML of a node.
910

1011

1112
## Version 0.3.34

docs/examples.rst

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,82 @@ Ensure exactly one match exists, otherwise raise an error.
173173
174174
ValueError: Expected 1 match, but found 2 matches
175175
176+
HTML manipulation
177+
-----------------
178+
179+
Getting HTML data back
180+
~~~~~~~~~~~~~~~~~~~~~~
181+
182+
You can get HTML data back using `.html` or `.inner_html` properties.
183+
They can be called on any node.
184+
185+
.. code-block:: python
186+
187+
from selectolax.lexbor import LexborHTMLParser
188+
html = """
189+
<div id="main">
190+
<div>Hi there</div>
191+
<div id="updated">2021-08-15</div>
192+
</div>
193+
"""
194+
parser = LexborHTMLParser(html)
195+
node = parser.css_first("#main")
196+
print("Inner html:\n")
197+
print(node.inner_html)
198+
print("\nOuter html:\n")
199+
print(node.html)
200+
201+
**Output:**
202+
203+
.. code-block:: text
204+
Inner html:
205+
206+
<div>Hi there</div>
207+
<div id="updated">2021-08-15</div>
208+
209+
Outer html:
210+
211+
<div id="main">
212+
<div>Hi there</div>
213+
<div id="updated">2021-08-15</div>
214+
</div>
215+
216+
217+
Changing HTML
218+
~~~~~~~~~~~~~~
219+
220+
You can also change HTML by setting the `.inner_html` property.
221+
222+
.. code-block:: python
223+
224+
from selectolax.lexbor import LexborHTMLParser
225+
html = """
226+
<div id="main">
227+
<div>Hi there</div>
228+
</div>
229+
"""
230+
parser = LexborHTMLParser(html)
231+
node = parser.css_first("#main")
232+
print("Old html:\n")
233+
print(node.html)
234+
235+
node.inner_html = "<span>Test</span>"
236+
print("\nNew html:\n")
237+
print(node.inner)
238+
239+
**Output:**
240+
241+
Old html:
242+
243+
<div id="main">
244+
<div>Hi there</div>
245+
</div>
246+
247+
New html:
248+
249+
<div id="main"><span>Test</span></div>
250+
251+
176252
DOM Navigation
177253
--------------
178254

selectolax/lexbor.pxd

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,8 @@ cdef extern from "lexbor/html/html.h" nogil:
215215

216216
size_t ref_count
217217

218+
ctypedef struct lxb_html_element_t
219+
218220
# Functions
219221
lxb_html_document_t * lxb_html_document_create()
220222
lxb_status_t lxb_html_document_parse(lxb_html_document_t *document, const lxb_char_t *html, size_t size)
@@ -223,6 +225,9 @@ cdef extern from "lexbor/html/html.h" nogil:
223225
lxb_dom_element_t * lxb_dom_document_element(lxb_dom_document_t *document)
224226

225227
lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
228+
lxb_status_t lxb_html_serialize_deep_str(lxb_dom_node_t *node, lexbor_str_t *str)
229+
lxb_html_element_t* lxb_html_element_inner_html_set(lxb_html_element_t *element,
230+
const lxb_char_t *html, size_t size)
226231

227232
cdef class LexborNode:
228233
cdef:

selectolax/lexbor.pyi

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,34 @@ class LexborSelector:
7171
"""
7272
...
7373

74+
@property
75+
def inner_html(self) -> str:
76+
"""Return HTML representation of the child nodes.
77+
78+
Works similar to innerHTML in JavaScript.
79+
Unlike `.html` propery, does not inlcude current node.
80+
Can be used to set HTML as well. See the setter docstring.
81+
82+
Returns
83+
-------
84+
text : str
85+
"""
86+
...
87+
88+
@inner_html.setter
89+
def inner_html(self, html: str):
90+
"""Set inner HTML to the specified HTML.
91+
92+
Replaces existing data inside the node.
93+
Works similar to innerHTML in JavaScript.
94+
95+
Parameters
96+
----------
97+
html : str
98+
99+
"""
100+
...
101+
74102
class LexborCSSSelector:
75103
def __init__(self): ...
76104
def find(self, query: str, node: LexborNode) -> list[LexborNode]: ...
@@ -608,6 +636,34 @@ class LexborNode:
608636
"""
609637
...
610638

639+
@property
640+
def inner_html(self) -> str:
641+
"""Return HTML representation of the child nodes.
642+
643+
Works similar to innerHTML in JavaScript.
644+
Unlike `.html` propery, does not inlcude current node.
645+
Can be used to set HTML as well. See the setter docstring.
646+
647+
Returns
648+
-------
649+
text : str
650+
"""
651+
...
652+
653+
@inner_html.setter
654+
def inner_html(self, html: str):
655+
"""Set inner HTML to the specified HTML.
656+
657+
Replaces existing data inside the node.
658+
Works similar to innerHTML in JavaScript.
659+
660+
Parameters
661+
----------
662+
html : str
663+
664+
"""
665+
...
666+
611667
class LexborHTMLParser:
612668
"""The lexbor HTML parser.
613669

selectolax/lexbor.pyx

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,3 +386,32 @@ cdef class LexborHTMLParser:
386386
# faster to check if the document is empty which should determine if we have a root
387387
if self.document != NULL:
388388
self.root.unwrap_tags(tags, delete_empty=delete_empty)
389+
390+
@property
391+
def inner_html(self):
392+
"""Return HTML representation of the child nodes.
393+
394+
Works similar to innerHTML in JavaScript.
395+
Unlike `.html` propery, does not inlcude current node.
396+
Can be used to set HTML as well. See the setter docstring.
397+
398+
Returns
399+
-------
400+
text : str
401+
"""
402+
return self.root.inner_html
403+
404+
@inner_html.setter
405+
def inner_html(self, str html):
406+
"""Set inner HTML to the specified HTML.
407+
408+
Replaces existing data inside the node.
409+
Works similar to innerHTML in JavaScript.
410+
411+
Parameters
412+
----------
413+
html : str
414+
415+
"""
416+
self.root.inner_html = html
417+

selectolax/lexbor/node.pxi

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ ctypedef fused str_or_LexborNode:
1111
bytes
1212
LexborNode
1313

14+
ctypedef fused str_or_bytes:
15+
str
16+
bytes
17+
1418
cdef inline bytes to_bytes(str_or_LexborNode value):
1519
cdef bytes bytes_val
1620
if isinstance(value, unicode):
@@ -889,6 +893,48 @@ cdef class LexborNode:
889893
container.append(py_text)
890894
return container.text
891895

896+
@property
897+
def inner_html(self):
898+
"""Return HTML representation of the child nodes.
899+
900+
Works similar to innerHTML in JavaScript.
901+
Unlike `.html` propery, does not inlcude current node.
902+
Can be used to set HTML as well. See the setter docstring.
903+
904+
Returns
905+
-------
906+
text : str
907+
"""
908+
909+
cdef lexbor_str_t *lxb_str
910+
cdef lxb_status_t status
911+
912+
lxb_str = lexbor_str_create()
913+
status = lxb_html_serialize_deep_str(self.node, lxb_str)
914+
if status == 0 and lxb_str.data:
915+
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
916+
lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
917+
return html
918+
919+
@inner_html.setter
920+
def inner_html(self, str html):
921+
"""Set inner HTML to the specified HTML.
922+
923+
Replaces existing data inside the node.
924+
Works similar to innerHTML in JavaScript.
925+
926+
Parameters
927+
----------
928+
html : str
929+
930+
"""
931+
cdef bytes bytes_val
932+
bytes_val = <bytes>html.encode("utf-8")
933+
lxb_html_element_inner_html_set(
934+
<lxb_html_element_t *>self.node,
935+
<lxb_char_t *> bytes_val, len(bytes_val)
936+
)
937+
892938

893939
@cython.internal
894940
@cython.final

tests/test_lexbor.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
"""Tests for functionality that is only supported by lexbor backend."""
2+
3+
from selectolax.lexbor import LexborHTMLParser
4+
5+
6+
def test_reads_inner_html():
7+
html = """<div id="main"><div>Hi</div><div id="updated">2025-09-27</div></div>"""
8+
parser = LexborHTMLParser(html)
9+
actual = parser.css_first("#main").inner_html
10+
expected = """<div>Hi</div><div id="updated">2025-09-27</div>"""
11+
assert actual == expected
12+
13+
14+
def test_sets_inner_html():
15+
html = """<div id="main"><div>Hi</div><div id="updated">2025-09-27</div></div>"""
16+
parser = LexborHTMLParser(html)
17+
expected = "<span>Test</span>"
18+
parser.css_first("#main").inner_html = "<span>Test</span>"
19+
actual = parser.css_first("#main").inner_html
20+
assert actual == expected

0 commit comments

Comments
 (0)