Skip to content

Commit 6a8fa19

Browse files
committed
Fix segfauls when doing unwrap or decompose twice
Lexbor backend. We now keep removed nodes in memory instead of deallocating them. In consumes slightly more memory, but does not result in segfaults when users try to remove same nodes twice.
1 parent ea06dac commit 6a8fa19

File tree

6 files changed

+65
-8
lines changed

6 files changed

+65
-8
lines changed

CHANGES.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
- Optimize performance for`css_first` in lexbor backend
1212
- Fix segfaults when accessing attributes. Resolves #135.
1313
- Add new `.clone` method to lexbor backend. Resolve #117.
14-
- Improve unicode handling for malformed text. Resolved #138.
14+
- Improve unicode handling for malformed text. Resolves #138.
15+
- Fix segfaults when doing double `.decompose`. Resolves #179.
16+
- Fix sefgaults when doing double `.unwrap`. Resolves #169.
1517

1618
## Version 0.3.34
1719

selectolax/lexbor.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,8 @@ class LexborNode:
382382
def unwrap(self, delete_empty: bool = False) -> None:
383383
"""Replace node with whatever is inside this node.
384384
385+
Does nothing if you perform unwrapping second time on the same node.
386+
385387
Parameters
386388
----------
387389
delete_empty : bool, default False

selectolax/lexbor.pyx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from cpython.bool cimport bool
22
from cpython.exc cimport PyErr_SetObject
33

4+
45
_ENCODING = 'UTF-8'
56

67
include "base.pxi"
@@ -9,6 +10,7 @@ include "lexbor/attrs.pxi"
910
include "lexbor/node.pxi"
1011
include "lexbor/selection.pxi"
1112
include "lexbor/util.pxi"
13+
include "lexbor/node_remove.pxi"
1214

1315
# We don't inherit from HTMLParser here, because it also includes all the C code from Modest.
1416

selectolax/lexbor/node.pxi

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
cimport cython
22
from cpython.exc cimport PyErr_SetNone
33

4+
import logging
5+
46
_TAG_TO_NAME = {
57
0x0005: "- doctype",
68
0x0002: "-text",
@@ -292,9 +294,9 @@ cdef class LexborNode:
292294
raise SelectolaxError("Decomposing the root node is not allowed.")
293295

294296
if recursive:
295-
lxb_dom_node_destroy_deep(<lxb_dom_node_t *> self.node)
297+
node_remove_deep(<lxb_dom_node_t *> self.node)
296298
else:
297-
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
299+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
298300

299301
def strip_tags(self, list tags, bool recursive = False):
300302
"""Remove specified tags from the HTML tree.
@@ -438,6 +440,8 @@ cdef class LexborNode:
438440
def unwrap(self, bint delete_empty=False):
439441
"""Replace node with whatever is inside this node.
440442
443+
Does nothing if you perform unwrapping second time on the same node.
444+
441445
Parameters
442446
----------
443447
delete_empty : bool, default False
@@ -453,9 +457,14 @@ cdef class LexborNode:
453457
454458
Note: by default, empty tags are ignored, use "delete_empty" to change this.
455459
"""
460+
461+
if node_is_removed(<lxb_dom_node_t *> self.node) == 1:
462+
logging.error("Attempt to unwrap removed node. Does nothing.")
463+
return
464+
456465
if self.node.first_child == NULL:
457466
if delete_empty:
458-
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
467+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
459468
return
460469
cdef lxb_dom_node_t* next_node
461470
cdef lxb_dom_node_t* current_node
@@ -470,7 +479,7 @@ cdef class LexborNode:
470479
current_node = next_node
471480
else:
472481
lxb_dom_node_insert_before(self.node, self.node.first_child)
473-
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
482+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
474483

475484
def unwrap_tags(self, list tags, bint delete_empty = False):
476485
"""Unwraps specified tags from the HTML tree.
@@ -610,7 +619,7 @@ cdef class LexborNode:
610619
if new_node == NULL:
611620
raise SelectolaxError("Can't create a new node")
612621
lxb_dom_node_insert_before(self.node, new_node)
613-
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
622+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
614623
elif isinstance(value, LexborNode):
615624
new_node = lxb_dom_document_import_node(
616625
&self.parser.document.dom_document,
@@ -620,7 +629,7 @@ cdef class LexborNode:
620629
if new_node == NULL:
621630
raise SelectolaxError("Can't create a new node")
622631
lxb_dom_node_insert_before(self.node, <lxb_dom_node_t *> new_node)
623-
lxb_dom_node_destroy(<lxb_dom_node_t *> self.node)
632+
lxb_dom_node_remove(<lxb_dom_node_t *> self.node)
624633
else:
625634
raise SelectolaxError("Expected a string or LexborNode instance, but %s found" % type(value).__name__)
626635

selectolax/lexbor/node_remove.pxi

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
2+
cdef lxb_dom_node_t * node_remove_deep(lxb_dom_node_t* root):
3+
cdef lxb_dom_node_t *tmp
4+
cdef lxb_dom_node_t *node = root
5+
6+
while node != NULL:
7+
if node.first_child != NULL:
8+
node = node.first_child
9+
else:
10+
while node != root and node.next == NULL:
11+
tmp = node.parent
12+
lxb_dom_node_remove(node)
13+
node = tmp
14+
15+
if node == root:
16+
lxb_dom_node_remove(node)
17+
break
18+
19+
tmp = node.next
20+
lxb_dom_node_remove(node)
21+
node = tmp
22+
23+
return NULL
24+
25+
cdef bint node_is_removed(lxb_dom_node_t* node):
26+
if node.parent == NULL and node.next == NULL \
27+
and node.prev == NULL:
28+
return 1
29+
return 0

tests/test_lexbor.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""Tests for functionality that is only supported by lexbor backend."""
22

3-
from selectolax.lexbor import LexborHTMLParser
3+
from selectolax.lexbor import LexborHTMLParser, parse_fragment
44

55

66
def test_reads_inner_html():
@@ -37,3 +37,16 @@ def test_node_cloning():
3737
new_node.inner_html = "<div>new</div>"
3838
assert parser.css_first("#main").html != new_node.html
3939
assert new_node.html == '<div id="main"><div>new</div></div>'
40+
41+
42+
def test_double_unwrap_does_not_segfault():
43+
html = """<div><div><div></div></div></div>"""
44+
outer_div = parse_fragment(html)[0]
45+
some_set = set()
46+
47+
inner_div = outer_div.child
48+
assert inner_div is not None
49+
inner_div.unwrap()
50+
inner_div.unwrap()
51+
some_set.add(outer_div.parent)
52+
some_set.add(outer_div.parent)

0 commit comments

Comments
 (0)