Skip to content

Commit ca47344

Browse files
committed
Initial work fixing fragmented parser. Addresses #192
1 parent 8f7c0b6 commit ca47344

File tree

5 files changed

+81
-5
lines changed

5 files changed

+81
-5
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,5 +87,5 @@ install: clean ## install the package to the active Python's site-packages
8787
dev:
8888
python setup.py build_ext --inplace --cython --lexbor
8989

90-
dev-static:
91-
python setup.py build_ext --inplace --cython --static
90+
dev-static: clean-build
91+
python setup.py build_ext --inplace --cython --static --disable-modest

selectolax/lexbor.pxd

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ cdef extern from "lexbor/core/core.h" nogil:
2828
LXB_STATUS_STOP
2929

3030
lexbor_str_t* lexbor_str_destroy(lexbor_str_t *str, lexbor_mraw_t *mraw, bint destroy_obj)
31+
3132
lexbor_str_t* lexbor_str_create()
3233
lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str)
3334

@@ -240,10 +241,13 @@ cdef class LexborNode:
240241
cdef:
241242
lxb_dom_node_t *node
242243
public LexborHTMLParser parser
244+
cdef bint _is_fragment_root
245+
243246
cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type)
244247

245248
@staticmethod
246249
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
250+
cdef void set_as_fragment_root(self)
247251

248252

249253
cdef bint is_empty_text_node(lxb_dom_node_t *node)

selectolax/lexbor.pyx

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,9 @@ cdef class LexborHTMLParser:
167167
)
168168
if fragment_html_node == NULL:
169169
return LXB_STATUS_ERROR
170+
170171
# Use the fragment document returned by lexbor as the parser document.
171-
self.document = <lxb_html_document_t *> fragment_html_node
172+
self.document = <lxb_html_document_t *> fragment_html_node
172173
return LXB_STATUS_OK
173174

174175
def __dealloc__(self):
@@ -220,7 +221,11 @@ cdef class LexborHTMLParser:
220221
"""
221222
if self.document == NULL:
222223
return None
223-
return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
224+
cdef LexborNode node
225+
node = LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
226+
if self._is_fragment:
227+
node.set_as_fragment_root()
228+
return node
224229

225230
@property
226231
def body(self):

selectolax/lexbor/node.pxi

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,15 @@ cdef inline bytes to_bytes(str_or_LexborNode value):
3232
cdef class LexborNode:
3333
"""A class that represents HTML node (element)."""
3434

35+
cdef void set_as_fragment_root(self):
36+
self._is_fragment_root = 1
37+
3538
@staticmethod
3639
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
3740
cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
3841
lxbnode.node = node
3942
lxbnode.parser = parser
43+
lxbnode._is_fragment_root = 0
4044
return lxbnode
4145

4246
@property
@@ -108,7 +112,10 @@ cdef class LexborNode:
108112
cdef lxb_status_t status
109113

110114
lxb_str = lexbor_str_create()
111-
status = lxb_html_serialize_tree_str(self.node, lxb_str)
115+
if self._is_fragment_root:
116+
status = serialize_fragment(self.node, lxb_str)
117+
else:
118+
status = lxb_html_serialize_tree_str(self.node, lxb_str)
112119
if status == 0 and lxb_str.data:
113120
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
114121
lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
@@ -1127,3 +1134,13 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
11271134
cls = <TextContainer> ctx
11281135
cls.append(py_str)
11291136
return LEXBOR_ACTION_OK
1137+
1138+
cdef lxb_status_t serialize_fragment(lxb_dom_node_t *node, lexbor_str_t *lxb_str):
1139+
cdef lxb_status_t status
1140+
while node != NULL:
1141+
status = lxb_html_serialize_tree_str(node, lxb_str)
1142+
if status != LXB_STATUS_OK:
1143+
return status
1144+
node = node.next
1145+
1146+
return LXB_STATUS_OK

tests/test_lexbor.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
from inspect import cleandoc
44

5+
import pytest
6+
57
from selectolax.lexbor import LexborHTMLParser, parse_fragment
68

79

@@ -245,3 +247,51 @@ def test_parser_without_top_level_tags():
245247
is_fragment=True,
246248
)
247249
assert parser.html == "<div><span>\n \n</span><title>X</title></div>"
250+
251+
252+
def test_fragment_parser_multiple_nodes_on_the_same_level():
253+
html = clean_doc("""
254+
<meta charset="utf-8">
255+
<meta content="width=device-width,initial-scale=1" name="viewport">
256+
<title>Title!</title>
257+
<!-- My crazy comment -->
258+
<p>Hello <strong>World</strong>!</p>
259+
""")
260+
parser = LexborHTMLParser(html, is_fragment=True)
261+
expected_html = clean_doc("""
262+
<meta charset="utf-8">
263+
<meta content="width=device-width,initial-scale=1" name="viewport">
264+
<title>Title!</title>
265+
<!-- My crazy comment -->
266+
<p>Hello <strong>World</strong>!</p>
267+
268+
""")
269+
assert parser.html == expected_html
270+
271+
272+
@pytest.mark.skip(reason="Currently bugged")
273+
def test_fragmented_parser_whole_doc():
274+
html = clean_doc("""
275+
<html lang="en">
276+
<head>
277+
<meta charset="utf-8">
278+
<title>Title!</title>
279+
</head>
280+
<body>
281+
<p>Hello <strong>Lorem Ipsum</strong>!</p>
282+
</body>
283+
</html>
284+
""")
285+
parser = LexborHTMLParser(html, is_fragment=True)
286+
expected_html = clean_doc("""
287+
<html lang="en">
288+
<head>
289+
<meta charset="utf-8">
290+
<title>Title!</title>
291+
</head>
292+
<body>
293+
<p>Hello <strong>Lorem Ipsum</strong>!</p>
294+
</body>
295+
</html>
296+
""")
297+
assert parser.html == expected_html

0 commit comments

Comments
 (0)