Initial work fixing fragmented parser. Addresses #192

rushter · rushter · commit ca473441fd83 · 2025-11-30T05:45:44.000+04:00
diff --git a/Makefile b/Makefile
@@ -87,5 +87,5 @@ install: clean ## install the package to the active Python's site-packages
 dev:
 	python setup.py build_ext --inplace --cython --lexbor
 
-dev-static:
-	python setup.py build_ext --inplace --cython --static
+dev-static: clean-build
+	python setup.py build_ext --inplace --cython --static --disable-modest
diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd
@@ -28,6 +28,7 @@ cdef extern from "lexbor/core/core.h" nogil:
         LXB_STATUS_STOP
 
     lexbor_str_t* lexbor_str_destroy(lexbor_str_t *str, lexbor_mraw_t *mraw, bint destroy_obj)
+
     lexbor_str_t* lexbor_str_create()
     lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str)
 
@@ -240,10 +241,13 @@ cdef class LexborNode:
     cdef:
         lxb_dom_node_t *node
         public LexborHTMLParser parser
+        cdef bint _is_fragment_root
+
     cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type)
 
     @staticmethod
     cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
+    cdef void set_as_fragment_root(self)
 
 
 cdef bint is_empty_text_node(lxb_dom_node_t *node)
diff --git a/selectolax/lexbor.pyx b/selectolax/lexbor.pyx
@@ -167,8 +167,9 @@ cdef class LexborHTMLParser:
         )
         if fragment_html_node == NULL:
             return LXB_STATUS_ERROR
+
         # Use the fragment document returned by lexbor as the parser document.
-        self.document = <lxb_html_document_t *> fragment_html_node
+        self.document  = <lxb_html_document_t *> fragment_html_node
         return LXB_STATUS_OK
 
     def __dealloc__(self):
@@ -220,7 +221,11 @@ cdef class LexborHTMLParser:
         """
         if self.document == NULL:
             return None
-        return LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
+        cdef LexborNode  node
+        node =  LexborNode.new(<lxb_dom_node_t *> lxb_dom_document_root(&self.document.dom_document), self)
+        if self._is_fragment:
+            node.set_as_fragment_root()
+        return node
 
     @property
     def body(self):
diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi
@@ -32,11 +32,15 @@ cdef inline bytes to_bytes(str_or_LexborNode value):
 cdef class LexborNode:
     """A class that represents HTML node (element)."""
 
+    cdef void set_as_fragment_root(self):
+        self._is_fragment_root = 1
+
     @staticmethod
     cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser):
         cdef LexborNode lxbnode = LexborNode.__new__(LexborNode)
         lxbnode.node = node
         lxbnode.parser = parser
+        lxbnode._is_fragment_root = 0
         return lxbnode
 
     @property
@@ -108,7 +112,10 @@ cdef class LexborNode:
         cdef lxb_status_t status
 
         lxb_str = lexbor_str_create()
-        status = lxb_html_serialize_tree_str(self.node, lxb_str)
+        if self._is_fragment_root:
+            status = serialize_fragment(self.node, lxb_str)
+        else:
+            status = lxb_html_serialize_tree_str(self.node, lxb_str)
         if status == 0 and lxb_str.data:
             html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
             lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
@@ -1127,3 +1134,13 @@ cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
     cls = <TextContainer> ctx
     cls.append(py_str)
     return LEXBOR_ACTION_OK
+
+cdef lxb_status_t serialize_fragment(lxb_dom_node_t *node, lexbor_str_t *lxb_str):
+        cdef lxb_status_t status
+        while node != NULL:
+            status = lxb_html_serialize_tree_str(node, lxb_str)
+            if status != LXB_STATUS_OK:
+                return status
+            node = node.next
+
+        return LXB_STATUS_OK
diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py
@@ -2,6 +2,8 @@
 
 from inspect import cleandoc
 
+import pytest
+
 from selectolax.lexbor import LexborHTMLParser, parse_fragment
 
 
@@ -245,3 +247,51 @@ def test_parser_without_top_level_tags():
         is_fragment=True,
     )
     assert parser.html == "<div><span>\n \n</span><title>X</title></div>"
+
+
+def test_fragment_parser_multiple_nodes_on_the_same_level():
+    html = clean_doc("""
+          <meta charset="utf-8">
+          <meta content="width=device-width,initial-scale=1" name="viewport">
+          <title>Title!</title>
+          <!-- My crazy comment -->
+          <p>Hello <strong>World</strong>!</p>
+    """)
+    parser = LexborHTMLParser(html, is_fragment=True)
+    expected_html = clean_doc("""
+          <meta charset="utf-8">
+          <meta content="width=device-width,initial-scale=1" name="viewport">
+          <title>Title!</title>
+          <!-- My crazy comment -->
+          <p>Hello <strong>World</strong>!</p>
+
+    """)
+    assert parser.html == expected_html
+
+
+@pytest.mark.skip(reason="Currently bugged")
+def test_fragmented_parser_whole_doc():
+    html = clean_doc("""
+        <html lang="en">
+            <head>
+                <meta charset="utf-8">
+                <title>Title!</title>
+            </head>
+            <body>
+                <p>Hello <strong>Lorem Ipsum</strong>!</p>
+            </body>
+        </html>
+    """)
+    parser = LexborHTMLParser(html, is_fragment=True)
+    expected_html = clean_doc("""
+        <html lang="en">
+            <head>
+                <meta charset="utf-8">
+                <title>Title!</title>
+            </head>
+            <body>
+                <p>Hello <strong>Lorem Ipsum</strong>!</p>
+            </body>
+        </html>
+    """)
+    assert parser.html == expected_html