Adds support for extracting the content of HTML comment nodes in the LexborNode class. (#199)

pygarap · web-flow · commit d2a76872327d · 2025-11-29T11:46:10.000+04:00
diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi
@@ -721,6 +721,26 @@ class LexborNode:
         """
         ...
 
+    @property
+    def comment_content(self) -> str | None:
+        """Extract the textual content of an HTML comment node.
+
+        Returns
+        -------
+        str or None
+            Comment text with surrounding whitespace removed, or ``None`` if
+            the current node is not a comment or the comment markup cannot be
+            parsed.
+
+        Examples
+        --------
+        >>> parse_fragment("<!-- hello -->")[0].comment_content
+        'hello'
+        >>> parse_fragment("<div>not a comment</div>")[0].comment_content is None
+        True
+        """
+        ...
+
     @property
     def inner_html(self) -> str | None:
         """Return HTML representation of the child nodes.
diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi
@@ -957,6 +957,31 @@ cdef class LexborNode:
             return container.text
         return None
 
+    @property
+    def comment_content(self) -> str | None:
+        """Extract the textual content of an HTML comment node.
+
+        Returns
+        -------
+        str or None
+            Comment text with surrounding whitespace removed, or ``None`` if
+            the current node is not a comment or the comment markup cannot be
+            parsed.
+
+        Examples
+        --------
+        >>> parse_fragment("<!-- hello -->")[0].comment_content
+        'hello'
+        >>> parse_fragment("<div>not a comment</div>")[0].comment_content is None
+        True
+        """
+        if not self.is_comment_node:
+            return None
+        try:
+            return extract_html_comment(self.html)
+        except (ValueError, AttributeError, IndexError):
+            return None
+
     @property
     def inner_html(self) -> str | None:
         """Return HTML representation of the child nodes.
diff --git a/selectolax/lexbor/util.pxi b/selectolax/lexbor/util.pxi
@@ -1,5 +1,7 @@
 include "../utils.pxi"
 
+import re
+
 
 def create_tag(tag: str):
     """
@@ -20,6 +22,28 @@ def parse_fragment(html: str):
     return do_parse_fragment(html, LexborHTMLParser)
 
 
+def extract_html_comment(text: str) -> str:
+    """Extract the inner content of an HTML comment string.
+
+    Args:
+        text: Raw HTML comment, including the ``<!--`` and ``-->`` markers.
+
+    Returns:
+        The comment body with surrounding whitespace stripped.
+
+    Raises:
+        ValueError: If the input is not a well-formed HTML comment.
+
+    Examples:
+        >>> extract_html_comment("<!-- hello -->")
+        'hello'
+    """
+    if match := re.fullmatch(r"\s*<!--\s*(.*?)\s*-->\s*", text, flags=re.DOTALL):
+        return match.group(1).strip()
+    msg = "Input is not a valid HTML comment"
+    raise ValueError(msg)
+
+
 cdef inline bint is_empty_text_node(lxb_dom_node_t *text_node):
     """
     Check whether a node is a text node made up solely of HTML ASCII whitespace.
@@ -46,6 +70,7 @@ cdef inline bint is_empty_text_node(lxb_dom_node_t *text_node):
 
     return _is_whitespace_only(text_bytes, text_length)
 
+
 cdef inline bint _is_whitespace_only(const lxb_char_t *buffer, size_t buffer_length) nogil:
     """
     Determine whether a byte buffer consists only of HTML ASCII whitespace.
diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py
@@ -210,6 +210,14 @@ def test_is_empty_text_node_property():
     assert not text_node.is_empty_text_node
 
 
+def test_comment_content_property() -> None:
+    parser = LexborHTMLParser("<div><span><!-- hello --></span><title>X</title></div>")
+    text_node = parser.css_first("span").first_child
+    assert text_node is not None
+    assert text_node.is_comment_node
+    assert text_node.comment_content == "hello"
+
+
 def test_parser_without_top_level_tags():
     parser = LexborHTMLParser(
         "<div><span>\n \n</span><title>X</title></div>", is_fragment=False