File tree Expand file tree Collapse file tree 4 files changed +78
-0
lines changed
Expand file tree Collapse file tree 4 files changed +78
-0
lines changed Original file line number Diff line number Diff line change @@ -721,6 +721,26 @@ class LexborNode:
721721 """
722722 ...
723723
724+ @property
725+ def comment_content (self ) -> str | None :
726+ """Extract the textual content of an HTML comment node.
727+
728+ Returns
729+ -------
730+ str or None
731+ Comment text with surrounding whitespace removed, or ``None`` if
732+ the current node is not a comment or the comment markup cannot be
733+ parsed.
734+
735+ Examples
736+ --------
737+ >>> parse_fragment("<!-- hello -->")[0].comment_content
738+ 'hello'
739+ >>> parse_fragment("<div>not a comment</div>")[0].comment_content is None
740+ True
741+ """
742+ ...
743+
724744 @property
725745 def inner_html (self ) -> str | None :
726746 """Return HTML representation of the child nodes.
Original file line number Diff line number Diff line change @@ -957,6 +957,31 @@ cdef class LexborNode:
957957 return container.text
958958 return None
959959
960+ @property
961+ def comment_content (self ) -> str | None:
962+ """Extract the textual content of an HTML comment node.
963+
964+ Returns
965+ -------
966+ str or None
967+ Comment text with surrounding whitespace removed , or ``None`` if
968+ the current node is not a comment or the comment markup cannot be
969+ parsed.
970+
971+ Examples
972+ --------
973+ >>> parse_fragment("<!-- hello -->")[0].comment_content
974+ 'hello'
975+ >>> parse_fragment("<div>not a comment</div>")[0].comment_content is None
976+ True
977+ """
978+ if not self.is_comment_node:
979+ return None
980+ try:
981+ return extract_html_comment(self.html )
982+ except (ValueError , AttributeError , IndexError ):
983+ return None
984+
960985 @property
961986 def inner_html (self ) -> str | None:
962987 """Return HTML representation of the child nodes.
Original file line number Diff line number Diff line change 11include " ../utils.pxi"
22
3+ import re
4+
35
46def create_tag (tag: str ):
57 """
@@ -20,6 +22,28 @@ def parse_fragment(html: str):
2022 return do_parse_fragment(html, LexborHTMLParser)
2123
2224
25+ def extract_html_comment (text: str ) -> str:
26+ """Extract the inner content of an HTML comment string.
27+
28+ Args:
29+ text: Raw HTML comment , including the ``<!--`` and ``-->`` markers.
30+
31+ Returns:
32+ The comment body with surrounding whitespace stripped.
33+
34+ Raises:
35+ ValueError: If the input is not a well-formed HTML comment.
36+
37+ Examples:
38+ >>> extract_html_comment("<!-- hello -->")
39+ 'hello'
40+ """
41+ if match := re.fullmatch(r"\s*<!--\s*(.*?)\s*-->\s*", text , flags = re.DOTALL):
42+ return match.group(1 ).strip()
43+ msg = " Input is not a valid HTML comment"
44+ raise ValueError (msg)
45+
46+
2347cdef inline bint is_empty_text_node(lxb_dom_node_t * text_node):
2448 """
2549 Check whether a node is a text node made up solely of HTML ASCII whitespace.
@@ -46,6 +70,7 @@ cdef inline bint is_empty_text_node(lxb_dom_node_t *text_node):
4670
4771 return _is_whitespace_only(text_bytes, text_length)
4872
73+
4974cdef inline bint _is_whitespace_only(const lxb_char_t * buffer , size_t buffer_length) nogil:
5075 """
5176 Determine whether a byte buffer consists only of HTML ASCII whitespace.
Original file line number Diff line number Diff line change @@ -210,6 +210,14 @@ def test_is_empty_text_node_property():
210210 assert not text_node .is_empty_text_node
211211
212212
213+ def test_comment_content_property () -> None :
214+ parser = LexborHTMLParser ("<div><span><!-- hello --></span><title>X</title></div>" )
215+ text_node = parser .css_first ("span" ).first_child
216+ assert text_node is not None
217+ assert text_node .is_comment_node
218+ assert text_node .comment_content == "hello"
219+
220+
213221def test_parser_without_top_level_tags ():
214222 parser = LexborHTMLParser (
215223 "<div><span>\n \n </span><title>X</title></div>" , is_fragment = False
You can’t perform that action at this time.
0 commit comments