Skip to content

Commit d2a7687

Browse files
authored
Adds support for extracting the content of HTML comment nodes in the LexborNode class. (#199)
1 parent 51efb93 commit d2a7687

File tree

4 files changed

+78
-0
lines changed

4 files changed

+78
-0
lines changed

selectolax/lexbor.pyi

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -721,6 +721,26 @@ class LexborNode:
721721
"""
722722
...
723723

724+
@property
725+
def comment_content(self) -> str | None:
726+
"""Extract the textual content of an HTML comment node.
727+
728+
Returns
729+
-------
730+
str or None
731+
Comment text with surrounding whitespace removed, or ``None`` if
732+
the current node is not a comment or the comment markup cannot be
733+
parsed.
734+
735+
Examples
736+
--------
737+
>>> parse_fragment("<!-- hello -->")[0].comment_content
738+
'hello'
739+
>>> parse_fragment("<div>not a comment</div>")[0].comment_content is None
740+
True
741+
"""
742+
...
743+
724744
@property
725745
def inner_html(self) -> str | None:
726746
"""Return HTML representation of the child nodes.

selectolax/lexbor/node.pxi

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,31 @@ cdef class LexborNode:
957957
return container.text
958958
return None
959959

960+
@property
961+
def comment_content(self) -> str | None:
962+
"""Extract the textual content of an HTML comment node.
963+
964+
Returns
965+
-------
966+
str or None
967+
Comment text with surrounding whitespace removed, or ``None`` if
968+
the current node is not a comment or the comment markup cannot be
969+
parsed.
970+
971+
Examples
972+
--------
973+
>>> parse_fragment("<!-- hello -->")[0].comment_content
974+
'hello'
975+
>>> parse_fragment("<div>not a comment</div>")[0].comment_content is None
976+
True
977+
"""
978+
if not self.is_comment_node:
979+
return None
980+
try:
981+
return extract_html_comment(self.html)
982+
except (ValueError, AttributeError, IndexError):
983+
return None
984+
960985
@property
961986
def inner_html(self) -> str | None:
962987
"""Return HTML representation of the child nodes.

selectolax/lexbor/util.pxi

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
include "../utils.pxi"
22

3+
import re
4+
35

46
def create_tag(tag: str):
57
"""
@@ -20,6 +22,28 @@ def parse_fragment(html: str):
2022
return do_parse_fragment(html, LexborHTMLParser)
2123

2224

25+
def extract_html_comment(text: str) -> str:
26+
"""Extract the inner content of an HTML comment string.
27+
28+
Args:
29+
text: Raw HTML comment, including the ``<!--`` and ``-->`` markers.
30+
31+
Returns:
32+
The comment body with surrounding whitespace stripped.
33+
34+
Raises:
35+
ValueError: If the input is not a well-formed HTML comment.
36+
37+
Examples:
38+
>>> extract_html_comment("<!-- hello -->")
39+
'hello'
40+
"""
41+
if match := re.fullmatch(r"\s*<!--\s*(.*?)\s*-->\s*", text, flags=re.DOTALL):
42+
return match.group(1).strip()
43+
msg = "Input is not a valid HTML comment"
44+
raise ValueError(msg)
45+
46+
2347
cdef inline bint is_empty_text_node(lxb_dom_node_t *text_node):
2448
"""
2549
Check whether a node is a text node made up solely of HTML ASCII whitespace.
@@ -46,6 +70,7 @@ cdef inline bint is_empty_text_node(lxb_dom_node_t *text_node):
4670

4771
return _is_whitespace_only(text_bytes, text_length)
4872

73+
4974
cdef inline bint _is_whitespace_only(const lxb_char_t *buffer, size_t buffer_length) nogil:
5075
"""
5176
Determine whether a byte buffer consists only of HTML ASCII whitespace.

tests/test_lexbor.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,14 @@ def test_is_empty_text_node_property():
210210
assert not text_node.is_empty_text_node
211211

212212

213+
def test_comment_content_property() -> None:
214+
parser = LexborHTMLParser("<div><span><!-- hello --></span><title>X</title></div>")
215+
text_node = parser.css_first("span").first_child
216+
assert text_node is not None
217+
assert text_node.is_comment_node
218+
assert text_node.comment_content == "hello"
219+
220+
213221
def test_parser_without_top_level_tags():
214222
parser = LexborHTMLParser(
215223
"<div><span>\n \n</span><title>X</title></div>", is_fragment=False

0 commit comments

Comments
 (0)