diff --git a/Makefile b/Makefile
index f31b855..702c91e 100644
--- a/Makefile
+++ b/Makefile
@@ -47,7 +47,8 @@ clean-test: ## remove test and coverage artifacts
rm -fr htmlcov/
lint: ## check style with ruff
- ruff selectolax tests
+ ruff format selectolax tests
+ ruff check --fix selectolax tests
mypy selectolax tests
test: ## run tests quickly with the default Python
diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd
index 177354c..9e577b2 100644
--- a/selectolax/lexbor.pxd
+++ b/selectolax/lexbor.pxd
@@ -233,6 +233,7 @@ cdef class LexborNode:
cdef:
lxb_dom_node_t *node
public LexborHTMLParser parser
+ cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type)
@staticmethod
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)
diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi
index 8212e29..0b1114e 100644
--- a/selectolax/lexbor.pyi
+++ b/selectolax/lexbor.pyi
@@ -40,20 +40,24 @@ class LexborSelector:
def matches(self) -> list[LexborNode]:
"""Returns all possible matches"""
...
+
@property
def any_matches(self) -> bool:
"""Returns True if there are any matches"""
...
+
def text_contains(
self, text: str, deep: bool = True, separator: str = "", strip: bool = False
) -> LexborSelector:
"""Filter all current matches given text."""
...
+
def any_text_contains(
self, text: str, deep: bool = True, separator: str = "", strip: bool = False
) -> bool:
"""Returns True if any node in the current search scope contains specified text"""
...
+
def attribute_longer_than(
self, attribute: str, length: int, start: str | None = None
) -> LexborSelector:
@@ -62,6 +66,7 @@ class LexborSelector:
Similar to string-length in XPath.
"""
...
+
def any_attribute_longer_than(
self, attribute: str, length: int, start: str | None = None
) -> bool:
@@ -108,6 +113,7 @@ class LexborNode:
"""A class that represents HTML node (element)."""
parser: LexborHTMLParser
+
@property
def mem_id(self) -> int: ...
@property
@@ -117,26 +123,32 @@ class LexborNode:
**Deprecated**. Please use `first_child` instead.
"""
...
+
@property
def first_child(self) -> LexborNode | None:
"""Return the first child node."""
...
+
@property
def parent(self) -> LexborNode | None:
"""Return the parent node."""
...
+
@property
def next(self) -> LexborNode | None:
"""Return next node."""
...
+
@property
def prev(self) -> LexborNode | None:
"""Return previous node."""
...
+
@property
def last_child(self) -> LexborNode | None:
"""Return last child node."""
...
+
@property
def html(self) -> str | None:
"""Return HTML representation of the current node including all its child nodes.
@@ -146,6 +158,7 @@ class LexborNode:
text : str
"""
...
+
def __hash__(self) -> int: ...
def text_lexbor(self) -> str:
"""Returns the text of the node including text of all its child nodes.
@@ -153,6 +166,7 @@ class LexborNode:
Uses builtin method from lexbor.
"""
...
+
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
"""Returns the text of the node including text of all its child nodes.
@@ -170,6 +184,7 @@ class LexborNode:
text : str
"""
...
+
def css(self, query: str) -> list[LexborNode]:
"""Evaluate CSS selector against current node and its child nodes.
@@ -192,6 +207,7 @@ class LexborNode:
selector : list of `Node` objects
"""
...
+
@overload
def css_first(
self, query: str, default: Any = ..., strict: Literal[True] = ...
@@ -213,6 +229,7 @@ class LexborNode:
selector : `LexborNode` object
"""
...
+
@overload
def css_first(
self, query: str, default: DefaultT, strict: bool = False
@@ -234,6 +251,7 @@ class LexborNode:
selector : `LexborNode` object
"""
...
+
@overload
def css_first(
self, query: str, default: None = ..., strict: bool = False
@@ -255,12 +273,15 @@ class LexborNode:
selector : `LexborNode` object
"""
...
+
def any_css_matches(self, selectors: tuple[str]) -> bool:
"""Returns True if any of CSS selectors matches a node"""
...
+
def css_matches(self, selector: str) -> bool:
"""Returns True if CSS selector matches a node."""
...
+
@property
def tag_id(self) -> int: ...
@property
@@ -278,6 +299,7 @@ class LexborNode:
text : str
"""
...
+
def decompose(self, recursive: bool = True) -> None:
"""Remove the current node from the tree.
@@ -294,6 +316,7 @@ class LexborNode:
>>> tag.decompose()
"""
...
+
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
"""Remove specified tags from the HTML tree.
@@ -314,6 +337,7 @@ class LexborNode:
'
Hello world!
'
"""
...
+
@property
def attributes(self) -> dict[str, str | None]:
"""Get all attributes that belong to the current node.
@@ -333,6 +357,7 @@ class LexborNode:
{'data': None, 'id': 'my_id'}
"""
...
+
@property
def attrs(self) -> LexborAttributes:
"""A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data.
@@ -361,6 +386,7 @@ class LexborNode:
''
"""
...
+
@property
def id(self) -> str | None:
"""Get the id attribute of the node.
@@ -372,6 +398,7 @@ class LexborNode:
text : str
"""
...
+
def iter(self, include_text: bool = False) -> Iterator[LexborNode]:
"""Iterate over nodes on the current level.
@@ -385,6 +412,7 @@ class LexborNode:
node
"""
...
+
def unwrap(self, delete_empty: bool = False) -> None:
"""Replace node with whatever is inside this node.
@@ -406,6 +434,7 @@ class LexborNode:
Note: by default, empty tags are ignored, use "delete_empty" to change this.
"""
...
+
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
"""Unwraps specified tags from the HTML tree.
@@ -429,6 +458,7 @@ class LexborNode:
Note: by default, empty tags are ignored, use "delete_empty" to change this.
"""
...
+
def merge_text_nodes(self) -> None:
"""Iterates over all text nodes and merges all text nodes that are close to each other.
@@ -448,6 +478,7 @@ class LexborNode:
"John Doe"
"""
...
+
def traverse(self, include_text: bool = False) -> Iterator[LexborNode]:
"""Iterate over all child and next nodes starting from the current level.
@@ -461,6 +492,7 @@ class LexborNode:
node
"""
...
+
def replace_with(self, value: bytes | str | LexborNode) -> None:
"""Replace current Node with specified value.
@@ -489,6 +521,7 @@ class LexborNode:
''
"""
...
+
def insert_before(self, value: bytes | str | LexborNode) -> None:
"""Insert a node before the current Node.
@@ -517,6 +550,7 @@ class LexborNode:
Get
Test
'
"""
...
+
def insert_after(self, value: bytes | str | LexborNode) -> None:
"""Insert a node after the current Node.
@@ -545,6 +579,7 @@ class LexborNode:
Get

Test
'
"""
...
+
def insert_child(self, value: bytes | str | LexborNode) -> None:
"""Insert a node inside (at the end of) the current Node.
@@ -573,6 +608,7 @@ class LexborNode:
'
"""
...
+
@property
def raw_value(self) -> NoReturn:
"""Return the raw (unparsed, original) value of a node.
@@ -595,6 +631,7 @@ class LexborNode:
b'<test>'
"""
...
+
def scripts_contain(self, query: str) -> bool:
"""Returns True if any of the script tags contain specified text.
@@ -606,6 +643,7 @@ class LexborNode:
The query to check.
"""
...
+
def script_srcs_contain(self, queries: tuple[str]) -> bool:
"""Returns True if any of the script SRCs attributes contain on of the specified text.
@@ -616,9 +654,11 @@ class LexborNode:
queries : tuple of str
"""
...
+
def remove(self, recursive: bool = True) -> None:
"""An alias for the decompose method."""
...
+
def select(self, query: str | None = None) -> LexborSelector:
"""Select nodes given a CSS selector.
@@ -634,6 +674,7 @@ class LexborNode:
selector : The `Selector` class.
"""
...
+
@property
def text_content(self) -> str | None:
"""Returns the text of the node if it is a text node.
@@ -685,6 +726,26 @@ class LexborNode:
"""
...
+ @property
+ def is_element_node(self) -> bool:
+ """Return True if the node represents an element node."""
+ ...
+
+ @property
+ def is_text_node(self) -> bool:
+ """Return True if the node represents a text node."""
+ ...
+
+ @property
+ def is_comment_node(self) -> bool:
+ """Return True if the node represents a comment node."""
+ ...
+
+ @property
+ def is_document_node(self) -> bool:
+ """Return True if the node represents a document node."""
+ ...
+
class LexborHTMLParser:
"""The lexbor HTML parser.
@@ -705,14 +766,17 @@ class LexborHTMLParser:
def root(self) -> LexborNode | None:
"""Returns root node."""
...
+
@property
def body(self) -> LexborNode | None:
"""Returns document body."""
...
+
@property
def head(self) -> LexborNode | None:
"""Returns document head."""
...
+
def tags(self, name: str) -> list[LexborNode]:
"""Returns a list of tags that match specified name.
@@ -721,6 +785,7 @@ class LexborHTMLParser:
name : str (e.g. div)
"""
...
+
def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str:
"""Returns the text of the node including text of all its child nodes.
@@ -738,10 +803,12 @@ class LexborHTMLParser:
text : str
"""
...
+
@property
def html(self) -> str | None:
"""Return HTML representation of the page."""
...
+
def css(self, query: str) -> list[LexborNode]:
"""A CSS selector.
@@ -764,6 +831,7 @@ class LexborHTMLParser:
selector : list of `Node` objects
"""
...
+
@overload
def css_first(
self, query: str, default: Any = ..., strict: Literal[True] = ...
@@ -785,6 +853,7 @@ class LexborHTMLParser:
selector : `LexborNode` object
"""
...
+
@overload
def css_first(
self, query: str, default: DefaultT, strict: bool = False
@@ -806,6 +875,7 @@ class LexborHTMLParser:
selector : `LexborNode` object
"""
...
+
@overload
def css_first(
self, query: str, default: None = ..., strict: bool = False
@@ -827,6 +897,7 @@ class LexborHTMLParser:
selector : `LexborNode` object
"""
...
+
def strip_tags(self, tags: list[str], recursive: bool = False) -> None:
"""Remove specified tags from the node.
@@ -847,6 +918,7 @@ class LexborHTMLParser:
'Hello world!
'
"""
...
+
def select(self, query: str | None = None) -> LexborSelector | None:
"""Select nodes give a CSS selector.
@@ -862,9 +934,11 @@ class LexborHTMLParser:
selector : The `Selector` class.
"""
...
+
def any_css_matches(self, selectors: tuple[str]) -> bool:
"""Returns True if any of the specified CSS selectors matches a node."""
...
+
def scripts_contain(self, query: str) -> bool:
"""Returns True if any of the script tags contain specified text.
@@ -876,6 +950,7 @@ class LexborHTMLParser:
The query to check.
"""
...
+
def script_srcs_contain(self, queries: tuple[str]) -> bool:
"""Returns True if any of the script SRCs attributes contain on of the specified text.
@@ -886,6 +961,7 @@ class LexborHTMLParser:
queries : tuple of str
"""
...
+
def css_matches(self, selector: str) -> bool: ...
def merge_text_nodes(self) -> None:
"""Iterates over all text nodes and merges all text nodes that are close to each other.
@@ -906,9 +982,11 @@ class LexborHTMLParser:
"John Doe"
"""
...
+
def clone(self) -> LexborHTMLParser:
"""Clone the current tree."""
...
+
def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None:
"""Unwraps specified tags from the HTML tree.
diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi
index af95fe1..988889b 100644
--- a/selectolax/lexbor/node.pxi
+++ b/selectolax/lexbor/node.pxi
@@ -22,9 +22,9 @@ ctypedef fused str_or_bytes:
cdef inline bytes to_bytes(str_or_LexborNode value):
cdef bytes bytes_val
if isinstance(value, unicode):
- bytes_val = value.encode("utf-8")
+ bytes_val = value.encode("utf-8")
elif isinstance(value, bytes):
- bytes_val = value
+ bytes_val = value
return bytes_val
@@ -111,7 +111,7 @@ cdef class LexborNode:
status = lxb_html_serialize_tree_str(self.node, lxb_str)
if status == 0 and lxb_str.data:
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
- lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
return html
return None
@@ -128,7 +128,7 @@ cdef class LexborNode:
cdef lxb_char_t * text
text = lxb_dom_node_text_content(self.node, &str_len)
- if str_len == 0:
+ if str_len == 0:
raise RuntimeError("Can't extract text")
unicode_text = text.decode(_ENCODING)
@@ -153,11 +153,11 @@ cdef class LexborNode:
"""
cdef unsigned char * text
- cdef lxb_dom_node_t* node = self.node.first_child
+ cdef lxb_dom_node_t * node = self.node.first_child
if not deep:
container = TextContainer(separator, strip)
- if self.node != NULL and self.node.type == LXB_DOM_NODE_TYPE_TEXT:
+ if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
text = lexbor_str_data_noi(&( self.node).data)
if text != NULL:
py_text = text.decode(_ENCODING)
@@ -173,15 +173,15 @@ cdef class LexborNode:
return container.text
else:
container = TextContainer(separator, strip)
- if self.node.type == LXB_DOM_NODE_TYPE_TEXT:
+ if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
text = lexbor_str_data_noi(&( self.node).data)
if text != NULL:
container.append(text.decode(_ENCODING))
lxb_dom_node_simple_walk(
self.node,
- text_callback,
- container
+ text_callback,
+ container
)
return container.text
@@ -241,7 +241,7 @@ cdef class LexborNode:
def any_css_matches(self, tuple selectors):
"""Returns True if any of CSS selectors matches a node"""
for selector in selectors:
- if self.parser.selector.any_matches(selector, self):
+ if self.parser.selector.any_matches(selector, self):
return True
return False
@@ -355,7 +355,7 @@ cdef class LexborNode:
cdef size_t str_len = 0
attributes = dict()
- if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT:
+ if not self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT):
return attributes
while attr != NULL:
@@ -398,7 +398,7 @@ cdef class LexborNode:
>>> node.html
''
"""
- cdef LexborAttributes attributes = LexborAttributes.create(self.node)
+ cdef LexborAttributes attributes = LexborAttributes.create( self.node)
return attributes
@property
@@ -476,8 +476,8 @@ cdef class LexborNode:
if delete_empty:
lxb_dom_node_remove( self.node)
return
- cdef lxb_dom_node_t* next_node
- cdef lxb_dom_node_t* current_node
+ cdef lxb_dom_node_t * next_node
+ cdef lxb_dom_node_t * current_node
if self.node.first_child.next != NULL:
current_node = self.node.first_child
@@ -548,7 +548,7 @@ cdef class LexborNode:
left_text = lxb_dom_node_text_content(node.prev, &left_length)
right_text = lxb_dom_node_text_content(node, &right_length)
if left_text and right_text:
- combined = (left_text[:left_length]) + (right_text[:right_length])
+ combined = ( left_text[:left_length]) + ( right_text[:right_length])
lxb_dom_node_text_content_set(node, combined, len(combined))
lxb_dom_node_remove(node.prev)
@@ -623,12 +623,12 @@ cdef class LexborNode:
if isinstance(value, (str, bytes, unicode)):
bytes_val = to_bytes(value)
new_node = lxb_dom_document_create_text_node(
- &self.parser.document.dom_document,
- bytes_val, len(bytes_val)
+ &self.parser.document.dom_document,
+ bytes_val, len(bytes_val)
)
if new_node == NULL:
raise SelectolaxError("Can't create a new node")
- lxb_dom_node_insert_before(self.node, new_node)
+ lxb_dom_node_insert_before(self.node, new_node)
lxb_dom_node_remove( self.node)
elif isinstance(value, LexborNode):
new_node = lxb_dom_document_import_node(
@@ -676,12 +676,12 @@ cdef class LexborNode:
if isinstance(value, (str, bytes, unicode)):
bytes_val = to_bytes(value)
new_node = lxb_dom_document_create_text_node(
- &self.parser.document.dom_document,
- bytes_val, len(bytes_val)
+ &self.parser.document.dom_document,
+ bytes_val, len(bytes_val)
)
if new_node == NULL:
raise SelectolaxError("Can't create a new node")
- lxb_dom_node_insert_before(self.node, new_node)
+ lxb_dom_node_insert_before(self.node, new_node)
elif isinstance(value, LexborNode):
new_node = lxb_dom_document_import_node(
&self.parser.document.dom_document,
@@ -727,12 +727,12 @@ cdef class LexborNode:
if isinstance(value, (str, bytes, unicode)):
bytes_val = to_bytes(value)
new_node = lxb_dom_document_create_text_node(
- &self.parser.document.dom_document,
- bytes_val, len(bytes_val)
+ &self.parser.document.dom_document,
+ bytes_val, len(bytes_val)
)
if new_node == NULL:
raise SelectolaxError("Can't create a new node")
- lxb_dom_node_insert_after(self.node, new_node)
+ lxb_dom_node_insert_after(self.node, new_node)
elif isinstance(value, LexborNode):
new_node = lxb_dom_document_import_node(
&self.parser.document.dom_document,
@@ -778,12 +778,12 @@ cdef class LexborNode:
if isinstance(value, (str, bytes, unicode)):
bytes_val = to_bytes(value)
new_node = lxb_dom_document_create_text_node(
- &self.parser.document.dom_document,
- bytes_val, len(bytes_val)
+ &self.parser.document.dom_document,
+ bytes_val, len(bytes_val)
)
if new_node == NULL:
raise SelectolaxError("Can't create a new node")
- lxb_dom_node_insert_child(self.node, new_node)
+ lxb_dom_node_insert_child(self.node, new_node)
elif isinstance(value, LexborNode):
new_node = lxb_dom_document_import_node(
&self.parser.document.dom_document,
@@ -910,9 +910,9 @@ cdef class LexborNode:
text : str or None.
"""
cdef unsigned char * text
- cdef lxb_dom_node_t* node = self.node.first_child
+ cdef lxb_dom_node_t * node = self.node.first_child
cdef TextContainer container
- if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT:
+ if not self._is_node_type(LXB_DOM_NODE_TYPE_TEXT):
return None
text = lexbor_str_data_noi(&( self.node).data)
@@ -921,9 +921,10 @@ cdef class LexborNode:
py_text = text.decode(_ENCODING)
container.append(py_text)
return container.text
+ return None
@property
- def inner_html(self) -> str:
+ def inner_html(self) -> str | None:
"""Return HTML representation of the child nodes.
Works similar to innerHTML in JavaScript.
@@ -942,12 +943,12 @@ cdef class LexborNode:
status = lxb_html_serialize_deep_str(self.node, lxb_str)
if status == 0 and lxb_str.data:
html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '')
- lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
+ lexbor_str_destroy(lxb_str, self.node.owner_document.text, True)
return html
return None
@inner_html.setter
- def inner_html(self, str html):
+ def inner_html(self, str html) -> None:
"""Set inner HTML to the specified HTML.
Replaces existing data inside the node.
@@ -959,10 +960,10 @@ cdef class LexborNode:
"""
cdef bytes bytes_val
- bytes_val = html.encode("utf-8")
+ bytes_val = html.encode("utf-8")
lxb_html_element_inner_html_set(
- self.node,
- bytes_val, len(bytes_val)
+ self.node,
+ bytes_val, len(bytes_val)
)
def clone(self) -> LexborNode:
@@ -973,10 +974,33 @@ cdef class LexborNode:
It is tied to the current parser instance.
Gets destroyed when parser instance is destroyed.
"""
- cdef lxb_dom_node_t* node
+ cdef lxb_dom_node_t * node
node = lxb_dom_node_clone( self.node, 1)
return LexborNode.new(node, self.parser)
+ cdef inline bint _is_node_type(self, lxb_dom_node_type_t expected_type):
+ return self.node != NULL and self.node.type == expected_type
+
+ @property
+ def is_element_node(self) -> bool:
+ """Return True if the node represents an element node."""
+ return self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT)
+
+ @property
+ def is_text_node(self) -> bool:
+ """Return True if the node represents a text node."""
+ return self._is_node_type(LXB_DOM_NODE_TYPE_TEXT)
+
+ @property
+ def is_comment_node(self) -> bool:
+ """Return True if the node represents a comment node."""
+ return self._is_node_type(LXB_DOM_NODE_TYPE_COMMENT)
+
+ @property
+ def is_document_node(self) -> bool:
+ """Return True if the node represents a document node."""
+ return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT)
+
@cython.internal
@cython.final
@@ -1010,14 +1034,13 @@ cdef class TextContainer:
self._text = self._text[:-len(self.separator)]
return self._text
-
cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx):
cdef unsigned char *text
cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node)
if tag_id != LXB_TAG__TEXT:
return LEXBOR_ACTION_OK
- text = lexbor_str_data_noi(&( node).char_data.data)
+ text = lexbor_str_data_noi(&( node).char_data.data)
if not text:
return LEXBOR_ACTION_OK
diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py
index 83d82d3..f29256a 100644
--- a/tests/test_lexbor.py
+++ b/tests/test_lexbor.py
@@ -57,3 +57,27 @@ def test_unicode_selector_works():
tree = LexborHTMLParser(html)
node = tree.css_first('span[data-original-title="Pneu renforcé"]')
assert node.tag == "span"
+
+
+def test_node_type_helpers():
+ html = "text
"
+ parser = LexborHTMLParser(html)
+
+ div_node = parser.css_first("#main")
+ assert div_node.is_element_node
+ assert not div_node.is_text_node
+
+ text_node = div_node.first_child
+ assert text_node is not None
+ assert text_node.is_text_node
+ assert not text_node.is_element_node
+
+ comment_node = div_node.last_child
+ assert comment_node is not None
+ assert comment_node.is_comment_node
+ assert not comment_node.is_text_node
+
+ document_node = parser.root.parent
+ assert document_node is not None
+ assert document_node.is_document_node
+ assert not document_node.is_element_node