diff --git a/Makefile b/Makefile index f31b855..702c91e 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,8 @@ clean-test: ## remove test and coverage artifacts rm -fr htmlcov/ lint: ## check style with ruff - ruff selectolax tests + ruff format selectolax tests + ruff check --fix selectolax tests mypy selectolax tests test: ## run tests quickly with the default Python diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd index 177354c..9e577b2 100644 --- a/selectolax/lexbor.pxd +++ b/selectolax/lexbor.pxd @@ -233,6 +233,7 @@ cdef class LexborNode: cdef: lxb_dom_node_t *node public LexborHTMLParser parser + cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type) @staticmethod cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser) diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi index 8212e29..0b1114e 100644 --- a/selectolax/lexbor.pyi +++ b/selectolax/lexbor.pyi @@ -40,20 +40,24 @@ class LexborSelector: def matches(self) -> list[LexborNode]: """Returns all possible matches""" ... + @property def any_matches(self) -> bool: """Returns True if there are any matches""" ... + def text_contains( self, text: str, deep: bool = True, separator: str = "", strip: bool = False ) -> LexborSelector: """Filter all current matches given text.""" ... + def any_text_contains( self, text: str, deep: bool = True, separator: str = "", strip: bool = False ) -> bool: """Returns True if any node in the current search scope contains specified text""" ... + def attribute_longer_than( self, attribute: str, length: int, start: str | None = None ) -> LexborSelector: @@ -62,6 +66,7 @@ class LexborSelector: Similar to string-length in XPath. """ ... + def any_attribute_longer_than( self, attribute: str, length: int, start: str | None = None ) -> bool: @@ -108,6 +113,7 @@ class LexborNode: """A class that represents HTML node (element).""" parser: LexborHTMLParser + @property def mem_id(self) -> int: ... @property @@ -117,26 +123,32 @@ class LexborNode: **Deprecated**. Please use `first_child` instead. """ ... + @property def first_child(self) -> LexborNode | None: """Return the first child node.""" ... + @property def parent(self) -> LexborNode | None: """Return the parent node.""" ... + @property def next(self) -> LexborNode | None: """Return next node.""" ... + @property def prev(self) -> LexborNode | None: """Return previous node.""" ... + @property def last_child(self) -> LexborNode | None: """Return last child node.""" ... + @property def html(self) -> str | None: """Return HTML representation of the current node including all its child nodes. @@ -146,6 +158,7 @@ class LexborNode: text : str """ ... + def __hash__(self) -> int: ... def text_lexbor(self) -> str: """Returns the text of the node including text of all its child nodes. @@ -153,6 +166,7 @@ class LexborNode: Uses builtin method from lexbor. """ ... + def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str: """Returns the text of the node including text of all its child nodes. @@ -170,6 +184,7 @@ class LexborNode: text : str """ ... + def css(self, query: str) -> list[LexborNode]: """Evaluate CSS selector against current node and its child nodes. @@ -192,6 +207,7 @@ class LexborNode: selector : list of `Node` objects """ ... + @overload def css_first( self, query: str, default: Any = ..., strict: Literal[True] = ... @@ -213,6 +229,7 @@ class LexborNode: selector : `LexborNode` object """ ... + @overload def css_first( self, query: str, default: DefaultT, strict: bool = False @@ -234,6 +251,7 @@ class LexborNode: selector : `LexborNode` object """ ... + @overload def css_first( self, query: str, default: None = ..., strict: bool = False @@ -255,12 +273,15 @@ class LexborNode: selector : `LexborNode` object """ ... + def any_css_matches(self, selectors: tuple[str]) -> bool: """Returns True if any of CSS selectors matches a node""" ... + def css_matches(self, selector: str) -> bool: """Returns True if CSS selector matches a node.""" ... + @property def tag_id(self) -> int: ... @property @@ -278,6 +299,7 @@ class LexborNode: text : str """ ... + def decompose(self, recursive: bool = True) -> None: """Remove the current node from the tree. @@ -294,6 +316,7 @@ class LexborNode: >>> tag.decompose() """ ... + def strip_tags(self, tags: list[str], recursive: bool = False) -> None: """Remove specified tags from the HTML tree. @@ -314,6 +337,7 @@ class LexborNode: '
Hello world!
' """ ... + @property def attributes(self) -> dict[str, str | None]: """Get all attributes that belong to the current node. @@ -333,6 +357,7 @@ class LexborNode: {'data': None, 'id': 'my_id'} """ ... + @property def attrs(self) -> LexborAttributes: """A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data. @@ -361,6 +386,7 @@ class LexborNode: '
' """ ... + @property def id(self) -> str | None: """Get the id attribute of the node. @@ -372,6 +398,7 @@ class LexborNode: text : str """ ... + def iter(self, include_text: bool = False) -> Iterator[LexborNode]: """Iterate over nodes on the current level. @@ -385,6 +412,7 @@ class LexborNode: node """ ... + def unwrap(self, delete_empty: bool = False) -> None: """Replace node with whatever is inside this node. @@ -406,6 +434,7 @@ class LexborNode: Note: by default, empty tags are ignored, use "delete_empty" to change this. """ ... + def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None: """Unwraps specified tags from the HTML tree. @@ -429,6 +458,7 @@ class LexborNode: Note: by default, empty tags are ignored, use "delete_empty" to change this. """ ... + def merge_text_nodes(self) -> None: """Iterates over all text nodes and merges all text nodes that are close to each other. @@ -448,6 +478,7 @@ class LexborNode: "John Doe" """ ... + def traverse(self, include_text: bool = False) -> Iterator[LexborNode]: """Iterate over all child and next nodes starting from the current level. @@ -461,6 +492,7 @@ class LexborNode: node """ ... + def replace_with(self, value: bytes | str | LexborNode) -> None: """Replace current Node with specified value. @@ -489,6 +521,7 @@ class LexborNode: '
Get
Test
' """ ... + def insert_before(self, value: bytes | str | LexborNode) -> None: """Insert a node before the current Node. @@ -517,6 +550,7 @@ class LexborNode:
Get
Test
' """ ... + def insert_after(self, value: bytes | str | LexborNode) -> None: """Insert a node after the current Node. @@ -545,6 +579,7 @@ class LexborNode:
Get
Test
' """ ... + def insert_child(self, value: bytes | str | LexborNode) -> None: """Insert a node inside (at the end of) the current Node. @@ -573,6 +608,7 @@ class LexborNode:
Get
Laptop
Test
' """ ... + @property def raw_value(self) -> NoReturn: """Return the raw (unparsed, original) value of a node. @@ -595,6 +631,7 @@ class LexborNode: b'<test>' """ ... + def scripts_contain(self, query: str) -> bool: """Returns True if any of the script tags contain specified text. @@ -606,6 +643,7 @@ class LexborNode: The query to check. """ ... + def script_srcs_contain(self, queries: tuple[str]) -> bool: """Returns True if any of the script SRCs attributes contain on of the specified text. @@ -616,9 +654,11 @@ class LexborNode: queries : tuple of str """ ... + def remove(self, recursive: bool = True) -> None: """An alias for the decompose method.""" ... + def select(self, query: str | None = None) -> LexborSelector: """Select nodes given a CSS selector. @@ -634,6 +674,7 @@ class LexborNode: selector : The `Selector` class. """ ... + @property def text_content(self) -> str | None: """Returns the text of the node if it is a text node. @@ -685,6 +726,26 @@ class LexborNode: """ ... + @property + def is_element_node(self) -> bool: + """Return True if the node represents an element node.""" + ... + + @property + def is_text_node(self) -> bool: + """Return True if the node represents a text node.""" + ... + + @property + def is_comment_node(self) -> bool: + """Return True if the node represents a comment node.""" + ... + + @property + def is_document_node(self) -> bool: + """Return True if the node represents a document node.""" + ... + class LexborHTMLParser: """The lexbor HTML parser. @@ -705,14 +766,17 @@ class LexborHTMLParser: def root(self) -> LexborNode | None: """Returns root node.""" ... + @property def body(self) -> LexborNode | None: """Returns document body.""" ... + @property def head(self) -> LexborNode | None: """Returns document head.""" ... + def tags(self, name: str) -> list[LexborNode]: """Returns a list of tags that match specified name. @@ -721,6 +785,7 @@ class LexborHTMLParser: name : str (e.g. div) """ ... + def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str: """Returns the text of the node including text of all its child nodes. @@ -738,10 +803,12 @@ class LexborHTMLParser: text : str """ ... + @property def html(self) -> str | None: """Return HTML representation of the page.""" ... + def css(self, query: str) -> list[LexborNode]: """A CSS selector. @@ -764,6 +831,7 @@ class LexborHTMLParser: selector : list of `Node` objects """ ... + @overload def css_first( self, query: str, default: Any = ..., strict: Literal[True] = ... @@ -785,6 +853,7 @@ class LexborHTMLParser: selector : `LexborNode` object """ ... + @overload def css_first( self, query: str, default: DefaultT, strict: bool = False @@ -806,6 +875,7 @@ class LexborHTMLParser: selector : `LexborNode` object """ ... + @overload def css_first( self, query: str, default: None = ..., strict: bool = False @@ -827,6 +897,7 @@ class LexborHTMLParser: selector : `LexborNode` object """ ... + def strip_tags(self, tags: list[str], recursive: bool = False) -> None: """Remove specified tags from the node. @@ -847,6 +918,7 @@ class LexborHTMLParser: '
Hello world!
' """ ... + def select(self, query: str | None = None) -> LexborSelector | None: """Select nodes give a CSS selector. @@ -862,9 +934,11 @@ class LexborHTMLParser: selector : The `Selector` class. """ ... + def any_css_matches(self, selectors: tuple[str]) -> bool: """Returns True if any of the specified CSS selectors matches a node.""" ... + def scripts_contain(self, query: str) -> bool: """Returns True if any of the script tags contain specified text. @@ -876,6 +950,7 @@ class LexborHTMLParser: The query to check. """ ... + def script_srcs_contain(self, queries: tuple[str]) -> bool: """Returns True if any of the script SRCs attributes contain on of the specified text. @@ -886,6 +961,7 @@ class LexborHTMLParser: queries : tuple of str """ ... + def css_matches(self, selector: str) -> bool: ... def merge_text_nodes(self) -> None: """Iterates over all text nodes and merges all text nodes that are close to each other. @@ -906,9 +982,11 @@ class LexborHTMLParser: "John Doe" """ ... + def clone(self) -> LexborHTMLParser: """Clone the current tree.""" ... + def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None: """Unwraps specified tags from the HTML tree. diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi index af95fe1..988889b 100644 --- a/selectolax/lexbor/node.pxi +++ b/selectolax/lexbor/node.pxi @@ -22,9 +22,9 @@ ctypedef fused str_or_bytes: cdef inline bytes to_bytes(str_or_LexborNode value): cdef bytes bytes_val if isinstance(value, unicode): - bytes_val = value.encode("utf-8") + bytes_val = value.encode("utf-8") elif isinstance(value, bytes): - bytes_val = value + bytes_val = value return bytes_val @@ -111,7 +111,7 @@ cdef class LexborNode: status = lxb_html_serialize_tree_str(self.node, lxb_str) if status == 0 and lxb_str.data: html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '') - lexbor_str_destroy(lxb_str, self.node.owner_document.text, True) + lexbor_str_destroy(lxb_str, self.node.owner_document.text, True) return html return None @@ -128,7 +128,7 @@ cdef class LexborNode: cdef lxb_char_t * text text = lxb_dom_node_text_content(self.node, &str_len) - if str_len == 0: + if str_len == 0: raise RuntimeError("Can't extract text") unicode_text = text.decode(_ENCODING) @@ -153,11 +153,11 @@ cdef class LexborNode: """ cdef unsigned char * text - cdef lxb_dom_node_t* node = self.node.first_child + cdef lxb_dom_node_t * node = self.node.first_child if not deep: container = TextContainer(separator, strip) - if self.node != NULL and self.node.type == LXB_DOM_NODE_TYPE_TEXT: + if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT): text = lexbor_str_data_noi(&( self.node).data) if text != NULL: py_text = text.decode(_ENCODING) @@ -173,15 +173,15 @@ cdef class LexborNode: return container.text else: container = TextContainer(separator, strip) - if self.node.type == LXB_DOM_NODE_TYPE_TEXT: + if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT): text = lexbor_str_data_noi(&( self.node).data) if text != NULL: container.append(text.decode(_ENCODING)) lxb_dom_node_simple_walk( self.node, - text_callback, - container + text_callback, + container ) return container.text @@ -241,7 +241,7 @@ cdef class LexborNode: def any_css_matches(self, tuple selectors): """Returns True if any of CSS selectors matches a node""" for selector in selectors: - if self.parser.selector.any_matches(selector, self): + if self.parser.selector.any_matches(selector, self): return True return False @@ -355,7 +355,7 @@ cdef class LexborNode: cdef size_t str_len = 0 attributes = dict() - if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT: + if not self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT): return attributes while attr != NULL: @@ -398,7 +398,7 @@ cdef class LexborNode: >>> node.html '
' """ - cdef LexborAttributes attributes = LexborAttributes.create(self.node) + cdef LexborAttributes attributes = LexborAttributes.create( self.node) return attributes @property @@ -476,8 +476,8 @@ cdef class LexborNode: if delete_empty: lxb_dom_node_remove( self.node) return - cdef lxb_dom_node_t* next_node - cdef lxb_dom_node_t* current_node + cdef lxb_dom_node_t * next_node + cdef lxb_dom_node_t * current_node if self.node.first_child.next != NULL: current_node = self.node.first_child @@ -548,7 +548,7 @@ cdef class LexborNode: left_text = lxb_dom_node_text_content(node.prev, &left_length) right_text = lxb_dom_node_text_content(node, &right_length) if left_text and right_text: - combined = (left_text[:left_length]) + (right_text[:right_length]) + combined = ( left_text[:left_length]) + ( right_text[:right_length]) lxb_dom_node_text_content_set(node, combined, len(combined)) lxb_dom_node_remove(node.prev) @@ -623,12 +623,12 @@ cdef class LexborNode: if isinstance(value, (str, bytes, unicode)): bytes_val = to_bytes(value) new_node = lxb_dom_document_create_text_node( - &self.parser.document.dom_document, - bytes_val, len(bytes_val) + &self.parser.document.dom_document, + bytes_val, len(bytes_val) ) if new_node == NULL: raise SelectolaxError("Can't create a new node") - lxb_dom_node_insert_before(self.node, new_node) + lxb_dom_node_insert_before(self.node, new_node) lxb_dom_node_remove( self.node) elif isinstance(value, LexborNode): new_node = lxb_dom_document_import_node( @@ -676,12 +676,12 @@ cdef class LexborNode: if isinstance(value, (str, bytes, unicode)): bytes_val = to_bytes(value) new_node = lxb_dom_document_create_text_node( - &self.parser.document.dom_document, - bytes_val, len(bytes_val) + &self.parser.document.dom_document, + bytes_val, len(bytes_val) ) if new_node == NULL: raise SelectolaxError("Can't create a new node") - lxb_dom_node_insert_before(self.node, new_node) + lxb_dom_node_insert_before(self.node, new_node) elif isinstance(value, LexborNode): new_node = lxb_dom_document_import_node( &self.parser.document.dom_document, @@ -727,12 +727,12 @@ cdef class LexborNode: if isinstance(value, (str, bytes, unicode)): bytes_val = to_bytes(value) new_node = lxb_dom_document_create_text_node( - &self.parser.document.dom_document, - bytes_val, len(bytes_val) + &self.parser.document.dom_document, + bytes_val, len(bytes_val) ) if new_node == NULL: raise SelectolaxError("Can't create a new node") - lxb_dom_node_insert_after(self.node, new_node) + lxb_dom_node_insert_after(self.node, new_node) elif isinstance(value, LexborNode): new_node = lxb_dom_document_import_node( &self.parser.document.dom_document, @@ -778,12 +778,12 @@ cdef class LexborNode: if isinstance(value, (str, bytes, unicode)): bytes_val = to_bytes(value) new_node = lxb_dom_document_create_text_node( - &self.parser.document.dom_document, - bytes_val, len(bytes_val) + &self.parser.document.dom_document, + bytes_val, len(bytes_val) ) if new_node == NULL: raise SelectolaxError("Can't create a new node") - lxb_dom_node_insert_child(self.node, new_node) + lxb_dom_node_insert_child(self.node, new_node) elif isinstance(value, LexborNode): new_node = lxb_dom_document_import_node( &self.parser.document.dom_document, @@ -910,9 +910,9 @@ cdef class LexborNode: text : str or None. """ cdef unsigned char * text - cdef lxb_dom_node_t* node = self.node.first_child + cdef lxb_dom_node_t * node = self.node.first_child cdef TextContainer container - if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT: + if not self._is_node_type(LXB_DOM_NODE_TYPE_TEXT): return None text = lexbor_str_data_noi(&( self.node).data) @@ -921,9 +921,10 @@ cdef class LexborNode: py_text = text.decode(_ENCODING) container.append(py_text) return container.text + return None @property - def inner_html(self) -> str: + def inner_html(self) -> str | None: """Return HTML representation of the child nodes. Works similar to innerHTML in JavaScript. @@ -942,12 +943,12 @@ cdef class LexborNode: status = lxb_html_serialize_deep_str(self.node, lxb_str) if status == 0 and lxb_str.data: html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '') - lexbor_str_destroy(lxb_str, self.node.owner_document.text, True) + lexbor_str_destroy(lxb_str, self.node.owner_document.text, True) return html return None @inner_html.setter - def inner_html(self, str html): + def inner_html(self, str html) -> None: """Set inner HTML to the specified HTML. Replaces existing data inside the node. @@ -959,10 +960,10 @@ cdef class LexborNode: """ cdef bytes bytes_val - bytes_val = html.encode("utf-8") + bytes_val = html.encode("utf-8") lxb_html_element_inner_html_set( - self.node, - bytes_val, len(bytes_val) + self.node, + bytes_val, len(bytes_val) ) def clone(self) -> LexborNode: @@ -973,10 +974,33 @@ cdef class LexborNode: It is tied to the current parser instance. Gets destroyed when parser instance is destroyed. """ - cdef lxb_dom_node_t* node + cdef lxb_dom_node_t * node node = lxb_dom_node_clone( self.node, 1) return LexborNode.new(node, self.parser) + cdef inline bint _is_node_type(self, lxb_dom_node_type_t expected_type): + return self.node != NULL and self.node.type == expected_type + + @property + def is_element_node(self) -> bool: + """Return True if the node represents an element node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT) + + @property + def is_text_node(self) -> bool: + """Return True if the node represents a text node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_TEXT) + + @property + def is_comment_node(self) -> bool: + """Return True if the node represents a comment node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_COMMENT) + + @property + def is_document_node(self) -> bool: + """Return True if the node represents a document node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT) + @cython.internal @cython.final @@ -1010,14 +1034,13 @@ cdef class TextContainer: self._text = self._text[:-len(self.separator)] return self._text - cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx): cdef unsigned char *text cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node) if tag_id != LXB_TAG__TEXT: return LEXBOR_ACTION_OK - text = lexbor_str_data_noi(&( node).char_data.data) + text = lexbor_str_data_noi(&( node).char_data.data) if not text: return LEXBOR_ACTION_OK diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py index 83d82d3..f29256a 100644 --- a/tests/test_lexbor.py +++ b/tests/test_lexbor.py @@ -57,3 +57,27 @@ def test_unicode_selector_works(): tree = LexborHTMLParser(html) node = tree.css_first('span[data-original-title="Pneu renforcé"]') assert node.tag == "span" + + +def test_node_type_helpers(): + html = "
text
" + parser = LexborHTMLParser(html) + + div_node = parser.css_first("#main") + assert div_node.is_element_node + assert not div_node.is_text_node + + text_node = div_node.first_child + assert text_node is not None + assert text_node.is_text_node + assert not text_node.is_element_node + + comment_node = div_node.last_child + assert comment_node is not None + assert comment_node.is_comment_node + assert not comment_node.is_text_node + + document_node = parser.root.parent + assert document_node is not None + assert document_node.is_document_node + assert not document_node.is_element_node