From bde646fd896a6ad2fe9b9d852fd0a4d831e39060 Mon Sep 17 00:00:00 2001 From: pygarap Date: Sun, 16 Nov 2025 16:16:56 +0200 Subject: [PATCH 1/7] **Add node type helpers and improve consistency across the codebase** (#186) --- Makefile | 3 +- docs/conf.py | 57 ++++++++++++------------- examples/benchmark.py | 85 ++++++++++++++++++++++---------------- examples/simple_example.py | 4 +- examples/walkthrough.ipynb | 82 ++++++++++++++++++------------------ selectolax/lexbor.pxd | 1 + selectolax/lexbor.pyi | 12 ++++++ selectolax/lexbor/node.pxi | 21 ++++++++++ setup.py | 2 +- tests/test_lexbor.py | 24 +++++++++++ 10 files changed, 180 insertions(+), 111 deletions(-) diff --git a/Makefile b/Makefile index f31b8555..702c91ee 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,8 @@ clean-test: ## remove test and coverage artifacts rm -fr htmlcov/ lint: ## check style with ruff - ruff selectolax tests + ruff format selectolax tests + ruff check --fix selectolax tests mypy selectolax tests test: ## run tests quickly with the default Python diff --git a/docs/conf.py b/docs/conf.py index b4161852..62ef0307 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -33,7 +33,7 @@ # This lets us ensure that the source package is imported, and that its # version is used. -if platform.system() == 'Darwin': +if platform.system() == "Darwin": sys.path.insert(0, project_root) import selectolax @@ -46,28 +46,28 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.viewcode', - 'numpydoc', - 'sphinxext.opengraph', - 'sphinx_copybutton' + "sphinx.ext.autodoc", + "sphinx.ext.viewcode", + "numpydoc", + "sphinxext.opengraph", + "sphinx_copybutton", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'selectolax' -copyright = u"2018-2025, Artem Golubin" +project = "selectolax" +copyright = "2018-2025, Artem Golubin" # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout @@ -90,7 +90,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -108,7 +108,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -123,7 +123,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'default' -html_theme = 'furo' +html_theme = "furo" # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -204,17 +204,15 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'selectolaxdoc' +htmlhelp_basename = "selectolaxdoc" # -- Options for LaTeX output ------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # 'preamble': '', } @@ -223,9 +221,7 @@ # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ - ('index', 'selectolax.tex', - u'selectolax Documentation', - u'Artem Golubin', 'manual'), + ("index", "selectolax.tex", "selectolax Documentation", "Artem Golubin", "manual"), ] # The name of an image file (relative to this directory) to place at @@ -253,11 +249,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'selectolax', - u'selectolax Documentation', - [u'Artem Golubin'], 1) -] +man_pages = [("index", "selectolax", "selectolax Documentation", ["Artem Golubin"], 1)] # If true, show URL addresses after external links. # man_show_urls = False @@ -269,12 +261,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'selectolax', - u'selectolax Documentation', - u'Artem Golubin', - 'selectolax', - 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "selectolax", + "selectolax Documentation", + "Artem Golubin", + "selectolax", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. diff --git a/examples/benchmark.py b/examples/benchmark.py index c6cda2bd..870bd63c 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -12,6 +12,7 @@ 4) The content of the Meta description tag """ + import functools import json import time @@ -27,71 +28,71 @@ def bs4_parser(html_content, parser=HTMLParser): - soup = BeautifulSoup(html_content, 'html.parser') + soup = BeautifulSoup(html_content, "html.parser") title_text = soup.title.string assert title_text - a_hrefs = [a.attrs.get('href', '') for a in soup.find_all('a')] - assert len(a_hrefs) >= 5, 'href' + a_hrefs = [a.attrs.get("href", "") for a in soup.find_all("a")] + assert len(a_hrefs) >= 5, "href" - num_script_tags = len(soup.find_all('script')) - assert num_script_tags > 0, 'script' - meta_description = soup.find('meta', attrs={"name": "description"}) + num_script_tags = len(soup.find_all("script")) + assert num_script_tags > 0, "script" + meta_description = soup.find("meta", attrs={"name": "description"}) if meta_description: - meta_content = meta_description.get('content') + meta_content = meta_description.get("content") def selectolax_parser(html_content, parser=HTMLParser): tree = parser(html_content) title_text = "" - title_node = tree.css_first('title') + title_node = tree.css_first("title") if title_node: title_text = title_node.text() assert title_text - a_hrefs = [a.attrs.get('href', '') for a in tree.css('a[href]')] - assert len(a_hrefs) >= 5, 'href' + a_hrefs = [a.attrs.get("href", "") for a in tree.css("a[href]")] + assert len(a_hrefs) >= 5, "href" - num_script_tags = len(tree.css('script')) - assert num_script_tags > 0, 'script' + num_script_tags = len(tree.css("script")) + assert num_script_tags > 0, "script" meta_description = tree.css_first('meta[name="description"]') if meta_description: - meta_content = meta_description.attrs.sget('content', '') + meta_content = meta_description.attrs.sget("content", "") def lxml_parser(html_content): tree = fromstring(html_content) - title_text = tree.xpath('//title/text()') - assert title_text, 'title' + title_text = tree.xpath("//title/text()") + assert title_text, "title" - a_hrefs = [a.attrib.get('href', '') for a in tree.xpath('//a[@href]')] - assert len(a_hrefs) >= 5, 'href' + a_hrefs = [a.attrib.get("href", "") for a in tree.xpath("//a[@href]")] + assert len(a_hrefs) >= 5, "href" - num_script_tags = len(tree.xpath('//script')) - assert num_script_tags > 0, 'script' + num_script_tags = len(tree.xpath("//script")) + assert num_script_tags > 0, "script" meta_description = tree.xpath('meta[@name="description"]') if meta_description: - meta_content = meta_description[0].attrib.get('content', '') + meta_content = meta_description[0].attrib.get("content", "") def html5_parser(html_content): tree = parse(html_content) - title_text = tree.xpath('//title/text()') - assert title_text, 'title' + title_text = tree.xpath("//title/text()") + assert title_text, "title" - a_hrefs = [a.attrib.get('href', '') for a in tree.xpath('//a[@href]')] - assert len(a_hrefs) >= 5, 'href' + a_hrefs = [a.attrib.get("href", "") for a in tree.xpath("//a[@href]")] + assert len(a_hrefs) >= 5, "href" - num_script_tags = len(tree.xpath('//script')) - assert num_script_tags > 0, 'script' + num_script_tags = len(tree.xpath("//script")) + assert num_script_tags > 0, "script" meta_description = tree.xpath('meta[@name="description"]') if meta_description: - meta_content = meta_description[0].attrib.get('content', '') + meta_content = meta_description[0].attrib.get("content", "") def _perform_test(pages, parse_func): for page in pages: - parse_func(page['html']) + parse_func(page["html"]) def main(): @@ -100,19 +101,31 @@ def main(): # That translates to 324MB of HTML data. # Because of potential copyright infringements, I don't publish it. # - html_pages = [json.loads(page) for page in open('pages/pages.json', 'rt')] + html_pages = [json.loads(page) for page in open("pages/pages.json", "rt")] available_parsers = [ - ('bs4', bs4_parser,), - ('lxml', lxml_parser,), - ('html5_parser', html5_parser,), - ('modest', selectolax_parser,), - ('lexbor', functools.partial(selectolax_parser, parser=LexborHTMLParser)), + ( + "bs4", + bs4_parser, + ), + ( + "lxml", + lxml_parser, + ), + ( + "html5_parser", + html5_parser, + ), + ( + "modest", + selectolax_parser, + ), + ("lexbor", functools.partial(selectolax_parser, parser=LexborHTMLParser)), ] for parser_name, parser in available_parsers: start = time.time() _perform_test(html_pages, parser) - print('%r: %s' % (parser_name, time.time() - start)) + print("%r: %s" % (parser_name, time.time() - start)) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/examples/simple_example.py b/examples/simple_example.py index 0e642d11..bffb6aaf 100644 --- a/examples/simple_example.py +++ b/examples/simple_example.py @@ -1,6 +1,8 @@ from selectolax.lexbor import LexborHTMLParser -html = "

link

text

" +html = ( + "

link

text

" +) selector = "div > :nth-child(2n+1):not(:has(a))" for node in LexborHTMLParser(html).css(selector): diff --git a/examples/walkthrough.ipynb b/examples/walkthrough.ipynb index 7de0588b..b36cc92f 100644 --- a/examples/walkthrough.ipynb +++ b/examples/walkthrough.ipynb @@ -1,8 +1,8 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": [ "Note: These examples were written for Modest backend.\n", "As of 2025, use lexbor backend for better performance and features.\n", @@ -125,15 +125,15 @@ "selector = \"p.p3\"\n", "\n", "for node in HTMLParser(html).css(selector):\n", - " print('---------------------')\n", - " print('Node: %s' % node.html)\n", - " print('attributes: %s' % node.attributes)\n", - " print('node text: %s' % node.text(deep=True, separator='', strip=False))\n", - " print('tag: %s' % node.tag)\n", - " print('parent tag: %s' % node.parent.tag)\n", + " print(\"---------------------\")\n", + " print(\"Node: %s\" % node.html)\n", + " print(\"attributes: %s\" % node.attributes)\n", + " print(\"node text: %s\" % node.text(deep=True, separator=\"\", strip=False))\n", + " print(\"tag: %s\" % node.tag)\n", + " print(\"parent tag: %s\" % node.parent.tag)\n", " if node.last_child:\n", - " print('last child inside current node: %s' % node.last_child.html)\n", - " print('---------------------\\n')" + " print(\"last child inside current node: %s\" % node.last_child.html)\n", + " print(\"---------------------\\n\")" ] }, { @@ -157,7 +157,7 @@ } ], "source": [ - "print(\"H1: %s\" % HTMLParser(html).css_first('h1').text())" + "print(\"H1: %s\" % HTMLParser(html).css_first(\"h1\").text())" ] }, { @@ -181,7 +181,7 @@ } ], "source": [ - "print(\"Title: %s\" % HTMLParser(html).css_first('title', default='not-found'))" + "print(\"Title: %s\" % HTMLParser(html).css_first(\"title\", default=\"not-found\"))" ] }, { @@ -203,17 +203,17 @@ "evalue": "Expected 1 match, but found 2 matches", "output_type": "error", "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)", - "\u001B[0;32m/var/folders/hz/kldp1y0d4tvd1x56cyg_b9mm0000gn/T/ipykernel_94350/4220316520.py\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0;31m \u001B[0mHTMLParser\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mhtml\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcss_first\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m\"p.p3\"\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdefault\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;34m'not-found'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstrict\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m", - "\u001B[0;32m~/Projects/python/selectolax/selectolax/parser.pyx\u001B[0m in \u001B[0;36mselectolax.parser.HTMLParser.css_first\u001B[0;34m()\u001B[0m\n", - "\u001B[0;32m~/Projects/python/selectolax/selectolax/node.pxi\u001B[0m in \u001B[0;36mselectolax.parser.Node.css_first\u001B[0;34m()\u001B[0m\n", - "\u001B[0;31mValueError\u001B[0m: Expected 1 match, but found 2 matches" + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/hz/kldp1y0d4tvd1x56cyg_b9mm0000gn/T/ipykernel_94350/4220316520.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mHTMLParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcss_first\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"p.p3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'not-found'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstrict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/Projects/python/selectolax/selectolax/parser.pyx\u001b[0m in \u001b[0;36mselectolax.parser.HTMLParser.css_first\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m~/Projects/python/selectolax/selectolax/node.pxi\u001b[0m in \u001b[0;36mselectolax.parser.Node.css_first\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mValueError\u001b[0m: Expected 1 match, but found 2 matches" ] } ], "source": [ - "HTMLParser(html).css_first(\"p.p3\", default='not-found', strict=True)" + "HTMLParser(html).css_first(\"p.p3\", default=\"not-found\", strict=True)" ] }, { @@ -239,7 +239,7 @@ } ], "source": [ - "print(HTMLParser(html).css_first('p#stext').parent.html)" + "print(HTMLParser(html).css_first(\"p#stext\").parent.html)" ] }, { @@ -266,7 +266,7 @@ } ], "source": [ - "HTMLParser(html).css_first('div#text').css_first('p:nth-child(2)').html" + "HTMLParser(html).css_first(\"div#text\").css_first(\"p:nth-child(2)\").html" ] }, { @@ -293,7 +293,7 @@ "source": [ "for node in HTMLParser(html).css(\"div#text\"):\n", " for cnode in node.iter():\n", - " print(cnode.tag, cnode.html)\n" + " print(cnode.tag, cnode.html)" ] }, { @@ -327,7 +327,7 @@ ], "source": [ "html_parser = HTMLParser(html)\n", - "for node in html_parser.tags('p'):\n", + "for node in html_parser.tags(\"p\"):\n", " node.decompose()\n", "print(html_parser.body.html)" ] @@ -396,7 +396,7 @@ ], "source": [ "html_parser = HTMLParser(html)\n", - "html_parser.unwrap_tags(['p', 'i'])\n", + "html_parser.unwrap_tags([\"p\", \"i\"])\n", "print(html_parser.body.html)" ] }, @@ -429,11 +429,11 @@ ], "source": [ "html_parser = HTMLParser(html)\n", - "node = html_parser.css_first('div#text')\n", - "node.attrs['data'] = 'secrect data'\n", - "node.attrs['id'] = 'new_id'\n", + "node = html_parser.css_first(\"div#text\")\n", + "node.attrs[\"data\"] = \"secrect data\"\n", + "node.attrs[\"id\"] = \"new_id\"\n", "print(node.attributes)\n", - "del node.attrs['id']\n", + "del node.attrs[\"id\"]\n", "print(node.attributes)\n", "print(node.html)" ] @@ -471,8 +471,7 @@ "source": [ "html_parser = HTMLParser(html)\n", "for node in html_parser.root.traverse():\n", - "\n", - " if node.tag == '-text':\n", + " if node.tag == \"-text\":\n", " text = node.text(deep=True).strip()\n", " if text:\n", " print(text)\n", @@ -495,7 +494,7 @@ "source": [ "html = \"
Привет мир!
\"\n", "# Encoding detector works only with raw strings (bytes)\n", - "html_bytes = html.encode('cp1251')" + "html_bytes = html.encode(\"cp1251\")" ] }, { @@ -562,7 +561,7 @@ } ], "source": [ - "html = ''.encode('cp1251')\n", + "html = ''.encode(\"cp1251\")\n", "HTMLParser(html, detect_encoding=True, use_meta_tags=True).input_encoding" ] }, @@ -583,7 +582,7 @@ } ], "source": [ - "html_utf = ''.encode('utf-8')\n", + "html_utf = ''.encode(\"utf-8\")\n", "HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding" ] }, @@ -625,7 +624,7 @@ "\"\"\"\n", "tree = HTMLParser(html)\n", "\n", - "[node.text() for node in tree.select('script').text_contains(\"super\").matches]\n" + "[node.text() for node in tree.select(\"script\").text_contains(\"super\").matches]" ] }, { @@ -661,7 +660,7 @@ "\"\"\"\n", "tree = HTMLParser(html)\n", "\n", - "print([node.html for node in tree.select('div').css(\"span\").css(\".red\").matches])\n" + "print([node.html for node in tree.select(\"div\").css(\"span\").css(\".red\").matches])" ] }, { @@ -688,39 +687,40 @@ "tree = HTMLParser(html)\n", "\n", "# Insert text\n", - "dest_node = html_parser.css_first('.red')\n", + "dest_node = html_parser.css_first(\".red\")\n", "dest_node.insert_before(\"Hello\")\n", "\n", "# Insert nodes\n", "subtree = HTMLParser(\"
Hi
\")\n", - "dest_node = html_parser.css_first('.red')\n", + "dest_node = html_parser.css_first(\".red\")\n", "dest_node.insert_before(subtree)\n", "\n", "# Insert before, after, or append inside\n", "subtree = HTMLParser(\"
Car
\")\n", - "dest_node = html_parser.css_first('.green')\n", + "dest_node = html_parser.css_first(\".green\")\n", "dest_node.insert_before(subtree)\n", "dest_node.insert_after(subtree)\n", "dest_node.insert_child(subtree)" ] }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": "### Contains selector in lexbor" }, { - "metadata": {}, "cell_type": "markdown", + "metadata": {}, "source": "The `lexbor` backend supports `:lexbor-contains(text)` pseudo-class for case-sensitive and case-insensitive text search." }, { - "metadata": {}, "cell_type": "code", - "outputs": [], "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from selectolax.lexbor import LexborHTMLParser\n", + "\n", "html = \"

hello

lexbor is AwesOme

\"\n", "parser = LexborHTMLParser(html)\n", "# Case-insensitive search\n", diff --git a/selectolax/lexbor.pxd b/selectolax/lexbor.pxd index 177354c7..9e577b2f 100644 --- a/selectolax/lexbor.pxd +++ b/selectolax/lexbor.pxd @@ -233,6 +233,7 @@ cdef class LexborNode: cdef: lxb_dom_node_t *node public LexborHTMLParser parser + cdef bint _is_node_type(self, lxb_dom_node_type_t expected_type) @staticmethod cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser) diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi index 8212e29c..a18ae726 100644 --- a/selectolax/lexbor.pyi +++ b/selectolax/lexbor.pyi @@ -261,6 +261,18 @@ class LexborNode: def css_matches(self, selector: str) -> bool: """Returns True if CSS selector matches a node.""" ... + def is_element_node(self) -> bool: + """Return True if the node represents an element node.""" + ... + def is_text_node(self) -> bool: + """Return True if the node represents a text node.""" + ... + def is_comment_node(self) -> bool: + """Return True if the node represents a comment node.""" + ... + def is_document_node(self) -> bool: + """Return True if the node represents a document node.""" + ... @property def tag_id(self) -> int: ... @property diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi index af95fe1a..57450c36 100644 --- a/selectolax/lexbor/node.pxi +++ b/selectolax/lexbor/node.pxi @@ -32,6 +32,11 @@ cdef inline bytes to_bytes(str_or_LexborNode value): cdef class LexborNode: """A class that represents HTML node (element).""" + cdef inline bint _is_node_type(self, lxb_dom_node_type_t expected_type): + if self.node == NULL: + return False + return self.node.type == expected_type + @staticmethod cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser): cdef LexborNode lxbnode = LexborNode.__new__(LexborNode) @@ -249,6 +254,22 @@ cdef class LexborNode: """Returns True if CSS selector matches a node.""" return bool(self.parser.selector.any_matches(selector, self)) + def is_element_node(self): + """Return True if the node represents an element node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT) + + def is_text_node(self): + """Return True if the node represents a text node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_TEXT) + + def is_comment_node(self): + """Return True if the node represents a comment node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_COMMENT) + + def is_document_node(self): + """Return True if the node represents a document node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT) + def __repr__(self): return '' % self.tag diff --git a/setup.py b/setup.py index 618e9895..42113fdd 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ HAS_CYTHON = True USE_CYTHON = True -except ImportError as err: +except ImportError: HAS_CYTHON = False if "--static" in sys.argv: diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py index 83d82d3a..0b07cdd6 100644 --- a/tests/test_lexbor.py +++ b/tests/test_lexbor.py @@ -57,3 +57,27 @@ def test_unicode_selector_works(): tree = LexborHTMLParser(html) node = tree.css_first('span[data-original-title="Pneu renforcé"]') assert node.tag == "span" + + +def test_node_type_helpers(): + html = "
text
" + parser = LexborHTMLParser(html) + + div_node = parser.css_first("#main") + assert div_node.is_element_node() + assert not div_node.is_text_node() + + text_node = div_node.first_child + assert text_node is not None + assert text_node.is_text_node() + assert not text_node.is_element_node() + + comment_node = div_node.last_child + assert comment_node is not None + assert comment_node.is_comment_node() + assert not comment_node.is_text_node() + + document_node = parser.root.parent + assert document_node is not None + assert document_node.is_document_node() + assert not document_node.is_element_node() From aa1684577fe077fd591425072a3f41b12ff4fd69 Mon Sep 17 00:00:00 2001 From: pygarap Date: Sun, 16 Nov 2025 20:32:38 +0200 Subject: [PATCH 2/7] **Enforce consistent string quoting and minor formatting adjustments across the codebase** (#187) --- docs/conf.py | 57 +++++++++++++------------ examples/benchmark.py | 85 ++++++++++++++++---------------------- examples/simple_example.py | 4 +- examples/walkthrough.ipynb | 82 ++++++++++++++++++------------------ setup.py | 2 +- 5 files changed, 110 insertions(+), 120 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 62ef0307..b4161852 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -33,7 +33,7 @@ # This lets us ensure that the source package is imported, and that its # version is used. -if platform.system() == "Darwin": +if platform.system() == 'Darwin': sys.path.insert(0, project_root) import selectolax @@ -46,28 +46,28 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.viewcode", - "numpydoc", - "sphinxext.opengraph", - "sphinx_copybutton", + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', + 'numpydoc', + 'sphinxext.opengraph', + 'sphinx_copybutton' ] # Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] +templates_path = ['_templates'] # The suffix of source filenames. -source_suffix = ".rst" +source_suffix = '.rst' # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = "index" +master_doc = 'index' # General information about the project. -project = "selectolax" -copyright = "2018-2025, Artem Golubin" +project = u'selectolax' +copyright = u"2018-2025, Artem Golubin" # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout @@ -90,7 +90,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ["_build"] +exclude_patterns = ['_build'] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -108,7 +108,7 @@ # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" +pygments_style = 'sphinx' # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -123,7 +123,7 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'default' -html_theme = "furo" +html_theme = 'furo' # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the @@ -204,15 +204,17 @@ # html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = "selectolaxdoc" +htmlhelp_basename = 'selectolaxdoc' # -- Options for LaTeX output ------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. # 'preamble': '', } @@ -221,7 +223,9 @@ # (source start file, target name, title, author, documentclass # [howto/manual]). latex_documents = [ - ("index", "selectolax.tex", "selectolax Documentation", "Artem Golubin", "manual"), + ('index', 'selectolax.tex', + u'selectolax Documentation', + u'Artem Golubin', 'manual'), ] # The name of an image file (relative to this directory) to place at @@ -249,7 +253,11 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [("index", "selectolax", "selectolax Documentation", ["Artem Golubin"], 1)] +man_pages = [ + ('index', 'selectolax', + u'selectolax Documentation', + [u'Artem Golubin'], 1) +] # If true, show URL addresses after external links. # man_show_urls = False @@ -261,15 +269,12 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ( - "index", - "selectolax", - "selectolax Documentation", - "Artem Golubin", - "selectolax", - "One line description of project.", - "Miscellaneous", - ), + ('index', 'selectolax', + u'selectolax Documentation', + u'Artem Golubin', + 'selectolax', + 'One line description of project.', + 'Miscellaneous'), ] # Documents to append as an appendix to all manuals. diff --git a/examples/benchmark.py b/examples/benchmark.py index 870bd63c..c6cda2bd 100644 --- a/examples/benchmark.py +++ b/examples/benchmark.py @@ -12,7 +12,6 @@ 4) The content of the Meta description tag """ - import functools import json import time @@ -28,71 +27,71 @@ def bs4_parser(html_content, parser=HTMLParser): - soup = BeautifulSoup(html_content, "html.parser") + soup = BeautifulSoup(html_content, 'html.parser') title_text = soup.title.string assert title_text - a_hrefs = [a.attrs.get("href", "") for a in soup.find_all("a")] - assert len(a_hrefs) >= 5, "href" + a_hrefs = [a.attrs.get('href', '') for a in soup.find_all('a')] + assert len(a_hrefs) >= 5, 'href' - num_script_tags = len(soup.find_all("script")) - assert num_script_tags > 0, "script" - meta_description = soup.find("meta", attrs={"name": "description"}) + num_script_tags = len(soup.find_all('script')) + assert num_script_tags > 0, 'script' + meta_description = soup.find('meta', attrs={"name": "description"}) if meta_description: - meta_content = meta_description.get("content") + meta_content = meta_description.get('content') def selectolax_parser(html_content, parser=HTMLParser): tree = parser(html_content) title_text = "" - title_node = tree.css_first("title") + title_node = tree.css_first('title') if title_node: title_text = title_node.text() assert title_text - a_hrefs = [a.attrs.get("href", "") for a in tree.css("a[href]")] - assert len(a_hrefs) >= 5, "href" + a_hrefs = [a.attrs.get('href', '') for a in tree.css('a[href]')] + assert len(a_hrefs) >= 5, 'href' - num_script_tags = len(tree.css("script")) - assert num_script_tags > 0, "script" + num_script_tags = len(tree.css('script')) + assert num_script_tags > 0, 'script' meta_description = tree.css_first('meta[name="description"]') if meta_description: - meta_content = meta_description.attrs.sget("content", "") + meta_content = meta_description.attrs.sget('content', '') def lxml_parser(html_content): tree = fromstring(html_content) - title_text = tree.xpath("//title/text()") - assert title_text, "title" + title_text = tree.xpath('//title/text()') + assert title_text, 'title' - a_hrefs = [a.attrib.get("href", "") for a in tree.xpath("//a[@href]")] - assert len(a_hrefs) >= 5, "href" + a_hrefs = [a.attrib.get('href', '') for a in tree.xpath('//a[@href]')] + assert len(a_hrefs) >= 5, 'href' - num_script_tags = len(tree.xpath("//script")) - assert num_script_tags > 0, "script" + num_script_tags = len(tree.xpath('//script')) + assert num_script_tags > 0, 'script' meta_description = tree.xpath('meta[@name="description"]') if meta_description: - meta_content = meta_description[0].attrib.get("content", "") + meta_content = meta_description[0].attrib.get('content', '') def html5_parser(html_content): tree = parse(html_content) - title_text = tree.xpath("//title/text()") - assert title_text, "title" + title_text = tree.xpath('//title/text()') + assert title_text, 'title' - a_hrefs = [a.attrib.get("href", "") for a in tree.xpath("//a[@href]")] - assert len(a_hrefs) >= 5, "href" + a_hrefs = [a.attrib.get('href', '') for a in tree.xpath('//a[@href]')] + assert len(a_hrefs) >= 5, 'href' - num_script_tags = len(tree.xpath("//script")) - assert num_script_tags > 0, "script" + num_script_tags = len(tree.xpath('//script')) + assert num_script_tags > 0, 'script' meta_description = tree.xpath('meta[@name="description"]') if meta_description: - meta_content = meta_description[0].attrib.get("content", "") + meta_content = meta_description[0].attrib.get('content', '') def _perform_test(pages, parse_func): for page in pages: - parse_func(page["html"]) + parse_func(page['html']) def main(): @@ -101,31 +100,19 @@ def main(): # That translates to 324MB of HTML data. # Because of potential copyright infringements, I don't publish it. # - html_pages = [json.loads(page) for page in open("pages/pages.json", "rt")] + html_pages = [json.loads(page) for page in open('pages/pages.json', 'rt')] available_parsers = [ - ( - "bs4", - bs4_parser, - ), - ( - "lxml", - lxml_parser, - ), - ( - "html5_parser", - html5_parser, - ), - ( - "modest", - selectolax_parser, - ), - ("lexbor", functools.partial(selectolax_parser, parser=LexborHTMLParser)), + ('bs4', bs4_parser,), + ('lxml', lxml_parser,), + ('html5_parser', html5_parser,), + ('modest', selectolax_parser,), + ('lexbor', functools.partial(selectolax_parser, parser=LexborHTMLParser)), ] for parser_name, parser in available_parsers: start = time.time() _perform_test(html_pages, parser) - print("%r: %s" % (parser_name, time.time() - start)) + print('%r: %s' % (parser_name, time.time() - start)) -if __name__ == "__main__": +if __name__ == '__main__': main() diff --git a/examples/simple_example.py b/examples/simple_example.py index bffb6aaf..0e642d11 100644 --- a/examples/simple_example.py +++ b/examples/simple_example.py @@ -1,8 +1,6 @@ from selectolax.lexbor import LexborHTMLParser -html = ( - "

link

text

" -) +html = "

link

text

" selector = "div > :nth-child(2n+1):not(:has(a))" for node in LexborHTMLParser(html).css(selector): diff --git a/examples/walkthrough.ipynb b/examples/walkthrough.ipynb index b36cc92f..7de0588b 100644 --- a/examples/walkthrough.ipynb +++ b/examples/walkthrough.ipynb @@ -1,8 +1,8 @@ { "cells": [ { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": [ "Note: These examples were written for Modest backend.\n", "As of 2025, use lexbor backend for better performance and features.\n", @@ -125,15 +125,15 @@ "selector = \"p.p3\"\n", "\n", "for node in HTMLParser(html).css(selector):\n", - " print(\"---------------------\")\n", - " print(\"Node: %s\" % node.html)\n", - " print(\"attributes: %s\" % node.attributes)\n", - " print(\"node text: %s\" % node.text(deep=True, separator=\"\", strip=False))\n", - " print(\"tag: %s\" % node.tag)\n", - " print(\"parent tag: %s\" % node.parent.tag)\n", + " print('---------------------')\n", + " print('Node: %s' % node.html)\n", + " print('attributes: %s' % node.attributes)\n", + " print('node text: %s' % node.text(deep=True, separator='', strip=False))\n", + " print('tag: %s' % node.tag)\n", + " print('parent tag: %s' % node.parent.tag)\n", " if node.last_child:\n", - " print(\"last child inside current node: %s\" % node.last_child.html)\n", - " print(\"---------------------\\n\")" + " print('last child inside current node: %s' % node.last_child.html)\n", + " print('---------------------\\n')" ] }, { @@ -157,7 +157,7 @@ } ], "source": [ - "print(\"H1: %s\" % HTMLParser(html).css_first(\"h1\").text())" + "print(\"H1: %s\" % HTMLParser(html).css_first('h1').text())" ] }, { @@ -181,7 +181,7 @@ } ], "source": [ - "print(\"Title: %s\" % HTMLParser(html).css_first(\"title\", default=\"not-found\"))" + "print(\"Title: %s\" % HTMLParser(html).css_first('title', default='not-found'))" ] }, { @@ -203,17 +203,17 @@ "evalue": "Expected 1 match, but found 2 matches", "output_type": "error", "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/hz/kldp1y0d4tvd1x56cyg_b9mm0000gn/T/ipykernel_94350/4220316520.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mHTMLParser\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcss_first\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"p.p3\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdefault\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'not-found'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstrict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/Projects/python/selectolax/selectolax/parser.pyx\u001b[0m in \u001b[0;36mselectolax.parser.HTMLParser.css_first\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m~/Projects/python/selectolax/selectolax/node.pxi\u001b[0m in \u001b[0;36mselectolax.parser.Node.css_first\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Expected 1 match, but found 2 matches" + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mValueError\u001B[0m Traceback (most recent call last)", + "\u001B[0;32m/var/folders/hz/kldp1y0d4tvd1x56cyg_b9mm0000gn/T/ipykernel_94350/4220316520.py\u001B[0m in \u001B[0;36m\u001B[0;34m\u001B[0m\n\u001B[0;32m----> 1\u001B[0;31m \u001B[0mHTMLParser\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mhtml\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mcss_first\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34m\"p.p3\"\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mdefault\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;34m'not-found'\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mstrict\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m", + "\u001B[0;32m~/Projects/python/selectolax/selectolax/parser.pyx\u001B[0m in \u001B[0;36mselectolax.parser.HTMLParser.css_first\u001B[0;34m()\u001B[0m\n", + "\u001B[0;32m~/Projects/python/selectolax/selectolax/node.pxi\u001B[0m in \u001B[0;36mselectolax.parser.Node.css_first\u001B[0;34m()\u001B[0m\n", + "\u001B[0;31mValueError\u001B[0m: Expected 1 match, but found 2 matches" ] } ], "source": [ - "HTMLParser(html).css_first(\"p.p3\", default=\"not-found\", strict=True)" + "HTMLParser(html).css_first(\"p.p3\", default='not-found', strict=True)" ] }, { @@ -239,7 +239,7 @@ } ], "source": [ - "print(HTMLParser(html).css_first(\"p#stext\").parent.html)" + "print(HTMLParser(html).css_first('p#stext').parent.html)" ] }, { @@ -266,7 +266,7 @@ } ], "source": [ - "HTMLParser(html).css_first(\"div#text\").css_first(\"p:nth-child(2)\").html" + "HTMLParser(html).css_first('div#text').css_first('p:nth-child(2)').html" ] }, { @@ -293,7 +293,7 @@ "source": [ "for node in HTMLParser(html).css(\"div#text\"):\n", " for cnode in node.iter():\n", - " print(cnode.tag, cnode.html)" + " print(cnode.tag, cnode.html)\n" ] }, { @@ -327,7 +327,7 @@ ], "source": [ "html_parser = HTMLParser(html)\n", - "for node in html_parser.tags(\"p\"):\n", + "for node in html_parser.tags('p'):\n", " node.decompose()\n", "print(html_parser.body.html)" ] @@ -396,7 +396,7 @@ ], "source": [ "html_parser = HTMLParser(html)\n", - "html_parser.unwrap_tags([\"p\", \"i\"])\n", + "html_parser.unwrap_tags(['p', 'i'])\n", "print(html_parser.body.html)" ] }, @@ -429,11 +429,11 @@ ], "source": [ "html_parser = HTMLParser(html)\n", - "node = html_parser.css_first(\"div#text\")\n", - "node.attrs[\"data\"] = \"secrect data\"\n", - "node.attrs[\"id\"] = \"new_id\"\n", + "node = html_parser.css_first('div#text')\n", + "node.attrs['data'] = 'secrect data'\n", + "node.attrs['id'] = 'new_id'\n", "print(node.attributes)\n", - "del node.attrs[\"id\"]\n", + "del node.attrs['id']\n", "print(node.attributes)\n", "print(node.html)" ] @@ -471,7 +471,8 @@ "source": [ "html_parser = HTMLParser(html)\n", "for node in html_parser.root.traverse():\n", - " if node.tag == \"-text\":\n", + "\n", + " if node.tag == '-text':\n", " text = node.text(deep=True).strip()\n", " if text:\n", " print(text)\n", @@ -494,7 +495,7 @@ "source": [ "html = \"
Привет мир!
\"\n", "# Encoding detector works only with raw strings (bytes)\n", - "html_bytes = html.encode(\"cp1251\")" + "html_bytes = html.encode('cp1251')" ] }, { @@ -561,7 +562,7 @@ } ], "source": [ - "html = ''.encode(\"cp1251\")\n", + "html = ''.encode('cp1251')\n", "HTMLParser(html, detect_encoding=True, use_meta_tags=True).input_encoding" ] }, @@ -582,7 +583,7 @@ } ], "source": [ - "html_utf = ''.encode(\"utf-8\")\n", + "html_utf = ''.encode('utf-8')\n", "HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding" ] }, @@ -624,7 +625,7 @@ "\"\"\"\n", "tree = HTMLParser(html)\n", "\n", - "[node.text() for node in tree.select(\"script\").text_contains(\"super\").matches]" + "[node.text() for node in tree.select('script').text_contains(\"super\").matches]\n" ] }, { @@ -660,7 +661,7 @@ "\"\"\"\n", "tree = HTMLParser(html)\n", "\n", - "print([node.html for node in tree.select(\"div\").css(\"span\").css(\".red\").matches])" + "print([node.html for node in tree.select('div').css(\"span\").css(\".red\").matches])\n" ] }, { @@ -687,40 +688,39 @@ "tree = HTMLParser(html)\n", "\n", "# Insert text\n", - "dest_node = html_parser.css_first(\".red\")\n", + "dest_node = html_parser.css_first('.red')\n", "dest_node.insert_before(\"Hello\")\n", "\n", "# Insert nodes\n", "subtree = HTMLParser(\"
Hi
\")\n", - "dest_node = html_parser.css_first(\".red\")\n", + "dest_node = html_parser.css_first('.red')\n", "dest_node.insert_before(subtree)\n", "\n", "# Insert before, after, or append inside\n", "subtree = HTMLParser(\"
Car
\")\n", - "dest_node = html_parser.css_first(\".green\")\n", + "dest_node = html_parser.css_first('.green')\n", "dest_node.insert_before(subtree)\n", "dest_node.insert_after(subtree)\n", "dest_node.insert_child(subtree)" ] }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": "### Contains selector in lexbor" }, { - "cell_type": "markdown", "metadata": {}, + "cell_type": "markdown", "source": "The `lexbor` backend supports `:lexbor-contains(text)` pseudo-class for case-sensitive and case-insensitive text search." }, { - "cell_type": "code", - "execution_count": null, "metadata": {}, + "cell_type": "code", "outputs": [], + "execution_count": null, "source": [ "from selectolax.lexbor import LexborHTMLParser\n", - "\n", "html = \"

hello

lexbor is AwesOme

\"\n", "parser = LexborHTMLParser(html)\n", "# Case-insensitive search\n", diff --git a/setup.py b/setup.py index 42113fdd..618e9895 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ HAS_CYTHON = True USE_CYTHON = True -except ImportError: +except ImportError as err: HAS_CYTHON = False if "--static" in sys.argv: From bc6dc8d270eb5f28fbf5c5653f12be89b979978e Mon Sep 17 00:00:00 2001 From: pygarap Date: Sun, 16 Nov 2025 21:09:00 +0200 Subject: [PATCH 3/7] **Refactor node type checks and apply formatting improvements** (#188) --- selectolax/lexbor/node.pxi | 120 ++++++++++++++++++------------------- tests/test_lexbor.py | 16 ++--- 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi index 57450c36..aaf31fd9 100644 --- a/selectolax/lexbor/node.pxi +++ b/selectolax/lexbor/node.pxi @@ -22,21 +22,15 @@ ctypedef fused str_or_bytes: cdef inline bytes to_bytes(str_or_LexborNode value): cdef bytes bytes_val if isinstance(value, unicode): - bytes_val = value.encode("utf-8") + bytes_val = value.encode("utf-8") elif isinstance(value, bytes): - bytes_val = value + bytes_val = value return bytes_val - @cython.final cdef class LexborNode: """A class that represents HTML node (element).""" - cdef inline bint _is_node_type(self, lxb_dom_node_type_t expected_type): - if self.node == NULL: - return False - return self.node.type == expected_type - @staticmethod cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser): cdef LexborNode lxbnode = LexborNode.__new__(LexborNode) @@ -116,7 +110,7 @@ cdef class LexborNode: status = lxb_html_serialize_tree_str(self.node, lxb_str) if status == 0 and lxb_str.data: html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '') - lexbor_str_destroy(lxb_str, self.node.owner_document.text, True) + lexbor_str_destroy(lxb_str, self.node.owner_document.text, True) return html return None @@ -133,7 +127,7 @@ cdef class LexborNode: cdef lxb_char_t * text text = lxb_dom_node_text_content(self.node, &str_len) - if str_len == 0: + if str_len == 0: raise RuntimeError("Can't extract text") unicode_text = text.decode(_ENCODING) @@ -158,11 +152,11 @@ cdef class LexborNode: """ cdef unsigned char * text - cdef lxb_dom_node_t* node = self.node.first_child + cdef lxb_dom_node_t * node = self.node.first_child if not deep: container = TextContainer(separator, strip) - if self.node != NULL and self.node.type == LXB_DOM_NODE_TYPE_TEXT: + if self.is_text_node: text = lexbor_str_data_noi(&( self.node).data) if text != NULL: py_text = text.decode(_ENCODING) @@ -178,15 +172,15 @@ cdef class LexborNode: return container.text else: container = TextContainer(separator, strip) - if self.node.type == LXB_DOM_NODE_TYPE_TEXT: + if self.is_text_node: text = lexbor_str_data_noi(&( self.node).data) if text != NULL: container.append(text.decode(_ENCODING)) lxb_dom_node_simple_walk( self.node, - text_callback, - container + text_callback, + container ) return container.text @@ -246,7 +240,7 @@ cdef class LexborNode: def any_css_matches(self, tuple selectors): """Returns True if any of CSS selectors matches a node""" for selector in selectors: - if self.parser.selector.any_matches(selector, self): + if self.parser.selector.any_matches(selector, self): return True return False @@ -254,22 +248,6 @@ cdef class LexborNode: """Returns True if CSS selector matches a node.""" return bool(self.parser.selector.any_matches(selector, self)) - def is_element_node(self): - """Return True if the node represents an element node.""" - return self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT) - - def is_text_node(self): - """Return True if the node represents a text node.""" - return self._is_node_type(LXB_DOM_NODE_TYPE_TEXT) - - def is_comment_node(self): - """Return True if the node represents a comment node.""" - return self._is_node_type(LXB_DOM_NODE_TYPE_COMMENT) - - def is_document_node(self): - """Return True if the node represents a document node.""" - return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT) - def __repr__(self): return '' % self.tag @@ -376,7 +354,7 @@ cdef class LexborNode: cdef size_t str_len = 0 attributes = dict() - if self.node.type != LXB_DOM_NODE_TYPE_ELEMENT: + if not self.is_element_node: return attributes while attr != NULL: @@ -419,7 +397,7 @@ cdef class LexborNode: >>> node.html '
' """ - cdef LexborAttributes attributes = LexborAttributes.create(self.node) + cdef LexborAttributes attributes = LexborAttributes.create( self.node) return attributes @property @@ -497,8 +475,8 @@ cdef class LexborNode: if delete_empty: lxb_dom_node_remove( self.node) return - cdef lxb_dom_node_t* next_node - cdef lxb_dom_node_t* current_node + cdef lxb_dom_node_t * next_node + cdef lxb_dom_node_t * current_node if self.node.first_child.next != NULL: current_node = self.node.first_child @@ -569,7 +547,7 @@ cdef class LexborNode: left_text = lxb_dom_node_text_content(node.prev, &left_length) right_text = lxb_dom_node_text_content(node, &right_length) if left_text and right_text: - combined = (left_text[:left_length]) + (right_text[:right_length]) + combined = ( left_text[:left_length]) + ( right_text[:right_length]) lxb_dom_node_text_content_set(node, combined, len(combined)) lxb_dom_node_remove(node.prev) @@ -644,12 +622,12 @@ cdef class LexborNode: if isinstance(value, (str, bytes, unicode)): bytes_val = to_bytes(value) new_node = lxb_dom_document_create_text_node( - &self.parser.document.dom_document, - bytes_val, len(bytes_val) + &self.parser.document.dom_document, + bytes_val, len(bytes_val) ) if new_node == NULL: raise SelectolaxError("Can't create a new node") - lxb_dom_node_insert_before(self.node, new_node) + lxb_dom_node_insert_before(self.node, new_node) lxb_dom_node_remove( self.node) elif isinstance(value, LexborNode): new_node = lxb_dom_document_import_node( @@ -697,12 +675,12 @@ cdef class LexborNode: if isinstance(value, (str, bytes, unicode)): bytes_val = to_bytes(value) new_node = lxb_dom_document_create_text_node( - &self.parser.document.dom_document, - bytes_val, len(bytes_val) + &self.parser.document.dom_document, + bytes_val, len(bytes_val) ) if new_node == NULL: raise SelectolaxError("Can't create a new node") - lxb_dom_node_insert_before(self.node, new_node) + lxb_dom_node_insert_before(self.node, new_node) elif isinstance(value, LexborNode): new_node = lxb_dom_document_import_node( &self.parser.document.dom_document, @@ -748,12 +726,12 @@ cdef class LexborNode: if isinstance(value, (str, bytes, unicode)): bytes_val = to_bytes(value) new_node = lxb_dom_document_create_text_node( - &self.parser.document.dom_document, - bytes_val, len(bytes_val) + &self.parser.document.dom_document, + bytes_val, len(bytes_val) ) if new_node == NULL: raise SelectolaxError("Can't create a new node") - lxb_dom_node_insert_after(self.node, new_node) + lxb_dom_node_insert_after(self.node, new_node) elif isinstance(value, LexborNode): new_node = lxb_dom_document_import_node( &self.parser.document.dom_document, @@ -799,12 +777,12 @@ cdef class LexborNode: if isinstance(value, (str, bytes, unicode)): bytes_val = to_bytes(value) new_node = lxb_dom_document_create_text_node( - &self.parser.document.dom_document, - bytes_val, len(bytes_val) + &self.parser.document.dom_document, + bytes_val, len(bytes_val) ) if new_node == NULL: raise SelectolaxError("Can't create a new node") - lxb_dom_node_insert_child(self.node, new_node) + lxb_dom_node_insert_child(self.node, new_node) elif isinstance(value, LexborNode): new_node = lxb_dom_document_import_node( &self.parser.document.dom_document, @@ -931,9 +909,9 @@ cdef class LexborNode: text : str or None. """ cdef unsigned char * text - cdef lxb_dom_node_t* node = self.node.first_child + cdef lxb_dom_node_t * node = self.node.first_child cdef TextContainer container - if self.node == NULL or self.node.type != LXB_DOM_NODE_TYPE_TEXT: + if not self.is_text_node: return None text = lexbor_str_data_noi(&( self.node).data) @@ -942,9 +920,10 @@ cdef class LexborNode: py_text = text.decode(_ENCODING) container.append(py_text) return container.text + return None @property - def inner_html(self) -> str: + def inner_html(self) -> str | None: """Return HTML representation of the child nodes. Works similar to innerHTML in JavaScript. @@ -963,12 +942,12 @@ cdef class LexborNode: status = lxb_html_serialize_deep_str(self.node, lxb_str) if status == 0 and lxb_str.data: html = lxb_str.data.decode(_ENCODING).replace('<-undef>', '') - lexbor_str_destroy(lxb_str, self.node.owner_document.text, True) + lexbor_str_destroy(lxb_str, self.node.owner_document.text, True) return html return None @inner_html.setter - def inner_html(self, str html): + def inner_html(self, str html) -> None: """Set inner HTML to the specified HTML. Replaces existing data inside the node. @@ -980,10 +959,10 @@ cdef class LexborNode: """ cdef bytes bytes_val - bytes_val = html.encode("utf-8") + bytes_val = html.encode("utf-8") lxb_html_element_inner_html_set( - self.node, - bytes_val, len(bytes_val) + self.node, + bytes_val, len(bytes_val) ) def clone(self) -> LexborNode: @@ -994,10 +973,32 @@ cdef class LexborNode: It is tied to the current parser instance. Gets destroyed when parser instance is destroyed. """ - cdef lxb_dom_node_t* node + cdef lxb_dom_node_t * node node = lxb_dom_node_clone( self.node, 1) return LexborNode.new(node, self.parser) + cdef inline bint _is_node_type(self, lxb_dom_node_type_t expected_type): + return self.node != NULL and self.node.type == expected_type + + @property + def is_element_node(self) -> bool: + """Return True if the node represents an element node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT) + + @property + def is_text_node(self) -> bool: + """Return True if the node represents a text node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_TEXT) + + @property + def is_comment_node(self) -> bool: + """Return True if the node represents a comment node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_COMMENT) + + @property + def is_document_node(self) -> bool: + """Return True if the node represents a document node.""" + return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT) @cython.internal @cython.final @@ -1031,14 +1032,13 @@ cdef class TextContainer: self._text = self._text[:-len(self.separator)] return self._text - cdef lexbor_action_t text_callback(lxb_dom_node_t *node, void *ctx): cdef unsigned char *text cdef lxb_tag_id_t tag_id = lxb_dom_node_tag_id_noi(node) if tag_id != LXB_TAG__TEXT: return LEXBOR_ACTION_OK - text = lexbor_str_data_noi(&( node).char_data.data) + text = lexbor_str_data_noi(&( node).char_data.data) if not text: return LEXBOR_ACTION_OK diff --git a/tests/test_lexbor.py b/tests/test_lexbor.py index 0b07cdd6..f29256a1 100644 --- a/tests/test_lexbor.py +++ b/tests/test_lexbor.py @@ -64,20 +64,20 @@ def test_node_type_helpers(): parser = LexborHTMLParser(html) div_node = parser.css_first("#main") - assert div_node.is_element_node() - assert not div_node.is_text_node() + assert div_node.is_element_node + assert not div_node.is_text_node text_node = div_node.first_child assert text_node is not None - assert text_node.is_text_node() - assert not text_node.is_element_node() + assert text_node.is_text_node + assert not text_node.is_element_node comment_node = div_node.last_child assert comment_node is not None - assert comment_node.is_comment_node() - assert not comment_node.is_text_node() + assert comment_node.is_comment_node + assert not comment_node.is_text_node document_node = parser.root.parent assert document_node is not None - assert document_node.is_document_node() - assert not document_node.is_element_node() + assert document_node.is_document_node + assert not document_node.is_element_node From 92773dd597bd35367a749f2e973385fad0f81747 Mon Sep 17 00:00:00 2001 From: pygarap Date: Sun, 16 Nov 2025 21:20:04 +0200 Subject: [PATCH 4/7] **Apply formatting changes and reorganize properties for improved readability** (#189) --- selectolax/lexbor.pyi | 122 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 110 insertions(+), 12 deletions(-) diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi index a18ae726..f4d2159c 100644 --- a/selectolax/lexbor.pyi +++ b/selectolax/lexbor.pyi @@ -2,30 +2,46 @@ from typing import Any, Iterator, Literal, NoReturn, Optional, TypeVar, overload DefaultT = TypeVar("DefaultT") + class LexborAttributes: """A dict-like object that represents attributes.""" @staticmethod def create(node: LexborAttributes) -> LexborAttributes: ... + def keys(self) -> Iterator[str]: ... + def items(self) -> Iterator[tuple[str, str | None]]: ... + def values(self) -> Iterator[str | None]: ... + def __iter__(self) -> Iterator[str]: ... + def __len__(self) -> int: ... + def __getitem__(self, key: str) -> str | None: ... + def __setitem__(self, key: str, value: Optional[str]) -> None: ... + def __delitem__(self, key: str) -> None: ... + def __contains__(self, key: str) -> bool: ... + def __repr__(self) -> str: ... + @overload def get(self, key: str, default: DefaultT) -> DefaultT | str | None: ... + @overload def get(self, key: str, default: None = ...) -> str | None: ... + @overload def sget(self, key: str, default: str | DefaultT) -> str | DefaultT: ... + @overload def sget(self, key: str, default: str = "") -> str: ... + class LexborSelector: """An advanced CSS selector that supports additional operations. @@ -35,25 +51,31 @@ class LexborSelector: """ def __init__(self, node: LexborNode, query: str): ... + def css(self, query: str) -> NoReturn: ... + @property def matches(self) -> list[LexborNode]: """Returns all possible matches""" ... + @property def any_matches(self) -> bool: """Returns True if there are any matches""" ... + def text_contains( self, text: str, deep: bool = True, separator: str = "", strip: bool = False ) -> LexborSelector: """Filter all current matches given text.""" ... + def any_text_contains( self, text: str, deep: bool = True, separator: str = "", strip: bool = False ) -> bool: """Returns True if any node in the current search scope contains specified text""" ... + def attribute_longer_than( self, attribute: str, length: int, start: str | None = None ) -> LexborSelector: @@ -62,6 +84,7 @@ class LexborSelector: Similar to string-length in XPath. """ ... + def any_attribute_longer_than( self, attribute: str, length: int, start: str | None = None ) -> bool: @@ -99,17 +122,23 @@ class LexborSelector: """ ... + class LexborCSSSelector: def __init__(self): ... + def find(self, query: str, node: LexborNode) -> list[LexborNode]: ... + def any_matches(self, query: str, node: LexborNode) -> bool: ... + class LexborNode: """A class that represents HTML node (element).""" parser: LexborHTMLParser + @property def mem_id(self) -> int: ... + @property def child(self) -> LexborNode | None: """Alias for the `first_child` property. @@ -117,26 +146,32 @@ class LexborNode: **Deprecated**. Please use `first_child` instead. """ ... + @property def first_child(self) -> LexborNode | None: """Return the first child node.""" ... + @property def parent(self) -> LexborNode | None: """Return the parent node.""" ... + @property def next(self) -> LexborNode | None: """Return next node.""" ... + @property def prev(self) -> LexborNode | None: """Return previous node.""" ... + @property def last_child(self) -> LexborNode | None: """Return last child node.""" ... + @property def html(self) -> str | None: """Return HTML representation of the current node including all its child nodes. @@ -146,13 +181,16 @@ class LexborNode: text : str """ ... + def __hash__(self) -> int: ... + def text_lexbor(self) -> str: """Returns the text of the node including text of all its child nodes. Uses builtin method from lexbor. """ ... + def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str: """Returns the text of the node including text of all its child nodes. @@ -170,6 +208,7 @@ class LexborNode: text : str """ ... + def css(self, query: str) -> list[LexborNode]: """Evaluate CSS selector against current node and its child nodes. @@ -192,6 +231,7 @@ class LexborNode: selector : list of `Node` objects """ ... + @overload def css_first( self, query: str, default: Any = ..., strict: Literal[True] = ... @@ -213,6 +253,7 @@ class LexborNode: selector : `LexborNode` object """ ... + @overload def css_first( self, query: str, default: DefaultT, strict: bool = False @@ -234,6 +275,7 @@ class LexborNode: selector : `LexborNode` object """ ... + @overload def css_first( self, query: str, default: None = ..., strict: bool = False @@ -255,26 +297,18 @@ class LexborNode: selector : `LexborNode` object """ ... + def any_css_matches(self, selectors: tuple[str]) -> bool: """Returns True if any of CSS selectors matches a node""" ... + def css_matches(self, selector: str) -> bool: """Returns True if CSS selector matches a node.""" ... - def is_element_node(self) -> bool: - """Return True if the node represents an element node.""" - ... - def is_text_node(self) -> bool: - """Return True if the node represents a text node.""" - ... - def is_comment_node(self) -> bool: - """Return True if the node represents a comment node.""" - ... - def is_document_node(self) -> bool: - """Return True if the node represents a document node.""" - ... + @property def tag_id(self) -> int: ... + @property def tag(self) -> str | None: """Return the name of the current tag (e.g. div, p, img). @@ -290,6 +324,7 @@ class LexborNode: text : str """ ... + def decompose(self, recursive: bool = True) -> None: """Remove the current node from the tree. @@ -306,6 +341,7 @@ class LexborNode: >>> tag.decompose() """ ... + def strip_tags(self, tags: list[str], recursive: bool = False) -> None: """Remove specified tags from the HTML tree. @@ -326,6 +362,7 @@ class LexborNode: '
Hello world!
' """ ... + @property def attributes(self) -> dict[str, str | None]: """Get all attributes that belong to the current node. @@ -345,6 +382,7 @@ class LexborNode: {'data': None, 'id': 'my_id'} """ ... + @property def attrs(self) -> LexborAttributes: """A dict-like object that is similar to the ``attributes`` property, but operates directly on the Node data. @@ -373,6 +411,7 @@ class LexborNode: '
' """ ... + @property def id(self) -> str | None: """Get the id attribute of the node. @@ -384,6 +423,7 @@ class LexborNode: text : str """ ... + def iter(self, include_text: bool = False) -> Iterator[LexborNode]: """Iterate over nodes on the current level. @@ -397,6 +437,7 @@ class LexborNode: node """ ... + def unwrap(self, delete_empty: bool = False) -> None: """Replace node with whatever is inside this node. @@ -418,6 +459,7 @@ class LexborNode: Note: by default, empty tags are ignored, use "delete_empty" to change this. """ ... + def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None: """Unwraps specified tags from the HTML tree. @@ -441,6 +483,7 @@ class LexborNode: Note: by default, empty tags are ignored, use "delete_empty" to change this. """ ... + def merge_text_nodes(self) -> None: """Iterates over all text nodes and merges all text nodes that are close to each other. @@ -460,6 +503,7 @@ class LexborNode: "John Doe" """ ... + def traverse(self, include_text: bool = False) -> Iterator[LexborNode]: """Iterate over all child and next nodes starting from the current level. @@ -473,6 +517,7 @@ class LexborNode: node """ ... + def replace_with(self, value: bytes | str | LexborNode) -> None: """Replace current Node with specified value. @@ -501,6 +546,7 @@ class LexborNode: '
Get
Test
' """ ... + def insert_before(self, value: bytes | str | LexborNode) -> None: """Insert a node before the current Node. @@ -529,6 +575,7 @@ class LexborNode:
Get
Test
' """ ... + def insert_after(self, value: bytes | str | LexborNode) -> None: """Insert a node after the current Node. @@ -557,6 +604,7 @@ class LexborNode:
Get
Test
' """ ... + def insert_child(self, value: bytes | str | LexborNode) -> None: """Insert a node inside (at the end of) the current Node. @@ -585,6 +633,7 @@ class LexborNode:
Get
Laptop
Test
' """ ... + @property def raw_value(self) -> NoReturn: """Return the raw (unparsed, original) value of a node. @@ -607,6 +656,7 @@ class LexborNode: b'<test>' """ ... + def scripts_contain(self, query: str) -> bool: """Returns True if any of the script tags contain specified text. @@ -618,6 +668,7 @@ class LexborNode: The query to check. """ ... + def script_srcs_contain(self, queries: tuple[str]) -> bool: """Returns True if any of the script SRCs attributes contain on of the specified text. @@ -628,9 +679,11 @@ class LexborNode: queries : tuple of str """ ... + def remove(self, recursive: bool = True) -> None: """An alias for the decompose method.""" ... + def select(self, query: str | None = None) -> LexborSelector: """Select nodes given a CSS selector. @@ -646,6 +699,7 @@ class LexborNode: selector : The `Selector` class. """ ... + @property def text_content(self) -> str | None: """Returns the text of the node if it is a text node. @@ -697,6 +751,27 @@ class LexborNode: """ ... + @property + def is_element_node(self) -> bool: + """Return True if the node represents an element node.""" + ... + + @property + def is_text_node(self) -> bool: + """Return True if the node represents a text node.""" + ... + + @property + def is_comment_node(self) -> bool: + """Return True if the node represents a comment node.""" + ... + + @property + def is_document_node(self) -> bool: + """Return True if the node represents a document node.""" + ... + + class LexborHTMLParser: """The lexbor HTML parser. @@ -711,20 +786,25 @@ class LexborHTMLParser: """ def __init__(self, html: str | bytes): ... + @property def selector(self) -> "LexborCSSSelector": ... + @property def root(self) -> LexborNode | None: """Returns root node.""" ... + @property def body(self) -> LexborNode | None: """Returns document body.""" ... + @property def head(self) -> LexborNode | None: """Returns document head.""" ... + def tags(self, name: str) -> list[LexborNode]: """Returns a list of tags that match specified name. @@ -733,6 +813,7 @@ class LexborHTMLParser: name : str (e.g. div) """ ... + def text(self, deep: bool = True, separator: str = "", strip: bool = False) -> str: """Returns the text of the node including text of all its child nodes. @@ -750,10 +831,12 @@ class LexborHTMLParser: text : str """ ... + @property def html(self) -> str | None: """Return HTML representation of the page.""" ... + def css(self, query: str) -> list[LexborNode]: """A CSS selector. @@ -776,6 +859,7 @@ class LexborHTMLParser: selector : list of `Node` objects """ ... + @overload def css_first( self, query: str, default: Any = ..., strict: Literal[True] = ... @@ -797,6 +881,7 @@ class LexborHTMLParser: selector : `LexborNode` object """ ... + @overload def css_first( self, query: str, default: DefaultT, strict: bool = False @@ -818,6 +903,7 @@ class LexborHTMLParser: selector : `LexborNode` object """ ... + @overload def css_first( self, query: str, default: None = ..., strict: bool = False @@ -839,6 +925,7 @@ class LexborHTMLParser: selector : `LexborNode` object """ ... + def strip_tags(self, tags: list[str], recursive: bool = False) -> None: """Remove specified tags from the node. @@ -859,6 +946,7 @@ class LexborHTMLParser: '
Hello world!
' """ ... + def select(self, query: str | None = None) -> LexborSelector | None: """Select nodes give a CSS selector. @@ -874,9 +962,11 @@ class LexborHTMLParser: selector : The `Selector` class. """ ... + def any_css_matches(self, selectors: tuple[str]) -> bool: """Returns True if any of the specified CSS selectors matches a node.""" ... + def scripts_contain(self, query: str) -> bool: """Returns True if any of the script tags contain specified text. @@ -888,6 +978,7 @@ class LexborHTMLParser: The query to check. """ ... + def script_srcs_contain(self, queries: tuple[str]) -> bool: """Returns True if any of the script SRCs attributes contain on of the specified text. @@ -898,7 +989,9 @@ class LexborHTMLParser: queries : tuple of str """ ... + def css_matches(self, selector: str) -> bool: ... + def merge_text_nodes(self) -> None: """Iterates over all text nodes and merges all text nodes that are close to each other. @@ -918,9 +1011,11 @@ class LexborHTMLParser: "John Doe" """ ... + def clone(self) -> LexborHTMLParser: """Clone the current tree.""" ... + def unwrap_tags(self, tags: list[str], delete_empty: bool = False) -> None: """Unwraps specified tags from the HTML tree. @@ -943,6 +1038,7 @@ class LexborHTMLParser: """ ... + def create_tag(tag: str) -> LexborNode: """ Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag, @@ -950,6 +1046,7 @@ def create_tag(tag: str) -> LexborNode: """ ... + def parse_fragment(html: str) -> list[LexborNode]: """ Given HTML, parse it into a list of Nodes, such that the nodes @@ -960,6 +1057,7 @@ def parse_fragment(html: str) -> list[LexborNode]: """ ... + class SelectolaxError(Exception): """An exception that indicates error.""" From b88b653f6090c6c655009f0c1becbf3915937b37 Mon Sep 17 00:00:00 2001 From: pygarap Date: Sun, 16 Nov 2025 21:21:31 +0200 Subject: [PATCH 5/7] Remove excessive blank lines and apply formatting cleanup for consistency --- selectolax/lexbor.pyi | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/selectolax/lexbor.pyi b/selectolax/lexbor.pyi index f4d2159c..0b1114e3 100644 --- a/selectolax/lexbor.pyi +++ b/selectolax/lexbor.pyi @@ -2,46 +2,30 @@ from typing import Any, Iterator, Literal, NoReturn, Optional, TypeVar, overload DefaultT = TypeVar("DefaultT") - class LexborAttributes: """A dict-like object that represents attributes.""" @staticmethod def create(node: LexborAttributes) -> LexborAttributes: ... - def keys(self) -> Iterator[str]: ... - def items(self) -> Iterator[tuple[str, str | None]]: ... - def values(self) -> Iterator[str | None]: ... - def __iter__(self) -> Iterator[str]: ... - def __len__(self) -> int: ... - def __getitem__(self, key: str) -> str | None: ... - def __setitem__(self, key: str, value: Optional[str]) -> None: ... - def __delitem__(self, key: str) -> None: ... - def __contains__(self, key: str) -> bool: ... - def __repr__(self) -> str: ... - @overload def get(self, key: str, default: DefaultT) -> DefaultT | str | None: ... - @overload def get(self, key: str, default: None = ...) -> str | None: ... - @overload def sget(self, key: str, default: str | DefaultT) -> str | DefaultT: ... - @overload def sget(self, key: str, default: str = "") -> str: ... - class LexborSelector: """An advanced CSS selector that supports additional operations. @@ -51,9 +35,7 @@ class LexborSelector: """ def __init__(self, node: LexborNode, query: str): ... - def css(self, query: str) -> NoReturn: ... - @property def matches(self) -> list[LexborNode]: """Returns all possible matches""" @@ -122,15 +104,11 @@ class LexborSelector: """ ... - class LexborCSSSelector: def __init__(self): ... - def find(self, query: str, node: LexborNode) -> list[LexborNode]: ... - def any_matches(self, query: str, node: LexborNode) -> bool: ... - class LexborNode: """A class that represents HTML node (element).""" @@ -138,7 +116,6 @@ class LexborNode: @property def mem_id(self) -> int: ... - @property def child(self) -> LexborNode | None: """Alias for the `first_child` property. @@ -183,7 +160,6 @@ class LexborNode: ... def __hash__(self) -> int: ... - def text_lexbor(self) -> str: """Returns the text of the node including text of all its child nodes. @@ -308,7 +284,6 @@ class LexborNode: @property def tag_id(self) -> int: ... - @property def tag(self) -> str | None: """Return the name of the current tag (e.g. div, p, img). @@ -771,7 +746,6 @@ class LexborNode: """Return True if the node represents a document node.""" ... - class LexborHTMLParser: """The lexbor HTML parser. @@ -786,10 +760,8 @@ class LexborHTMLParser: """ def __init__(self, html: str | bytes): ... - @property def selector(self) -> "LexborCSSSelector": ... - @property def root(self) -> LexborNode | None: """Returns root node.""" @@ -991,7 +963,6 @@ class LexborHTMLParser: ... def css_matches(self, selector: str) -> bool: ... - def merge_text_nodes(self) -> None: """Iterates over all text nodes and merges all text nodes that are close to each other. @@ -1038,7 +1009,6 @@ class LexborHTMLParser: """ ... - def create_tag(tag: str) -> LexborNode: """ Given an HTML tag name, e.g. `"div"`, create a single empty node for that tag, @@ -1046,7 +1016,6 @@ def create_tag(tag: str) -> LexborNode: """ ... - def parse_fragment(html: str) -> list[LexborNode]: """ Given HTML, parse it into a list of Nodes, such that the nodes @@ -1057,7 +1026,6 @@ def parse_fragment(html: str) -> list[LexborNode]: """ ... - class SelectolaxError(Exception): """An exception that indicates error.""" From 88baf49928b665fcc8fd97050b83746da5029f45 Mon Sep 17 00:00:00 2001 From: pygarap Date: Sun, 16 Nov 2025 22:03:15 +0200 Subject: [PATCH 6/7] Remove excessive blank lines for formatting consistency --- selectolax/lexbor/node.pxi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi index aaf31fd9..cdefde8e 100644 --- a/selectolax/lexbor/node.pxi +++ b/selectolax/lexbor/node.pxi @@ -27,6 +27,7 @@ cdef inline bytes to_bytes(str_or_LexborNode value): bytes_val = value return bytes_val + @cython.final cdef class LexborNode: """A class that represents HTML node (element).""" @@ -1000,6 +1001,7 @@ cdef class LexborNode: """Return True if the node represents a document node.""" return self._is_node_type(LXB_DOM_NODE_TYPE_DOCUMENT) + @cython.internal @cython.final cdef class TextContainer: From d3fe084547d5eaa382802925ae75289f1f73ba65 Mon Sep 17 00:00:00 2001 From: pygarap Date: Wed, 19 Nov 2025 00:43:43 +0200 Subject: [PATCH 7/7] Refactor node type checks by replacing `is_text_node` and `is_element_node` with `_is_node_type` helper methods for consistency --- selectolax/lexbor/node.pxi | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/selectolax/lexbor/node.pxi b/selectolax/lexbor/node.pxi index cdefde8e..988889b0 100644 --- a/selectolax/lexbor/node.pxi +++ b/selectolax/lexbor/node.pxi @@ -157,7 +157,7 @@ cdef class LexborNode: if not deep: container = TextContainer(separator, strip) - if self.is_text_node: + if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT): text = lexbor_str_data_noi(&( self.node).data) if text != NULL: py_text = text.decode(_ENCODING) @@ -173,7 +173,7 @@ cdef class LexborNode: return container.text else: container = TextContainer(separator, strip) - if self.is_text_node: + if self._is_node_type(LXB_DOM_NODE_TYPE_TEXT): text = lexbor_str_data_noi(&( self.node).data) if text != NULL: container.append(text.decode(_ENCODING)) @@ -355,7 +355,7 @@ cdef class LexborNode: cdef size_t str_len = 0 attributes = dict() - if not self.is_element_node: + if not self._is_node_type(LXB_DOM_NODE_TYPE_ELEMENT): return attributes while attr != NULL: @@ -912,7 +912,7 @@ cdef class LexborNode: cdef unsigned char * text cdef lxb_dom_node_t * node = self.node.first_child cdef TextContainer container - if not self.is_text_node: + if not self._is_node_type(LXB_DOM_NODE_TYPE_TEXT): return None text = lexbor_str_data_noi(&( self.node).data)