Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
add47af
Add `skip_empty` parameter to `text()` method in `LexborNode`
pygarap Nov 20, 2025
ed0622b
Add `with_top_level_tags` parameter to `LexborHTMLParser`
pygarap Nov 20, 2025
9d22d77
Refactor `lxb_html_document_parse_fragment` implementation in `Lexbor…
pygarap Nov 20, 2025
d62e03b
Improve fragment error handling in `LexborHTMLParser`
pygarap Nov 20, 2025
fb3dcef
Improve handling of fragment parsing in `LexborHTMLParser`
pygarap Nov 21, 2025
caf04d2
Refactor `_parse_html` by delegating parsing logic to `_parse_with_to…
pygarap Nov 21, 2025
da4d224
Refactor and clean up `LexborHTMLParser` and related files
pygarap Nov 21, 2025
d8abd9d
Remove commented-out `_parse_html` method in `LexborHTMLParser` and a…
pygarap Nov 21, 2025
ecd6bdc
Reorganize `selector` property in `LexborHTMLParser` and improve form…
pygarap Nov 21, 2025
f2d9609
Add detailed docstrings to `LexborHTMLParser` methods and refine type…
pygarap Nov 21, 2025
0c0c784
Clean up `lexbor.pyi` formatting and integrate `cython-lint` in Makefile
pygarap Nov 21, 2025
79b79a1
Add detailed docstrings for HTML parsing methods in `lexbor.pyx`
pygarap Nov 21, 2025
5ed6089
Add detailed docstrings for `__dealloc__` and `from_document` methods…
pygarap Nov 21, 2025
4b03c8d
Improve formatting and spacing consistency across `lexbor.pxd`, `lexb…
pygarap Nov 21, 2025
1f110cf
Clarify docstring for `with_top_level_tags` parameter in HTML parsing…
pygarap Nov 21, 2025
530d53b
Merge branch 'master' into match_top_level_tags
rushter Nov 22, 2025
d95ac2e
Refactor: Rename `with_top_level_tags` to `is_fragment` in `LexborHTM…
pygarap Nov 22, 2025
7fe547b
Refine docstring formatting for HTML parsing methods to improve reada…
pygarap Nov 22, 2025
4506650
Fix `html` method logic by reordering fragment check for consistent b…
pygarap Nov 22, 2025
4e9ce89
Improve formatting consistency in `lexbor.pxd`, `lexbor.pyx`, and `le…
pygarap Nov 22, 2025
b62036e
Add `skip_empty` parameter to `text` method and update docstring for …
pygarap Nov 22, 2025
7c86f32
Minor edits
rushter Nov 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ clean-test: ## remove test and coverage artifacts
lint: ## check style with ruff
ruff format selectolax tests
ruff check --fix selectolax tests
cython-lint selectolax/
mypy selectolax tests

test: ## run tests quickly with the default Python
Expand Down
94 changes: 52 additions & 42 deletions selectolax/lexbor.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ cdef extern from "lexbor/core/core.h" nogil:
LXB_STATUS_NEXT
LXB_STATUS_STOP

lexbor_str_t* lexbor_str_destroy(lexbor_str_t *str, lexbor_mraw_t *mraw, bint destroy_obj)
lexbor_str_t* lexbor_str_create()
lexbor_str_t * lexbor_str_destroy(lexbor_str_t *str, lexbor_mraw_t *mraw, bint destroy_obj)
lexbor_str_t * lexbor_str_create()
lxb_char_t * lexbor_str_data_noi(lexbor_str_t *str)

cdef extern from "lexbor/html/html.h" nogil:
Expand Down Expand Up @@ -63,14 +63,14 @@ cdef extern from "lexbor/html/html.h" nogil:
uintptr_t prefix
uintptr_t ns

lxb_dom_document_t *owner_document
lxb_dom_document_t *owner_document

lxb_dom_node_t *next
lxb_dom_node_t *prev
lxb_dom_node_t *parent
lxb_dom_node_t *first_child
lxb_dom_node_t *last_child
void *user
lxb_dom_node_t *next
lxb_dom_node_t *prev
lxb_dom_node_t *parent
lxb_dom_node_t *first_child
lxb_dom_node_t *last_child
void *user

lxb_dom_node_type_t type

Expand Down Expand Up @@ -111,32 +111,32 @@ cdef extern from "lexbor/html/html.h" nogil:
lxb_html_document_opt_t opt

ctypedef enum lxb_html_document_ready_state_t:
LXB_HTML_DOCUMENT_READY_STATE_UNDEF = 0x00
LXB_HTML_DOCUMENT_READY_STATE_LOADING = 0x01
LXB_HTML_DOCUMENT_READY_STATE_UNDEF = 0x00
LXB_HTML_DOCUMENT_READY_STATE_LOADING = 0x01
LXB_HTML_DOCUMENT_READY_STATE_INTERACTIVE = 0x02
LXB_HTML_DOCUMENT_READY_STATE_COMPLETE = 0x03
LXB_HTML_DOCUMENT_READY_STATE_COMPLETE = 0x03

ctypedef enum lxb_html_parser_state_t:
LXB_HTML_PARSER_STATE_BEGIN = 0x00
LXB_HTML_PARSER_STATE_PROCESS = 0x01
LXB_HTML_PARSER_STATE_END = 0x02
LXB_HTML_PARSER_STATE_BEGIN = 0x00
LXB_HTML_PARSER_STATE_PROCESS = 0x01
LXB_HTML_PARSER_STATE_END = 0x02
LXB_HTML_PARSER_STATE_FRAGMENT_PROCESS = 0x03
LXB_HTML_PARSER_STATE_ERROR = 0x04
LXB_HTML_PARSER_STATE_ERROR = 0x04

ctypedef enum lxb_dom_node_type_t:
LXB_DOM_NODE_TYPE_ELEMENT = 0x01
LXB_DOM_NODE_TYPE_ATTRIBUTE = 0x02
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Such formatting is intentional. Please avoid changing formatting everywhere. It makes it harder to review and keeps unrelated changes in the same commits.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rushter Thanks for pointing this out.

I have reverted the formatting changes and kept this PR focused only on the relevant code changes. I will avoid broad formatting edits in future commits to make diffs more straightforward to review.

LXB_DOM_NODE_TYPE_TEXT = 0x03
LXB_DOM_NODE_TYPE_CDATA_SECTION = 0x04
LXB_DOM_NODE_TYPE_ENTITY_REFERENCE = 0x05
LXB_DOM_NODE_TYPE_ENTITY = 0x06
LXB_DOM_NODE_TYPE_ELEMENT = 0x01
LXB_DOM_NODE_TYPE_ATTRIBUTE = 0x02
LXB_DOM_NODE_TYPE_TEXT = 0x03
LXB_DOM_NODE_TYPE_CDATA_SECTION = 0x04
LXB_DOM_NODE_TYPE_ENTITY_REFERENCE = 0x05
LXB_DOM_NODE_TYPE_ENTITY = 0x06
LXB_DOM_NODE_TYPE_PROCESSING_INSTRUCTION = 0x07
LXB_DOM_NODE_TYPE_COMMENT = 0x08
LXB_DOM_NODE_TYPE_DOCUMENT = 0x09
LXB_DOM_NODE_TYPE_DOCUMENT_TYPE = 0x0A
LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT = 0x0B
LXB_DOM_NODE_TYPE_NOTATION = 0x0C
LXB_DOM_NODE_TYPE_LAST_ENTRY = 0x0D
LXB_DOM_NODE_TYPE_COMMENT = 0x08
LXB_DOM_NODE_TYPE_DOCUMENT = 0x09
LXB_DOM_NODE_TYPE_DOCUMENT_TYPE = 0x0A
LXB_DOM_NODE_TYPE_DOCUMENT_FRAGMENT = 0x0B
LXB_DOM_NODE_TYPE_NOTATION = 0x0C
LXB_DOM_NODE_TYPE_LAST_ENTRY = 0x0D

ctypedef enum lxb_dom_document_cmode_t:
LXB_DOM_DOCUMENT_CMODE_NO_QUIRKS = 0x00
Expand All @@ -159,7 +159,7 @@ cdef extern from "lexbor/html/html.h" nogil:
LXB_HTML_SERIALIZE_OPT_FULL_DOCTYPE = 0x40

ctypedef struct lexbor_array_t:
void **list
void ** list
size_t size
size_t length

Expand All @@ -174,7 +174,6 @@ cdef extern from "lexbor/html/html.h" nogil:
ctypedef lxb_status_t lxb_html_tree_append_attr_f

ctypedef struct lxb_html_tree_t:

lxb_html_tokenizer_t *tkz_ref

lxb_html_document_t *document
Expand Down Expand Up @@ -219,15 +218,22 @@ cdef extern from "lexbor/html/html.h" nogil:

# Functions
lxb_html_document_t * lxb_html_document_create()
lxb_status_t lxb_html_document_parse(lxb_html_document_t *document, const lxb_char_t *html, size_t size)
lxb_html_element_t * lxb_html_document_create_element(lxb_html_document_t *document,
const lxb_char_t *local_name, size_t lname_len,
void *reserved_for_opt)
lxb_status_t lxb_html_document_parse(lxb_html_document_t *document, const lxb_char_t *html, size_t size)
lxb_dom_node_t * lxb_html_document_parse_fragment(lxb_html_document_t *document,
lxb_dom_element_t *element,
const lxb_char_t *html,
size_t size)
lxb_html_body_element_t * lxb_html_document_body_element_noi(lxb_html_document_t *document)
lxb_html_head_element_t * lxb_html_document_head_element_noi(lxb_html_document_t *document)
lxb_dom_element_t * lxb_dom_document_element(lxb_dom_document_t *document)

lxb_status_t lxb_html_serialize_tree_str(lxb_dom_node_t *node, lexbor_str_t *str)
lxb_status_t lxb_html_serialize_deep_str(lxb_dom_node_t *node, lexbor_str_t *str)
lxb_html_element_t* lxb_html_element_inner_html_set(lxb_html_element_t *element,
const lxb_char_t *html, size_t size)
lxb_html_element_t * lxb_html_element_inner_html_set(lxb_html_element_t *element,
const lxb_char_t *html, size_t size)

cdef class LexborNode:
cdef:
Expand All @@ -239,9 +245,8 @@ cdef class LexborNode:
@staticmethod
cdef LexborNode new(lxb_dom_node_t *node, LexborHTMLParser parser)


cdef class LexborCSSSelector:
cdef lxb_css_parser_t* parser
cdef lxb_css_parser_t * parser
cdef lxb_selectors_t * selectors
cdef lxb_css_selectors_t * css_selectors
cdef public list results
Expand All @@ -253,10 +258,14 @@ cdef class LexborCSSSelector:
cpdef int any_matches(self, str query, LexborNode node) except -1

cdef class LexborHTMLParser:
cdef lxb_html_document_t *document
cdef lxb_html_document_t * document
cdef bint _with_top_level_tags
cdef public bytes raw_html
cdef LexborCSSSelector _selector
cdef int _parse_html(self, char* html, size_t html_len) except -1
cdef inline void _new_html_document(self)
cdef inline lxb_status_t _parse_with_top_level_tags(self, char * html, size_t html_len) nogil
cdef inline lxb_status_t _parse_without_top_level_tags(self, char * html, size_t html_len) nogil
cdef int _parse_html(self, char * html, size_t html_len) except -1
cdef object cached_script_texts
cdef object cached_script_srcs

Expand All @@ -266,9 +275,9 @@ cdef class LexborHTMLParser:

cdef extern from "lexbor/dom/dom.h" nogil:
ctypedef enum lexbor_action_t:
LEXBOR_ACTION_OK = 0x00
LEXBOR_ACTION_STOP = 0x01
LEXBOR_ACTION_NEXT = 0x02
LEXBOR_ACTION_OK = 0x00
LEXBOR_ACTION_STOP = 0x01
LEXBOR_ACTION_NEXT = 0x02

ctypedef lexbor_action_t (*lxb_dom_node_simple_walker_f)(lxb_dom_node_t *node, void *ctx)

Expand Down Expand Up @@ -304,6 +313,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
void lxb_dom_node_remove(lxb_dom_node_t *node)
void * lxb_dom_document_destroy_text_noi(lxb_dom_document_t *document, lxb_char_t *text)
lxb_dom_node_t * lxb_dom_document_root(lxb_dom_document_t *document)
lxb_dom_element_t * lxb_dom_interface_element(lxb_dom_node_t *node)
lxb_char_t * lxb_dom_element_qualified_name(lxb_dom_element_t *element, size_t *len)
lxb_dom_node_t * lxb_dom_node_destroy(lxb_dom_node_t *node)
lxb_dom_node_t * lxb_dom_node_destroy_deep(lxb_dom_node_t *root)
Expand All @@ -328,7 +338,7 @@ cdef extern from "lexbor/dom/dom.h" nogil:
void lxb_dom_node_insert_after(lxb_dom_node_t *to, lxb_dom_node_t *node)
lxb_dom_text_t * lxb_dom_document_create_text_node(lxb_dom_document_t *document, const lxb_char_t *data, size_t len)
void lxb_dom_node_simple_walk(lxb_dom_node_t *root, lxb_dom_node_simple_walker_f walker_cb, void *ctx)
lxb_dom_node_t* lxb_dom_node_clone(lxb_dom_node_t *node, bint deep)
lxb_dom_node_t * lxb_dom_node_clone(lxb_dom_node_t *node, bint deep)


cdef extern from "lexbor/dom/interfaces/element.h" nogil:
Expand All @@ -349,7 +359,7 @@ cdef extern from "lexbor/dom/collection.h" nogil:

cdef extern from "lexbor/css/css.h" nogil:
ctypedef struct lxb_css_parser_t:
lxb_css_memory_t* memory
lxb_css_memory_t * memory
ctypedef struct lxb_css_syntax_tokenizer_t
ctypedef struct lxb_css_memory_t

Expand Down
Loading