Skip to content

Commit 823859a

Browse files
committed
Minor improvements
1 parent 7fe8aa4 commit 823859a

File tree

3 files changed

+25
-6
lines changed

3 files changed

+25
-6
lines changed

selectolax/lexbor.pyx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,12 @@ cdef class LexborHTMLParser:
9696
name : str (e.g. div)
9797
9898
"""
99+
100+
if not name:
101+
raise ValueError("Tag name cannot be empty")
102+
if len(name) > 100: # Reasonable limit for tag names
103+
raise ValueError("Tag name is too long")
104+
99105
cdef lxb_dom_collection_t* collection = NULL
100106
cdef lxb_status_t status
101107
pybyte_name = name.encode('UTF-8')

selectolax/modest/node.pxi

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ cdef class Stack:
1414
self.capacity = capacity
1515
self.top = 0
1616
self._stack = <myhtml_tree_node_t**> malloc(capacity * sizeof(myhtml_tree_node_t))
17+
if self._stack == NULL:
18+
raise MemoryError("Failed to allocate memory for stack")
1719

1820
def __dealloc__(self):
1921
free(self._stack)
@@ -595,7 +597,7 @@ cdef class Node:
595597
>>> tree.body.unwrap_tags(['i','a'])
596598
>>> tree.body.html
597599
'<body><div>Hello world!</div></body>'
598-
600+
599601
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
600602
"""
601603

@@ -783,7 +785,7 @@ cdef class Node:
783785
>>> tree.body.unwrap_tags(['i','a'])
784786
>>> tree.body.html
785787
'<body><div>Hello world!</div></body>'
786-
788+
787789
Note: by default, empty tags are ignored, set "delete_empty" to "True" to change this.
788790
"""
789791

selectolax/parser.pyx

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ cdef class HTMLParser:
124124
status = myhtml_parse(self.html_tree, self._encoding, html, html_len)
125125

126126
if status != 0:
127-
raise RuntimeError("Can't parse HTML:\n%s" % str(html))
127+
raise RuntimeError("Can't parse HTML (status code: %d)" % status)
128128

129129
assert self.html_tree.node_html != NULL
130130

@@ -147,9 +147,13 @@ cdef class HTMLParser:
147147
def root(self):
148148
"""Returns root node."""
149149
if self.html_tree and self.html_tree.node_html:
150-
node = Node()
151-
node._init(self.html_tree.node_html, self)
152-
return node
150+
try:
151+
node = Node()
152+
node._init(self.html_tree.node_html, self)
153+
return node
154+
except Exception:
155+
# If Node creation or initialization fails, return None
156+
return None
153157
return None
154158

155159
@property
@@ -185,6 +189,12 @@ cdef class HTMLParser:
185189
name : str (e.g. div)
186190
187191
"""
192+
# Validate tag name
193+
if not name:
194+
raise ValueError("Tag name cannot be empty")
195+
if len(name) > 100: # Reasonable limit for tag names
196+
raise ValueError("Tag name is too long")
197+
188198
cdef myhtml_collection_t* collection = NULL
189199
pybyte_name = name.encode('UTF-8')
190200
cdef mystatus_t status = 0;
@@ -428,6 +438,7 @@ cdef class HTMLParser:
428438
if self.html_tree != NULL:
429439
myhtml = self.html_tree.myhtml
430440
myhtml_tree_destroy(self.html_tree)
441+
self.html_tree = NULL # Prevent double-free
431442
if myhtml != NULL:
432443
myhtml_destroy(myhtml)
433444

0 commit comments

Comments
 (0)