Skip to content

Commit 0f30e3b

Browse files
committed
Improve tests
1 parent f6bd701 commit 0f30e3b

File tree

3 files changed

+177
-37
lines changed

3 files changed

+177
-37
lines changed

selectolax/lexbor/selection.pxi

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ cdef class LexborCSSSelector:
3838
cdef lxb_char_t* c_selector
3939
cdef lxb_css_selector_list_t * selectors_list
4040

41+
if not isinstance(query, str):
42+
raise TypeError("Query must be a string.")
43+
4144
bytes_query = query.encode(_ENCODING)
4245
selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t>len(query))
4346

@@ -59,6 +62,9 @@ cdef class LexborCSSSelector:
5962
cdef lxb_char_t * c_selector
6063
cdef lxb_css_selector_list_t * selectors_list
6164

65+
if not isinstance(query, str):
66+
raise TypeError("Query must be a string.")
67+
6268
bytes_query = query.encode(_ENCODING)
6369
selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t> len(query))
6470

tests/test_parser.py

Lines changed: 137 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,54 @@
1-
# coding:utf-8
1+
import threading
22
from difflib import SequenceMatcher
33

44
import pytest
55
from selectolax.parser import HTMLParser, Node
6-
from selectolax.lexbor import LexborHTMLParser, LexborNode
6+
7+
from selectolax.lexbor import LexborHTMLParser, LexborNode, SelectolaxError
78

89
"""
910
We'are testing only our own code.
1011
Many functionality are already tested in the Modest engine, so there is no reason to test every case.
1112
"""
1213

13-
_PARSERS_PARAMETRIZER = ("parser", (HTMLParser, LexborHTMLParser),)
14+
_PARSERS_PARAMETRIZER = (
15+
"parser",
16+
(HTMLParser, LexborHTMLParser),
17+
)
1418

1519

1620
def test_encoding():
1721
html = "<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
1822
html = HTMLParser(html)
19-
assert html.input_encoding == 'UTF-8'
23+
assert html.input_encoding == "UTF-8"
2024

2125
html = b"<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
2226
html = HTMLParser(html)
23-
assert html.input_encoding == 'UTF-8'
27+
assert html.input_encoding == "UTF-8"
2428

25-
html = "<div>Привет мир!</div>".encode('cp1251')
26-
assert HTMLParser(html, detect_encoding=True).input_encoding == 'WINDOWS-1251'
29+
html = "<div>Привет мир!</div>".encode("cp1251")
30+
assert HTMLParser(html, detect_encoding=True).input_encoding == "WINDOWS-1251"
2731

28-
html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-8')
29-
assert HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding == 'WINDOWS-1251'
32+
html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode("utf-8")
33+
assert (
34+
HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding
35+
== "WINDOWS-1251"
36+
)
3037

3138
# UTF-16 not ASCII-readable
32-
html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-16le')
33-
assert HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding == 'UTF-16LE'
39+
html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode("utf-16le")
40+
assert (
41+
HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding
42+
== "UTF-16LE"
43+
)
3444

3545
# Unencodable characters in string, should not throw an exception by default
36-
html_unencodable = b'<div>Roboto+Condensed</div>'.decode('utf-7', errors='ignore')
37-
assert HTMLParser(html_unencodable).input_encoding == 'UTF-8'
46+
html_unencodable = b"<div>Roboto+Condensed</div>".decode("utf-7", errors="ignore")
47+
assert HTMLParser(html_unencodable).input_encoding == "UTF-8"
3848

3949
# decode_errrors='strict' should error out
4050
try:
41-
HTMLParser(html_unencodable, decode_errors='strict')
51+
HTMLParser(html_unencodable, decode_errors="strict")
4252
assert False
4353
except Exception as e:
4454
assert type(e) is UnicodeEncodeError
@@ -56,7 +66,6 @@ def test_parser(parser):
5666
parser("asd").css(123)
5767

5868

59-
6069
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
6170
def test_malformed_data(parser):
6271
malformed_inputs = [
@@ -80,13 +89,12 @@ def test_malformed_data(parser):
8089
def test_properties(parser):
8190
html_parser = parser("<div><p>test</p></div>")
8291

83-
properties_to_test = ['root', 'head', 'body', 'html']
92+
properties_to_test = ["root", "head", "body", "html"]
8493

8594
for prop_name in properties_to_test:
8695
getattr(html_parser, prop_name)
8796

8897

89-
9098
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
9199
def test_unicode_handling(parser):
92100
unicode_content = [
@@ -99,7 +107,7 @@ def test_unicode_handling(parser):
99107
html = f"<div>{content}</div>"
100108
try:
101109
html_parser = parser(html)
102-
result = html_parser.css_first('div')
110+
result = html_parser.css_first("div")
103111
if result:
104112
extracted_text = result.text()
105113
assert content in extracted_text
@@ -123,7 +131,6 @@ def test_tag_name_validation(parser):
123131
html_parser.tags(long_tag_name)
124132

125133

126-
127134
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
128135
def test_nodes(parser):
129136
html = (
@@ -141,16 +148,16 @@ def test_nodes(parser):
141148

142149
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
143150
def test_root_css(parser):
144-
tree = parser('test')
145-
assert len(tree.root.css('data')) == 0
151+
tree = parser("test")
152+
assert len(tree.root.css("data")) == 0
146153

147154

148155
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
149156
def test_strip_tags_from_root(parser):
150157
html = "<body><div></div><script></script></body>"
151158
html_parser = parser(html)
152-
html_parser.root.strip_tags(['div', 'script'])
153-
assert html_parser.html == '<html><head></head><body></body></html>'
159+
html_parser.root.strip_tags(["div", "script"])
160+
assert html_parser.html == "<html><head></head><body></body></html>"
154161

155162
with pytest.raises(TypeError):
156163
html_parser.strip_tags(1)
@@ -160,9 +167,9 @@ def test_strip_tags_from_root(parser):
160167
def test_clone(parser):
161168
html_parser = parser("""<h1>Welcome</h1>""")
162169
clone = html_parser.clone()
163-
html_parser.root.css_first('h1').decompose()
170+
html_parser.root.css_first("h1").decompose()
164171
del html_parser
165-
assert clone.html == '<html><head></head><body><h1>Welcome</h1></body></html>'
172+
assert clone.html == "<html><head></head><body><h1>Welcome</h1></body></html>"
166173

167174

168175
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
@@ -174,7 +181,7 @@ def test_tags(parser):
174181
<span></span>
175182
<div></div>
176183
""")
177-
assert len(html_parser.tags('div')) == 5
184+
assert len(html_parser.tags("div")) == 5
178185

179186

180187
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
@@ -186,4 +193,107 @@ def test_preserves_doctype(parser):
186193
<body><p>Hello World</p></body>
187194
</html>
188195
""")
189-
assert '<!DOCTYPE html>' in html_parser.html
196+
assert "<!DOCTYPE html>" in html_parser.html
197+
198+
199+
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
200+
def test_invalid_input_types(parser):
201+
with pytest.raises(TypeError, match="Expected a string"):
202+
parser(123)
203+
204+
with pytest.raises(TypeError, match="Expected a string"):
205+
parser([])
206+
207+
with pytest.raises(TypeError, match="Expected a string"):
208+
parser(None)
209+
210+
211+
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
212+
def test_clone_handling(parser):
213+
html_parser = parser("<div>test</div>")
214+
215+
cloned = html_parser.clone()
216+
assert cloned.html is not None
217+
218+
assert html_parser.html is not None
219+
220+
221+
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
222+
def test_concurrent_parsing(parser):
223+
"""Test that concurrent parsing doesn't cause race conditions."""
224+
results = []
225+
errors = []
226+
lock = threading.Lock()
227+
228+
def parse_html(content):
229+
try:
230+
html_parser = parser(content)
231+
result = html_parser.body.text()
232+
if result:
233+
with lock:
234+
results.append(result)
235+
except Exception as e:
236+
with lock:
237+
errors.append(e)
238+
239+
threads = []
240+
test_content = "<div>Content {}</div>"
241+
242+
for i in range(50):
243+
content = test_content.format(i)
244+
t1 = threading.Thread(target=parse_html, args=(content,))
245+
threads.append(t1)
246+
247+
for t in threads:
248+
t.start()
249+
250+
for t in threads:
251+
t.join()
252+
253+
assert len(errors) == 0
254+
assert len(results) == 50
255+
256+
257+
def test_css_selector_error_handling():
258+
html_parser = LexborHTMLParser("<div class='test'>content</div>")
259+
260+
# Invalid selector types should raise TypeError
261+
with pytest.raises(TypeError):
262+
html_parser.css(123)
263+
264+
with pytest.raises(TypeError):
265+
html_parser.css(None)
266+
267+
invalid_selectors = [
268+
":::",
269+
"[[[",
270+
"div{color:red}",
271+
'h3:contains("some substring")',
272+
]
273+
274+
for selector in invalid_selectors:
275+
try:
276+
result = html_parser.css(selector)
277+
# Should return empty list or raise specific exception
278+
assert isinstance(result, list)
279+
except SelectolaxError:
280+
# Specific parsing errors are acceptable
281+
pass
282+
283+
284+
@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
285+
def test_null_pointer_safety(parser):
286+
"""Test that NULL pointer checks prevent crashes."""
287+
# Test edge cases that might result in NULL pointers
288+
edge_cases = [
289+
"", # Empty HTML
290+
"<>", # Empty tag
291+
"<!>", # Empty declaration
292+
"<html></html>", # Minimal valid HTML
293+
]
294+
properties_to_test = ["root", "head", "body", "html"]
295+
for html_content in edge_cases:
296+
html_parser = parser(html_content)
297+
298+
for prop_name in properties_to_test:
299+
getattr(html_parser, prop_name)

tests/test_utils.py

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
Many functionality are already tested in the Modest engine, so there is no reason to test every case.
44
"""
55

6-
# coding:utf-8
76
from typing import Callable, NamedTuple, Sequence, Type, Union
87

98
import pytest
@@ -38,7 +37,7 @@ class Impl(NamedTuple):
3837
tag_fn=lexbor_create_tag,
3938
parse_fragment_fn=lexbor_parse_fragment,
4039
),
41-
)
40+
),
4241
)
4342

4443

@@ -55,6 +54,7 @@ def test_create_header_tag(impl: Impl):
5554
assert isinstance(node, impl.node)
5655
assert node.html == "<header></header>"
5756

57+
5858
# Cases to test parse_fragment():
5959
# - <doctyle> + <html> only
6060
# - HTML with <head>
@@ -88,7 +88,10 @@ def test_parse_fragment_html_with_head(impl: Impl):
8888
assert len(nodes) == 1
8989
assert nodes[0].tag == "html"
9090
assert nodes[0].html == '<html><head><link href="http://"></head></html>'
91-
assert nodes[0].parser.html == '<!DOCTYPE html><html><head><link href="http://"></head></html>'
91+
assert (
92+
nodes[0].parser.html
93+
== '<!DOCTYPE html><html><head><link href="http://"></head></html>'
94+
)
9295

9396
assert len(nodes[0].parser.css("head")) == 1
9497
assert len(nodes[0].parser.css("body")) == 0
@@ -100,8 +103,14 @@ def test_parse_fragment_html_with_body(impl: Impl):
100103
nodes = impl.parse_fragment_fn(html)
101104
assert len(nodes) == 1
102105
assert nodes[0].tag == "html"
103-
assert nodes[0].html == '<html><body><div><script src="http://"></script></div></body></html>'
104-
assert nodes[0].parser.html == '<!DOCTYPE html><html><body><div><script src="http://"></script></div></body></html>'
106+
assert (
107+
nodes[0].html
108+
== '<html><body><div><script src="http://"></script></div></body></html>'
109+
)
110+
assert (
111+
nodes[0].parser.html
112+
== '<!DOCTYPE html><html><body><div><script src="http://"></script></div></body></html>'
113+
)
105114

106115
assert len(nodes[0].parser.css("head")) == 0
107116
assert len(nodes[0].parser.css("body")) == 1
@@ -113,8 +122,14 @@ def test_parse_fragment_html_with_head_and_body(impl: Impl):
113122
nodes = impl.parse_fragment_fn(html)
114123
assert len(nodes) == 1
115124
assert nodes[0].tag == "html"
116-
assert nodes[0].html == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>' # noqa: E501
117-
assert nodes[0].parser.html == '<!DOCTYPE html><html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>' # noqa: E501
125+
assert (
126+
nodes[0].html
127+
== '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'
128+
) # noqa: E501
129+
assert (
130+
nodes[0].parser.html
131+
== '<!DOCTYPE html><html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'
132+
) # noqa: E501
118133

119134
assert len(nodes[0].parser.css("head")) == 1
120135
assert len(nodes[0].parser.css("body")) == 1
@@ -129,7 +144,10 @@ def test_parse_fragment_head_and_body_no_html(impl: Impl):
129144
assert nodes[1].tag == "body"
130145
assert nodes[0].html == '<head><link href="http://"></head>'
131146
assert nodes[1].html == '<body><div><script src="http://"></script></div></body>'
132-
assert nodes[0].parser.html == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>' # noqa: E501
147+
assert (
148+
nodes[0].parser.html
149+
== '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'
150+
) # noqa: E501
133151

134152
assert len(nodes[0].parser.css("head")) == 1
135153
assert len(nodes[0].parser.css("body")) == 1
@@ -155,7 +173,10 @@ def test_parse_fragment_body_no_html(impl: Impl):
155173
assert len(nodes) == 1
156174
assert nodes[0].tag == "body"
157175
assert nodes[0].html == '<body><div><script src="http://"></script></div></body>'
158-
assert nodes[0].parser.html == '<html><body><div><script src="http://"></script></div></body></html>'
176+
assert (
177+
nodes[0].parser.html
178+
== '<html><body><div><script src="http://"></script></div></body></html>'
179+
)
159180

160181
assert len(nodes[0].parser.css("head")) == 0
161182
assert len(nodes[0].parser.css("body")) == 1
@@ -174,6 +195,9 @@ def test_parse_fragment_fragment(impl: Impl):
174195
# NOTE: Ideally the full HTML would NOT contain `<html>`, `<head>` and `<body>` in this case,
175196
# but this is technical limitation of the parser.
176197
# But as long as user serializes fragment nodes by as `Node.html`, they should be fine.
177-
assert nodes[0].parser.html == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>' # noqa: E501
198+
assert (
199+
nodes[0].parser.html
200+
== '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'
201+
) # noqa: E501
178202
assert len(nodes[0].parser.css("head")) == 1
179203
assert len(nodes[0].parser.css("body")) == 1

0 commit comments

Comments
 (0)