1- # coding:utf-8
1+ import threading
22from difflib import SequenceMatcher
33
44import pytest
55from selectolax .parser import HTMLParser , Node
6- from selectolax .lexbor import LexborHTMLParser , LexborNode
6+
7+ from selectolax .lexbor import LexborHTMLParser , LexborNode , SelectolaxError
78
89"""
910We'are testing only our own code.
1011Many functionality are already tested in the Modest engine, so there is no reason to test every case.
1112"""
1213
13- _PARSERS_PARAMETRIZER = ("parser" , (HTMLParser , LexborHTMLParser ),)
14+ _PARSERS_PARAMETRIZER = (
15+ "parser" ,
16+ (HTMLParser , LexborHTMLParser ),
17+ )
1418
1519
1620def test_encoding ():
1721 html = "<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
1822 html = HTMLParser (html )
19- assert html .input_encoding == ' UTF-8'
23+ assert html .input_encoding == " UTF-8"
2024
2125 html = b"<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
2226 html = HTMLParser (html )
23- assert html .input_encoding == ' UTF-8'
27+ assert html .input_encoding == " UTF-8"
2428
25- html = "<div>Привет мир!</div>" .encode (' cp1251' )
26- assert HTMLParser (html , detect_encoding = True ).input_encoding == ' WINDOWS-1251'
29+ html = "<div>Привет мир!</div>" .encode (" cp1251" )
30+ assert HTMLParser (html , detect_encoding = True ).input_encoding == " WINDOWS-1251"
2731
28- html_utf = '<head><meta charset="WINDOWS-1251"></head>' .encode ('utf-8' )
29- assert HTMLParser (html_utf , detect_encoding = True , use_meta_tags = True ).input_encoding == 'WINDOWS-1251'
32+ html_utf = '<head><meta charset="WINDOWS-1251"></head>' .encode ("utf-8" )
33+ assert (
34+ HTMLParser (html_utf , detect_encoding = True , use_meta_tags = True ).input_encoding
35+ == "WINDOWS-1251"
36+ )
3037
3138 # UTF-16 not ASCII-readable
32- html_utf = '<head><meta charset="WINDOWS-1251"></head>' .encode ('utf-16le' )
33- assert HTMLParser (html_utf , detect_encoding = True , use_meta_tags = True ).input_encoding == 'UTF-16LE'
39+ html_utf = '<head><meta charset="WINDOWS-1251"></head>' .encode ("utf-16le" )
40+ assert (
41+ HTMLParser (html_utf , detect_encoding = True , use_meta_tags = True ).input_encoding
42+ == "UTF-16LE"
43+ )
3444
3545 # Unencodable characters in string, should not throw an exception by default
36- html_unencodable = b' <div>Roboto+Condensed</div>' .decode (' utf-7' , errors = ' ignore' )
37- assert HTMLParser (html_unencodable ).input_encoding == ' UTF-8'
46+ html_unencodable = b" <div>Roboto+Condensed</div>" .decode (" utf-7" , errors = " ignore" )
47+ assert HTMLParser (html_unencodable ).input_encoding == " UTF-8"
3848
3949 # decode_errrors='strict' should error out
4050 try :
41- HTMLParser (html_unencodable , decode_errors = ' strict' )
51+ HTMLParser (html_unencodable , decode_errors = " strict" )
4252 assert False
4353 except Exception as e :
4454 assert type (e ) is UnicodeEncodeError
@@ -56,7 +66,6 @@ def test_parser(parser):
5666 parser ("asd" ).css (123 )
5767
5868
59-
6069@pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
6170def test_malformed_data (parser ):
6271 malformed_inputs = [
@@ -80,13 +89,12 @@ def test_malformed_data(parser):
8089def test_properties (parser ):
8190 html_parser = parser ("<div><p>test</p></div>" )
8291
83- properties_to_test = [' root' , ' head' , ' body' , ' html' ]
92+ properties_to_test = [" root" , " head" , " body" , " html" ]
8493
8594 for prop_name in properties_to_test :
8695 getattr (html_parser , prop_name )
8796
8897
89-
9098@pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
9199def test_unicode_handling (parser ):
92100 unicode_content = [
@@ -99,7 +107,7 @@ def test_unicode_handling(parser):
99107 html = f"<div>{ content } </div>"
100108 try :
101109 html_parser = parser (html )
102- result = html_parser .css_first (' div' )
110+ result = html_parser .css_first (" div" )
103111 if result :
104112 extracted_text = result .text ()
105113 assert content in extracted_text
@@ -123,7 +131,6 @@ def test_tag_name_validation(parser):
123131 html_parser .tags (long_tag_name )
124132
125133
126-
127134@pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
128135def test_nodes (parser ):
129136 html = (
@@ -141,16 +148,16 @@ def test_nodes(parser):
141148
142149@pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
143150def test_root_css (parser ):
144- tree = parser (' test' )
145- assert len (tree .root .css (' data' )) == 0
151+ tree = parser (" test" )
152+ assert len (tree .root .css (" data" )) == 0
146153
147154
148155@pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
149156def test_strip_tags_from_root (parser ):
150157 html = "<body><div></div><script></script></body>"
151158 html_parser = parser (html )
152- html_parser .root .strip_tags ([' div' , ' script' ])
153- assert html_parser .html == ' <html><head></head><body></body></html>'
159+ html_parser .root .strip_tags ([" div" , " script" ])
160+ assert html_parser .html == " <html><head></head><body></body></html>"
154161
155162 with pytest .raises (TypeError ):
156163 html_parser .strip_tags (1 )
@@ -160,9 +167,9 @@ def test_strip_tags_from_root(parser):
160167def test_clone (parser ):
161168 html_parser = parser ("""<h1>Welcome</h1>""" )
162169 clone = html_parser .clone ()
163- html_parser .root .css_first ('h1' ).decompose ()
170+ html_parser .root .css_first ("h1" ).decompose ()
164171 del html_parser
165- assert clone .html == ' <html><head></head><body><h1>Welcome</h1></body></html>'
172+ assert clone .html == " <html><head></head><body><h1>Welcome</h1></body></html>"
166173
167174
168175@pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
@@ -174,7 +181,7 @@ def test_tags(parser):
174181 <span></span>
175182 <div></div>
176183 """ )
177- assert len (html_parser .tags (' div' )) == 5
184+ assert len (html_parser .tags (" div" )) == 5
178185
179186
180187@pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
@@ -186,4 +193,107 @@ def test_preserves_doctype(parser):
186193 <body><p>Hello World</p></body>
187194 </html>
188195 """ )
189- assert '<!DOCTYPE html>' in html_parser .html
196+ assert "<!DOCTYPE html>" in html_parser .html
197+
198+
199+ @pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
200+ def test_invalid_input_types (parser ):
201+ with pytest .raises (TypeError , match = "Expected a string" ):
202+ parser (123 )
203+
204+ with pytest .raises (TypeError , match = "Expected a string" ):
205+ parser ([])
206+
207+ with pytest .raises (TypeError , match = "Expected a string" ):
208+ parser (None )
209+
210+
211+ @pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
212+ def test_clone_handling (parser ):
213+ html_parser = parser ("<div>test</div>" )
214+
215+ cloned = html_parser .clone ()
216+ assert cloned .html is not None
217+
218+ assert html_parser .html is not None
219+
220+
221+ @pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
222+ def test_concurrent_parsing (parser ):
223+ """Test that concurrent parsing doesn't cause race conditions."""
224+ results = []
225+ errors = []
226+ lock = threading .Lock ()
227+
228+ def parse_html (content ):
229+ try :
230+ html_parser = parser (content )
231+ result = html_parser .body .text ()
232+ if result :
233+ with lock :
234+ results .append (result )
235+ except Exception as e :
236+ with lock :
237+ errors .append (e )
238+
239+ threads = []
240+ test_content = "<div>Content {}</div>"
241+
242+ for i in range (50 ):
243+ content = test_content .format (i )
244+ t1 = threading .Thread (target = parse_html , args = (content ,))
245+ threads .append (t1 )
246+
247+ for t in threads :
248+ t .start ()
249+
250+ for t in threads :
251+ t .join ()
252+
253+ assert len (errors ) == 0
254+ assert len (results ) == 50
255+
256+
257+ def test_css_selector_error_handling ():
258+ html_parser = LexborHTMLParser ("<div class='test'>content</div>" )
259+
260+ # Invalid selector types should raise TypeError
261+ with pytest .raises (TypeError ):
262+ html_parser .css (123 )
263+
264+ with pytest .raises (TypeError ):
265+ html_parser .css (None )
266+
267+ invalid_selectors = [
268+ ":::" ,
269+ "[[[" ,
270+ "div{color:red}" ,
271+ 'h3:contains("some substring")' ,
272+ ]
273+
274+ for selector in invalid_selectors :
275+ try :
276+ result = html_parser .css (selector )
277+ # Should return empty list or raise specific exception
278+ assert isinstance (result , list )
279+ except SelectolaxError :
280+ # Specific parsing errors are acceptable
281+ pass
282+
283+
284+ @pytest .mark .parametrize (* _PARSERS_PARAMETRIZER )
285+ def test_null_pointer_safety (parser ):
286+ """Test that NULL pointer checks prevent crashes."""
287+ # Test edge cases that might result in NULL pointers
288+ edge_cases = [
289+ "" , # Empty HTML
290+ "<>" , # Empty tag
291+ "<!>" , # Empty declaration
292+ "<html></html>" , # Minimal valid HTML
293+ ]
294+ properties_to_test = ["root" , "head" , "body" , "html" ]
295+ for html_content in edge_cases :
296+ html_parser = parser (html_content )
297+
298+ for prop_name in properties_to_test :
299+ getattr (html_parser , prop_name )
0 commit comments