Improve tests

rushter · rushter · commit 0f30e3bb37ba · 2025-05-29T00:50:58.000+04:00
diff --git a/selectolax/lexbor/selection.pxi b/selectolax/lexbor/selection.pxi
@@ -38,6 +38,9 @@ cdef class LexborCSSSelector:
         cdef lxb_char_t* c_selector
         cdef lxb_css_selector_list_t * selectors_list
 
+        if not isinstance(query, str):
+            raise TypeError("Query must be a string.")
+
         bytes_query = query.encode(_ENCODING)
         selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t>len(query))
 
@@ -59,6 +62,9 @@ cdef class LexborCSSSelector:
         cdef lxb_char_t * c_selector
         cdef lxb_css_selector_list_t * selectors_list
 
+        if not isinstance(query, str):
+            raise TypeError("Query must be a string.")
+
         bytes_query = query.encode(_ENCODING)
         selectors_list = lxb_css_selectors_parse(self.parser, <lxb_char_t *> bytes_query, <size_t> len(query))
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -1,44 +1,54 @@
-# coding:utf-8
+import threading
 from difflib import SequenceMatcher
 
 import pytest
 from selectolax.parser import HTMLParser, Node
-from selectolax.lexbor import LexborHTMLParser, LexborNode
+
+from selectolax.lexbor import LexborHTMLParser, LexborNode, SelectolaxError
 
 """
 We'are testing only our own code.
 Many functionality are already tested in the Modest engine, so there is no reason to test every case.
 """
 
-_PARSERS_PARAMETRIZER = ("parser", (HTMLParser, LexborHTMLParser),)
+_PARSERS_PARAMETRIZER = (
+    "parser",
+    (HTMLParser, LexborHTMLParser),
+)
 
 
 def test_encoding():
     html = "<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
     html = HTMLParser(html)
-    assert html.input_encoding == 'UTF-8'
+    assert html.input_encoding == "UTF-8"
 
     html = b"<div><p id=p1><p id=p2><p id=p3><a>link</a><p id=p4><p id=p5>text<p id=p6></div>"
     html = HTMLParser(html)
-    assert html.input_encoding == 'UTF-8'
+    assert html.input_encoding == "UTF-8"
 
-    html = "<div>Привет мир!</div>".encode('cp1251')
-    assert HTMLParser(html, detect_encoding=True).input_encoding == 'WINDOWS-1251'
+    html = "<div>Привет мир!</div>".encode("cp1251")
+    assert HTMLParser(html, detect_encoding=True).input_encoding == "WINDOWS-1251"
 
-    html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-8')
-    assert HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding == 'WINDOWS-1251'
+    html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode("utf-8")
+    assert (
+        HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding
+        == "WINDOWS-1251"
+    )
 
     # UTF-16 not ASCII-readable
-    html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode('utf-16le')
-    assert HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding == 'UTF-16LE'
+    html_utf = '<head><meta charset="WINDOWS-1251"></head>'.encode("utf-16le")
+    assert (
+        HTMLParser(html_utf, detect_encoding=True, use_meta_tags=True).input_encoding
+        == "UTF-16LE"
+    )
 
     # Unencodable characters in string, should not throw an exception by default
-    html_unencodable = b'<div>Roboto+Condensed</div>'.decode('utf-7', errors='ignore')
-    assert HTMLParser(html_unencodable).input_encoding == 'UTF-8'
+    html_unencodable = b"<div>Roboto+Condensed</div>".decode("utf-7", errors="ignore")
+    assert HTMLParser(html_unencodable).input_encoding == "UTF-8"
 
     # decode_errrors='strict' should error out
     try:
-        HTMLParser(html_unencodable, decode_errors='strict')
+        HTMLParser(html_unencodable, decode_errors="strict")
         assert False
     except Exception as e:
         assert type(e) is UnicodeEncodeError
@@ -56,7 +66,6 @@ def test_parser(parser):
         parser("asd").css(123)
 
 
-
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
 def test_malformed_data(parser):
     malformed_inputs = [
@@ -80,13 +89,12 @@ def test_malformed_data(parser):
 def test_properties(parser):
     html_parser = parser("<div><p>test</p></div>")
 
-    properties_to_test = ['root', 'head', 'body', 'html']
+    properties_to_test = ["root", "head", "body", "html"]
 
     for prop_name in properties_to_test:
         getattr(html_parser, prop_name)
 
 
-
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
 def test_unicode_handling(parser):
     unicode_content = [
@@ -99,7 +107,7 @@ def test_unicode_handling(parser):
         html = f"<div>{content}</div>"
         try:
             html_parser = parser(html)
-            result = html_parser.css_first('div')
+            result = html_parser.css_first("div")
             if result:
                 extracted_text = result.text()
                 assert content in extracted_text
@@ -123,7 +131,6 @@ def test_tag_name_validation(parser):
         html_parser.tags(long_tag_name)
 
 
-
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
 def test_nodes(parser):
     html = (
@@ -141,16 +148,16 @@ def test_nodes(parser):
 
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
 def test_root_css(parser):
-    tree = parser('test')
-    assert len(tree.root.css('data')) == 0
+    tree = parser("test")
+    assert len(tree.root.css("data")) == 0
 
 
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
 def test_strip_tags_from_root(parser):
     html = "<body><div></div><script></script></body>"
     html_parser = parser(html)
-    html_parser.root.strip_tags(['div', 'script'])
-    assert html_parser.html == '<html><head></head><body></body></html>'
+    html_parser.root.strip_tags(["div", "script"])
+    assert html_parser.html == "<html><head></head><body></body></html>"
 
     with pytest.raises(TypeError):
         html_parser.strip_tags(1)
@@ -160,9 +167,9 @@ def test_strip_tags_from_root(parser):
 def test_clone(parser):
     html_parser = parser("""<h1>Welcome</h1>""")
     clone = html_parser.clone()
-    html_parser.root.css_first('h1').decompose()
+    html_parser.root.css_first("h1").decompose()
     del html_parser
-    assert clone.html == '<html><head></head><body><h1>Welcome</h1></body></html>'
+    assert clone.html == "<html><head></head><body><h1>Welcome</h1></body></html>"
 
 
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
@@ -174,7 +181,7 @@ def test_tags(parser):
     <span></span>
     <div></div>
     """)
-    assert len(html_parser.tags('div')) == 5
+    assert len(html_parser.tags("div")) == 5
 
 
 @pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
@@ -186,4 +193,107 @@ def test_preserves_doctype(parser):
         <body><p>Hello World</p></body>
     </html>
     """)
-    assert '<!DOCTYPE html>' in html_parser.html
+    assert "<!DOCTYPE html>" in html_parser.html
+
+
+@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
+def test_invalid_input_types(parser):
+    with pytest.raises(TypeError, match="Expected a string"):
+        parser(123)
+
+    with pytest.raises(TypeError, match="Expected a string"):
+        parser([])
+
+    with pytest.raises(TypeError, match="Expected a string"):
+        parser(None)
+
+
+@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
+def test_clone_handling(parser):
+    html_parser = parser("<div>test</div>")
+
+    cloned = html_parser.clone()
+    assert cloned.html is not None
+
+    assert html_parser.html is not None
+
+
+@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
+def test_concurrent_parsing(parser):
+    """Test that concurrent parsing doesn't cause race conditions."""
+    results = []
+    errors = []
+    lock = threading.Lock()
+
+    def parse_html(content):
+        try:
+            html_parser = parser(content)
+            result = html_parser.body.text()
+            if result:
+                with lock:
+                    results.append(result)
+        except Exception as e:
+            with lock:
+                errors.append(e)
+
+    threads = []
+    test_content = "<div>Content {}</div>"
+
+    for i in range(50):
+        content = test_content.format(i)
+        t1 = threading.Thread(target=parse_html, args=(content,))
+        threads.append(t1)
+
+    for t in threads:
+        t.start()
+
+    for t in threads:
+        t.join()
+
+    assert len(errors) == 0
+    assert len(results) == 50
+
+
+def test_css_selector_error_handling():
+    html_parser = LexborHTMLParser("<div class='test'>content</div>")
+
+    # Invalid selector types should raise TypeError
+    with pytest.raises(TypeError):
+        html_parser.css(123)
+
+    with pytest.raises(TypeError):
+        html_parser.css(None)
+
+    invalid_selectors = [
+        ":::",
+        "[[[",
+        "div{color:red}",
+        'h3:contains("some substring")',
+    ]
+
+    for selector in invalid_selectors:
+        try:
+            result = html_parser.css(selector)
+            # Should return empty list or raise specific exception
+            assert isinstance(result, list)
+        except SelectolaxError:
+            # Specific parsing errors are acceptable
+            pass
+
+
+@pytest.mark.parametrize(*_PARSERS_PARAMETRIZER)
+def test_null_pointer_safety(parser):
+    """Test that NULL pointer checks prevent crashes."""
+    # Test edge cases that might result in NULL pointers
+    edge_cases = [
+        "",  # Empty HTML
+        "<>",  # Empty tag
+        "<!>",  # Empty declaration
+        "<html></html>",  # Minimal valid HTML
+    ]
+    properties_to_test = ["root", "head", "body", "html"]
+    for html_content in edge_cases:
+        html_parser = parser(html_content)
+
+        for prop_name in properties_to_test:
+            getattr(html_parser, prop_name)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -3,7 +3,6 @@
 Many functionality are already tested in the Modest engine, so there is no reason to test every case.
 """
 
-# coding:utf-8
 from typing import Callable, NamedTuple, Sequence, Type, Union
 
 import pytest
@@ -38,7 +37,7 @@ class Impl(NamedTuple):
             tag_fn=lexbor_create_tag,
             parse_fragment_fn=lexbor_parse_fragment,
         ),
-    )
+    ),
 )
 
 
@@ -55,6 +54,7 @@ def test_create_header_tag(impl: Impl):
     assert isinstance(node, impl.node)
     assert node.html == "<header></header>"
 
+
 # Cases to test parse_fragment():
 # - <doctyle> + <html> only
 # - HTML with <head>
@@ -88,7 +88,10 @@ def test_parse_fragment_html_with_head(impl: Impl):
     assert len(nodes) == 1
     assert nodes[0].tag == "html"
     assert nodes[0].html == '<html><head><link href="http://"></head></html>'
-    assert nodes[0].parser.html == '<!DOCTYPE html><html><head><link href="http://"></head></html>'
+    assert (
+        nodes[0].parser.html
+        == '<!DOCTYPE html><html><head><link href="http://"></head></html>'
+    )
 
     assert len(nodes[0].parser.css("head")) == 1
     assert len(nodes[0].parser.css("body")) == 0
@@ -100,8 +103,14 @@ def test_parse_fragment_html_with_body(impl: Impl):
     nodes = impl.parse_fragment_fn(html)
     assert len(nodes) == 1
     assert nodes[0].tag == "html"
-    assert nodes[0].html == '<html><body><div><script src="http://"></script></div></body></html>'
-    assert nodes[0].parser.html == '<!DOCTYPE html><html><body><div><script src="http://"></script></div></body></html>'
+    assert (
+        nodes[0].html
+        == '<html><body><div><script src="http://"></script></div></body></html>'
+    )
+    assert (
+        nodes[0].parser.html
+        == '<!DOCTYPE html><html><body><div><script src="http://"></script></div></body></html>'
+    )
 
     assert len(nodes[0].parser.css("head")) == 0
     assert len(nodes[0].parser.css("body")) == 1
@@ -113,8 +122,14 @@ def test_parse_fragment_html_with_head_and_body(impl: Impl):
     nodes = impl.parse_fragment_fn(html)
     assert len(nodes) == 1
     assert nodes[0].tag == "html"
-    assert nodes[0].html == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'  # noqa: E501
-    assert nodes[0].parser.html == '<!DOCTYPE html><html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'  # noqa: E501
+    assert (
+        nodes[0].html
+        == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'
+    )  # noqa: E501
+    assert (
+        nodes[0].parser.html
+        == '<!DOCTYPE html><html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'
+    )  # noqa: E501
 
     assert len(nodes[0].parser.css("head")) == 1
     assert len(nodes[0].parser.css("body")) == 1
@@ -129,7 +144,10 @@ def test_parse_fragment_head_and_body_no_html(impl: Impl):
     assert nodes[1].tag == "body"
     assert nodes[0].html == '<head><link href="http://"></head>'
     assert nodes[1].html == '<body><div><script src="http://"></script></div></body>'
-    assert nodes[0].parser.html == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'  # noqa: E501
+    assert (
+        nodes[0].parser.html
+        == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'
+    )  # noqa: E501
 
     assert len(nodes[0].parser.css("head")) == 1
     assert len(nodes[0].parser.css("body")) == 1
@@ -155,7 +173,10 @@ def test_parse_fragment_body_no_html(impl: Impl):
     assert len(nodes) == 1
     assert nodes[0].tag == "body"
     assert nodes[0].html == '<body><div><script src="http://"></script></div></body>'
-    assert nodes[0].parser.html == '<html><body><div><script src="http://"></script></div></body></html>'
+    assert (
+        nodes[0].parser.html
+        == '<html><body><div><script src="http://"></script></div></body></html>'
+    )
 
     assert len(nodes[0].parser.css("head")) == 0
     assert len(nodes[0].parser.css("body")) == 1
@@ -174,6 +195,9 @@ def test_parse_fragment_fragment(impl: Impl):
     # NOTE: Ideally the full HTML would NOT contain `<html>`, `<head>` and `<body>` in this case,
     # but this is technical limitation of the parser.
     # But as long as user serializes fragment nodes by as `Node.html`, they should be fine.
-    assert nodes[0].parser.html == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'  # noqa: E501
+    assert (
+        nodes[0].parser.html
+        == '<html><head><link href="http://"></head><body><div><script src="http://"></script></div></body></html>'
+    )  # noqa: E501
     assert len(nodes[0].parser.css("head")) == 1
     assert len(nodes[0].parser.css("body")) == 1