Skip to content

Commit 7ca02eb

Browse files
authored
test: add edge case tests for filter, iterancestors, and find_similar (#200)
2 parents 1dc0b7a + bf9e2da commit 7ca02eb

File tree

3 files changed

+208
-0
lines changed

3 files changed

+208
-0
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
"""
2+
Tests for Selector.iterancestors() and Selector.find_ancestor() methods.
3+
Target file: tests/parser/test_general.py (append to TestElementNavigation class)
4+
"""
5+
import pytest
6+
from scrapling import Selector
7+
8+
9+
@pytest.fixture
10+
def nested_page():
11+
html = """
12+
<html><body>
13+
<div id="level1">
14+
<section id="level2" class="wrapper">
15+
<article id="level3" class="card">
16+
<p id="level4"><span id="target">deep text</span></p>
17+
</article>
18+
</section>
19+
</div>
20+
</body></html>
21+
"""
22+
return Selector(html, adaptive=False)
23+
24+
25+
class TestAncestorNavigation:
26+
def test_iterancestors_returns_all_ancestors(self, nested_page):
27+
"""iterancestors() should yield every ancestor up to <html>"""
28+
target = nested_page.css("#target")[0]
29+
ancestor_tags = [a.tag for a in target.iterancestors()]
30+
# Expected order: p → article → section → div → body → html
31+
assert ancestor_tags[:4] == ["p", "article", "section", "div"]
32+
assert "body" in ancestor_tags
33+
assert "html" in ancestor_tags
34+
35+
def test_iterancestors_order_is_bottom_up(self, nested_page):
36+
"""iterancestors() should start from the immediate parent, not the root"""
37+
target = nested_page.css("#target")[0]
38+
first_ancestor = next(target.iterancestors())
39+
assert first_ancestor.attrib.get("id") == "level4"
40+
41+
def test_find_ancestor_returns_first_match(self, nested_page):
42+
"""find_ancestor() should return the closest ancestor matching the predicate"""
43+
target = nested_page.css("#target")[0]
44+
# Looking for the nearest ancestor with class "card"
45+
result = target.find_ancestor(lambda el: el.has_class("card"))
46+
assert result is not None
47+
assert result.attrib.get("id") == "level3"
48+
49+
def test_find_ancestor_returns_none_when_not_found(self, nested_page):
50+
"""find_ancestor() should return None if no ancestor matches"""
51+
target = nested_page.css("#target")[0]
52+
result = target.find_ancestor(lambda el: el.has_class("nonexistent-class"))
53+
assert result is None
54+
55+
def test_iterancestors_on_text_node_is_empty(self, nested_page):
56+
"""iterancestors() on a text node should yield nothing (not raise)"""
57+
text_node = nested_page.css("#target::text")[0]
58+
ancestors = list(text_node.iterancestors())
59+
assert ancestors == []
60+
61+
def test_find_ancestor_on_root_element_returns_none(self, nested_page):
62+
"""find_ancestor() on the root <html> element should return None gracefully"""
63+
# html element has no ancestors
64+
html_el = nested_page.css("html")[0]
65+
result = html_el.find_ancestor(lambda el: True)
66+
assert result is None
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
"""
2+
Tests for Selector.find_similar() with non-default parameters.
3+
Target file: tests/parser/test_general.py (append to TestSimilarElements class)
4+
"""
5+
import pytest
6+
from scrapling import Selector
7+
8+
9+
@pytest.fixture
10+
def product_page():
11+
html = """
12+
<html><body>
13+
<div class="product-list">
14+
<div class="product" data-category="fruit" data-price="10">
15+
<span class="name">Apple</span>
16+
</div>
17+
<div class="product" data-category="fruit" data-price="5">
18+
<span class="name">Banana</span>
19+
</div>
20+
<div class="product" data-category="veggie" data-price="3">
21+
<span class="name">Carrot</span>
22+
</div>
23+
<!-- Structurally similar but different tag — should NOT be found -->
24+
<section class="product" data-category="fruit" data-price="8">
25+
<span class="name">Grape</span>
26+
</section>
27+
</div>
28+
</body></html>
29+
"""
30+
return Selector(html, adaptive=False)
31+
32+
33+
class TestFindSimilarAdvanced:
34+
def test_find_similar_default_finds_same_tag_siblings(self, product_page):
35+
"""find_similar() with defaults should find div.product siblings, not the section"""
36+
first = product_page.css("div.product")[0]
37+
similar = first.find_similar()
38+
tags = [el.tag for el in similar]
39+
assert all(t == "div" for t in tags), "Should only return <div> elements"
40+
assert len(similar) == 2 # Banana and Carrot, not Grape (section)
41+
42+
def test_find_similar_high_threshold_filters_more(self, product_page):
43+
"""A higher similarity_threshold should return fewer (or equal) results"""
44+
first = product_page.css("div.product")[0]
45+
low_threshold = first.find_similar(similarity_threshold=0.1)
46+
high_threshold = first.find_similar(similarity_threshold=0.9)
47+
assert len(high_threshold) <= len(low_threshold)
48+
49+
def test_find_similar_match_text_excludes_different_text(self, product_page):
50+
"""match_text=True should factor in text content during similarity scoring"""
51+
first = product_page.css("div.product")[0] # Apple
52+
# With match_text=True and a high threshold, "Apple" vs "Banana"/"Carrot" text
53+
# should reduce similarity scores — result count may drop
54+
with_text = first.find_similar(similarity_threshold=0.8, match_text=True)
55+
without_text = first.find_similar(similarity_threshold=0.8, match_text=False)
56+
# match_text=True is stricter when text differs, so result should be <= without_text
57+
assert len(with_text) <= len(without_text)
58+
59+
def test_find_similar_ignore_attributes_affects_matching(self, product_page):
60+
"""Ignoring data-price should make more elements qualify as similar"""
61+
first = product_page.css("div.product")[0]
62+
# Ignore both data-price and data-category → only class matters → all 3 divs match
63+
ignore_all_data = first.find_similar(
64+
similarity_threshold=0.2,
65+
ignore_attributes=["data-price", "data-category"]
66+
)
67+
# Ignore nothing → data-category difference (fruit vs veggie) may reduce matches
68+
ignore_nothing = first.find_similar(
69+
similarity_threshold=0.9,
70+
ignore_attributes=[]
71+
)
72+
assert len(ignore_all_data) >= len(ignore_nothing)
73+
74+
def test_find_similar_on_text_node_returns_empty(self, product_page):
75+
"""find_similar() on a text node should return empty Selectors without raising"""
76+
text_node = product_page.css(".name::text")[0]
77+
result = text_node.find_similar()
78+
assert len(result) == 0
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
"""
2+
Tests for Selectors.filter() method edge cases.
3+
Target file: tests/parser/test_parser_advanced.py (append to TestAdvancedSelectors class)
4+
"""
5+
import pytest
6+
from scrapling import Selector, Selectors
7+
8+
9+
@pytest.fixture
10+
def page():
11+
html = """
12+
<html><body>
13+
<ul>
14+
<li class="item" data-value="10">Apple</li>
15+
<li class="item" data-value="5">Banana</li>
16+
<li class="item" data-value="20">Cherry</li>
17+
<li class="item disabled" data-value="0">Durian</li>
18+
</ul>
19+
</body></html>
20+
"""
21+
return Selector(html, adaptive=False)
22+
23+
24+
class TestSelectorsFilter:
25+
def test_filter_basic(self, page):
26+
"""filter() should return only elements matching the predicate"""
27+
items = page.css("li.item")
28+
expensive = items.filter(lambda el: int(el.attrib.get("data-value", 0)) >= 10)
29+
assert len(expensive) == 2
30+
texts = expensive.getall()
31+
assert any("Apple" in t for t in texts)
32+
assert any("Cherry" in t for t in texts)
33+
34+
def test_filter_returns_empty_selectors_when_no_match(self, page):
35+
"""filter() should return an empty Selectors (not None/exception) when nothing matches"""
36+
items = page.css("li.item")
37+
result = items.filter(lambda el: int(el.attrib.get("data-value", 0)) > 9999)
38+
assert isinstance(result, Selectors)
39+
assert len(result) == 0
40+
assert result.first is None
41+
42+
def test_filter_all_pass(self, page):
43+
"""filter() with always-True predicate should return all elements"""
44+
items = page.css("li.item")
45+
result = items.filter(lambda el: True)
46+
assert len(result) == len(items)
47+
48+
def test_filter_chained(self, page):
49+
"""filter() should be chainable — apply two filters in sequence"""
50+
items = page.css("li.item")
51+
# First: value > 0, then: not disabled
52+
result = (
53+
items
54+
.filter(lambda el: int(el.attrib.get("data-value", 0)) > 0)
55+
.filter(lambda el: not el.has_class("disabled"))
56+
)
57+
assert len(result) == 3 # Apple, Banana, Cherry (Durian is disabled AND value=0)
58+
59+
def test_filter_on_empty_selectors(self):
60+
"""filter() on an already-empty Selectors should not raise"""
61+
empty = Selectors()
62+
result = empty.filter(lambda el: True)
63+
assert isinstance(result, Selectors)
64+
assert len(result) == 0

0 commit comments

Comments
 (0)