test: add edge case tests for filter, iterancestors, and find_similar (#200)

D4Vinci · web-flow · commit 7ca02ebb80ce · 2026-03-17T22:16:15.000+02:00
diff --git a/tests/parser/test_ancestor_navigation.py b/tests/parser/test_ancestor_navigation.py
@@ -0,0 +1,66 @@
+"""
+Tests for Selector.iterancestors() and Selector.find_ancestor() methods.
+Target file: tests/parser/test_general.py (append to TestElementNavigation class)
+"""
+import pytest
+from scrapling import Selector
+
+
+@pytest.fixture
+def nested_page():
+    html = """
+    <html><body>
+        <div id="level1">
+            <section id="level2" class="wrapper">
+                <article id="level3" class="card">
+                    <p id="level4"><span id="target">deep text</span></p>
+                </article>
+            </section>
+        </div>
+    </body></html>
+    """
+    return Selector(html, adaptive=False)
+
+
+class TestAncestorNavigation:
+    def test_iterancestors_returns_all_ancestors(self, nested_page):
+        """iterancestors() should yield every ancestor up to <html>"""
+        target = nested_page.css("#target")[0]
+        ancestor_tags = [a.tag for a in target.iterancestors()]
+        # Expected order: p → article → section → div → body → html
+        assert ancestor_tags[:4] == ["p", "article", "section", "div"]
+        assert "body" in ancestor_tags
+        assert "html" in ancestor_tags
+
+    def test_iterancestors_order_is_bottom_up(self, nested_page):
+        """iterancestors() should start from the immediate parent, not the root"""
+        target = nested_page.css("#target")[0]
+        first_ancestor = next(target.iterancestors())
+        assert first_ancestor.attrib.get("id") == "level4"
+
+    def test_find_ancestor_returns_first_match(self, nested_page):
+        """find_ancestor() should return the closest ancestor matching the predicate"""
+        target = nested_page.css("#target")[0]
+        # Looking for the nearest ancestor with class "card"
+        result = target.find_ancestor(lambda el: el.has_class("card"))
+        assert result is not None
+        assert result.attrib.get("id") == "level3"
+
+    def test_find_ancestor_returns_none_when_not_found(self, nested_page):
+        """find_ancestor() should return None if no ancestor matches"""
+        target = nested_page.css("#target")[0]
+        result = target.find_ancestor(lambda el: el.has_class("nonexistent-class"))
+        assert result is None
+
+    def test_iterancestors_on_text_node_is_empty(self, nested_page):
+        """iterancestors() on a text node should yield nothing (not raise)"""
+        text_node = nested_page.css("#target::text")[0]
+        ancestors = list(text_node.iterancestors())
+        assert ancestors == []
+
+    def test_find_ancestor_on_root_element_returns_none(self, nested_page):
+        """find_ancestor() on the root <html> element should return None gracefully"""
+        # html element has no ancestors
+        html_el = nested_page.css("html")[0]
+        result = html_el.find_ancestor(lambda el: True)
+        assert result is None
diff --git a/tests/parser/test_find_similar_advanced.py b/tests/parser/test_find_similar_advanced.py
@@ -0,0 +1,78 @@
+"""
+Tests for Selector.find_similar() with non-default parameters.
+Target file: tests/parser/test_general.py (append to TestSimilarElements class)
+"""
+import pytest
+from scrapling import Selector
+
+
+@pytest.fixture
+def product_page():
+    html = """
+    <html><body>
+        <div class="product-list">
+            <div class="product" data-category="fruit" data-price="10">
+                <span class="name">Apple</span>
+            </div>
+            <div class="product" data-category="fruit" data-price="5">
+                <span class="name">Banana</span>
+            </div>
+            <div class="product" data-category="veggie" data-price="3">
+                <span class="name">Carrot</span>
+            </div>
+            <!-- Structurally similar but different tag — should NOT be found -->
+            <section class="product" data-category="fruit" data-price="8">
+                <span class="name">Grape</span>
+            </section>
+        </div>
+    </body></html>
+    """
+    return Selector(html, adaptive=False)
+
+
+class TestFindSimilarAdvanced:
+    def test_find_similar_default_finds_same_tag_siblings(self, product_page):
+        """find_similar() with defaults should find div.product siblings, not the section"""
+        first = product_page.css("div.product")[0]
+        similar = first.find_similar()
+        tags = [el.tag for el in similar]
+        assert all(t == "div" for t in tags), "Should only return <div> elements"
+        assert len(similar) == 2  # Banana and Carrot, not Grape (section)
+
+    def test_find_similar_high_threshold_filters_more(self, product_page):
+        """A higher similarity_threshold should return fewer (or equal) results"""
+        first = product_page.css("div.product")[0]
+        low_threshold = first.find_similar(similarity_threshold=0.1)
+        high_threshold = first.find_similar(similarity_threshold=0.9)
+        assert len(high_threshold) <= len(low_threshold)
+
+    def test_find_similar_match_text_excludes_different_text(self, product_page):
+        """match_text=True should factor in text content during similarity scoring"""
+        first = product_page.css("div.product")[0]  # Apple
+        # With match_text=True and a high threshold, "Apple" vs "Banana"/"Carrot" text
+        # should reduce similarity scores — result count may drop
+        with_text = first.find_similar(similarity_threshold=0.8, match_text=True)
+        without_text = first.find_similar(similarity_threshold=0.8, match_text=False)
+        # match_text=True is stricter when text differs, so result should be <= without_text
+        assert len(with_text) <= len(without_text)
+
+    def test_find_similar_ignore_attributes_affects_matching(self, product_page):
+        """Ignoring data-price should make more elements qualify as similar"""
+        first = product_page.css("div.product")[0]
+        # Ignore both data-price and data-category → only class matters → all 3 divs match
+        ignore_all_data = first.find_similar(
+            similarity_threshold=0.2,
+            ignore_attributes=["data-price", "data-category"]
+        )
+        # Ignore nothing → data-category difference (fruit vs veggie) may reduce matches
+        ignore_nothing = first.find_similar(
+            similarity_threshold=0.9,
+            ignore_attributes=[]
+        )
+        assert len(ignore_all_data) >= len(ignore_nothing)
+
+    def test_find_similar_on_text_node_returns_empty(self, product_page):
+        """find_similar() on a text node should return empty Selectors without raising"""
+        text_node = product_page.css(".name::text")[0]
+        result = text_node.find_similar()
+        assert len(result) == 0
diff --git a/tests/parser/test_selectors_filter.py b/tests/parser/test_selectors_filter.py
@@ -0,0 +1,64 @@
+"""
+Tests for Selectors.filter() method edge cases.
+Target file: tests/parser/test_parser_advanced.py (append to TestAdvancedSelectors class)
+"""
+import pytest
+from scrapling import Selector, Selectors
+
+
+@pytest.fixture
+def page():
+    html = """
+    <html><body>
+        <ul>
+            <li class="item" data-value="10">Apple</li>
+            <li class="item" data-value="5">Banana</li>
+            <li class="item" data-value="20">Cherry</li>
+            <li class="item disabled" data-value="0">Durian</li>
+        </ul>
+    </body></html>
+    """
+    return Selector(html, adaptive=False)
+
+
+class TestSelectorsFilter:
+    def test_filter_basic(self, page):
+        """filter() should return only elements matching the predicate"""
+        items = page.css("li.item")
+        expensive = items.filter(lambda el: int(el.attrib.get("data-value", 0)) >= 10)
+        assert len(expensive) == 2
+        texts = expensive.getall()
+        assert any("Apple" in t for t in texts)
+        assert any("Cherry" in t for t in texts)
+
+    def test_filter_returns_empty_selectors_when_no_match(self, page):
+        """filter() should return an empty Selectors (not None/exception) when nothing matches"""
+        items = page.css("li.item")
+        result = items.filter(lambda el: int(el.attrib.get("data-value", 0)) > 9999)
+        assert isinstance(result, Selectors)
+        assert len(result) == 0
+        assert result.first is None
+
+    def test_filter_all_pass(self, page):
+        """filter() with always-True predicate should return all elements"""
+        items = page.css("li.item")
+        result = items.filter(lambda el: True)
+        assert len(result) == len(items)
+
+    def test_filter_chained(self, page):
+        """filter() should be chainable — apply two filters in sequence"""
+        items = page.css("li.item")
+        # First: value > 0, then: not disabled
+        result = (
+            items
+            .filter(lambda el: int(el.attrib.get("data-value", 0)) > 0)
+            .filter(lambda el: not el.has_class("disabled"))
+        )
+        assert len(result) == 3  # Apple, Banana, Cherry (Durian is disabled AND value=0)
+
+    def test_filter_on_empty_selectors(self):
+        """filter() on an already-empty Selectors should not raise"""
+        empty = Selectors()
+        result = empty.filter(lambda el: True)
+        assert isinstance(result, Selectors)
+        assert len(result) == 0