|
| 1 | +""" |
| 2 | +Tests for Selector.find_similar() with non-default parameters. |
| 3 | +Target file: tests/parser/test_general.py (append to TestSimilarElements class) |
| 4 | +""" |
| 5 | +import pytest |
| 6 | +from scrapling import Selector |
| 7 | + |
| 8 | + |
| 9 | +@pytest.fixture |
| 10 | +def product_page(): |
| 11 | + html = """ |
| 12 | + <html><body> |
| 13 | + <div class="product-list"> |
| 14 | + <div class="product" data-category="fruit" data-price="10"> |
| 15 | + <span class="name">Apple</span> |
| 16 | + </div> |
| 17 | + <div class="product" data-category="fruit" data-price="5"> |
| 18 | + <span class="name">Banana</span> |
| 19 | + </div> |
| 20 | + <div class="product" data-category="veggie" data-price="3"> |
| 21 | + <span class="name">Carrot</span> |
| 22 | + </div> |
| 23 | + <!-- Structurally similar but different tag — should NOT be found --> |
| 24 | + <section class="product" data-category="fruit" data-price="8"> |
| 25 | + <span class="name">Grape</span> |
| 26 | + </section> |
| 27 | + </div> |
| 28 | + </body></html> |
| 29 | + """ |
| 30 | + return Selector(html, adaptive=False) |
| 31 | + |
| 32 | + |
| 33 | +class TestFindSimilarAdvanced: |
| 34 | + def test_find_similar_default_finds_same_tag_siblings(self, product_page): |
| 35 | + """find_similar() with defaults should find div.product siblings, not the section""" |
| 36 | + first = product_page.css("div.product")[0] |
| 37 | + similar = first.find_similar() |
| 38 | + tags = [el.tag for el in similar] |
| 39 | + assert all(t == "div" for t in tags), "Should only return <div> elements" |
| 40 | + assert len(similar) == 2 # Banana and Carrot, not Grape (section) |
| 41 | + |
| 42 | + def test_find_similar_high_threshold_filters_more(self, product_page): |
| 43 | + """A higher similarity_threshold should return fewer (or equal) results""" |
| 44 | + first = product_page.css("div.product")[0] |
| 45 | + low_threshold = first.find_similar(similarity_threshold=0.1) |
| 46 | + high_threshold = first.find_similar(similarity_threshold=0.9) |
| 47 | + assert len(high_threshold) <= len(low_threshold) |
| 48 | + |
| 49 | + def test_find_similar_match_text_excludes_different_text(self, product_page): |
| 50 | + """match_text=True should factor in text content during similarity scoring""" |
| 51 | + first = product_page.css("div.product")[0] # Apple |
| 52 | + # With match_text=True and a high threshold, "Apple" vs "Banana"/"Carrot" text |
| 53 | + # should reduce similarity scores — result count may drop |
| 54 | + with_text = first.find_similar(similarity_threshold=0.8, match_text=True) |
| 55 | + without_text = first.find_similar(similarity_threshold=0.8, match_text=False) |
| 56 | + # match_text=True is stricter when text differs, so result should be <= without_text |
| 57 | + assert len(with_text) <= len(without_text) |
| 58 | + |
| 59 | + def test_find_similar_ignore_attributes_affects_matching(self, product_page): |
| 60 | + """Ignoring data-price should make more elements qualify as similar""" |
| 61 | + first = product_page.css("div.product")[0] |
| 62 | + # Ignore both data-price and data-category → only class matters → all 3 divs match |
| 63 | + ignore_all_data = first.find_similar( |
| 64 | + similarity_threshold=0.2, |
| 65 | + ignore_attributes=["data-price", "data-category"] |
| 66 | + ) |
| 67 | + # Ignore nothing → data-category difference (fruit vs veggie) may reduce matches |
| 68 | + ignore_nothing = first.find_similar( |
| 69 | + similarity_threshold=0.9, |
| 70 | + ignore_attributes=[] |
| 71 | + ) |
| 72 | + assert len(ignore_all_data) >= len(ignore_nothing) |
| 73 | + |
| 74 | + def test_find_similar_on_text_node_returns_empty(self, product_page): |
| 75 | + """find_similar() on a text node should return empty Selectors without raising""" |
| 76 | + text_node = product_page.css(".name::text")[0] |
| 77 | + result = text_node.find_similar() |
| 78 | + assert len(result) == 0 |
0 commit comments