Skip to content

Commit ecd3c64

Browse files
authored
Merge branch 'master' into only-apply-timeout-to-request-handler
2 parents 3715db2 + ce191ca commit ecd3c64

File tree

6 files changed

+236
-0
lines changed

6 files changed

+236
-0
lines changed

before_scroll.png

4.15 KB
Loading
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
from playwright.async_api import async_playwright
2+
from yarl import URL
3+
4+
from crawlee.crawlers._playwright._utils import block_requests, infinite_scroll
5+
6+
7+
async def test_infinite_scroll_on_dynamic_page(server_url: URL) -> None:
8+
"""Checks that infinite_scroll loads all items on a page with infinite scrolling."""
9+
async with async_playwright() as p:
10+
browser = await p.chromium.launch(headless=True)
11+
page = await browser.new_page()
12+
13+
target_url = str(server_url / 'infinite_scroll')
14+
15+
# Get data with manual scrolling
16+
await page.goto(target_url)
17+
18+
manual_items = []
19+
for _ in range(4):
20+
items = await page.query_selector_all('.item')
21+
manual_items = items
22+
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
23+
await page.wait_for_timeout(1000)
24+
25+
# Reset page
26+
await page.close()
27+
page = await browser.new_page()
28+
await page.goto(target_url)
29+
30+
# Get data with infinite_scroll utility
31+
before_scroll = await page.query_selector_all('.item')
32+
assert len(before_scroll) != len(manual_items)
33+
assert len(before_scroll) == 10
34+
35+
await infinite_scroll(page)
36+
37+
after_scroll = await page.query_selector_all('.item')
38+
39+
assert len(before_scroll) < len(after_scroll)
40+
assert len(manual_items) == len(after_scroll)
41+
42+
await browser.close()
43+
44+
45+
async def test_infinite_scroll_no_page_without_scroll(server_url: URL) -> None:
46+
"""Checks that infinite_scroll does not call error on a page without infinite scrolling."""
47+
async with async_playwright() as p:
48+
browser = await p.chromium.launch(headless=True)
49+
page = await browser.new_page()
50+
51+
await page.goto(str(server_url))
52+
53+
await infinite_scroll(page)
54+
55+
title = await page.title()
56+
57+
assert title == 'Hello, world!'
58+
59+
await browser.close()
60+
61+
62+
async def test_double_call_infinite_scroll(server_url: URL) -> None:
63+
"""Checks that calling infinite_scroll twice does not load more items the second time."""
64+
async with async_playwright() as p:
65+
browser = await p.chromium.launch(headless=True)
66+
page = await browser.new_page()
67+
68+
await page.goto(str(server_url / 'infinite_scroll'))
69+
70+
await infinite_scroll(page)
71+
first_count = len(await page.query_selector_all('.item'))
72+
73+
await infinite_scroll(page)
74+
second_count = len(await page.query_selector_all('.item'))
75+
76+
assert first_count == second_count
77+
78+
await browser.close()
79+
80+
81+
async def test_block_requests_default(server_url: URL) -> None:
82+
"""Checks that block_requests blocks the correct resources by default."""
83+
async with async_playwright() as p:
84+
browser = await p.chromium.launch()
85+
86+
target_url = str(server_url / 'resource_loading_page')
87+
88+
# Default behavior, all resources load
89+
page = await browser.new_page()
90+
loaded_urls_no_block = []
91+
92+
page.on('requestfinished', lambda req: loaded_urls_no_block.append(req.url.rsplit('/', 1)[-1]))
93+
await page.goto(target_url)
94+
await page.wait_for_load_state('networkidle')
95+
await page.close()
96+
97+
# With blocking — collect loaded resources
98+
page = await browser.new_page()
99+
loaded_urls_blocked = []
100+
101+
page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
102+
await block_requests(page)
103+
await page.goto(target_url)
104+
await page.wait_for_load_state('networkidle')
105+
await page.close()
106+
107+
await browser.close()
108+
109+
# Without blocking, both resources should load
110+
assert set(loaded_urls_no_block) == {'resource_loading_page', 'test.js', 'test.png'}
111+
112+
# With blocking, only JS should load
113+
assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.js'}
114+
115+
116+
async def test_block_requests_with_extra_patterns(server_url: URL) -> None:
117+
"""Checks that block_requests blocks the correct resources with extra patterns."""
118+
async with async_playwright() as p:
119+
browser = await p.chromium.launch()
120+
121+
target_url = str(server_url / 'resource_loading_page')
122+
123+
page = await browser.new_page()
124+
loaded_urls_blocked = []
125+
126+
page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
127+
await block_requests(page, extra_url_patterns=['*.js'])
128+
await page.goto(target_url)
129+
await page.wait_for_load_state('networkidle')
130+
await page.close()
131+
132+
await browser.close()
133+
134+
# With blocking, only HTML should load
135+
assert set(loaded_urls_blocked) == {'resource_loading_page'}
136+
137+
138+
async def test_block_requests_with_custom_patterns(server_url: URL) -> None:
139+
"""Checks that block_requests blocks the correct resources with custom patterns."""
140+
async with async_playwright() as p:
141+
browser = await p.chromium.launch()
142+
143+
target_url = str(server_url / 'resource_loading_page')
144+
145+
page = await browser.new_page()
146+
loaded_urls_blocked = []
147+
148+
page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
149+
await block_requests(page, url_patterns=['*.js'])
150+
await page.goto(target_url)
151+
await page.wait_for_load_state('networkidle')
152+
await page.close()
153+
154+
await browser.close()
155+
156+
# With blocking, only PNG should load
157+
assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.png'}

tests/unit/server.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
GENERIC_RESPONSE,
1919
HELLO_WORLD,
2020
INCAPSULA,
21+
INFINITE_SCROLL,
2122
PROBLEMATIC_LINKS,
23+
RESOURCE_LOADING_PAGE,
2224
ROBOTS_TXT,
2325
SECONDARY_INDEX,
2426
START_ENQUEUE,
@@ -122,6 +124,8 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
122124
'robots.txt': robots_txt,
123125
'get_compressed': get_compressed,
124126
'slow': slow_response,
127+
'infinite_scroll': infinite_scroll_endpoint,
128+
'resource_loading_page': resource_loading_endpoint,
125129
}
126130
path = URL(scope['path']).parts[1]
127131
# Route requests to appropriate handlers
@@ -421,6 +425,22 @@ async def slow_response(scope: dict[str, Any], _receive: Receive, send: Send) ->
421425
await send_html_response(send, HELLO_WORLD)
422426

423427

428+
async def infinite_scroll_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
429+
"""Handle requests for the infinite scroll page."""
430+
await send_html_response(
431+
send,
432+
INFINITE_SCROLL,
433+
)
434+
435+
436+
async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
437+
"""Handle requests for the resource loading page."""
438+
await send_html_response(
439+
send,
440+
RESOURCE_LOADING_PAGE,
441+
)
442+
443+
424444
class TestServer(Server):
425445
"""A test HTTP server implementation based on Uvicorn Server."""
426446

tests/unit/server_endpoints.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,62 @@
6969
7070
sitemap: http://not-exists.com/sitemap_1.xml
7171
sitemap: http://not-exists.com/sitemap_2.xml"""
72+
73+
74+
INFINITE_SCROLL = b"""\
75+
<!DOCTYPE html>
76+
<html>
77+
<body>
78+
<div id="content"></div>
79+
80+
<script>
81+
let page = 0;
82+
let loading = false;
83+
84+
for (let i = 0; i < 10; i++) {
85+
const div = document.createElement('div');
86+
div.className = 'item';
87+
div.style.height = '200px';
88+
div.textContent = 'Item ' + (i + 1);
89+
document.getElementById('content').appendChild(div);
90+
}
91+
92+
async function loadMore() {
93+
if (loading || page >= 3) return;
94+
loading = true;
95+
page++;
96+
97+
await new Promise(resolve => setTimeout(resolve, 100));
98+
99+
for (let i = 0; i < 10; i++) {
100+
const div = document.createElement('div');
101+
div.className = 'item';
102+
div.style.height = '200px';
103+
div.textContent = 'Item ' + (page * 10 + i + 1);
104+
document.getElementById('content').appendChild(div);
105+
}
106+
107+
loading = false;
108+
}
109+
110+
window.addEventListener('scroll', () => {
111+
if (window.innerHeight + window.scrollY >= document.body.offsetHeight - 100) {
112+
loadMore();
113+
}
114+
});
115+
</script>
116+
</body>
117+
</html>
118+
"""
119+
120+
RESOURCE_LOADING_PAGE = b"""\
121+
<!DOCTYPE html>
122+
<html>
123+
<head>
124+
<script src="/server_static/test.js"></script>
125+
</head>
126+
<body>
127+
<img src="/server_static/test.png" />
128+
</body>
129+
</html>
130+
"""

tests/unit/server_static/test.js

Whitespace-only changes.

tests/unit/server_static/test.png

Loading

0 commit comments

Comments
 (0)