Skip to content

Commit 694102e

Browse files
authored
fix: fix convert relative link to absolute in enqueue_links for response with redirect (#956)
### Description - fix `enqueue_links` for response with redirect. ### Issues - Closes: #955
1 parent 13ec88e commit 694102e

File tree

5 files changed

+21
-9
lines changed

5 files changed

+21
-9
lines changed

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,8 @@ async def enqueue_links(
152152
for link in self._parser.find_links(parsed_content, selector=selector):
153153
url = link
154154
if not is_url_absolute(url):
155-
url = convert_to_absolute_url(context.request.url, url)
155+
base_url = context.request.loaded_url or context.request.url
156+
url = convert_to_absolute_url(base_url, url)
156157

157158
request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
158159

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,8 @@ async def enqueue_links(
210210
url = url.strip()
211211

212212
if not is_url_absolute(url):
213-
url = convert_to_absolute_url(context.request.url, url)
213+
base_url = context.request.loaded_url or context.request.url
214+
url = convert_to_absolute_url(base_url, url)
214215

215216
request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
216217

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
@pytest.fixture
2020
async def server() -> AsyncGenerator[respx.MockRouter, None]:
2121
with respx.mock(base_url='https://test.io', assert_all_called=False) as mock:
22+
mock.get('https://www.test.io/').return_value = Response(302, headers={'Location': 'https://test.io/'})
23+
2224
mock.get('/', name='index_endpoint').return_value = Response(
2325
200,
2426
text="""<html>
@@ -104,14 +106,15 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
104106
visit(context.request.url)
105107
await context.enqueue_links()
106108

107-
await crawler.run(['https://test.io/'])
109+
await crawler.run(['https://www.test.io/'])
108110

109111
assert server['index_endpoint'].called
110112
assert server['secondary_index_endpoint'].called
111113

112114
visited = {call[0][0] for call in visit.call_args_list}
115+
113116
assert visited == {
114-
'https://test.io/',
117+
'https://www.test.io/',
115118
'https://test.io/asdf',
116119
'https://test.io/hjkl',
117120
'https://test.io/qwer',

tests/unit/crawlers/_parsel/test_parsel_crawler.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
@pytest.fixture
2222
async def server() -> AsyncGenerator[respx.MockRouter, None]:
2323
with respx.mock(base_url='https://test.io', assert_all_called=False) as mock:
24+
mock.get('https://www.test.io/').return_value = Response(302, headers={'Location': 'https://test.io/'})
25+
2426
mock.get('/', name='index_endpoint').return_value = Response(
2527
200,
2628
text="""<html>
@@ -134,14 +136,14 @@ async def request_handler(context: ParselCrawlingContext) -> None:
134136
visit(url)
135137
await context.enqueue_links()
136138

137-
await crawler.run(['https://test.io/'])
139+
await crawler.run(['https://www.test.io/'])
138140

139141
assert server['index_endpoint'].called
140142
assert server['secondary_index_endpoint'].called
141143

142144
visited = {call[0][0] for call in visit.call_args_list}
143145
assert visited == {
144-
'https://test.io/',
146+
'https://www.test.io/',
145147
'https://test.io/asdf',
146148
'https://test.io/hjkl',
147149
'https://test.io/qwer',

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
4848

4949

5050
async def test_enqueue_links() -> None:
51-
requests = ['https://crawlee.dev/docs/examples']
52-
crawler = PlaywrightCrawler()
51+
# www.crawlee.dev create a redirect to crawlee.dev
52+
requests = ['https://www.crawlee.dev/docs/examples']
53+
crawler = PlaywrightCrawler(max_requests_per_crawl=11)
5354
visit = mock.Mock()
5455

5556
@crawler.router.default_handler
@@ -59,9 +60,13 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
5960

6061
await crawler.run(requests)
6162

62-
visited: set[str] = {call[0][0] for call in visit.call_args_list}
63+
first_visited = visit.call_args_list[0][0][0]
64+
visited: set[str] = {call[0][0] for call in visit.call_args_list[1:]}
6365

66+
# The first link visited use original domain
67+
assert first_visited == 'https://www.crawlee.dev/docs/examples'
6468
assert len(visited) >= 10
69+
# All other links must have a domain name after the redirect
6570
assert all(url.startswith('https://crawlee.dev/docs/examples') for url in visited)
6671

6772

0 commit comments

Comments
 (0)