Skip to content

Commit 700df91

Browse files
authored
fix: Respect enqueue_strategy after redirects in enqueue_links (#1607)
### Description - Sets the `enqueue_strategy` attribute in `Request` during `enqueue_links` processing, for correct check of requests that have completed redirection. ### Issues - Closes: #1606
1 parent 533b187 commit 700df91

File tree

4 files changed

+39
-9
lines changed

4 files changed

+39
-9
lines changed

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ async def extract_links(
191191
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
192192

193193
kwargs.setdefault('strategy', 'same-hostname')
194+
strategy = kwargs.get('strategy', 'same-hostname')
194195

195196
links_iterator: Iterator[str] = iter(self._parser.find_links(parsed_content, selector=selector))
196197

@@ -209,7 +210,9 @@ async def extract_links(
209210
skipped = iter([])
210211

211212
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
212-
request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
213+
request_options = RequestOptions(
214+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
215+
)
213216

214217
if transform_request_function:
215218
transform_request_options = transform_request_function(request_options)

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1038,7 +1038,12 @@ def _enqueue_links_filter_iterator(
10381038
warning_flag = True
10391039

10401040
for request in request_iterator:
1041-
target_url = request.url if isinstance(request, Request) else request
1041+
if isinstance(request, Request):
1042+
if request.enqueue_strategy != strategy:
1043+
request.enqueue_strategy = strategy
1044+
target_url = request.url
1045+
else:
1046+
target_url = request
10421047
parsed_target_url = urlparse(target_url)
10431048

10441049
if warning_flag and strategy != 'all' and not parsed_target_url.hostname:

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,7 @@ async def extract_links(
399399
robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
400400

401401
kwargs.setdefault('strategy', 'same-hostname')
402+
strategy = kwargs.get('strategy', 'same-hostname')
402403

403404
elements = await context.page.query_selector_all(selector)
404405
links_iterator: Iterator[str] = iter(
@@ -417,17 +418,19 @@ async def extract_links(
417418
skipped = iter([])
418419

419420
for url in self._enqueue_links_filter_iterator(links_iterator, context.request.url, **kwargs):
420-
request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
421+
request_options = RequestOptions(
422+
url=url, user_data={**base_user_data}, label=label, enqueue_strategy=strategy
423+
)
421424

422425
if transform_request_function:
423-
transform_request_option = transform_request_function(request_option)
424-
if transform_request_option == 'skip':
426+
transform_request_options = transform_request_function(request_options)
427+
if transform_request_options == 'skip':
425428
continue
426-
if transform_request_option != 'unchanged':
427-
request_option = transform_request_option
429+
if transform_request_options != 'unchanged':
430+
request_options = transform_request_options
428431

429432
try:
430-
request = Request.from_url(**request_option)
433+
request = Request.from_url(**request_options)
431434
except ValidationError as exc:
432435
context.log.debug(
433436
f'Skipping URL "{url}" due to invalid format: {exc}. '

tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
import pytest
99

10-
from crawlee import ConcurrencySettings, Glob, HttpHeaders, RequestTransformAction, SkippedReason
10+
from crawlee import ConcurrencySettings, Glob, HttpHeaders, Request, RequestTransformAction, SkippedReason
1111
from crawlee.crawlers import BasicCrawlingContext, BeautifulSoupCrawler, BeautifulSoupCrawlingContext
1212
from crawlee.storages import RequestQueue
1313

@@ -409,3 +409,22 @@ async def test_slow_navigation_does_not_count_toward_handler_timeout(server_url:
409409
assert result.requests_failed == 0
410410
assert result.requests_finished == 1
411411
assert request_handler.call_count == 1
412+
413+
414+
async def test_enqueue_strategy_after_redirect(server_url: URL, redirect_server_url: URL) -> None:
415+
crawler = BeautifulSoupCrawler()
416+
417+
handler_calls = mock.AsyncMock()
418+
419+
@crawler.router.default_handler
420+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
421+
await handler_calls(context.request.url)
422+
423+
target_url = str(server_url.with_path('redirect').with_query(url=str(redirect_server_url)))
424+
425+
await context.enqueue_links(requests=[Request.from_url(target_url)], strategy='same-origin')
426+
427+
await crawler.run([str(server_url)])
428+
429+
assert handler_calls.called
430+
assert handler_calls.call_count == 1

0 commit comments

Comments
 (0)