Skip to content

Commit 3d018b2

Browse files
authored
fix: Fix same-domain strategy ignoring public suffix (#1572)
### Description - Use `top_domain_under_public_suffix` instead of just `domain` when comparing domains in `same-domain` strategy. ### Issues - Closes: #1571 ### Testing - Added unit test ### Checklist - [x] CI passed
1 parent 50163c3 commit 3d018b2

File tree

2 files changed

+6
-5
lines changed

2 files changed

+6
-5
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,8 +1043,8 @@ def _check_enqueue_strategy(
10431043
return target_url.hostname == origin_url.hostname
10441044

10451045
if strategy == 'same-domain':
1046-
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
1047-
target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
1046+
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).top_domain_under_public_suffix
1047+
target_domain = self._tld_extractor.extract_str(target_url.hostname).top_domain_under_public_suffix
10481048
return origin_domain == target_domain
10491049

10501050
if strategy == 'same-origin':

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ class AddRequestsTestInput:
347347
'https://blog.someplace.com/index.html',
348348
'https://redirect.someplace.com',
349349
'https://other.place.com/index.html',
350+
'https://someplace.jp/',
350351
)
351352

352353
INCLUDE_TEST_URLS = (
@@ -401,7 +402,7 @@ class AddRequestsTestInput:
401402
AddRequestsTestInput(
402403
start_url=STRATEGY_TEST_URLS[0],
403404
loaded_url=STRATEGY_TEST_URLS[0],
404-
requests=STRATEGY_TEST_URLS[:4],
405+
requests=STRATEGY_TEST_URLS,
405406
kwargs=EnqueueLinksKwargs(strategy='same-domain'),
406407
expected_urls=STRATEGY_TEST_URLS[1:4],
407408
),
@@ -411,7 +412,7 @@ class AddRequestsTestInput:
411412
AddRequestsTestInput(
412413
start_url=STRATEGY_TEST_URLS[0],
413414
loaded_url=STRATEGY_TEST_URLS[0],
414-
requests=STRATEGY_TEST_URLS[:4],
415+
requests=STRATEGY_TEST_URLS,
415416
kwargs=EnqueueLinksKwargs(strategy='same-hostname'),
416417
expected_urls=[STRATEGY_TEST_URLS[1]],
417418
),
@@ -421,7 +422,7 @@ class AddRequestsTestInput:
421422
AddRequestsTestInput(
422423
start_url=STRATEGY_TEST_URLS[0],
423424
loaded_url=STRATEGY_TEST_URLS[0],
424-
requests=STRATEGY_TEST_URLS[:4],
425+
requests=STRATEGY_TEST_URLS,
425426
kwargs=EnqueueLinksKwargs(strategy='same-origin'),
426427
expected_urls=[],
427428
),

0 commit comments

Comments
 (0)