Skip to content

Commit a6c3aab

Browse files
authored
fix: Do not raise an error to check 'same-domain' if there is no hostname in the url (#1251)
### Description - The need for PR is due to the fact that now the enqueue strategy check, is used in `extract_links` and can be applied to invalid urls.
1 parent afbdaa5 commit a6c3aab

File tree

1 file changed

+24
-7
lines changed

1 file changed

+24
-7
lines changed

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -942,12 +942,25 @@ def _enqueue_links_filter_iterator(
942942
"""Filter requests based on the enqueue strategy and URL patterns."""
943943
limit = kwargs.get('limit')
944944
parsed_origin_url = urlparse(origin_url)
945+
strategy = kwargs.get('strategy', 'all')
946+
947+
if strategy == 'all' and not parsed_origin_url.hostname:
948+
self.log.warning(f'Skipping enqueue: Missing hostname in origin_url = {origin_url}.')
949+
return
950+
951+
# Emit a `warning` message to the log, only once per call
952+
warning_flag = True
945953

946954
for request in request_iterator:
947955
target_url = request.url if isinstance(request, Request) else request
956+
parsed_target_url = urlparse(target_url)
957+
958+
if warning_flag and strategy != 'all' and not parsed_target_url.hostname:
959+
self.log.warning(f'Skipping enqueue url: Missing hostname in target_url = {target_url}.')
960+
warning_flag = False
948961

949962
if self._check_enqueue_strategy(
950-
kwargs.get('strategy', 'all'), target_url=urlparse(target_url), origin_url=parsed_origin_url
963+
strategy, target_url=parsed_target_url, origin_url=parsed_origin_url
951964
) and self._check_url_patterns(target_url, kwargs.get('include'), kwargs.get('exclude')):
952965
yield request
953966

@@ -963,13 +976,20 @@ def _check_enqueue_strategy(
963976
origin_url: ParseResult,
964977
) -> bool:
965978
"""Check if a URL matches the enqueue_strategy."""
979+
if strategy == 'all':
980+
return True
981+
982+
if origin_url.hostname is None or target_url.hostname is None:
983+
self.log.debug(
984+
f'Skipping enqueue: Missing hostname in origin_url = {origin_url.geturl()} or '
985+
f'target_url = {target_url.geturl()}'
986+
)
987+
return False
988+
966989
if strategy == 'same-hostname':
967990
return target_url.hostname == origin_url.hostname
968991

969992
if strategy == 'same-domain':
970-
if origin_url.hostname is None or target_url.hostname is None:
971-
raise ValueError('Both origin and target URLs must have a hostname')
972-
973993
origin_domain = self._tld_extractor.extract_str(origin_url.hostname).domain
974994
target_domain = self._tld_extractor.extract_str(target_url.hostname).domain
975995
return origin_domain == target_domain
@@ -981,9 +1001,6 @@ def _check_enqueue_strategy(
9811001
and target_url.port == origin_url.port
9821002
)
9831003

984-
if strategy == 'all':
985-
return True
986-
9871004
assert_never(strategy)
9881005

9891006
def _check_url_patterns(

0 commit comments

Comments
 (0)