@@ -942,12 +942,25 @@ def _enqueue_links_filter_iterator(
942
942
"""Filter requests based on the enqueue strategy and URL patterns."""
943
943
limit = kwargs .get ('limit' )
944
944
parsed_origin_url = urlparse (origin_url )
945
+ strategy = kwargs .get ('strategy' , 'all' )
946
+
947
+ if strategy == 'all' and not parsed_origin_url .hostname :
948
+ self .log .warning (f'Skipping enqueue: Missing hostname in origin_url = { origin_url } .' )
949
+ return
950
+
951
+ # Emit a `warning` message to the log, only once per call
952
+ warning_flag = True
945
953
946
954
for request in request_iterator :
947
955
target_url = request .url if isinstance (request , Request ) else request
956
+ parsed_target_url = urlparse (target_url )
957
+
958
+ if warning_flag and strategy != 'all' and not parsed_target_url .hostname :
959
+ self .log .warning (f'Skipping enqueue url: Missing hostname in target_url = { target_url } .' )
960
+ warning_flag = False
948
961
949
962
if self ._check_enqueue_strategy (
950
- kwargs . get ( ' strategy' , 'all' ), target_url = urlparse ( target_url ) , origin_url = parsed_origin_url
963
+ strategy , target_url = parsed_target_url , origin_url = parsed_origin_url
951
964
) and self ._check_url_patterns (target_url , kwargs .get ('include' ), kwargs .get ('exclude' )):
952
965
yield request
953
966
@@ -963,13 +976,20 @@ def _check_enqueue_strategy(
963
976
origin_url : ParseResult ,
964
977
) -> bool :
965
978
"""Check if a URL matches the enqueue_strategy."""
979
+ if strategy == 'all' :
980
+ return True
981
+
982
+ if origin_url .hostname is None or target_url .hostname is None :
983
+ self .log .debug (
984
+ f'Skipping enqueue: Missing hostname in origin_url = { origin_url .geturl ()} or '
985
+ f'target_url = { target_url .geturl ()} '
986
+ )
987
+ return False
988
+
966
989
if strategy == 'same-hostname' :
967
990
return target_url .hostname == origin_url .hostname
968
991
969
992
if strategy == 'same-domain' :
970
- if origin_url .hostname is None or target_url .hostname is None :
971
- raise ValueError ('Both origin and target URLs must have a hostname' )
972
-
973
993
origin_domain = self ._tld_extractor .extract_str (origin_url .hostname ).domain
974
994
target_domain = self ._tld_extractor .extract_str (target_url .hostname ).domain
975
995
return origin_domain == target_domain
@@ -981,9 +1001,6 @@ def _check_enqueue_strategy(
981
1001
and target_url .port == origin_url .port
982
1002
)
983
1003
984
- if strategy == 'all' :
985
- return True
986
-
987
1004
assert_never (strategy )
988
1005
989
1006
def _check_url_patterns (
0 commit comments