Skip to content

Commit 7f12e16

Browse files
committed
Catch exceptions in all callbacks. Treat www subdomain the same as no
subdomain. Fixed possible invalid URL schemes when the <base> element was used.
1 parent 30a24ae commit 7f12e16

File tree

4 files changed

+28
-7
lines changed

4 files changed

+28
-7
lines changed

docs/source/options_crawling_scope.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ Only crawl pages with the same protocol as the startpoint (e.g. only https) if T
4545

4646
Only crawl pages with the same subdomain as the startpoint if True. If the startpoint is not a subdomain, no subdomains will be crawled. Default is True.
4747

48+
Please note that the `www` subdomain will be treated the same as no subdomain.
49+
4850
.. code:: python
4951
5052
options.scope.subdomain_must_match = True

nyawc/Crawler.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,10 @@ def __crawler_start(self):
139139
140140
"""
141141

142-
self.__options.callbacks.crawler_before_start()
142+
try:
143+
self.__options.callbacks.crawler_before_start()
144+
except Exception as e:
145+
print(e)
143146

144147
self.__spawn_new_requests()
145148

@@ -175,7 +178,10 @@ def __crawler_stop(self):
175178
def __crawler_finish(self):
176179
"""Called when the crawler is finished because there are no queued requests left or it was stopped."""
177180

178-
self.__options.callbacks.crawler_after_finish(self.queue)
181+
try:
182+
self.__options.callbacks.crawler_after_finish(self.queue)
183+
except Exception as e:
184+
print(e)
179185

180186
def __request_start(self, queue_item):
181187
"""Execute the request in given queue item.
@@ -185,7 +191,11 @@ def __request_start(self, queue_item):
185191
186192
"""
187193

188-
action = self.__options.callbacks.request_before_start(self.queue, queue_item)
194+
try:
195+
action = self.__options.callbacks.request_before_start(self.queue, queue_item)
196+
except Exception as e:
197+
action = None
198+
print(e)
189199

190200
if action == CrawlerActions.DO_STOP_CRAWLING:
191201
self.__should_stop = True
@@ -223,7 +233,11 @@ def __request_finish(self, queue_item, new_requests, request_failed=False):
223233
new_queue_items = self.__add_scraped_requests_to_queue(queue_item, new_requests)
224234
self.queue.move(queue_item, QueueItem.STATUS_FINISHED)
225235

226-
action = self.__options.callbacks.request_after_finish(self.queue, queue_item, new_queue_items)
236+
try:
237+
action = self.__options.callbacks.request_after_finish(self.queue, queue_item, new_queue_items)
238+
except Exception as e:
239+
action = None
240+
print(e)
227241

228242
if action == CrawlerActions.DO_STOP_CRAWLING:
229243
self.__should_stop = True

nyawc/helpers/HTTPRequestHelper.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,13 @@ def complies_with_scope(queue_item, new_request, scope):
8282
return False
8383

8484
if scope.subdomain_must_match:
85-
if URLHelper.get_subdomain(queue_item.request.url) != URLHelper.get_subdomain(new_request.url):
86-
return False
85+
current_subdomain = URLHelper.get_subdomain(queue_item.request.url)
86+
new_subdomain = URLHelper.get_subdomain(new_request.url)
87+
88+
if current_subdomain != new_subdomain:
89+
if current_subdomain != "www" and new_subdomain != "":
90+
if new_subdomain != "www" and current_subdomain != "":
91+
return False
8792

8893
if scope.hostname_must_match:
8994
if URLHelper.get_hostname(queue_item.request.url) != URLHelper.get_hostname(new_request.url):

nyawc/scrapers/HTMLSoupLinkScraper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def derived_get_requests(self):
6363
# Always use the URL from the base element if it exists.
6464
# https://www.w3schools.com/tags/tag_base.asp
6565
if base_element:
66-
host = base_element["href"]
66+
host = URLHelper.make_absolute(host, base_element["href"])
6767

6868
found_requests = []
6969

0 commit comments

Comments
 (0)