Skip to content

Commit 951578f

Browse files
authored
Merge pull request #23 from scrapy-plugins/clear-dnscache-on-errors
Clear Scrapy dns cache on connection errors to proxy service
2 parents 80c47db + 301fd60 commit 951578f

File tree

2 files changed

+30
-5
lines changed

2 files changed

+30
-5
lines changed

scrapy_crawlera/middleware.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1-
from collections import defaultdict
2-
import warnings
31
import os
42
import logging
3+
import warnings
4+
from collections import defaultdict
55

6+
from six.moves.urllib.parse import urlparse
67
from w3lib.http import basic_auth_header
78
from scrapy import signals
9+
from scrapy.resolver import dnscache
810
from scrapy.exceptions import ScrapyDeprecationWarning
9-
from twisted.internet.error import ConnectionRefusedError
11+
from twisted.internet.error import ConnectionRefusedError, ConnectionDone
1012

1113

1214
class CrawleraMiddleware(object):
@@ -48,6 +50,7 @@ def open_spider(self, spider):
4850

4951
for k, type_ in self._settings:
5052
setattr(self, k, self._get_setting_value(spider, k, type_))
53+
5154
if '?noconnect' not in self.url:
5255
self.url += '?noconnect'
5356

@@ -155,10 +158,16 @@ def process_response(self, request, response, spider):
155158
def process_exception(self, request, exception, spider):
156159
if not self._is_enabled_for_request(request):
157160
return
158-
if isinstance(exception, ConnectionRefusedError):
161+
if isinstance(exception, (ConnectionRefusedError, ConnectionDone)):
159162
# Handle crawlera downtime
163+
self._clear_dns_cache()
160164
self._set_custom_delay(request, self.connection_refused_delay)
161165

166+
def _clear_dns_cache(self):
167+
# Scrapy doesn't expire dns records by default, so we force it here,
168+
# so client can reconnect trough DNS failover.
169+
dnscache.pop(urlparse(self.url).hostname, None)
170+
162171
def _is_enabled_for_request(self, request):
163172
return self.enabled and 'dont_proxy' not in request.meta
164173

tests/test_crawlera.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
from scrapy.http import Request, Response
55
from scrapy.spiders import Spider
66
from scrapy.utils.test import get_crawler
7-
from twisted.internet.error import ConnectionRefusedError
7+
from scrapy.resolver import dnscache
8+
from twisted.internet.error import ConnectionRefusedError, ConnectionDone
89

910
from scrapy_crawlera import CrawleraMiddleware
1011
import os
@@ -229,29 +230,44 @@ def test_delay_adjustment(self):
229230
self.assertEqual(slot.delay, retry_after)
230231
self.assertEqual(self.spider.download_delay, delay)
231232

233+
# DNS cache should be cleared in case of errors
234+
dnscache['proxy.crawlera.com'] = '1.1.1.1'
235+
232236
res = Response(url, request=req)
233237
mw.process_response(req, res, self.spider)
234238
self.assertEqual(slot.delay, delay)
235239
self.assertEqual(self.spider.download_delay, delay)
240+
self.assertIn('proxy.crawlera.com', dnscache)
236241

237242
# server failures
238243
mw.process_exception(req, ConnectionRefusedError(), self.spider)
239244
self.assertEqual(slot.delay, mw.connection_refused_delay)
240245
self.assertEqual(self.spider.download_delay, delay)
246+
self.assertNotIn('proxy.crawlera.com', dnscache)
241247

248+
dnscache['proxy.crawlera.com'] = '1.1.1.1'
242249
res = Response(ban_url, request=req)
243250
mw.process_response(req, res, self.spider)
244251
self.assertEqual(slot.delay, delay)
245252
self.assertEqual(self.spider.download_delay, delay)
253+
self.assertIn('proxy.crawlera.com', dnscache)
246254

247255
mw.process_exception(req, ConnectionRefusedError(), self.spider)
248256
self.assertEqual(slot.delay, mw.connection_refused_delay)
249257
self.assertEqual(self.spider.download_delay, delay)
258+
self.assertNotIn('proxy.crawlera.com', dnscache)
250259

260+
dnscache['proxy.crawlera.com'] = '1.1.1.1'
251261
res = Response(ban_url, status=self.bancode, request=req)
252262
mw.process_response(req, res, self.spider)
253263
self.assertEqual(slot.delay, delay)
254264
self.assertEqual(self.spider.download_delay, delay)
265+
self.assertIn('proxy.crawlera.com', dnscache)
266+
267+
mw.process_exception(req, ConnectionDone(), self.spider)
268+
self.assertEqual(slot.delay, mw.connection_refused_delay)
269+
self.assertEqual(self.spider.download_delay, delay)
270+
self.assertNotIn('proxy.crawlera.com', dnscache)
255271

256272
def test_jobid_header(self):
257273
# test without the environment variable 'SCRAPY_JOB'

0 commit comments

Comments
 (0)