Skip to content

Commit cf0f32c

Browse files
authored
Use a different stat prefix for Zyte API, and improve wording overall to minimize confusion (#120)
1 parent ffdb38f commit cf0f32c

File tree

7 files changed

+146
-57
lines changed

7 files changed

+146
-57
lines changed

README.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,14 @@ scrapy-zyte-smartproxy
1414
:target: http://codecov.io/github/scrapy-plugins/scrapy-zyte-smartproxy?branch=master
1515
:alt: Code Coverage
1616

17-
scrapy-zyte-smartproxy provides easy use of `Zyte Smart Proxy Manager
18-
<https://www.zyte.com/smart-proxy-manager/>`_ (formerly Crawlera) with Scrapy.
17+
scrapy-zyte-smartproxy is a `Scrapy downloader middleware`_ to use one of
18+
Zyte’s proxy services: either the `proxy mode`_ of `Zyte API`_ or `Zyte Smart
19+
Proxy Manager`_ (formerly Crawlera).
20+
21+
.. _Scrapy downloader middleware: https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
22+
.. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html
23+
.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html
24+
.. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/
1925

2026
Requirements
2127
============

docs/index.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ scrapy-zyte-smartproxy |version| documentation
66
:hidden:
77

88
headers
9+
stats
910
settings
1011
news
1112

@@ -61,6 +62,9 @@ Configuration
6162
6263
ZYTE_SMARTPROXY_URL = "http://api.zyte.com:8011"
6364
65+
.. tip:: This URL is logged, so that you can tell which value was used
66+
from crawl logs.
67+
6468
- To use the default Zyte Smart Proxy Manager endpoint, leave it unset.
6569

6670
- To use a custom Zyte Smart Proxy Manager endpoint, in case you have a

docs/stats.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
Stats
2+
=====
3+
4+
This Scrapy plugin tracks some stats.
5+
6+
Stats for the `proxy mode`_ of `Zyte API`_ and stats for `Zyte Smart
7+
Proxy Manager`_ (formerly Crawlera) have a different prefix, ``zyte_api_proxy``
8+
and ``zyte_smartproxy`` respectively.
9+
10+
.. _proxy mode: https://docs.zyte.com/zyte-api/usage/proxy-mode.html
11+
.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html
12+
.. _Zyte Smart Proxy Manager: https://www.zyte.com/smart-proxy-manager/

scrapy_zyte_smartproxy/middleware.py

Lines changed: 41 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def _make_auth_url(self, spider):
8686
auth = self.get_proxyauth(spider)
8787
if not auth.startswith(b'Basic '):
8888
raise ValueError(
89-
'Zyte Smart Proxy Manager only supports HTTP basic access '
89+
'Zyte proxy services only support HTTP basic access '
9090
'authentication, but %s.%s.get_proxyauth() returned %r'
9191
% (self.__module__, self.__class__.__name__, auth)
9292
)
@@ -111,7 +111,7 @@ def open_spider(self, spider):
111111

112112
if not self.apikey:
113113
logger.warning(
114-
"Zyte Smart Proxy Manager cannot be used without an API key",
114+
"Zyte proxy services cannot be used without an API key",
115115
extra={'spider': spider},
116116
)
117117
return
@@ -120,7 +120,7 @@ def open_spider(self, spider):
120120
self._authless_url = _remove_auth(self._auth_url)
121121

122122
logger.info(
123-
"Using Zyte Smart Proxy Manager at %s (apikey: %s)" % (
123+
"Using Zyte proxy service %s with an API key ending in %s" % (
124124
self.url, self.apikey[:7]
125125
),
126126
extra={'spider': spider},
@@ -131,8 +131,8 @@ def open_spider(self, spider):
131131
spider.download_delay = 0
132132
logger.info(
133133
"ZyteSmartProxyMiddleware: disabling download delays in "
134-
"Scrapy to optimize delays introduced by Zyte Smart Proxy "
135-
"Manager. To avoid this behaviour you can use the "
134+
"Scrapy to optimize delays introduced by Zyte proxy services. "
135+
"To avoid this behaviour you can use the "
136136
"ZYTE_SMARTPROXY_PRESERVE_DELAY setting, but keep in mind "
137137
"that this may slow down the crawl significantly",
138138
extra={'spider': spider},
@@ -196,7 +196,9 @@ def get_proxyauth(self, spider):
196196
return basic_auth_header(self.apikey, '')
197197

198198
def _targets_zyte_api(self, request):
199-
auth_url = request.meta["proxy"]
199+
if self._auth_url is None:
200+
return False
201+
auth_url = request.meta.get("proxy", self._auth_url)
200202
targets_zyte_api = self._targets.get(auth_url, None)
201203
if targets_zyte_api is None:
202204
targets_zyte_api = urlparse(auth_url).hostname == "api.zyte.com"
@@ -220,6 +222,10 @@ def _translate_headers(self, request, targets_zyte_api):
220222
request,
221223
)
222224

225+
def _inc_stat(self, stat, targets_zyte_api, value=1):
226+
prefix = "zyte_api_proxy" if targets_zyte_api else "zyte_smartproxy"
227+
self.crawler.stats.inc_value("{}/{}".format(prefix, stat), value)
228+
223229
def process_request(self, request, spider):
224230
if self._is_enabled_for_request(request):
225231
if 'proxy' not in request.meta:
@@ -246,8 +252,8 @@ def process_request(self, request, spider):
246252
user_agent_header = "Zyte-Client" if targets_zyte_api else "X-Crawlera-Client"
247253
from scrapy_zyte_smartproxy import __version__
248254
request.headers[user_agent_header] = 'scrapy-zyte-smartproxy/%s' % __version__
249-
self.crawler.stats.inc_value('zyte_smartproxy/request')
250-
self.crawler.stats.inc_value('zyte_smartproxy/request/method/%s' % request.method)
255+
self._inc_stat("request", targets_zyte_api=targets_zyte_api)
256+
self._inc_stat("request/method/{}".format(request.method), targets_zyte_api=targets_zyte_api)
251257
self._translate_headers(request, targets_zyte_api=targets_zyte_api)
252258
self._clean_zyte_smartproxy_headers(request, targets_zyte_api=targets_zyte_api)
253259
else:
@@ -285,8 +291,10 @@ def _process_error(self, response):
285291
def process_response(self, request, response, spider):
286292
zyte_smartproxy_error = self._process_error(response)
287293

294+
targets_zyte_api = self._targets_zyte_api(request)
295+
288296
if not self._is_enabled_for_request(request):
289-
return self._handle_not_enabled_response(request, response)
297+
return self._handle_not_enabled_response(request, response, targets_zyte_api=targets_zyte_api)
290298

291299
if not self._is_zyte_smartproxy_or_zapi_response(response):
292300
return response
@@ -299,19 +307,19 @@ def process_response(self, request, response, spider):
299307
reason = 'noslaves'
300308
else:
301309
reason = 'autherror'
302-
self._set_custom_delay(request, next(self.exp_backoff), reason=reason)
310+
self._set_custom_delay(request, next(self.exp_backoff), reason=reason, targets_zyte_api=targets_zyte_api)
303311
else:
304-
self.crawler.stats.inc_value('zyte_smartproxy/delay/reset_backoff')
312+
self._inc_stat("delay/reset_backoff", targets_zyte_api=targets_zyte_api)
305313
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
306314

307315
if self._is_auth_error(response):
308316
# When Zyte Smart Proxy Manager has issues it might not be able to
309317
# authenticate users we must retry
310318
retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0)
311319
if retries < self.max_auth_retry_times:
312-
return self._retry_auth(response, request, spider)
320+
return self._retry_auth(response, request, spider, targets_zyte_api=targets_zyte_api)
313321
else:
314-
self.crawler.stats.inc_value('zyte_smartproxy/retries/auth/max_reached')
322+
self._inc_stat("retries/auth/max_reached", targets_zyte_api=targets_zyte_api)
315323
logger.warning(
316324
"Max retries for authentication issues reached, please check auth"
317325
" information settings",
@@ -325,17 +333,17 @@ def process_response(self, request, response, spider):
325333
else:
326334
after = response.headers.get('retry-after')
327335
if after:
328-
self._set_custom_delay(request, float(after), reason='banned')
329-
self.crawler.stats.inc_value('zyte_smartproxy/response/banned')
336+
self._set_custom_delay(request, float(after), reason='banned', targets_zyte_api=targets_zyte_api)
337+
self._inc_stat("response/banned", targets_zyte_api=targets_zyte_api)
330338
else:
331339
self._bans[key] = 0
332340
# If placed behind `RedirectMiddleware`, it would not count 3xx responses
333-
self.crawler.stats.inc_value('zyte_smartproxy/response')
334-
self.crawler.stats.inc_value('zyte_smartproxy/response/status/%s' % response.status)
341+
self._inc_stat("response", targets_zyte_api=targets_zyte_api)
342+
self._inc_stat("response/status/{}".format(response.status), targets_zyte_api=targets_zyte_api)
335343
if zyte_smartproxy_error:
336-
self.crawler.stats.inc_value('zyte_smartproxy/response/error')
337-
self.crawler.stats.inc_value(
338-
'zyte_smartproxy/response/error/%s' % zyte_smartproxy_error.decode('utf8'))
344+
self._inc_stat("response/error", targets_zyte_api=targets_zyte_api)
345+
error_msg = zyte_smartproxy_error.decode('utf8')
346+
self._inc_stat("response/error/{}".format(error_msg), targets_zyte_api=targets_zyte_api)
339347
return response
340348

341349
def process_exception(self, request, exception, spider):
@@ -344,30 +352,33 @@ def process_exception(self, request, exception, spider):
344352
if isinstance(exception, (ConnectionRefusedError, ConnectionDone)):
345353
# Handle Zyte Smart Proxy Manager downtime
346354
self._clear_dns_cache()
347-
self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused')
355+
targets_zyte_api = self._targets_zyte_api(request)
356+
self._set_custom_delay(request, self.connection_refused_delay, reason='conn_refused', targets_zyte_api=targets_zyte_api)
348357

349-
def _handle_not_enabled_response(self, request, response):
358+
def _handle_not_enabled_response(self, request, response, targets_zyte_api):
350359
if self._should_enable_for_response(response):
351360
domain = self._get_url_domain(request.url)
352361
self.enabled_for_domain[domain] = True
353362

354363
retryreq = request.copy()
355364
retryreq.dont_filter = True
356-
self.crawler.stats.inc_value('zyte_smartproxy/retries/should_have_been_enabled')
365+
self._inc_stat("retries/should_have_been_enabled", targets_zyte_api=targets_zyte_api)
357366
return retryreq
358367
return response
359368

360-
def _retry_auth(self, response, request, spider):
369+
def _retry_auth(self, response, request, spider, targets_zyte_api):
361370
logger.warning(
362-
"Retrying a Zyte Smart Proxy Manager request due to an "
363-
"authentication issue",
371+
(
372+
"Retrying a request due to an authentication issue with "
373+
"the configured Zyte proxy service"
374+
),
364375
extra={'spider': self.spider},
365376
)
366377
retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0) + 1
367378
retryreq = request.copy()
368379
retryreq.meta['zyte_smartproxy_auth_retry_times'] = retries
369380
retryreq.dont_filter = True
370-
self.crawler.stats.inc_value('zyte_smartproxy/retries/auth')
381+
self._inc_stat("retries/auth", targets_zyte_api=targets_zyte_api)
371382
return retryreq
372383

373384
def _clear_dns_cache(self):
@@ -402,7 +413,7 @@ def _get_slot(self, request):
402413
key = self._get_slot_key(request)
403414
return key, self.crawler.engine.downloader.slots.get(key)
404415

405-
def _set_custom_delay(self, request, delay, reason=None):
416+
def _set_custom_delay(self, request, delay, targets_zyte_api, reason=None):
406417
"""Set custom delay for slot and save original one."""
407418
key, slot = self._get_slot(request)
408419
if not slot:
@@ -411,8 +422,8 @@ def _set_custom_delay(self, request, delay, reason=None):
411422
self._saved_delays[key] = slot.delay
412423
slot.delay = delay
413424
if reason is not None:
414-
self.crawler.stats.inc_value('zyte_smartproxy/delay/%s' % reason)
415-
self.crawler.stats.inc_value('zyte_smartproxy/delay/%s/total' % reason, delay)
425+
self._inc_stat("delay/{}".format(reason), targets_zyte_api=targets_zyte_api)
426+
self._inc_stat("delay/{}/total".format(reason), value=delay, targets_zyte_api=targets_zyte_api)
416427

417428
def _restore_original_delay(self, request):
418429
"""Restore original delay for slot if it was changed."""

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from setuptools import setup
22

3-
with open("README.rst") as f:
4-
readme = f.read()
3+
with open("README.rst", "rb") as f:
4+
readme = f.read().decode("utf-8")
55

66

77
setup(

0 commit comments

Comments
 (0)