Skip to content

Commit 0fb96a6

Browse files
authored
Improve error handling for Zyte API (#123)
1 parent 3435cd7 commit 0fb96a6

File tree

7 files changed

+114
-38
lines changed

7 files changed

+114
-38
lines changed

.bandit.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
skips:
2+
- B101 # assert_used, needed for mypy
3+
exclude_dirs: ['tests']

.github/workflows/main.yml

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,8 @@ jobs:
1515
- python-version: "3.4"
1616
env:
1717
TOXENV: py34
18-
- python-version: "3.5"
19-
env:
20-
TOXENV: py
18+
# 3.5 cannot be tested in CI
19+
# https://github.com/MatteoH2O1999/setup-python/issues/49#issuecomment-2209940822
2120
- python-version: "3.6"
2221
env:
2322
TOXENV: py
@@ -33,6 +32,9 @@ jobs:
3332
- python-version: "3.10"
3433
env:
3534
TOXENV: py
35+
- python-version: "3.10"
36+
env:
37+
TOXENV: mypy
3638
- python-version: "3.11"
3739
env:
3840
TOXENV: py
@@ -41,16 +43,19 @@ jobs:
4143
TOXENV: py
4244
- python-version: "3.12"
4345
env:
44-
TOXENV: security
46+
TOXENV: pre-commit
4547
- python-version: "3.12"
4648
env:
4749
TOXENV: docs
4850
steps:
4951
- uses: actions/checkout@v2
5052
- name: Set up Python ${{ matrix.python-version }}
51-
uses: MatteoH2O1999/setup-python@v2
53+
uses: MatteoH2O1999/setup-python@v4
5254
with:
5355
python-version: ${{ matrix.python-version }}
56+
allow-build: info
57+
cache-build: true
58+
cache: pip
5459
- name: Install dependencies
5560
run: |
5661
python -m pip install --upgrade pip

.pre-commit-config.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
repos:
2+
- repo: https://github.com/PyCQA/bandit
3+
rev: 1.7.9
4+
hooks:
5+
- id: bandit
6+
args: [-r, -c, .bandit.yml]

scrapy_zyte_smartproxy/middleware.py

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
import warnings
44
from base64 import urlsafe_b64decode
55
from collections import defaultdict
6+
from typing import Dict, List
67
try:
7-
from urllib.request import _parse_proxy
8+
from urllib.request import _parse_proxy # type: ignore
89
except ImportError:
9-
from urllib2 import _parse_proxy
10+
from urllib2 import _parse_proxy # type: ignore
1011

1112
from six.moves.urllib.parse import urlparse, urlunparse
1213
from w3lib.http import basic_auth_header
@@ -41,9 +42,9 @@ class ZyteSmartProxyMiddleware(object):
4142
backoff_step = 15
4243
backoff_max = 180
4344
exp_backoff = None
44-
force_enable_on_http_codes = []
45+
force_enable_on_http_codes = [] # type: List[int]
4546
max_auth_retry_times = 10
46-
enabled_for_domain = {}
47+
enabled_for_domain = {} # type: Dict[str, bool]
4748
apikey = ""
4849
zyte_api_to_spm_translations = {
4950
b"zyte-device": b"x-crawlera-profile",
@@ -261,14 +262,11 @@ def process_request(self, request, spider):
261262

262263
def _is_banned(self, response):
263264
return (
264-
response.status == self.ban_code and
265-
response.headers.get('X-Crawlera-Error') == b'banned'
266-
)
267-
268-
def _is_no_available_proxies(self, response):
269-
return (
270-
response.status == self.ban_code and
271-
response.headers.get('X-Crawlera-Error') == b'noslaves'
265+
response.status == self.ban_code
266+
and response.headers.get('X-Crawlera-Error') == b'banned'
267+
) or (
268+
response.status in {520, 521}
269+
and response.headers.get('Zyte-Error')
272270
)
273271

274272
def _is_auth_error(self, response):
@@ -277,6 +275,16 @@ def _is_auth_error(self, response):
277275
response.headers.get('X-Crawlera-Error') == b'bad_proxy_auth'
278276
)
279277

278+
def _throttle_error(self, response):
279+
error = response.headers.get('Zyte-Error') or response.headers.get('X-Crawlera-Error')
280+
if (
281+
response.status in {429, 503}
282+
and error
283+
and error != b"banned"
284+
):
285+
return error.decode()
286+
return None
287+
280288
def _process_error(self, response):
281289
if "Zyte-Error" in response.headers:
282290
value = response.headers.get('Zyte-Error')
@@ -302,17 +310,20 @@ def process_response(self, request, response, spider):
302310
key = self._get_slot_key(request)
303311
self._restore_original_delay(request)
304312

305-
if self._is_no_available_proxies(response) or self._is_auth_error(response):
306-
if self._is_no_available_proxies(response):
307-
reason = 'noslaves'
308-
else:
313+
is_auth_error = self._is_auth_error(response)
314+
throttle_error = self._throttle_error(response)
315+
if is_auth_error or throttle_error:
316+
if is_auth_error:
309317
reason = 'autherror'
318+
else:
319+
assert throttle_error
320+
reason = throttle_error.lstrip("/")
310321
self._set_custom_delay(request, next(self.exp_backoff), reason=reason, targets_zyte_api=targets_zyte_api)
311322
else:
312323
self._inc_stat("delay/reset_backoff", targets_zyte_api=targets_zyte_api)
313324
self.exp_backoff = exp_backoff(self.backoff_step, self.backoff_max)
314325

315-
if self._is_auth_error(response):
326+
if is_auth_error:
316327
# When Zyte Smart Proxy Manager has issues it might not be able to
317328
# authenticate users we must retry
318329
retries = request.meta.get('zyte_smartproxy_auth_retry_times', 0)

setup.cfg

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,16 @@
11
[bdist_wheel]
22
universal=1
3+
4+
[mypy]
5+
6+
[mypy-pytest.*]
7+
ignore_missing_imports = True
8+
9+
[mypy-scrapy.*]
10+
ignore_missing_imports = True
11+
12+
[mypy-twisted.*]
13+
ignore_missing_imports = True
14+
15+
[mypy-w3lib.*]
16+
ignore_missing_imports = True

tests/test_all.py

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
from random import choice
66
from unittest import TestCase
77
try:
8-
from unittest.mock import call, patch
8+
from unittest.mock import call, patch # type: ignore
99
except ImportError:
10-
from mock import call, patch
10+
from mock import call, patch # type: ignore
1111

1212
from w3lib.http import basic_auth_header
1313
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
@@ -666,19 +666,24 @@ def test_is_banned(self):
666666
mw = self.mwcls.from_crawler(crawler)
667667
mw.open_spider(self.spider)
668668
req = self._make_fake_request(self.spider, zyte_smartproxy_enabled=True)
669+
669670
res = Response(req.url, status=200)
670671
res = mw.process_response(req, res, self.spider)
671672
self.assertFalse(mw._is_banned(res))
672673
res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'noslaves'})
673674
res = mw.process_response(req, res, self.spider)
674675
self.assertFalse(mw._is_banned(res))
675-
res = Response(req.url, status=503, headers={'Zyte-Error': 'noslaves'})
676+
res = Response(req.url, status=503, headers={'Zyte-Error': '/limits/over-global-limit'})
676677
res = mw.process_response(req, res, self.spider)
677678
self.assertFalse(mw._is_banned(res))
679+
678680
res = Response(req.url, status=503, headers={'X-Crawlera-Error': 'banned'})
679681
res = mw.process_response(req, res, self.spider)
680682
self.assertTrue(mw._is_banned(res))
681-
res = Response(req.url, status=503, headers={'Zyte-Error': 'banned'})
683+
res = Response(req.url, status=520, headers={'Zyte-Error': '/download/temporary-error'})
684+
res = mw.process_response(req, res, self.spider)
685+
self.assertTrue(mw._is_banned(res))
686+
res = Response(req.url, status=521, headers={'Zyte-Error': '/download/internal-error'})
682687
res = mw.process_response(req, res, self.spider)
683688
self.assertTrue(mw._is_banned(res))
684689

@@ -709,24 +714,38 @@ def test_noslaves_delays(self, random_uniform_patch):
709714
noslaves_req = Request(url, meta={'download_slot': slot_key})
710715
assert mw.process_request(noslaves_req, self.spider) is None
711716
assert httpproxy.process_request(noslaves_req, self.spider) is None
712-
headers = {'X-Crawlera-Error': 'noslaves'}
713-
noslaves_res = self._mock_zyte_smartproxy_response(
717+
718+
# delays grow exponentially with any throttling error
719+
noslaves_response = self._mock_zyte_smartproxy_response(
714720
ban_url,
715-
status=self.bancode,
716-
headers=headers,
721+
status=503,
722+
headers={'X-Crawlera-Error': 'noslaves'},
717723
)
718-
719-
# delays grow exponentially
720-
mw.process_response(noslaves_req, noslaves_res, self.spider)
724+
mw.process_response(noslaves_req, noslaves_response, self.spider)
721725
self.assertEqual(slot.delay, backoff_step)
722726

723-
mw.process_response(noslaves_req, noslaves_res, self.spider)
727+
over_use_limit_response = self._mock_zyte_smartproxy_response(
728+
ban_url,
729+
status=429,
730+
headers={'Zyte-Error': '/limits/over-user-limit'},
731+
)
732+
mw.process_response(noslaves_req, over_use_limit_response, self.spider)
724733
self.assertEqual(slot.delay, backoff_step * 2 ** 1)
725734

726-
mw.process_response(noslaves_req, noslaves_res, self.spider)
735+
over_domain_limit_response = self._mock_zyte_smartproxy_response(
736+
ban_url,
737+
status=429,
738+
headers={'Zyte-Error': '/limits/over-domain-limit'},
739+
)
740+
mw.process_response(noslaves_req, over_domain_limit_response, self.spider)
727741
self.assertEqual(slot.delay, backoff_step * 2 ** 2)
728742

729-
mw.process_response(noslaves_req, noslaves_res, self.spider)
743+
over_global_limit_response = self._mock_zyte_smartproxy_response(
744+
ban_url,
745+
status=503,
746+
headers={'Zyte-Error': '/limits/over-global-limit'},
747+
)
748+
mw.process_response(noslaves_req, over_global_limit_response, self.spider)
730749
self.assertEqual(slot.delay, max_delay)
731750

732751
# other responses reset delay
@@ -742,7 +761,7 @@ def test_noslaves_delays(self, random_uniform_patch):
742761
mw.process_response(ban_req, ban_res, self.spider)
743762
self.assertEqual(slot.delay, default_delay)
744763

745-
mw.process_response(noslaves_req, noslaves_res, self.spider)
764+
mw.process_response(noslaves_req, noslaves_response, self.spider)
746765
self.assertEqual(slot.delay, backoff_step)
747766

748767
good_req = Request(url, meta={'download_slot': slot_key})

tox.ini

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# tox.ini
22
[tox]
3-
envlist = min,py27,py34,py35,py36,py37,py38,py39,py310,py311,py312,docs,security
3+
envlist = pre-commit,mypy,min,py27,py34,py35,py36,py37,py38,py39,py310,py311,py312,docs
44

55
[testenv]
66
deps =
@@ -9,6 +9,24 @@ deps =
99
commands =
1010
py.test --doctest-modules --cov=scrapy_zyte_smartproxy {posargs:scrapy_zyte_smartproxy tests}
1111

12+
[testenv:pre-commit]
13+
basepython = python3
14+
deps =
15+
pre-commit
16+
commands =
17+
pre-commit run {posargs:--all-files}
18+
19+
[testenv:mypy]
20+
basepython = python3.10
21+
deps =
22+
mypy[python2]<0.980
23+
pytest<4.7
24+
twisted<=20.3.0
25+
types-six<1.16.12
26+
w3lib<2
27+
commands =
28+
mypy --py2 {posargs:scrapy_zyte_smartproxy tests}
29+
1230
[testenv:min]
1331
basepython = python2.7
1432
deps =

0 commit comments

Comments
 (0)