Skip to content

Commit 4602bfc

Browse files
authored
Support Scrapy 2.6.2 (#103)
1 parent eb7946e commit 4602bfc

File tree

8 files changed

+106
-94
lines changed

8 files changed

+106
-94
lines changed

.github/workflows/main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ jobs:
88
include:
99
- python-version: 2.7
1010
env:
11-
TOXENV: py27,stack-scrapy-1.0,stack-scrapy-1.1,stack-scrapy-1.2,stack-scrapy-1.3,stack-scrapy-1.4,stack-scrapy-1.5
11+
TOXENV: py27,stack-scrapy-1.4,stack-scrapy-1.5
1212
- python-version: 3.5
1313
env:
1414
TOXENV: py35,stack-scrapy-1.8-py3,stack-scrapy-2.0-py3,stack-scrapy-2.1-py3,stack-scrapy-2.2-py3,stack-scrapy-2.3-py3

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Requirements
2121
============
2222

2323
* Python 2.7 or Python 3.4+
24-
* Scrapy
24+
* Scrapy 1.4+
2525

2626
Installation
2727
============

docs/conf.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,6 @@
7171
version = ''
7272
release = ''
7373

74-
# The language for content autogenerated by Sphinx. Refer to documentation
75-
# for a list of supported languages.
76-
#
77-
# This is also used if you do content translation via gettext catalogs.
78-
# Usually you set "language" from the command line for these cases.
79-
language = None
80-
8174
# List of patterns, relative to source directory, that match files and
8275
# directories to ignore when looking for source files.
8376
# This patterns also effect to html_static_path and html_extra_path

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
scrapy>=1.0
1+
scrapy>=1.4
22
six
33
w3lib

scrapy_zyte_smartproxy/middleware.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import os
22
import logging
33
import warnings
4+
from base64 import urlsafe_b64decode
45
from collections import defaultdict
56

6-
from six.moves.urllib.parse import urlparse
7+
from six.moves.urllib.parse import urlparse, urlunparse
78
from w3lib.http import basic_auth_header
89
from scrapy import signals
910
from scrapy.resolver import dnscache
@@ -52,13 +53,28 @@ def __init__(self, crawler):
5253
self.spider = None
5354
self._bans = defaultdict(int)
5455
self._saved_delays = defaultdict(lambda: None)
56+
self._auth_url = None
5557

5658
@classmethod
5759
def from_crawler(cls, crawler):
5860
o = cls(crawler)
5961
crawler.signals.connect(o.open_spider, signals.spider_opened)
6062
return o
6163

64+
def _make_auth_url(self, spider):
65+
parsed_url = urlparse(self.url)
66+
auth = self.get_proxyauth(spider)
67+
if not auth.startswith(b'Basic '):
68+
raise ValueError(
69+
'Zyte Smart Proxy Manager only supports HTTP basic access '
70+
'authentication, but %s.%s.get_proxyauth() returned %r'
71+
% (self.__module__, self.__class__.__name__, auth)
72+
)
73+
user_and_colon = urlsafe_b64decode(auth[6:].strip()).decode('utf-8')
74+
netloc = user_and_colon + '@' + parsed_url.netloc.split('@')[-1]
75+
parsed_url = parsed_url._replace(netloc=netloc)
76+
return urlunparse(parsed_url)
77+
6278
def open_spider(self, spider):
6379
self.enabled = self.is_enabled(spider)
6480
self.spider = spider
@@ -80,7 +96,7 @@ def open_spider(self, spider):
8096
)
8197
return
8298

83-
self._proxyauth = self.get_proxyauth(spider)
99+
self._auth_url = self._make_auth_url(spider)
84100

85101
logger.info(
86102
"Using Zyte Smart Proxy Manager at %s (apikey: %s)" % (
@@ -162,9 +178,8 @@ def process_request(self, request, spider):
162178
from scrapy_zyte_smartproxy import __version__
163179
if self._is_enabled_for_request(request):
164180
self._set_zyte_smartproxy_default_headers(request)
165-
request.meta['proxy'] = self.url
181+
request.meta['proxy'] = self._auth_url
166182
request.meta['download_timeout'] = self.download_timeout
167-
request.headers['Proxy-Authorization'] = self._proxyauth
168183
if self.job_id:
169184
request.headers['X-Crawlera-Jobid'] = self.job_id
170185
request.headers['X-Crawlera-Client'] = 'scrapy-zyte-smartproxy/%s' % __version__

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,5 +35,5 @@
3535
'Topic :: Software Development :: Libraries :: Application Frameworks',
3636
'Topic :: Software Development :: Libraries :: Python Modules',
3737
],
38-
install_requires=['scrapy>=1.0.0', 'six', 'w3lib'],
38+
install_requires=['scrapy>=1.4.0', 'six', 'w3lib'],
3939
)

tests/test_all.py

Lines changed: 83 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import binascii
12
import os
23
import pytest
34
from random import choice
@@ -8,6 +9,7 @@
89
from mock import call, patch
910

1011
from w3lib.http import basic_auth_header
12+
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
1113
from scrapy.http import Request, Response
1214
from scrapy.spiders import Spider
1315
from scrapy.utils.test import get_crawler
@@ -84,17 +86,19 @@ def _assert_disabled(self, spider, settings=None):
8486
def _assert_enabled(self, spider,
8587
settings=None,
8688
proxyurl='http://proxy.zyte.com:8011',
89+
proxyurlcreds='http://apikey:@proxy.zyte.com:8011',
8790
proxyauth=basic_auth_header('apikey', ''),
8891
maxbans=400,
8992
download_timeout=190):
9093
crawler = self._mock_crawler(spider, settings)
9194
mw = self.mwcls.from_crawler(crawler)
9295
mw.open_spider(spider)
96+
assert mw.url == proxyurl
9397
req = Request('http://www.scrapytest.org')
9498
assert mw.process_request(req, spider) is None
95-
self.assertEqual(req.meta.get('proxy'), proxyurl)
99+
self.assertEqual(req.meta.get('proxy'), proxyurlcreds)
96100
self.assertEqual(req.meta.get('download_timeout'), download_timeout)
97-
self.assertEqual(req.headers.get('Proxy-Authorization'), proxyauth)
101+
self.assertNotIn(b'Proxy-Authorization', req.headers)
98102
res = self._mock_zyte_smartproxy_response(req.url)
99103
assert mw.process_response(req, res, spider) is res
100104

@@ -169,31 +173,31 @@ def test_apikey(self):
169173
self.spider.zyte_smartproxy_enabled = True
170174
self.settings['ZYTE_SMARTPROXY_APIKEY'] = apikey = 'apikey'
171175
proxyauth = basic_auth_header(apikey, '')
172-
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
176+
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://apikey:@proxy.zyte.com:8011')
173177

174178
self.spider.zyte_smartproxy_apikey = apikey = 'notfromsettings'
175179
proxyauth = basic_auth_header(apikey, '')
176-
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
180+
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://notfromsettings:@proxy.zyte.com:8011')
177181

178182
def test_proxyurl(self):
179183
self.spider.zyte_smartproxy_enabled = True
180184
self.settings['ZYTE_SMARTPROXY_URL'] = 'http://localhost:8011'
181-
self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011')
185+
self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011', proxyurlcreds='http://apikey:@localhost:8011')
182186

183187
def test_proxyurl_no_protocol(self):
184188
self.spider.zyte_smartproxy_enabled = True
185189
self.settings['ZYTE_SMARTPROXY_URL'] = 'localhost:8011'
186-
self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011')
190+
self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011', proxyurlcreds='http://apikey:@localhost:8011')
187191

188192
def test_proxyurl_https(self):
189193
self.spider.zyte_smartproxy_enabled = True
190194
self.settings['ZYTE_SMARTPROXY_URL'] = 'https://localhost:8011'
191-
self._assert_enabled(self.spider, self.settings, proxyurl='https://localhost:8011')
195+
self._assert_enabled(self.spider, self.settings, proxyurl='https://localhost:8011', proxyurlcreds='https://apikey:@localhost:8011')
192196

193197
def test_proxyurl_including_noconnect(self):
194198
self.spider.zyte_smartproxy_enabled = True
195199
self.settings['ZYTE_SMARTPROXY_URL'] = 'http://localhost:8011?noconnect'
196-
self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011?noconnect')
200+
self._assert_enabled(self.spider, self.settings, proxyurl='http://localhost:8011?noconnect', proxyurlcreds='http://apikey:@localhost:8011?noconnect')
197201

198202
def test_maxbans(self):
199203
self.spider.zyte_smartproxy_enabled = True
@@ -218,7 +222,7 @@ def test_download_timeout(self):
218222
self._assert_enabled(self.spider, self.settings, download_timeout=120)
219223

220224
def test_hooks(self):
221-
proxyauth = b'Basic Foo'
225+
proxyauth = basic_auth_header('foo', '')
222226

223227
class _ECLS(self.mwcls):
224228
def is_enabled(self, spider):
@@ -241,7 +245,7 @@ def get_proxyauth(self, spider):
241245
wascalled[:] = [] # reset
242246
enabled = True
243247
self.spider.zyte_smartproxy_enabled = False
244-
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth)
248+
self._assert_enabled(self.spider, self.settings, proxyauth=proxyauth, proxyurlcreds='http://foo:@proxy.zyte.com:8011')
245249
self.assertEqual(wascalled, ['is_enabled', 'get_proxyauth'])
246250

247251
def test_delay_adjustment(self):
@@ -909,3 +913,72 @@ def test_client_header(self):
909913
req.headers.get('X-Crawlera-Client').decode('utf-8'),
910914
'scrapy-zyte-smartproxy/%s' % __version__
911915
)
916+
917+
def test_scrapy_httpproxy_integration(self):
918+
self.spider.zyte_smartproxy_enabled = True
919+
crawler = self._mock_crawler(self.spider, self.settings)
920+
smartproxy = self.mwcls.from_crawler(crawler)
921+
smartproxy.open_spider(self.spider)
922+
httpproxy = HttpProxyMiddleware.from_crawler(crawler)
923+
request = Request('https://example.com')
924+
auth_header = basic_auth_header('apikey', '')
925+
926+
# 1st pass
927+
self.assertEqual(smartproxy.process_request(request, self.spider), None)
928+
self.assertEqual(httpproxy.process_request(request, self.spider), None)
929+
self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011')
930+
self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header)
931+
932+
# 2nd pass (e.g. retry or redirect)
933+
self.assertEqual(smartproxy.process_request(request, self.spider), None)
934+
self.assertEqual(httpproxy.process_request(request, self.spider), None)
935+
self.assertEqual(request.meta['proxy'], 'http://proxy.zyte.com:8011')
936+
self.assertEqual(request.headers[b'Proxy-Authorization'], auth_header)
937+
938+
def test_subclass_non_basic_header(self):
939+
940+
class Subclass(self.mwcls):
941+
def get_proxyauth(self, spider):
942+
return b'Non-Basic foo'
943+
944+
self.spider.zyte_smartproxy_enabled = True
945+
crawler = self._mock_crawler(self.spider, self.settings)
946+
smartproxy = Subclass.from_crawler(crawler)
947+
with pytest.raises(ValueError):
948+
smartproxy.open_spider(self.spider)
949+
950+
def test_subclass_basic_header_non_base64(self):
951+
952+
class Subclass(self.mwcls):
953+
def get_proxyauth(self, spider):
954+
return b'Basic foo'
955+
956+
self.spider.zyte_smartproxy_enabled = True
957+
crawler = self._mock_crawler(self.spider, self.settings)
958+
smartproxy = Subclass.from_crawler(crawler)
959+
with pytest.raises((TypeError, binascii.Error)):
960+
smartproxy.open_spider(self.spider)
961+
962+
def test_subclass_basic_header_nonurlsafe_base64(self):
963+
964+
class Subclass(self.mwcls):
965+
def get_proxyauth(self, spider):
966+
return b'Basic YWF+Og=='
967+
968+
self.spider.zyte_smartproxy_enabled = True
969+
crawler = self._mock_crawler(self.spider, self.settings)
970+
smartproxy = Subclass.from_crawler(crawler)
971+
smartproxy.open_spider(self.spider)
972+
self.assertEqual(smartproxy._auth_url, "http://aa~:@proxy.zyte.com:8011")
973+
974+
def test_subclass_basic_header_urlsafe_base64(self):
975+
976+
class Subclass(self.mwcls):
977+
def get_proxyauth(self, spider):
978+
return b'Basic YWF-Og=='
979+
980+
self.spider.zyte_smartproxy_enabled = True
981+
crawler = self._mock_crawler(self.spider, self.settings)
982+
smartproxy = Subclass.from_crawler(crawler)
983+
smartproxy.open_spider(self.spider)
984+
self.assertEqual(smartproxy._auth_url, "http://aa~:@proxy.zyte.com:8011")

tox.ini

Lines changed: 0 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -15,75 +15,6 @@ deps =
1515
commands =
1616
bandit -r {posargs:scrapy_zyte_smartproxy setup.py}
1717

18-
[testenv:stack-scrapy-1.0]
19-
basepython = python2.7
20-
deps =
21-
Scrapy==1.0.6
22-
six==1.10.0
23-
Twisted==16.1.1
24-
w3lib==1.14.2
25-
-rtests/requirements.txt
26-
27-
[testenv:stack-scrapy-1.1]
28-
basepython = python2.7
29-
deps =
30-
Scrapy==1.1.3
31-
Parsel==1.0.3
32-
six==1.10.0
33-
Twisted==16.1.1
34-
w3lib==1.14.2
35-
-rtests/requirements.txt
36-
37-
[testenv:stack-scrapy-1.1-py3]
38-
basepython = python3.5
39-
deps =
40-
Scrapy==1.1.3
41-
Parsel==1.0.3
42-
six==1.10.0
43-
Twisted==16.3.0
44-
w3lib==1.14.3
45-
-rtests/requirements.txt
46-
47-
[testenv:stack-scrapy-1.2]
48-
basepython = python2.7
49-
deps =
50-
Scrapy==1.2.2
51-
Parsel==1.1.0
52-
six==1.10.0
53-
Twisted==16.1.1
54-
w3lib==1.15.0
55-
-rtests/requirements.txt
56-
57-
[testenv:stack-scrapy-1.2-py3]
58-
basepython = python3.5
59-
deps =
60-
Scrapy==1.2.2
61-
Parsel==1.1.0
62-
six==1.10.0
63-
Twisted==16.3.0
64-
w3lib==1.15.0
65-
-rtests/requirements.txt
66-
67-
[testenv:stack-scrapy-1.3]
68-
basepython = python2.7
69-
deps =
70-
Scrapy==1.3.3
71-
Parsel==1.3.1
72-
six==1.10.0
73-
Twisted==16.6.0
74-
w3lib==1.16.0
75-
-rtests/requirements.txt
76-
77-
[testenv:stack-scrapy-1.3-py3]
78-
basepython = python3.6
79-
deps =
80-
Scrapy==1.3.3
81-
Parsel==1.3.1
82-
six==1.10.0
83-
Twisted==16.6.0
84-
w3lib==1.16.0
85-
-rtests/requirements.txt
86-
8718
[testenv:stack-scrapy-1.4]
8819
basepython = python2.7
8920
deps =

0 commit comments

Comments
 (0)