Skip to content

Commit 7979eda

Browse files
authored
Support Scrapy 2.12+ (#317)
1 parent 6a9eb9c commit 7979eda

File tree

10 files changed

+199
-30
lines changed

10 files changed

+199
-30
lines changed

.github/workflows/tests.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ jobs:
99
fail-fast: false
1010
matrix:
1111
include:
12-
- python-version: '3.7'
13-
- python-version: '3.8'
1412
- python-version: '3.9'
1513
- python-version: '3.10'
1614
- python-version: '3.11'
15+
- python-version: '3.12'
16+
- python-version: '3.13'
1717

1818
steps:
1919
- uses: actions/checkout@v2

README.rst

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -75,25 +75,9 @@ Configuration
7575
also allows to save network traffic by not sending these duplicate
7676
arguments to Splash server multiple times.
7777

78-
4. Set a custom ``DUPEFILTER_CLASS``::
78+
4. Set a custom ``REQUEST_FINGERPRINTER_CLASS``::
7979

80-
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
81-
82-
5. If you use Scrapy HTTP cache then a custom cache storage backend
83-
is required. scrapy-splash provides a subclass of
84-
``scrapy.contrib.httpcache.FilesystemCacheStorage``::
85-
86-
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'
87-
88-
If you use other cache storage then it is necesary to subclass it and
89-
replace all ``scrapy.util.request.request_fingerprint`` calls with
90-
``scrapy_splash.splash_request_fingerprint``.
91-
92-
.. note::
93-
94-
Steps (4) and (5) are necessary because Scrapy doesn't provide a way
95-
to override request fingerprints calculation algorithm globally; this
96-
could change in future.
80+
REQUEST_FINGERPRINTER_CLASS = 'scrapy_splash.SplashRequestFingerprinter'
9781

9882

9983
There are also some additional options available.
@@ -111,6 +95,7 @@ Put them into your ``settings.py`` if you want to change the defaults:
11195
It specifies how concurrency & politeness are maintained for Splash requests,
11296
and specify the default value for ``slot_policy`` argument for
11397
``SplashRequest``, which is described below.
98+
* ``SCRAPY_SPLASH_REQUEST_FINGERPRINTER_BASE_CLASS`` is ``scrapy.settings.default_settings.REQUEST_FINGERPRINTER_CLASS`` by default. This changes the base class the Fingerprinter uses to get a fingerprint.
11499

115100

116101
Usage

scrapy_splash/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@
1010
from .dupefilter import SplashAwareDupeFilter, splash_request_fingerprint
1111
from .cache import SplashAwareFSCacheStorage
1212
from .response import SplashResponse, SplashTextResponse, SplashJsonResponse
13-
from .request import SplashRequest, SplashFormRequest
13+
from .request import SplashRequest, SplashFormRequest, SplashRequestFingerprinter

scrapy_splash/cache.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,26 @@
77
"""
88
from __future__ import absolute_import
99
import os
10+
from warnings import warn
1011

1112
from scrapy.extensions.httpcache import FilesystemCacheStorage
1213

1314
from .dupefilter import splash_request_fingerprint
1415

1516

1617
class SplashAwareFSCacheStorage(FilesystemCacheStorage):
18+
def __init__(self, settings):
19+
warn(
20+
(
21+
"scrapy-splash.SplashAwareFSCacheStorage is deprecated. Set "
22+
"the REQUEST_FINGERPRINTER_CLASS Scrapy setting to "
23+
"\"scrapy_splash.SplashRequestFingerprinter\" instead."
24+
),
25+
DeprecationWarning,
26+
stacklevel=2,
27+
)
28+
super().__init__(settings)
29+
1730
def _get_request_path(self, spider, request):
1831
key = splash_request_fingerprint(request)
1932
return os.path.join(self.cachedir, spider.name, key[0:2], key)

scrapy_splash/dupefilter.py

Lines changed: 96 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,100 @@
55
"""
66
from __future__ import absolute_import
77
from copy import deepcopy
8+
import hashlib
9+
from weakref import WeakKeyDictionary
10+
from warnings import warn
811

912
from scrapy.dupefilters import RFPDupeFilter
1013

14+
from scrapy.utils.python import to_bytes
1115
from scrapy.utils.url import canonicalize_url
12-
from scrapy.utils.request import request_fingerprint
1316

1417
from .utils import dict_hash
1518

1619

20+
_deprecated_fingerprint_cache = WeakKeyDictionary()
21+
22+
23+
def _serialize_headers(
24+
headers, request
25+
):
26+
for header in headers:
27+
if header in request.headers:
28+
yield header
29+
for value in request.headers.getlist(header):
30+
yield value
31+
32+
33+
# From https://docs.scrapy.org/en/2.11/_modules/scrapy/utils/request.html
34+
# Needs to be added here since it was deletedin Scrapy 2.12
35+
def request_fingerprint(
36+
request,
37+
include_headers=None,
38+
keep_fragments=False,
39+
):
40+
"""
41+
Return the request fingerprint as an hexadecimal string.
42+
43+
The request fingerprint is a hash that uniquely identifies the resource the
44+
request points to. For example, take the following two urls:
45+
46+
http://www.example.com/query?id=111&cat=222
47+
http://www.example.com/query?cat=222&id=111
48+
49+
Even though those are two different URLs both point to the same resource
50+
and are equivalent (i.e. they should return the same response).
51+
52+
Another example are cookies used to store session ids. Suppose the
53+
following page is only accessible to authenticated users:
54+
55+
http://www.example.com/members/offers.html
56+
57+
Lots of sites use a cookie to store the session id, which adds a random
58+
component to the HTTP Request and thus should be ignored when calculating
59+
the fingerprint.
60+
61+
For this reason, request headers are ignored by default when calculating
62+
the fingerprint. If you want to include specific headers use the
63+
include_headers argument, which is a list of Request headers to include.
64+
65+
Also, servers usually ignore fragments in urls when handling requests,
66+
so they are also ignored by default when calculating the fingerprint.
67+
If you want to include them, set the keep_fragments argument to True
68+
(for instance when handling requests with a headless browser).
69+
"""
70+
processed_include_headers = None
71+
if include_headers:
72+
processed_include_headers = tuple(
73+
to_bytes(h.lower()) for h in sorted(include_headers)
74+
)
75+
cache = _deprecated_fingerprint_cache.setdefault(request, {})
76+
cache_key = (processed_include_headers, keep_fragments)
77+
if cache_key not in cache:
78+
fp = hashlib.sha1()
79+
fp.update(to_bytes(request.method))
80+
fp.update(
81+
to_bytes(canonicalize_url(request.url, keep_fragments=keep_fragments))
82+
)
83+
fp.update(request.body or b"")
84+
if processed_include_headers:
85+
for part in _serialize_headers(processed_include_headers, request):
86+
fp.update(part)
87+
cache[cache_key] = fp.hexdigest()
88+
return cache[cache_key]
89+
90+
1791
def splash_request_fingerprint(request, include_headers=None):
1892
""" Request fingerprint which takes 'splash' meta key into account """
93+
warn(
94+
(
95+
"scrapy_splash.splash_request_fingerprint is deprecated. Set "
96+
"the REQUEST_FINGERPRINTER_CLASS Scrapy setting to "
97+
"\"scrapy_splash.SplashRequestFingerprinter\" instead."
98+
),
99+
DeprecationWarning,
100+
stacklevel=2,
101+
)
19102

20103
fp = request_fingerprint(request, include_headers=include_headers)
21104
if 'splash' not in request.meta:
@@ -35,5 +118,17 @@ class SplashAwareDupeFilter(RFPDupeFilter):
35118
DupeFilter that takes 'splash' meta key in account.
36119
It should be used with SplashMiddleware.
37120
"""
121+
122+
def __init__(self):
123+
warn(
124+
(
125+
"SplashAwareDupeFilter is deprecated. Set "
126+
"the REQUEST_FINGERPRINTER_CLASS Scrapy setting to "
127+
"\"scrapy_splash.SplashRequestFingerprinter\" instead."
128+
),
129+
DeprecationWarning,
130+
stacklevel=2,
131+
)
132+
38133
def request_fingerprint(self, request):
39134
return splash_request_fingerprint(request)

scrapy_splash/request.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,20 @@
33
import copy
44
import scrapy
55
from scrapy.http import FormRequest
6+
from scrapy.utils.url import canonicalize_url
67

78
from scrapy_splash import SlotPolicy
8-
from scrapy_splash.utils import to_unicode
9+
from scrapy_splash.utils import to_unicode, dict_hash
10+
from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
11+
from scrapy.utils.misc import load_object
12+
13+
try:
14+
from scrapy.utils.misc import build_from_crawler
15+
except ImportError: # Scrapy < 2.12
16+
from scrapy.utils.misc import create_instance
17+
18+
def build_from_crawler(objcls, crawler, /, *args, **kwargs):
19+
return create_instance(objcls, None, crawler, *args, **kwargs)
920

1021
# XXX: we can't implement SplashRequest without middleware support
1122
# because there is no way to set Splash URL based on settings
@@ -115,3 +126,35 @@ def __init__(self, url=None, callback=None, method=None, formdata=None,
115126
SplashRequest.__init__(
116127
self, url=url, callback=callback, method=method, body=body,
117128
**kwargs)
129+
130+
131+
class SplashRequestFingerprinter:
132+
@classmethod
133+
def from_crawler(cls, crawler):
134+
return cls(crawler)
135+
136+
def __init__(self, crawler):
137+
self._base_request_fingerprinter = build_from_crawler(
138+
load_object(
139+
crawler.settings.get(
140+
"SCRAPY_SPLASH_REQUEST_FINGERPRINTER_BASE_CLASS",
141+
REQUEST_FINGERPRINTER_CLASS,
142+
)
143+
),
144+
crawler,
145+
)
146+
147+
def fingerprint(self, request):
148+
""" Request fingerprint which takes 'splash' meta key into account """
149+
150+
fp = self._base_request_fingerprinter.fingerprint(request)
151+
if 'splash' not in request.meta:
152+
return fp
153+
154+
splash_options = copy.deepcopy(request.meta['splash'])
155+
args = splash_options.setdefault('args', {})
156+
157+
if 'url' in args:
158+
args['url'] = canonicalize_url(args['url'], keep_fragments=True)
159+
160+
return dict_hash(splash_options, fp).encode()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,5 +30,5 @@
3030
'Topic :: Software Development :: Libraries :: Application Frameworks',
3131
'Topic :: Software Development :: Libraries :: Python Modules',
3232
],
33-
requires=['scrapy', 'six'],
33+
install_requires=['scrapy', 'six'],
3434
)

tests/conftest.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,7 @@ def settings():
2626
SPIDER_MIDDLEWARES={
2727
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
2828
},
29-
DUPEFILTER_CLASS='scrapy_splash.SplashAwareDupeFilter',
30-
HTTPCACHE_STORAGE='scrapy_splash.SplashAwareFSCacheStorage',
29+
REQUEST_FINGERPRINTER_CLASS='scrapy_splash.SplashRequestFingerprinter',
3130
)
3231
return s
3332

tests/test_fingerprints.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@
44

55
import pytest
66
import scrapy
7-
from scrapy.utils.request import request_fingerprint
87

98
from scrapy_splash import SplashRequest
10-
from scrapy_splash.dupefilter import splash_request_fingerprint
9+
from scrapy_splash.dupefilter import request_fingerprint, splash_request_fingerprint
1110
from scrapy_splash.utils import dict_hash
1211

1312
from .test_middleware import _get_mw
13+
from .utils import make_crawler
14+
from scrapy_splash.request import SplashRequestFingerprinter
1415

1516

1617
def test_dict_hash():
@@ -79,6 +80,39 @@ def test_request_fingerprint_splash():
7980
assert_fingerprints_match(r2, r4)
8081

8182

83+
def assert_fingerprints_match_fingerprinter(fingerprinter, r1, r2):
84+
assert fingerprinter.fingerprint(r1) == fingerprinter.fingerprint(r2)
85+
86+
87+
def assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r2):
88+
assert fingerprinter.fingerprint(r1) != fingerprinter.fingerprint(r2)
89+
90+
91+
class TestSpider(scrapy.Spider):
92+
name = 'test_spider'
93+
94+
95+
def test_splash_request_fingerprinter():
96+
crawler = make_crawler(TestSpider, {})
97+
fingerprinter = SplashRequestFingerprinter(crawler)
98+
99+
r1 = scrapy.Request("http://example.com")
100+
r2 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1}}})
101+
r3 = scrapy.Request("http://example.com", meta={"splash": {"args": {"png": 1}}})
102+
r4 = scrapy.Request("http://example.com", meta={"foo": "bar", "splash": {"args": {"html": 1}}})
103+
r5 = scrapy.Request("http://example.com", meta={"splash": {"args": {"html": 1, "wait": 1.0}}})
104+
105+
assert request_fingerprint(r1) == request_fingerprint(r2)
106+
assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r2)
107+
assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r3)
108+
assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r4)
109+
assert_fingerprints_dont_match_fingerprinter(fingerprinter, r1, r5)
110+
assert_fingerprints_dont_match_fingerprinter(fingerprinter, r2, r3)
111+
112+
# only "splash" contents is taken into account
113+
assert_fingerprints_match_fingerprinter(fingerprinter, r2, r4)
114+
115+
82116
@pytest.fixture()
83117
def splash_middleware():
84118
return _get_mw()

tests/test_middleware.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import scrapy
88
from scrapy.core.engine import ExecutionEngine
99
from scrapy.utils.test import get_crawler
10-
from scrapy.http import Response, TextResponse
10+
from scrapy.http import Response, TextResponse, JsonResponse
1111
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
1212

1313
import scrapy_splash
@@ -505,7 +505,7 @@ def _get_req():
505505
cached_resp = cache_mw.process_request(req, spider) or req
506506

507507
# response should be from cache:
508-
assert cached_resp.__class__ is TextResponse
508+
assert cached_resp.__class__ is JsonResponse
509509
assert cached_resp.body == resp_body
510510
resp2_1 = cache_mw.process_response(req, cached_resp, spider)
511511
resp3_1 = mw.process_response(req, resp2_1, spider)

0 commit comments

Comments
 (0)