Skip to content

Commit e0cead1

Browse files
Gallaeciokmike
andauthored
HubstorageDownloaderMiddleware: centralized request fingerprinting (#87)
* HubstorageDownloaderMiddleware: use centralized request fingerprinting when available, support Scrapy 2.12+ * Update sh_scrapy/middlewares.py Co-authored-by: Mikhail Korobov <[email protected]> --------- Co-authored-by: Mikhail Korobov <[email protected]>
1 parent 46c7a4d commit e0cead1

File tree

2 files changed

+42
-6
lines changed

2 files changed

+42
-6
lines changed

sh_scrapy/middlewares.py

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from weakref import WeakKeyDictionary
44

55
from scrapy import Request
6-
from scrapy.utils.request import request_fingerprint
76

87
from sh_scrapy.writer import pipe_writer
98

@@ -35,7 +34,7 @@ def process_spider_output(self, response, result, spider):
3534
yield x
3635

3736

38-
class HubstorageDownloaderMiddleware(object):
37+
class HubstorageDownloaderMiddleware:
3938
"""Hubstorage dowloader middleware.
4039
4140
What it does:
@@ -46,10 +45,37 @@ class HubstorageDownloaderMiddleware(object):
4645
4746
"""
4847

49-
def __init__(self):
48+
@classmethod
49+
def from_crawler(cls, crawler):
50+
try:
51+
result = cls(crawler)
52+
except TypeError:
53+
warn(
54+
(
55+
"Subclasses of HubstorageDownloaderMiddleware must now "
56+
"accept a crawler parameter in their __init__ method. "
57+
"This will become an error in the future."
58+
),
59+
DeprecationWarning,
60+
)
61+
result = cls()
62+
result._crawler = crawler
63+
result._load_fingerprinter()
64+
return result
65+
66+
def __init__(self, crawler):
67+
self._crawler = crawler
5068
self._seen_requests = seen_requests
5169
self.pipe_writer = pipe_writer
5270
self.request_id_sequence = request_id_sequence
71+
self._load_fingerprinter()
72+
73+
def _load_fingerprinter(self):
74+
if hasattr(self._crawler, "request_fingerprinter"):
75+
self._fingerprint = lambda request: self._crawler.request_fingerprinter.fingerprint(request).hex()
76+
else:
77+
from scrapy.utils.request import request_fingerprint
78+
self._fingerprint = request_fingerprint
5379

5480
def process_request(self, request, spider):
5581
# Check if request id is set, which usually happens for retries or redirects because
@@ -72,7 +98,7 @@ def process_response(self, request, response, spider):
7298
rs=len(response.body),
7399
duration=request.meta.get('download_latency', 0) * 1000,
74100
parent=request.meta.setdefault(HS_PARENT_ID_KEY),
75-
fp=request_fingerprint(request),
101+
fp=self._fingerprint(response.request),
76102
)
77103
# Generate and set request id.
78104
request_id = next(self.request_id_sequence)

tests/test_middlewares.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import sys
66
from scrapy import Spider, Request, Item
77
from scrapy.http import Response
8+
from scrapy.utils.test import get_crawler
89
from typing import Optional
910

1011
from sh_scrapy.middlewares import (
@@ -26,7 +27,8 @@ def hs_spider_middleware(monkeypatch_globals):
2627

2728
@pytest.fixture()
2829
def hs_downloader_middleware(monkeypatch_globals):
29-
return HubstorageDownloaderMiddleware()
30+
crawler = get_crawler()
31+
return HubstorageDownloaderMiddleware.from_crawler(crawler)
3032

3133

3234
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
@@ -46,13 +48,13 @@ def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
4648
assert len(hs_spider_middleware._seen_requests) == 0
4749
assert len(hs_downloader_middleware._seen_requests) == 0
4850

51+
response_0.request = request_0
4952
hs_downloader_middleware.process_response(request_0, response_0, spider)
5053

5154
assert request_0.meta[HS_REQUEST_ID_KEY] == 0
5255
assert request_0.meta[HS_PARENT_ID_KEY] is None
5356
assert hs_spider_middleware._seen_requests[request_0] == 0
5457

55-
response_0.request = request_0
5658
request_1 = Request(url)
5759
request_2 = Request(url)
5860
item1 = {}
@@ -69,12 +71,14 @@ def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
6971

7072
response_1 = Response(url)
7173
hs_downloader_middleware.process_request(request_1, spider)
74+
response_1.request = request_1
7275
hs_downloader_middleware.process_response(request_1, response_1, spider)
7376
assert request_1.meta[HS_REQUEST_ID_KEY] == 1
7477
assert request_1.meta[HS_PARENT_ID_KEY] == 0
7578

7679
response_2 = Response(url)
7780
hs_downloader_middleware.process_request(request_2, spider)
81+
response_2.request = request_2
7882
hs_downloader_middleware.process_response(request_2, response_2, spider)
7983
assert request_2.meta[HS_REQUEST_ID_KEY] == 2
8084
assert request_2.meta[HS_PARENT_ID_KEY] == 0
@@ -101,12 +105,14 @@ def __init__(self, url: str, request: Optional[Request] = None):
101105
response_1 = DummyResponse(url, request)
102106
response_2 = Response(url)
103107
hs_downloader_middleware.process_request(request, spider)
108+
response_1.request = request
104109
hs_downloader_middleware.process_response(request, response_1, spider)
105110

106111
with open(hs_downloader_middleware.pipe_writer.path, 'r') as tmp_file:
107112
assert tmp_file.readline() == ""
108113
assert request.meta == {}
109114

115+
response_2.request = request
110116
hs_downloader_middleware.process_response(request, response_2, spider)
111117
with open(hs_downloader_middleware.pipe_writer.path, 'r') as tmp_file:
112118
assert tmp_file.readline().startswith('REQ')
@@ -138,6 +144,7 @@ def __init__(self, url: str, request: Optional[Request] = None):
138144
assert len(hs_spider_middleware._seen_requests) == 0
139145
assert len(hs_downloader_middleware._seen_requests) == 0
140146

147+
response_0.request = request_0
141148
hs_downloader_middleware.process_response(request_0, response_0, spider)
142149

143150
assert request_0.meta[HS_REQUEST_ID_KEY] == 0
@@ -154,6 +161,7 @@ def __init__(self, url: str, request: Optional[Request] = None):
154161
assert HS_REQUEST_ID_KEY not in request_1.meta
155162
assert request_1.meta[HS_PARENT_ID_KEY] == 0
156163

164+
response_1.request = request_1
157165
hs_downloader_middleware.process_response(request_1, response_1, spider)
158166

159167
assert request_1.meta[HS_REQUEST_ID_KEY] == 1
@@ -163,11 +171,13 @@ def __init__(self, url: str, request: Optional[Request] = None):
163171
response_2_1 = DummyResponse(url, request_2)
164172
response_2_2 = Response(url)
165173

174+
response_2_1.request = request_2
166175
hs_downloader_middleware.process_response(request_2, response_2_1, spider)
167176

168177
assert request_2.meta[HS_REQUEST_ID_KEY] == 1
169178
assert request_2.meta[HS_PARENT_ID_KEY] == 0
170179

180+
response_2_2.request = request_2
171181
hs_downloader_middleware.process_response(request_2, response_2_2, spider)
172182

173183
assert request_2.meta[HS_REQUEST_ID_KEY] == 2

0 commit comments

Comments
 (0)