11# -*- coding: utf-8 -*-
2+ from __future__ import annotations
3+
24import itertools
5+ from typing import AsyncIterable , AsyncGenerator , Iterable
36from warnings import warn
47from weakref import WeakKeyDictionary
58
6- from scrapy import Request
9+ from scrapy import Spider
10+ from scrapy .crawler import Crawler
11+ from scrapy .http import Request , Response
712
13+ from sh_scrapy import _SCRAPY_NO_SPIDER_ARG
814from sh_scrapy .writer import pipe_writer
915
16+
1017HS_REQUEST_ID_KEY = '_hsid'
1118HS_PARENT_ID_KEY = '_hsparent'
1219request_id_sequence = itertools .count ()
1320seen_requests = WeakKeyDictionary ()
1421
1522
16- class HubstorageSpiderMiddleware ( object ) :
23+ class HubstorageSpiderMiddleware :
1724 """Hubstorage spider middleware.
18-
25+
1926 What it does:
20-
27+
2128 - Sets parent request ids to the requests coming out of the spider.
22-
29+
2330 """
2431
25- def __init__ (self ):
32+ def __init__ (self ) -> None :
2633 self ._seen_requests = seen_requests
2734
28- def process_spider_output (self , response , result , spider ):
35+ if _SCRAPY_NO_SPIDER_ARG :
36+
37+ def process_spider_output (self , response : Response , result : Iterable ) -> Iterable :
38+ return self ._process_spider_output (response , result )
39+
40+ async def process_spider_output_async (
41+ self , response : Response , result : Iterable
42+ ) -> AsyncGenerator :
43+ async for x in self ._process_spider_output_async (response , result ):
44+ yield x
45+
46+ else :
47+
48+ def process_spider_output (
49+ self , response : Response , result : Iterable , spider : Spider
50+ ) -> Iterable :
51+ return self ._process_spider_output (response , result )
52+
53+ async def process_spider_output_async (
54+ self , response : Response , result : Iterable , spider : Spider
55+ ) -> AsyncGenerator :
56+ async for x in self ._process_spider_output_async (response , result ):
57+ yield x
58+
59+ def _process_spider_output (self , response : Response , result : Iterable ) -> Iterable :
2960 parent = self ._seen_requests .pop (response .request , None )
3061 for x in result :
3162 if isinstance (x , Request ):
3263 self ._process_request (x , parent )
3364 yield x
3465
35- async def process_spider_output_async (self , response , result , spider ):
66+ async def _process_spider_output_async (
67+ self , response : Response , result : AsyncIterable
68+ ) -> AsyncGenerator :
3669 parent = self ._seen_requests .pop (response .request , None )
3770 async for x in result :
3871 if isinstance (x , Request ):
3972 self ._process_request (x , parent )
4073 yield x
4174
42- def _process_request (self , request , parent ) :
75+ def _process_request (self , request : Request , parent : int | None ) -> None :
4376 request .meta [HS_PARENT_ID_KEY ] = parent
4477 # Remove request id if it was for some reason set in the request coming from Spider.
4578 request .meta .pop (HS_REQUEST_ID_KEY , None )
4679
4780
4881class HubstorageDownloaderMiddleware :
4982 """Hubstorage dowloader middleware.
50-
83+
5184 What it does:
52-
85+
5386 - Generates request ids for all downloaded requests.
5487 - Sets parent request ids for requests generated in downloader middlewares.
5588 - Stores all downloaded requests into Hubstorage.
56-
89+
5790 """
5891
5992 @classmethod
60- def from_crawler (cls , crawler ) :
93+ def from_crawler (cls , crawler : Crawler ) -> HubstorageDownloaderMiddleware :
6194 try :
6295 result = cls (crawler )
6396 except TypeError :
@@ -74,29 +107,45 @@ def from_crawler(cls, crawler):
74107 result ._load_fingerprinter ()
75108 return result
76109
77- def __init__ (self , crawler ):
110+ def __init__ (self , crawler : Crawler ):
78111 self ._crawler = crawler
79112 self ._seen_requests = seen_requests
80113 self .pipe_writer = pipe_writer
81114 self .request_id_sequence = request_id_sequence
82115 self ._load_fingerprinter ()
83116
84- def _load_fingerprinter (self ):
117+ def _load_fingerprinter (self ) -> None :
85118 if hasattr (self ._crawler , "request_fingerprinter" ):
86119 self ._fingerprint = lambda request : self ._crawler .request_fingerprinter .fingerprint (request ).hex ()
87120 else :
88121 from scrapy .utils .request import request_fingerprint
89122 self ._fingerprint = request_fingerprint
90123
91- def process_request (self , request , spider ):
124+ if _SCRAPY_NO_SPIDER_ARG :
125+
126+ def process_request (self , request : Request ) -> None :
127+ return self ._process_request (request )
128+
129+ def process_response (self , request : Request , response : Response ) -> Response :
130+ return self ._process_response (request , response )
131+
132+ else :
133+
134+ def process_request (self , request : Request , spider : Spider ) -> None :
135+ return self ._process_request (request )
136+
137+ def process_response (self , request : Request , response : Response , spider : Spider ) -> Response :
138+ return self ._process_response (request , response )
139+
140+ def _process_request (self , request : Request ) -> None :
92141 # Check if request id is set, which usually happens for retries or redirects because
93142 # those requests are usually copied from the original one.
94143 request_id = request .meta .pop (HS_REQUEST_ID_KEY , None )
95144 if request_id is not None :
96145 # Set original request id or None as a parent request id.
97146 request .meta [HS_PARENT_ID_KEY ] = request_id
98147
99- def process_response (self , request , response , spider ) :
148+ def _process_response (self , request : Request , response : Response ) -> Response :
100149 # This class of response check is intended to fix the bug described here
101150 # https://github.com/scrapy-plugins/scrapy-zyte-api/issues/112
102151 if type (response ).__name__ == "DummyResponse" and type (response ).__module__ .startswith ("scrapy_poet" ):
0 commit comments