55"""
66from __future__ import absolute_import
77from copy import deepcopy
8+ import hashlib
9+ from weakref import WeakKeyDictionary
10+ from warnings import warn
811
912from scrapy .dupefilters import RFPDupeFilter
1013
14+ from scrapy .utils .python import to_bytes
1115from scrapy .utils .url import canonicalize_url
12- from scrapy .utils .request import request_fingerprint
1316
1417from .utils import dict_hash
1518
1619
20+ _deprecated_fingerprint_cache = WeakKeyDictionary ()
21+
22+
23+ def _serialize_headers (
24+ headers , request
25+ ):
26+ for header in headers :
27+ if header in request .headers :
28+ yield header
29+ for value in request .headers .getlist (header ):
30+ yield value
31+
32+
33+ # From https://docs.scrapy.org/en/2.11/_modules/scrapy/utils/request.html
34+ # Needs to be added here since it was deletedin Scrapy 2.12
35+ def request_fingerprint (
36+ request ,
37+ include_headers = None ,
38+ keep_fragments = False ,
39+ ):
40+ """
41+ Return the request fingerprint as an hexadecimal string.
42+
43+ The request fingerprint is a hash that uniquely identifies the resource the
44+ request points to. For example, take the following two urls:
45+
46+ http://www.example.com/query?id=111&cat=222
47+ http://www.example.com/query?cat=222&id=111
48+
49+ Even though those are two different URLs both point to the same resource
50+ and are equivalent (i.e. they should return the same response).
51+
52+ Another example are cookies used to store session ids. Suppose the
53+ following page is only accessible to authenticated users:
54+
55+ http://www.example.com/members/offers.html
56+
57+ Lots of sites use a cookie to store the session id, which adds a random
58+ component to the HTTP Request and thus should be ignored when calculating
59+ the fingerprint.
60+
61+ For this reason, request headers are ignored by default when calculating
62+ the fingerprint. If you want to include specific headers use the
63+ include_headers argument, which is a list of Request headers to include.
64+
65+ Also, servers usually ignore fragments in urls when handling requests,
66+ so they are also ignored by default when calculating the fingerprint.
67+ If you want to include them, set the keep_fragments argument to True
68+ (for instance when handling requests with a headless browser).
69+ """
70+ processed_include_headers = None
71+ if include_headers :
72+ processed_include_headers = tuple (
73+ to_bytes (h .lower ()) for h in sorted (include_headers )
74+ )
75+ cache = _deprecated_fingerprint_cache .setdefault (request , {})
76+ cache_key = (processed_include_headers , keep_fragments )
77+ if cache_key not in cache :
78+ fp = hashlib .sha1 ()
79+ fp .update (to_bytes (request .method ))
80+ fp .update (
81+ to_bytes (canonicalize_url (request .url , keep_fragments = keep_fragments ))
82+ )
83+ fp .update (request .body or b"" )
84+ if processed_include_headers :
85+ for part in _serialize_headers (processed_include_headers , request ):
86+ fp .update (part )
87+ cache [cache_key ] = fp .hexdigest ()
88+ return cache [cache_key ]
89+
90+
1791def splash_request_fingerprint (request , include_headers = None ):
1892 """ Request fingerprint which takes 'splash' meta key into account """
93+ warn (
94+ (
95+ "scrapy_splash.splash_request_fingerprint is deprecated. Set "
96+ "the REQUEST_FINGERPRINTER_CLASS Scrapy setting to "
97+ "\" scrapy_splash.SplashRequestFingerprinter\" instead."
98+ ),
99+ DeprecationWarning ,
100+ stacklevel = 2 ,
101+ )
19102
20103 fp = request_fingerprint (request , include_headers = include_headers )
21104 if 'splash' not in request .meta :
@@ -35,5 +118,17 @@ class SplashAwareDupeFilter(RFPDupeFilter):
35118 DupeFilter that takes 'splash' meta key in account.
36119 It should be used with SplashMiddleware.
37120 """
121+
122+ def __init__ (self ):
123+ warn (
124+ (
125+ "SplashAwareDupeFilter is deprecated. Set "
126+ "the REQUEST_FINGERPRINTER_CLASS Scrapy setting to "
127+ "\" scrapy_splash.SplashRequestFingerprinter\" instead."
128+ ),
129+ DeprecationWarning ,
130+ stacklevel = 2 ,
131+ )
132+
38133 def request_fingerprint (self , request ):
39134 return splash_request_fingerprint (request )
0 commit comments