File tree Expand file tree Collapse file tree 2 files changed +32
-6
lines changed Expand file tree Collapse file tree 2 files changed +32
-6
lines changed Original file line number Diff line number Diff line change 1
1
import logging
2
+ import hashlib
3
+ import json
2
4
import time
3
5
4
6
from scrapy .dupefilters import BaseDupeFilter
5
7
from scrapy .utils .request import request_fingerprint
8
+ from scrapy .utils .python import to_unicode
9
+ from w3lib .url import canonicalize_url
6
10
7
11
from . import defaults
8
12
from .connection import get_redis_from_settings
@@ -112,8 +116,14 @@ def request_fingerprint(self, request):
112
116
str
113
117
114
118
"""
115
- return request_fingerprint (request )
116
-
119
+ fingerprint_data = {
120
+ "method" : to_unicode (request .method ),
121
+ "url" : canonicalize_url (request .url ),
122
+ "body" : (request .body or b"" ).hex (),
123
+ }
124
+ fingerprint_json = json .dumps (fingerprint_data , sort_keys = True )
125
+ return hashlib .sha1 (fingerprint_json .encode ()).hexdigest ()
126
+
117
127
@classmethod
118
128
def from_spider (cls , spider ):
119
129
settings = spider .settings
Original file line number Diff line number Diff line change @@ -11,8 +11,8 @@ def get_redis_mock():
11
11
12
12
def sadd (key , fp , added = 0 , db = {}):
13
13
fingerprints = db .setdefault (key , set ())
14
- if key not in fingerprints :
15
- fingerprints .add (key )
14
+ if fp not in fingerprints :
15
+ fingerprints .add (fp )
16
16
added += 1
17
17
return added
18
18
@@ -30,8 +30,24 @@ def setup(self):
30
30
31
31
def test_request_seen (self ):
32
32
req = Request ('http://example.com' )
33
- assert not self .df .request_seen (req )
34
- assert self .df .request_seen (req )
33
+
34
+ def same_request ():
35
+ assert not self .df .request_seen (req )
36
+ assert self .df .request_seen (req )
37
+
38
+ def diff_method ():
39
+ diff_method = Request ('http://example.com' , method = 'POST' )
40
+ assert self .df .request_seen (req )
41
+ assert not self .df .request_seen (diff_method )
42
+
43
+ def diff_url ():
44
+ diff_url = Request ('http://example2.com' )
45
+ assert self .df .request_seen (req )
46
+ assert not self .df .request_seen (diff_url )
47
+
48
+ same_request ()
49
+ diff_method ()
50
+ diff_url ()
35
51
36
52
def test_overridable_request_fingerprinter (self ):
37
53
req = Request ('http://example.com' )
You can’t perform that action at this time.
0 commit comments