Skip to content

Commit 937c537

Browse files
authored
[dev] add customize fingerprint (#280)
[dev] add customize fingerprint
1 parent ce30a1d commit 937c537

File tree

2 files changed

+32
-6
lines changed

2 files changed

+32
-6
lines changed

src/scrapy_redis/dupefilter.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
import logging
2+
import hashlib
3+
import json
24
import time
35

46
from scrapy.dupefilters import BaseDupeFilter
57
from scrapy.utils.request import request_fingerprint
8+
from scrapy.utils.python import to_unicode
9+
from w3lib.url import canonicalize_url
610

711
from . import defaults
812
from .connection import get_redis_from_settings
@@ -112,8 +116,14 @@ def request_fingerprint(self, request):
112116
str
113117
114118
"""
115-
return request_fingerprint(request)
116-
119+
fingerprint_data = {
120+
"method": to_unicode(request.method),
121+
"url": canonicalize_url(request.url),
122+
"body": (request.body or b"").hex(),
123+
}
124+
fingerprint_json = json.dumps(fingerprint_data, sort_keys=True)
125+
return hashlib.sha1(fingerprint_json.encode()).hexdigest()
126+
117127
@classmethod
118128
def from_spider(cls, spider):
119129
settings = spider.settings

tests/test_dupefilter.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ def get_redis_mock():
1111

1212
def sadd(key, fp, added=0, db={}):
1313
fingerprints = db.setdefault(key, set())
14-
if key not in fingerprints:
15-
fingerprints.add(key)
14+
if fp not in fingerprints:
15+
fingerprints.add(fp)
1616
added += 1
1717
return added
1818

@@ -30,8 +30,24 @@ def setup(self):
3030

3131
def test_request_seen(self):
3232
req = Request('http://example.com')
33-
assert not self.df.request_seen(req)
34-
assert self.df.request_seen(req)
33+
34+
def same_request():
35+
assert not self.df.request_seen(req)
36+
assert self.df.request_seen(req)
37+
38+
def diff_method():
39+
diff_method = Request('http://example.com', method='POST')
40+
assert self.df.request_seen(req)
41+
assert not self.df.request_seen(diff_method)
42+
43+
def diff_url():
44+
diff_url = Request('http://example2.com')
45+
assert self.df.request_seen(req)
46+
assert not self.df.request_seen(diff_url)
47+
48+
same_request()
49+
diff_method()
50+
diff_url()
3551

3652
def test_overridable_request_fingerprinter(self):
3753
req = Request('http://example.com')

0 commit comments

Comments
 (0)