1+ import dbm
12import logging
2- import os
33import time
4- import dbm
4+ from pathlib import Path
55
6+ from scrapy import signals
7+ from scrapy .exceptions import NotConfigured
68from scrapy .http import Request
79from scrapy .item import Item
8- from scrapy .utils .request import request_fingerprint
910from scrapy .utils .project import data_path
1011from scrapy .utils .python import to_bytes
11- from scrapy .exceptions import NotConfigured
12- from scrapy import signals
13-
12+ from scrapy .utils .request import request_fingerprint
1413
1514logger = logging .getLogger (__name__ )
1615
1716
18- class DeltaFetch (object ):
19- """
20- This is a spider middleware to ignore requests to pages containing items
21- seen in previous crawls of the same spider, thus producing a "delta crawl"
17+ class DeltaFetch :
18+ """Spider middleware to ignore requests to pages containing items seen in
19+ previous crawls of the same spider, thus producing a "delta crawl"
2220 containing only new items.
2321
2422 This also speeds up the crawl, by reducing the number of requests that need
@@ -32,56 +30,57 @@ def __init__(self, dir, reset=False, stats=None):
3230 self .stats = stats
3331
3432 @classmethod
35- def from_crawler (cls , crawler ):
33+ def from_crawler (cls , crawler ): # noqa: D102
3634 s = crawler .settings
37- if not s .getbool (' DELTAFETCH_ENABLED' ):
35+ if not s .getbool (" DELTAFETCH_ENABLED" ):
3836 raise NotConfigured
39- dir = data_path (s .get (' DELTAFETCH_DIR' , ' deltafetch' ))
40- reset = s .getbool (' DELTAFETCH_RESET' )
37+ dir = data_path (s .get (" DELTAFETCH_DIR" , " deltafetch" ))
38+ reset = s .getbool (" DELTAFETCH_RESET" )
4139 o = cls (dir , reset , crawler .stats )
4240 crawler .signals .connect (o .spider_opened , signal = signals .spider_opened )
4341 crawler .signals .connect (o .spider_closed , signal = signals .spider_closed )
4442 return o
4543
46- def spider_opened (self , spider ):
47- if not os . path . exists (self .dir ):
48- os . makedirs ( self . dir )
44+ def spider_opened (self , spider ): # noqa: D102
45+ dir = Path (self .dir )
46+ dir . mkdir ( parents = True , exist_ok = True )
4947 # TODO may be tricky, as there may be different paths on systems
50- dbpath = os . path . join ( self . dir , '%s.db' % spider .name )
51- reset = self .reset or getattr (spider , ' deltafetch_reset' , False )
52- flag = 'n' if reset else 'c'
48+ dbpath = dir / f" { spider .name } .db"
49+ reset = self .reset or getattr (spider , " deltafetch_reset" , False )
50+ flag = "n" if reset else "c"
5351 try :
54- self .db = dbm .open (dbpath , flag = flag )
52+ self .db = dbm .open (dbpath , flag = flag ) # noqa: SIM115
5553 except Exception :
56- logger .warning ("Failed to open DeltaFetch database at %s, "
57- "trying to recreate it" % dbpath )
58- if os .path .exists (dbpath ):
59- os .remove (dbpath )
60- self .db = dbm .open (dbpath , 'c' )
54+ logger .warning (
55+ f"Failed to open DeltaFetch database at { dbpath } , trying to recreate it"
56+ )
57+ if dbpath .exists ():
58+ dbpath .unlink ()
59+ self .db = dbm .open (dbpath , "c" ) # noqa: SIM115
6160
62- def spider_closed (self , spider ):
61+ def spider_closed (self , spider ): # noqa: D102
6362 self .db .close ()
6463
65- def process_spider_output (self , response , result , spider ):
64+ def process_spider_output (self , response , result , spider ): # noqa: D102
6665 for r in result :
6766 if isinstance (r , Request ):
6867 key = self ._get_key (r )
6968 if key in self .db and self ._is_enabled_for_request (r ):
70- logger .info ("Ignoring already visited: %s" % r )
69+ logger .info (f "Ignoring already visited: { r } " )
7170 if self .stats :
72- self .stats .inc_value (' deltafetch/skipped' , spider = spider )
71+ self .stats .inc_value (" deltafetch/skipped" , spider = spider )
7372 continue
7473 elif isinstance (r , (Item , dict )):
7574 key = self ._get_key (response .request )
7675 self .db [key ] = str (time .time ())
7776 if self .stats :
78- self .stats .inc_value (' deltafetch/stored' , spider = spider )
77+ self .stats .inc_value (" deltafetch/stored" , spider = spider )
7978 yield r
8079
8180 def _get_key (self , request ):
82- key = request .meta .get (' deltafetch_key' ) or request_fingerprint (request )
81+ key = request .meta .get (" deltafetch_key" ) or request_fingerprint (request )
8382 return to_bytes (key )
8483
8584 def _is_enabled_for_request (self , request ):
8685 # Gives you option to disable deltafetch for some requests
87- return request .meta .get (' deltafetch_enabled' , True )
86+ return request .meta .get (" deltafetch_enabled" , True )
0 commit comments