@@ -37,6 +37,7 @@ def __init__(
3737 flush_on_start = False ,
3838 queue_key = defaults .SCHEDULER_QUEUE_KEY ,
3939 queue_cls = defaults .SCHEDULER_QUEUE_CLASS ,
40+ dupefilter = None ,
4041 dupefilter_key = defaults .SCHEDULER_DUPEFILTER_KEY ,
4142 dupefilter_cls = defaults .SCHEDULER_DUPEFILTER_CLASS ,
4243 idle_before_close = 0 ,
@@ -56,6 +57,8 @@ def __init__(
5657 Requests queue key.
5758 queue_cls : str
5859 Importable path to the queue class.
60+ dupefilter: Dupefilter
61+ Custom dupefilter instance.
5962 dupefilter_key : str
6063 Duplicates filter key.
6164 dupefilter_cls : str
@@ -72,6 +75,7 @@ def __init__(
7275 self .flush_on_start = flush_on_start
7376 self .queue_key = queue_key
7477 self .queue_cls = queue_cls
78+ self .df = dupefilter
7579 self .dupefilter_cls = dupefilter_cls
7680 self .dupefilter_key = dupefilter_key
7781 self .idle_before_close = idle_before_close
@@ -105,6 +109,10 @@ def from_settings(cls, settings):
105109 if val :
106110 kwargs [name ] = val
107111
112+ dupefilter_cls = load_object (kwargs ["dupefilter_cls" ])
113+ if not hasattr (dupefilter_cls , "from_spider" ):
114+ kwargs ["dupefilter" ] = dupefilter_cls .from_settings (settings )
115+
108116 # Support serializer as a path to a module.
109117 if isinstance (kwargs .get ("serializer" ), str ):
110118 kwargs ["serializer" ] = importlib .import_module (kwargs ["serializer" ])
@@ -137,7 +145,8 @@ def open(self, spider):
137145 f"Failed to instantiate queue class '{ self .queue_cls } ': { e } "
138146 )
139147
140- self .df = load_object (self .dupefilter_cls ).from_spider (spider )
148+ if not self .df :
149+ self .df = load_object (self .dupefilter_cls ).from_spider (spider )
141150
142151 if self .flush_on_start :
143152 self .flush ()
0 commit comments