77from frontera .core .components import Metadata , Queue , States
88from frontera .core import OverusedBuffer
99from frontera .utils .heap import Heap
10- from frontera .contrib .backends .partitioners import Crc32NamePartitioner
1110from frontera .utils .url import parse_domain_from_url_fast
11+ from frontera .utils .misc import load_object
1212import six
1313from six .moves import map
1414from six .moves import range
@@ -52,12 +52,11 @@ def update_score(self, batch):
5252
5353
5454class MemoryQueue (Queue ):
55- def __init__ (self , partitions ):
56- self .partitions = [i for i in range (0 , partitions )]
57- self .partitioner = Crc32NamePartitioner (self .partitions )
55+ def __init__ (self , partitioner ):
56+ self .partitioner = partitioner
5857 self .logger = logging .getLogger ("memory.queue" )
5958 self .heap = {}
60- for partition in self .partitions :
59+ for partition in self .partitioner . partitions :
6160 self .heap [partition ] = Heap (self ._compare_pages )
6261
6362 def count (self ):
@@ -70,31 +69,26 @@ def schedule(self, batch):
7069 for fprint , score , request , schedule in batch :
7170 if schedule :
7271 request .meta [b'_scr' ] = score
73- _ , hostname , _ , _ , _ , _ = parse_domain_from_url_fast (request .url )
74- if not hostname :
75- self .logger .error ("Can't get hostname for URL %s, fingerprint %s" , request .url , fprint )
76- partition_id = self .partitions [0 ]
77- else :
78- partition_id = self .partitioner .partition (hostname , self .partitions )
72+ key = self .partitioner .get_key (request )
73+ partition_id = self .partitioner .partition (key )
7974 self .heap [partition_id ].push (request )
8075
8176 def _compare_pages (self , first , second ):
8277 return cmp (first .meta [b'_scr' ], second .meta [b'_scr' ])
8378
8479
8580class MemoryDequeQueue (Queue ):
86- def __init__ (self , partitions , is_fifo = True ):
81+ def __init__ (self , partitioner , is_fifo = True ):
8782 """
8883 Deque-based queue (see collections module). Efficient queue for LIFO and FIFO strategies.
89- :param partitions: int count of partitions
84+ :param partitioner: Partitioner
9085 :param type: bool, True for FIFO, False for LIFO
9186 """
92- self .partitions = [i for i in range (0 , partitions )]
93- self .partitioner = Crc32NamePartitioner (self .partitions )
87+ self .partitioner = partitioner
9488 self .logger = logging .getLogger ("memory.dequequeue" )
9589 self .queues = {}
9690 self .is_fifo = is_fifo
97- for partition in self .partitions :
91+ for partition in self .partitioner . partitions :
9892 self .queues [partition ] = deque ()
9993
10094 def count (self ):
@@ -112,12 +106,8 @@ def schedule(self, batch):
112106 for fprint , score , request , schedule in batch :
113107 if schedule :
114108 request .meta [b'_scr' ] = score
115- _ , hostname , _ , _ , _ , _ = parse_domain_from_url_fast (request .url )
116- if not hostname :
117- self .logger .error ("Can't get hostname for URL %s, fingerprint %s" , request .url , fprint )
118- partition_id = self .partitions [0 ]
119- else :
120- partition_id = self .partitioner .partition (hostname , self .partitions )
109+ key = self .partitioner .get_key (request )
110+ partition_id = self .partitioner .partition (key )
121111 self .queues [partition_id ].append (request )
122112
123113
@@ -165,6 +155,9 @@ def __init__(self, manager):
165155 settings = manager .settings
166156 self ._metadata = MemoryMetadata ()
167157 self ._states = MemoryStates (settings .get ("STATE_CACHE_SIZE" ))
158+ partitions = list (range (settings .get ('SPIDER_FEED_PARTITIONS' )))
159+ partitioner_cls = load_object (settings .get ('SPIDER_FEED_PARTITIONER' ))
160+ self ._partitioner = partitioner_cls (partitions )
168161 self ._queue = self ._create_queue (settings )
169162 self ._id = 0
170163
@@ -222,27 +215,27 @@ def _compare_pages(self, first, second):
222215
223216class MemoryFIFOBackend (MemoryBaseBackend ):
224217 def _create_queue (self , settings ):
225- return MemoryDequeQueue (settings . get ( 'SPIDER_FEED_PARTITIONS' ) )
218+ return MemoryDequeQueue (self . _partitioner )
226219
227220
228221class MemoryLIFOBackend (MemoryBaseBackend ):
229222 def _create_queue (self , settings ):
230- return MemoryDequeQueue (settings . get ( 'SPIDER_FEED_PARTITIONS' ) , is_fifo = False )
223+ return MemoryDequeQueue (self . _partitioner , is_fifo = False )
231224
232225
233226class MemoryDFSBackend (MemoryBaseBackend ):
234227 def _create_queue (self , settings ):
235- return MemoryDFSQueue (settings . get ( 'SPIDER_FEED_PARTITIONS' ) )
228+ return MemoryDFSQueue (self . _partitioner )
236229
237230
238231class MemoryBFSBackend (MemoryBaseBackend ):
239232 def _create_queue (self , settings ):
240- return MemoryBFSQueue (settings . get ( 'SPIDER_FEED_PARTITIONS' ) )
233+ return MemoryBFSQueue (self . _partitioner )
241234
242235
243236class MemoryRandomBackend (MemoryBaseBackend ):
244237 def _create_queue (self , settings ):
245- return MemoryRandomQueue (settings . get ( 'SPIDER_FEED_PARTITIONS' ) )
238+ return MemoryRandomQueue (self . _partitioner )
246239
247240
248241class MemoryDFSOverusedBackend (MemoryDFSBackend ):
0 commit comments