@@ -207,6 +207,11 @@ def _do_main_work(self):
207207 else :
208208 wanted = available
209209
210+ # Make sure to archive the front pages and some common pages.
211+
212+ inform ('Extracting URLs under /view pages ...' )
213+ urls = self ._eprints_general_urls (server )
214+
210215 # The basic URLs for EPrint pages can be constructed without doing
211216 # any record lookups -- all you need is id numbers from the index.
212217 # Some sites like Caltech use an additional field, <official_url>,
@@ -223,7 +228,7 @@ def _do_main_work(self):
223228 records = []
224229 if not self .lastmod and not self .status :
225230 official_url = lambda r : server .eprint_field_value (r , 'official_url' )
226- urls = self ._eprints_values (official_url , wanted , server , "<official_url>'s" )
231+ urls + = self ._eprints_values (official_url , wanted , server , "<official_url>'s" )
227232 else :
228233 skipped = []
229234 for r in self ._eprints_values (server .eprint_xml , wanted , server , "record materials" ):
@@ -246,19 +251,20 @@ def _do_main_work(self):
246251 if len (records ) == 0 :
247252 warn ('Filtering left 0 records -- nothing left to do' )
248253 return
249- urls = [server .eprint_field_value (r , 'official_url' ) for r in records ]
254+ urls + = [server .eprint_field_value (r , 'official_url' ) for r in records ]
250255
251256 # Next, construct "standard" URLs and check that they exist. Do this
252257 # AFTER the steps above, because if we did any filtering, we may have
253258 # a much shorter list of records now than what we started with.
254259
255- urls += self ._eprints_basic_urls (server , records or wanted )
260+ urls += self ._eprints_record_urls (server , records or wanted )
256261
257262 # Good time to check if the parent thread sent an interrupt.
258263 raise_for_interrupts ()
259264
260265 # Clean up any None's and make sure we have something left to do.
261- urls = list (filter (None , urls ))
266+ # Also make sure the URLs are unique (that's the dict.fromkeys bit).
267+ urls = list (dict .fromkeys (filter (None , urls )))
262268 if not urls :
263269 alert ('List of URLs is empty -- nothing to archive' )
264270 return
@@ -293,7 +299,7 @@ def record_values(items, update_progress):
293299 else :
294300 warn (failure )
295301 continue
296- if data :
302+ if data is not None :
297303 results .append (data )
298304 elif self .quit_on_error :
299305 alert (f'Received no data for { item } ' )
@@ -307,18 +313,23 @@ def record_values(items, update_progress):
307313 return self ._gathered (record_values , items_list , header )
308314
309315
310- def _eprints_basic_urls (self , server , records_list ):
316+ def _eprints_general_urls (self , server ):
317+ '''Return a list of commonly-available, high-level EPrints URLs.'''
318+ return [server .front_page_url ()] + server .view_urls ()
319+
320+
321+ def _eprints_record_urls (self , server , records_list ):
311322 '''Get the normal EPrints URLS for the items in "records_list".'''
312323 # Helper function: body of loop that is executed in all cases.
313324 def eprints_urls (item_list , update_progress ):
314325 urls = []
315326 for r in item_list :
316- if __debug__ : log (f'getting URLs for { r } ' )
327+ # Note: don't use log() here b/c r could be an xml etree.
317328 try :
318329 urls .append (server .eprint_id_url (r ))
319330 urls .append (server .eprint_page_url (r ))
320331 except (NoContent , AuthenticationFailure ) as ex :
321- if __debug__ : log ( f'got exception { str ( ex ) } for { r } -- moving on' )
332+ continue
322333 update_progress ()
323334 raise_for_interrupts ()
324335 return urls
@@ -332,7 +343,7 @@ def _send(self, urls_to_send):
332343 '''Send the list of URLs to each web archiving service in parallel.'''
333344 num_urls = len (urls_to_send )
334345 num_dest = len (self .dest )
335- self ._report (f'{ num_urls } URLs to be sent to { num_dest } { plural ("service" , num_dest )} .' )
346+ self ._report (f'Will send { num_urls } URLs to { num_dest } { plural ("service" , num_dest )} .' )
336347 if self .force :
337348 inform ('Force option given ⟹ adding URLs even if archives have copies.' )
338349
@@ -364,7 +375,8 @@ def send_to_service(dest, pbar):
364375 else :
365376 num_threads = min (num_dest , self .threads )
366377 if __debug__ : log (f'using { num_threads } threads to send records' )
367- self ._executor = ThreadPoolExecutor (max_workers = num_threads )
378+ self ._executor = ThreadPoolExecutor (max_workers = num_threads ,
379+ thread_name_prefix = 'SendThread' )
368380 self ._futures = []
369381 for service in self .dest :
370382 future = self ._executor .submit (send_to_service , service , pbar )
@@ -398,7 +410,8 @@ def _gathered(self, loop, items_list, header):
398410 # If we didn't return above, we're going parallel.
399411 num_threads = min (num_items , self .threads )
400412 if __debug__ : log (f'using { num_threads } threads to gather records' )
401- self ._executor = ThreadPoolExecutor (max_workers = num_threads )
413+ self ._executor = ThreadPoolExecutor (max_workers = num_threads ,
414+ thread_name_prefix = 'GatherThread' )
402415 self ._futures = []
403416 for sublist in slice (items_list , num_threads ):
404417 future = self ._executor .submit (loop , sublist , update_progress )
0 commit comments