caltechlibrary
diff --git a/‎CHANGES.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGES.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 36 additions & 8 deletions b/‎README.md‎
Lines changed: 36 additions & 8 deletions
diff --git a/‎eprints2archives/debug.py‎
Lines changed: 13 additions & 6 deletions b/‎eprints2archives/debug.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎eprints2archives/eprints.py‎
Lines changed: 36 additions & 1 deletion b/‎eprints2archives/eprints.py‎
Lines changed: 36 additions & 1 deletion
diff --git a/‎eprints2archives/main_body.py‎
Lines changed: 24 additions & 11 deletions b/‎eprints2archives/main_body.py‎
Lines changed: 24 additions & 11 deletions
@@ -1,6 +1,15 @@
 Change log for REPOSITORY
 =========================
 
+Version 1.1.0
+--------------
+
+* Include the top-level server URL among the URLs sent to archives, as well as `/view` and two levels of pages under `/view`.
+* Make sure the set of URLs sent to archives is unique.
+* Improve debug logging from low-level network module.
+* Clarify some things in the README file.
+
+
 Version 1.0.0
 -------------
 
 
@@ -53,7 +53,7 @@ def set_debug(enabled, dest = '-'):
         # Set the appropriate output destination if we haven't already.
         if enabled:
             logger    = logging.getLogger(__package__)
-            formatter = logging.Formatter('%(name)s %(message)s')
+            formatter = logging.Formatter('%(threadName)s %(message)s')
             # We only allow one active destination.
             for h in logger.handlers:
                 logger.removeHandler(h)
@@ -72,6 +72,12 @@ def set_debug(enabled, dest = '-'):
             logger.setLevel(WARNING)
 
 
+# You might think that the way to get the current caller info when the log
+# function is called would be to use logger.findCaller(). I tried that, and it
+# produced very different information, even when using various values of
+# stacklevel as the argument. The code below instead uses the Python inspect
+# module to get the correct stack frame at run time.
+
 def log(s, *other_args):
     '''Logs a debug message. 's' can contain format directive, and the
     remaining arguments are the arguments to the format string.
@@ -81,8 +87,9 @@ def log(s, *other_args):
         # the string format from always being performed if logging is not
         # turned on and the user isn't running Python with -O.
         if getattr(sys.modules[__package__], '_debugging'):
-            func = inspect.currentframe().f_back.f_code.co_name
-            file_path = inspect.currentframe().f_back.f_code.co_filename
-            filename = path.basename(file_path)
-            logging.getLogger(__package__).debug(f'{filename} {func}(): '
-                                                 + s.format(*other_args))
+            frame  = inspect.currentframe().f_back
+            func   = frame.f_code.co_name
+            lineno = frame.f_lineno
+            file   = path.basename(frame.f_code.co_filename)
+            logger = logging.getLogger(__package__)
+            logger.debug(f'{file}:{lineno} {func}() -- ' + s.format(*other_args))
@@ -16,7 +16,7 @@
 
 import codecs
 from   collections import defaultdict
-from   lxml import etree
+from   lxml import etree, html
 import os
 from   os import path
 import shutil
@@ -46,6 +46,7 @@ def __init__(self, api_url, user, password):
         self._protocol   = scheme(self._api_url)
         self._netloc     = netloc(self._api_url)
         self._hostname   = hostname(self._api_url)
+        self._base_url   = self._protocol + '://' + self._netloc
         self._user       = user
         self._password   = password
         # List of all record identifiers known to the server:
@@ -103,6 +104,40 @@ def index(self, as_int = False):
             return self._index
 
 
+    def front_page_url(self):
+        '''Return the public front page URL of this EPrints server.'''
+        return self._base_url
+
+
+    def view_urls(self):
+        '''Return a list of URLs corresponding to pages under /view.'''
+        # Start with the top-level one
+        view_base = self._base_url + '/view/'
+        (response, error) = net('get', view_base, timeout = 10)
+        if error:
+            if __debug__: log(f'got {type(error)} error for {view_base}')
+            return urls
+        # Scrape the HTML to find the block of links to pages under /view.
+        doc = html.fromstring(response.text)
+        doc.make_links_absolute(view_base)
+        view_urls = [x.get('href') for x in doc.cssselect('div.ep_view_browse_list li a')]
+        if __debug__: log(f'found {len(view_urls)} URLs under /view')
+        # Iterate over each page found to get the links to its subpages.
+        # There will be many under /view/ids, one for each record, but they're
+        # separate pages from the individual EPrint record pages.
+        subpage_urls = []
+        for view_subpage in view_urls:
+            (response, error) = net('get', view_subpage, timeout = 10)
+            if error:
+                if __debug__: log(f'got {type(error)} error for {view_subpage}')
+                continue
+            doc = html.fromstring(response.text)
+            doc.make_links_absolute(view_subpage)
+            subpage_urls += [x.get('href') for x in doc.cssselect('div.ep_view_menu li a')]
+        if __debug__: log(f'collected {len(subpage_urls)} /view subpage URLs')
+        return view_urls + subpage_urls
+
+
     def eprint_id_url(self, id_or_record, verify = True):
         '''Return the main URL of the web page for the record on this server.
 
 
@@ -207,6 +207,11 @@ def _do_main_work(self):
         else:
             wanted = available
 
+        # Make sure to archive the front pages and some common pages.
+
+        inform('Extracting URLs under /view pages ...')
+        urls = self._eprints_general_urls(server)
+
         # The basic URLs for EPrint pages can be constructed without doing
         # any record lookups -- all you need is id numbers from the index.
         # Some sites like Caltech use an additional field, <official_url>,
@@ -223,7 +228,7 @@ def _do_main_work(self):
         records = []
         if not self.lastmod and not self.status:
             official_url = lambda r: server.eprint_field_value(r, 'official_url')
-            urls = self._eprints_values(official_url, wanted, server, "<official_url>'s")
+            urls += self._eprints_values(official_url, wanted, server, "<official_url>'s")
         else:
             skipped = []
             for r in self._eprints_values(server.eprint_xml, wanted, server, "record materials"):
@@ -246,19 +251,20 @@ def _do_main_work(self):
             if len(records) == 0:
                 warn('Filtering left 0 records -- nothing left to do')
                 return
-            urls = [server.eprint_field_value(r, 'official_url') for r in records]
+            urls += [server.eprint_field_value(r, 'official_url') for r in records]
 
         # Next, construct "standard" URLs and check that they exist.  Do this
         # AFTER the steps above, because if we did any filtering, we may have
         # a much shorter list of records now than what we started with.
 
-        urls += self._eprints_basic_urls(server, records or wanted)
+        urls += self._eprints_record_urls(server, records or wanted)
 
         # Good time to check if the parent thread sent an interrupt.
         raise_for_interrupts()
 
         # Clean up any None's and make sure we have something left to do.
-        urls = list(filter(None, urls))
+        # Also make sure the URLs are unique (that's the dict.fromkeys bit).
+        urls = list(dict.fromkeys(filter(None, urls)))
         if not urls:
             alert('List of URLs is empty -- nothing to archive')
             return
@@ -293,7 +299,7 @@ def record_values(items, update_progress):
                     else:
                         warn(failure)
                         continue
-                if data:
+                if data is not None:
                     results.append(data)
                 elif self.quit_on_error:
                     alert(f'Received no data for {item}')
@@ -307,18 +313,23 @@ def record_values(items, update_progress):
         return self._gathered(record_values, items_list, header)
 
 
-    def _eprints_basic_urls(self, server, records_list):
+    def _eprints_general_urls(self, server):
+        '''Return a list of commonly-available, high-level EPrints URLs.'''
+        return [server.front_page_url()] + server.view_urls()
+
+
+    def _eprints_record_urls(self, server, records_list):
         '''Get the normal EPrints URLS for the items in "records_list".'''
         # Helper function: body of loop that is executed in all cases.
         def eprints_urls(item_list, update_progress):
             urls = []
             for r in item_list:
-                if __debug__: log(f'getting URLs for {r}')
+                # Note: don't use log() here b/c r could be an xml etree.
                 try:
                     urls.append(server.eprint_id_url(r))
                     urls.append(server.eprint_page_url(r))
                 except (NoContent, AuthenticationFailure) as ex:
-                    if __debug__: log(f'got exception {str(ex)} for {r} -- moving on')
+                    continue
                 update_progress()
                 raise_for_interrupts()
             return urls
@@ -332,7 +343,7 @@ def _send(self, urls_to_send):
         '''Send the list of URLs to each web archiving service in parallel.'''
         num_urls = len(urls_to_send)
         num_dest = len(self.dest)
-        self._report(f'{num_urls} URLs to be sent to {num_dest} {plural("service", num_dest)}.')
+        self._report(f'Will send {num_urls} URLs to {num_dest} {plural("service", num_dest)}.')
         if self.force:
             inform('Force option given ⟹  adding URLs even if archives have copies.')
 
@@ -364,7 +375,8 @@ def send_to_service(dest, pbar):
             else:
                 num_threads = min(num_dest, self.threads)
                 if __debug__: log(f'using {num_threads} threads to send records')
-                self._executor = ThreadPoolExecutor(max_workers = num_threads)
+                self._executor = ThreadPoolExecutor(max_workers = num_threads,
+                                                    thread_name_prefix = 'SendThread')
                 self._futures = []
                 for service in self.dest:
                     future = self._executor.submit(send_to_service, service, pbar)
@@ -398,7 +410,8 @@ def _gathered(self, loop, items_list, header):
             # If we didn't return above, we're going parallel.
             num_threads = min(num_items, self.threads)
             if __debug__: log(f'using {num_threads} threads to gather records')
-            self._executor = ThreadPoolExecutor(max_workers = num_threads)
+            self._executor = ThreadPoolExecutor(max_workers = num_threads,
+                                                thread_name_prefix = 'GatherThread')
             self._futures = []
             for sublist in slice(items_list, num_threads):
                 future = self._executor.submit(loop, sublist, update_progress)