Skip to content

Commit 48abc7b

Browse files
committed
Merge branch 'develop' into main
2 parents f7d03f6 + f17fafd commit 48abc7b

File tree

8 files changed

+166
-64
lines changed

8 files changed

+166
-64
lines changed

CHANGES.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
Change log for REPOSITORY
22
=========================
33

4+
Version 1.1.0
5+
--------------
6+
7+
* Include the top-level server URL among the URLs sent to archives, as well as `/view` and two levels of pages under `/view`.
8+
* Make sure the set of URLs sent to archives is unique.
9+
* Improve debug logging from low-level network module.
10+
* Clarify some things in the README file.
11+
12+
413
Version 1.0.0
514
-------------
615

README.md

Lines changed: 36 additions & 8 deletions
Large diffs are not rendered by default.

eprints2archives/debug.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def set_debug(enabled, dest = '-'):
5353
# Set the appropriate output destination if we haven't already.
5454
if enabled:
5555
logger = logging.getLogger(__package__)
56-
formatter = logging.Formatter('%(name)s %(message)s')
56+
formatter = logging.Formatter('%(threadName)s %(message)s')
5757
# We only allow one active destination.
5858
for h in logger.handlers:
5959
logger.removeHandler(h)
@@ -72,6 +72,12 @@ def set_debug(enabled, dest = '-'):
7272
logger.setLevel(WARNING)
7373

7474

75+
# You might think that the way to get the current caller info when the log
76+
# function is called would be to use logger.findCaller(). I tried that, and it
77+
# produced very different information, even when using various values of
78+
# stacklevel as the argument. The code below instead uses the Python inspect
79+
# module to get the correct stack frame at run time.
80+
7581
def log(s, *other_args):
7682
'''Logs a debug message. 's' can contain format directive, and the
7783
remaining arguments are the arguments to the format string.
@@ -81,8 +87,9 @@ def log(s, *other_args):
8187
# the string format from always being performed if logging is not
8288
# turned on and the user isn't running Python with -O.
8389
if getattr(sys.modules[__package__], '_debugging'):
84-
func = inspect.currentframe().f_back.f_code.co_name
85-
file_path = inspect.currentframe().f_back.f_code.co_filename
86-
filename = path.basename(file_path)
87-
logging.getLogger(__package__).debug(f'{filename} {func}(): '
88-
+ s.format(*other_args))
90+
frame = inspect.currentframe().f_back
91+
func = frame.f_code.co_name
92+
lineno = frame.f_lineno
93+
file = path.basename(frame.f_code.co_filename)
94+
logger = logging.getLogger(__package__)
95+
logger.debug(f'{file}:{lineno} {func}() -- ' + s.format(*other_args))

eprints2archives/eprints.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
import codecs
1818
from collections import defaultdict
19-
from lxml import etree
19+
from lxml import etree, html
2020
import os
2121
from os import path
2222
import shutil
@@ -46,6 +46,7 @@ def __init__(self, api_url, user, password):
4646
self._protocol = scheme(self._api_url)
4747
self._netloc = netloc(self._api_url)
4848
self._hostname = hostname(self._api_url)
49+
self._base_url = self._protocol + '://' + self._netloc
4950
self._user = user
5051
self._password = password
5152
# List of all record identifiers known to the server:
@@ -103,6 +104,40 @@ def index(self, as_int = False):
103104
return self._index
104105

105106

107+
def front_page_url(self):
108+
'''Return the public front page URL of this EPrints server.'''
109+
return self._base_url
110+
111+
112+
def view_urls(self):
113+
'''Return a list of URLs corresponding to pages under /view.'''
114+
# Start with the top-level one
115+
view_base = self._base_url + '/view/'
116+
(response, error) = net('get', view_base, timeout = 10)
117+
if error:
118+
if __debug__: log(f'got {type(error)} error for {view_base}')
119+
return urls
120+
# Scrape the HTML to find the block of links to pages under /view.
121+
doc = html.fromstring(response.text)
122+
doc.make_links_absolute(view_base)
123+
view_urls = [x.get('href') for x in doc.cssselect('div.ep_view_browse_list li a')]
124+
if __debug__: log(f'found {len(view_urls)} URLs under /view')
125+
# Iterate over each page found to get the links to its subpages.
126+
# There will be many under /view/ids, one for each record, but they're
127+
# separate pages from the individual EPrint record pages.
128+
subpage_urls = []
129+
for view_subpage in view_urls:
130+
(response, error) = net('get', view_subpage, timeout = 10)
131+
if error:
132+
if __debug__: log(f'got {type(error)} error for {view_subpage}')
133+
continue
134+
doc = html.fromstring(response.text)
135+
doc.make_links_absolute(view_subpage)
136+
subpage_urls += [x.get('href') for x in doc.cssselect('div.ep_view_menu li a')]
137+
if __debug__: log(f'collected {len(subpage_urls)} /view subpage URLs')
138+
return view_urls + subpage_urls
139+
140+
106141
def eprint_id_url(self, id_or_record, verify = True):
107142
'''Return the main URL of the web page for the record on this server.
108143

eprints2archives/main_body.py

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,11 @@ def _do_main_work(self):
207207
else:
208208
wanted = available
209209

210+
# Make sure to archive the front pages and some common pages.
211+
212+
inform('Extracting URLs under /view pages ...')
213+
urls = self._eprints_general_urls(server)
214+
210215
# The basic URLs for EPrint pages can be constructed without doing
211216
# any record lookups -- all you need is id numbers from the index.
212217
# Some sites like Caltech use an additional field, <official_url>,
@@ -223,7 +228,7 @@ def _do_main_work(self):
223228
records = []
224229
if not self.lastmod and not self.status:
225230
official_url = lambda r: server.eprint_field_value(r, 'official_url')
226-
urls = self._eprints_values(official_url, wanted, server, "<official_url>'s")
231+
urls += self._eprints_values(official_url, wanted, server, "<official_url>'s")
227232
else:
228233
skipped = []
229234
for r in self._eprints_values(server.eprint_xml, wanted, server, "record materials"):
@@ -246,19 +251,20 @@ def _do_main_work(self):
246251
if len(records) == 0:
247252
warn('Filtering left 0 records -- nothing left to do')
248253
return
249-
urls = [server.eprint_field_value(r, 'official_url') for r in records]
254+
urls += [server.eprint_field_value(r, 'official_url') for r in records]
250255

251256
# Next, construct "standard" URLs and check that they exist. Do this
252257
# AFTER the steps above, because if we did any filtering, we may have
253258
# a much shorter list of records now than what we started with.
254259

255-
urls += self._eprints_basic_urls(server, records or wanted)
260+
urls += self._eprints_record_urls(server, records or wanted)
256261

257262
# Good time to check if the parent thread sent an interrupt.
258263
raise_for_interrupts()
259264

260265
# Clean up any None's and make sure we have something left to do.
261-
urls = list(filter(None, urls))
266+
# Also make sure the URLs are unique (that's the dict.fromkeys bit).
267+
urls = list(dict.fromkeys(filter(None, urls)))
262268
if not urls:
263269
alert('List of URLs is empty -- nothing to archive')
264270
return
@@ -293,7 +299,7 @@ def record_values(items, update_progress):
293299
else:
294300
warn(failure)
295301
continue
296-
if data:
302+
if data is not None:
297303
results.append(data)
298304
elif self.quit_on_error:
299305
alert(f'Received no data for {item}')
@@ -307,18 +313,23 @@ def record_values(items, update_progress):
307313
return self._gathered(record_values, items_list, header)
308314

309315

310-
def _eprints_basic_urls(self, server, records_list):
316+
def _eprints_general_urls(self, server):
317+
'''Return a list of commonly-available, high-level EPrints URLs.'''
318+
return [server.front_page_url()] + server.view_urls()
319+
320+
321+
def _eprints_record_urls(self, server, records_list):
311322
'''Get the normal EPrints URLS for the items in "records_list".'''
312323
# Helper function: body of loop that is executed in all cases.
313324
def eprints_urls(item_list, update_progress):
314325
urls = []
315326
for r in item_list:
316-
if __debug__: log(f'getting URLs for {r}')
327+
# Note: don't use log() here b/c r could be an xml etree.
317328
try:
318329
urls.append(server.eprint_id_url(r))
319330
urls.append(server.eprint_page_url(r))
320331
except (NoContent, AuthenticationFailure) as ex:
321-
if __debug__: log(f'got exception {str(ex)} for {r} -- moving on')
332+
continue
322333
update_progress()
323334
raise_for_interrupts()
324335
return urls
@@ -332,7 +343,7 @@ def _send(self, urls_to_send):
332343
'''Send the list of URLs to each web archiving service in parallel.'''
333344
num_urls = len(urls_to_send)
334345
num_dest = len(self.dest)
335-
self._report(f'{num_urls} URLs to be sent to {num_dest} {plural("service", num_dest)}.')
346+
self._report(f'Will send {num_urls} URLs to {num_dest} {plural("service", num_dest)}.')
336347
if self.force:
337348
inform('Force option given ⟹ adding URLs even if archives have copies.')
338349

@@ -364,7 +375,8 @@ def send_to_service(dest, pbar):
364375
else:
365376
num_threads = min(num_dest, self.threads)
366377
if __debug__: log(f'using {num_threads} threads to send records')
367-
self._executor = ThreadPoolExecutor(max_workers = num_threads)
378+
self._executor = ThreadPoolExecutor(max_workers = num_threads,
379+
thread_name_prefix = 'SendThread')
368380
self._futures = []
369381
for service in self.dest:
370382
future = self._executor.submit(send_to_service, service, pbar)
@@ -398,7 +410,8 @@ def _gathered(self, loop, items_list, header):
398410
# If we didn't return above, we're going parallel.
399411
num_threads = min(num_items, self.threads)
400412
if __debug__: log(f'using {num_threads} threads to gather records')
401-
self._executor = ThreadPoolExecutor(max_workers = num_threads)
413+
self._executor = ThreadPoolExecutor(max_workers = num_threads,
414+
thread_name_prefix = 'GatherThread')
402415
self._futures = []
403416
for sublist in slice(items_list, num_threads):
404417
future = self._executor.submit(loop, sublist, update_progress)

0 commit comments

Comments
 (0)