Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions doab_check/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,27 +30,34 @@ def __init__(self):
def content_type(self, url):
try:
try:
r = requests.get(url, allow_redirects=True, headers=HEADERS)
r = requests.get(url, allow_redirects=True, headers=HEADERS, timeout=(5, 60))
except UnicodeDecodeError as ude:
# fallback for non-ascii, non-utf8 bytes in redirect location
if 'utf-8' in str(ude):
(scheme, netloc, path, query, fragment) = urlsplit(url)
newpath = quote(unquote(path), encoding='latin1')
url = urlunsplit((scheme, netloc, newpath, query, fragment))
r = requests.get(url, allow_redirects=True, headers=HEADERS)
r = requests.get(url, allow_redirects=True, headers=HEADERS, timeout=(5, 60))
if r.status_code == 200:
r.status_code = 214 # unofficial status code where url is changed
if r.status_code == 405:
r = requests.get(url, headers=HEADERS)
r = requests.get(url, headers=HEADERS, timeout=(5, 60))
return r
except requests.exceptions.SSLError:
r = requests.get(url, verify=False)
r.status_code = 511
return r
try:
r = requests.get(url, verify=False, timeout=(5, 60))
r.status_code = 511
return r
except requests.exceptions.Timeout:
return (524, '', '')
except Exception:
return (511, '', '')
except requests.exceptions.Timeout:
return (524, '', '')
except requests.exceptions.ConnectionError as ce:
if '[Errno 8]' in str(ce) or '[Errno -2]' in str(ce):
try:
r = requests.get(url, allow_redirects=False, headers=HEADERS)
r = requests.get(url, allow_redirects=False, headers=HEADERS, timeout=(5, 60))
return r
except Exception as e:
pass
Expand Down
19 changes: 17 additions & 2 deletions doab_check/doab_oai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import datetime
import logging
import re
import urllib.error

import pytz
from dateutil.parser import isoparse
Expand Down Expand Up @@ -152,16 +153,30 @@ def load_doab_oai(from_date, until_date, limit=100):
doab = getdoab(ident)
if doab:
num_doabs += 1
item = add_by_doab(doab, record=record)
try:
item = add_by_doab(doab, record=record)
except Exception as ex:
logger.exception('unexpected error processing doab #%s: %s', doab, ex)
continue
if not item:
logger.error('error for doab #%s', doab)
continue
if item.created > start:
new_doabs += 1
title = item.title
logger.info(u'updated:\t%s\t%s', doab, title)
if num_doabs >= limit:
if limit is not None and num_doabs >= limit:
break
except NoRecordsMatchError:
pass
except urllib.error.HTTPError as e:
if e.code == 429:
retry_after = e.headers.get('Retry-After', 'unknown')
logger.error(
'DOAB OAI rate-limited (HTTP 429). '
'Retry-After: %s seconds. Harvest stopped after %s records.',
retry_after, num_doabs
)
else:
raise
return num_doabs, new_doabs, lasttime
6 changes: 5 additions & 1 deletion doab_check/doab_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@
def get_streamdata(handle):
url = STREAM_QUERY.format(handle)
try:
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT})
response = requests.get(url, headers={"User-Agent": settings.USER_AGENT}, timeout=(5, 60))
if response.status_code == 429:
retry_after = response.headers.get('Retry-After', 'unknown')
logger.error('DOAB bitstream API rate-limited (HTTP 429) for %s. Retry-After: %s', handle, retry_after)
return None
items = response.json()
if items:
for stream in items[0]['bitstreams']:
Expand Down