Skip to content

Commit eb930dd

Browse files
authored
Separate HTTP request from feedparser.parse (#136)
1 parent 43c907f commit eb930dd

File tree

7 files changed

+123
-88
lines changed

7 files changed

+123
-88
lines changed

.github/workflows/python-package.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ jobs:
1111
test:
1212
runs-on: ubuntu-latest
1313
strategy:
14+
fail-fast: false
15+
max-parallel: 1
1416
matrix:
1517
python-version: ["3.7", "3.10", "3.11"]
1618
steps:

arxiv/arxiv.py

Lines changed: 55 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
import logging
55
import time
66
import feedparser
7-
import re
87
import os
8+
import re
9+
import requests
910
import warnings
1011

1112
from urllib.parse import urlencode
@@ -514,7 +515,9 @@ class Client(object):
514515
"""Number of seconds to wait between API requests."""
515516
num_retries: int
516517
"""Number of times to retry a failing API request."""
518+
517519
_last_request_dt: datetime
520+
_session: requests.Session
518521

519522
def __init__(
520523
self, page_size: int = 100, delay_seconds: int = 3, num_retries: int = 3
@@ -531,6 +534,7 @@ def __init__(
531534
self.delay_seconds = delay_seconds
532535
self.num_retries = num_retries
533536
self._last_request_dt = None
537+
self._session = requests.Session()
534538

535539
def __str__(self) -> str:
536540
# TODO: develop a more informative string representation.
@@ -579,7 +583,7 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No
579583
page_size = min(self.page_size, search.max_results - offset)
580584
logger.info("Requesting %d results at offset %d", page_size, offset)
581585
page_url = self._format_url(search, offset, page_size)
582-
feed = self._parse_feed(page_url, first_page)
586+
feed = self._parse_feed(page_url, first_page=first_page)
583587
if first_page:
584588
# NOTE: this is an ugly fix for a known bug. The totalresults
585589
# value is set to 1 for results with zero entries. If that API
@@ -626,32 +630,42 @@ def _format_url(self, search: Search, start: int, page_size: int) -> str:
626630
return self.query_url_format.format(urlencode(url_args))
627631

628632
def _parse_feed(
629-
self, url: str, first_page: bool = True
633+
self, url: str, first_page: bool = True, _try_index: int = 0
630634
) -> feedparser.FeedParserDict:
631635
"""
632636
Fetches the specified URL and parses it with feedparser.
633637
634638
If a request fails or is unexpectedly empty, retries the request up to
635639
`self.num_retries` times.
636640
"""
637-
# Invoke the recursive helper with initial available retries.
638-
return self.__try_parse_feed(
639-
url, first_page=first_page, retries_left=self.num_retries
640-
)
641+
try:
642+
return self.__try_parse_feed(
643+
url, first_page=first_page, try_index=_try_index
644+
)
645+
except (
646+
HTTPError,
647+
UnexpectedEmptyPageError,
648+
requests.exceptions.ConnectionError,
649+
) as err:
650+
if _try_index < self.num_retries:
651+
logger.debug("Got error (try %d): %s", _try_index, err)
652+
return self._parse_feed(
653+
url, first_page=first_page, _try_index=_try_index + 1
654+
)
655+
logger.debug("Giving up (try %d): %s", _try_index, err)
656+
raise err
641657

642658
def __try_parse_feed(
643659
self,
644660
url: str,
645661
first_page: bool,
646-
retries_left: int,
647-
last_err: Exception = None,
662+
try_index: int,
648663
) -> feedparser.FeedParserDict:
649664
"""
650665
Recursive helper for _parse_feed. Enforces `self.delay_seconds`: if that
651666
number of seconds has not passed since `_parse_feed` was last called,
652667
sleeps until delay_seconds seconds have passed.
653668
"""
654-
retry = self.num_retries - retries_left
655669
# If this call would violate the rate limit, sleep until it doesn't.
656670
if self._last_request_dt is not None:
657671
required = timedelta(seconds=self.delay_seconds)
@@ -660,34 +674,26 @@ def __try_parse_feed(
660674
to_sleep = (required - since_last_request).total_seconds()
661675
logger.info("Sleeping: %f seconds", to_sleep)
662676
time.sleep(to_sleep)
677+
663678
logger.info(
664-
"Requesting page (try %d): %s",
665-
retry,
666-
url,
667-
extra={
668-
"first_page": first_page,
669-
"last_err": last_err.message if last_err is not None else None,
670-
},
679+
"Requesting page (first: %r, try: %d): %s", first_page, try_index, url
671680
)
672-
feed = feedparser.parse(url)
681+
682+
resp = self._session.get(url, headers={"user-agent": "arxiv.py/1.4.8"})
673683
self._last_request_dt = datetime.now()
674-
err = None
675-
if feed.status != 200:
676-
err = HTTPError(url, retry, feed)
677-
elif len(feed.entries) == 0 and not first_page:
678-
err = UnexpectedEmptyPageError(url, retry)
679-
if err is not None:
680-
logger.debug("Got error (try %d): %s", retry, err)
681-
if retries_left > 0:
682-
return self.__try_parse_feed(
683-
url,
684-
first_page=first_page,
685-
retries_left=retries_left - 1,
686-
last_err=err,
687-
)
688-
# Feed was never returned in self.num_retries tries. Raise the last
689-
# exception encountered.
690-
raise err
684+
if resp.status_code != requests.codes.OK:
685+
raise HTTPError(url, try_index, resp.status_code)
686+
687+
feed = feedparser.parse(resp.content)
688+
if len(feed.entries) == 0 and not first_page:
689+
raise UnexpectedEmptyPageError(url, try_index, feed)
690+
691+
if feed.bozo:
692+
logger.warning(
693+
"Bozo feed; consider handling: %s",
694+
feed.bozo_exception if "bozo_exception" in feed else None,
695+
)
696+
691697
return feed
692698

693699

@@ -727,16 +733,25 @@ class UnexpectedEmptyPageError(ArxivError):
727733
See `Client.results` for usage.
728734
"""
729735

730-
def __init__(self, url: str, retry: int):
736+
raw_feed: feedparser.FeedParserDict
737+
"""
738+
The raw output of `feedparser.parse`. Sometimes this contains useful
739+
diagnostic information, e.g. in 'bozo_exception'.
740+
"""
741+
742+
def __init__(self, url: str, retry: int, raw_feed: feedparser.FeedParserDict):
731743
"""
732744
Constructs an `UnexpectedEmptyPageError` encountered for the specified
733745
API URL after `retry` tries.
734746
"""
735747
self.url = url
748+
self.raw_feed = raw_feed
736749
super().__init__(url, retry, "Page of results was unexpectedly empty")
737750

738751
def __repr__(self) -> str:
739-
return "{}({}, {})".format(_classname(self), repr(self.url), repr(self.retry))
752+
return "{}({}, {}, {})".format(
753+
_classname(self), repr(self.url), repr(self.retry), repr(self.raw_feed)
754+
)
740755

741756

742757
class HTTPError(ArxivError):
@@ -748,29 +763,18 @@ class HTTPError(ArxivError):
748763

749764
status: int
750765
"""The HTTP status reported by feedparser."""
751-
entry: feedparser.FeedParserDict
752-
"""The feed entry describing the error, if present."""
753766

754-
def __init__(self, url: str, retry: int, feed: feedparser.FeedParserDict):
767+
def __init__(self, url: str, retry: int, status: int):
755768
"""
756769
Constructs an `HTTPError` for the specified status code, encountered for
757770
the specified API URL after `retry` tries.
758771
"""
759772
self.url = url
760-
self.status = feed.status
761-
# If the feed is valid and includes a single entry, trust it's an
762-
# explanation.
763-
if not feed.bozo and len(feed.entries) == 1:
764-
self.entry = feed.entries[0]
765-
else:
766-
self.entry = None
773+
self.status = status
767774
super().__init__(
768775
url,
769776
retry,
770-
"Page request resulted in HTTP {}: {}".format(
771-
self.status,
772-
self.entry.summary if self.entry else None,
773-
),
777+
"Page request resulted in HTTP {}".format(self.status),
774778
)
775779

776780
def __repr__(self) -> str:

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
feedparser==6.0.6
1+
feedparser==6.0.10
2+
requests==2.31.0
23

34
# Development dependencies
45
pytest>=6.2.2

setup.cfg

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,4 @@ description_file = README.md
44
[tool:pytest]
55
addopts = --verbose
66
log_cli = True
7-
log_cli_level = INFO
8-
7+
log_cli_level = DEBUG

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
packages=["arxiv"],
1212
# dependencies
1313
python_requires=">=3.7",
14-
install_requires=["feedparser==6.0.6"],
14+
install_requires=["feedparser==6.0.10", "requests==2.31.0"],
1515
tests_require=["pytest", "pdoc", "ruff"],
1616
# metadata for upload to PyPI
1717
author="Lukas Schwab",

0 commit comments

Comments
 (0)