44import logging
55import time
66import feedparser
7- import re
87import os
8+ import re
9+ import requests
910import warnings
1011
1112from urllib .parse import urlencode
@@ -514,7 +515,9 @@ class Client(object):
514515 """Number of seconds to wait between API requests."""
515516 num_retries : int
516517 """Number of times to retry a failing API request."""
518+
517519 _last_request_dt : datetime
520+ _session : requests .Session
518521
519522 def __init__ (
520523 self , page_size : int = 100 , delay_seconds : int = 3 , num_retries : int = 3
@@ -531,6 +534,7 @@ def __init__(
531534 self .delay_seconds = delay_seconds
532535 self .num_retries = num_retries
533536 self ._last_request_dt = None
537+ self ._session = requests .Session ()
534538
535539 def __str__ (self ) -> str :
536540 # TODO: develop a more informative string representation.
@@ -579,7 +583,7 @@ def results(self, search: Search, offset: int = 0) -> Generator[Result, None, No
579583 page_size = min (self .page_size , search .max_results - offset )
580584 logger .info ("Requesting %d results at offset %d" , page_size , offset )
581585 page_url = self ._format_url (search , offset , page_size )
582- feed = self ._parse_feed (page_url , first_page )
586+ feed = self ._parse_feed (page_url , first_page = first_page )
583587 if first_page :
584588 # NOTE: this is an ugly fix for a known bug. The totalresults
585589 # value is set to 1 for results with zero entries. If that API
@@ -626,32 +630,42 @@ def _format_url(self, search: Search, start: int, page_size: int) -> str:
626630 return self .query_url_format .format (urlencode (url_args ))
627631
628632 def _parse_feed (
629- self , url : str , first_page : bool = True
633+ self , url : str , first_page : bool = True , _try_index : int = 0
630634 ) -> feedparser .FeedParserDict :
631635 """
632636 Fetches the specified URL and parses it with feedparser.
633637
634638 If a request fails or is unexpectedly empty, retries the request up to
635639 `self.num_retries` times.
636640 """
637- # Invoke the recursive helper with initial available retries.
638- return self .__try_parse_feed (
639- url , first_page = first_page , retries_left = self .num_retries
640- )
641+ try :
642+ return self .__try_parse_feed (
643+ url , first_page = first_page , try_index = _try_index
644+ )
645+ except (
646+ HTTPError ,
647+ UnexpectedEmptyPageError ,
648+ requests .exceptions .ConnectionError ,
649+ ) as err :
650+ if _try_index < self .num_retries :
651+ logger .debug ("Got error (try %d): %s" , _try_index , err )
652+ return self ._parse_feed (
653+ url , first_page = first_page , _try_index = _try_index + 1
654+ )
655+ logger .debug ("Giving up (try %d): %s" , _try_index , err )
656+ raise err
641657
642658 def __try_parse_feed (
643659 self ,
644660 url : str ,
645661 first_page : bool ,
646- retries_left : int ,
647- last_err : Exception = None ,
662+ try_index : int ,
648663 ) -> feedparser .FeedParserDict :
649664 """
650665 Recursive helper for _parse_feed. Enforces `self.delay_seconds`: if that
651666 number of seconds has not passed since `_parse_feed` was last called,
652667 sleeps until delay_seconds seconds have passed.
653668 """
654- retry = self .num_retries - retries_left
655669 # If this call would violate the rate limit, sleep until it doesn't.
656670 if self ._last_request_dt is not None :
657671 required = timedelta (seconds = self .delay_seconds )
@@ -660,34 +674,26 @@ def __try_parse_feed(
660674 to_sleep = (required - since_last_request ).total_seconds ()
661675 logger .info ("Sleeping: %f seconds" , to_sleep )
662676 time .sleep (to_sleep )
677+
663678 logger .info (
664- "Requesting page (try %d): %s" ,
665- retry ,
666- url ,
667- extra = {
668- "first_page" : first_page ,
669- "last_err" : last_err .message if last_err is not None else None ,
670- },
679+ "Requesting page (first: %r, try: %d): %s" , first_page , try_index , url
671680 )
672- feed = feedparser .parse (url )
681+
682+ resp = self ._session .get (url , headers = {"user-agent" : "arxiv.py/1.4.8" })
673683 self ._last_request_dt = datetime .now ()
674- err = None
675- if feed .status != 200 :
676- err = HTTPError (url , retry , feed )
677- elif len (feed .entries ) == 0 and not first_page :
678- err = UnexpectedEmptyPageError (url , retry )
679- if err is not None :
680- logger .debug ("Got error (try %d): %s" , retry , err )
681- if retries_left > 0 :
682- return self .__try_parse_feed (
683- url ,
684- first_page = first_page ,
685- retries_left = retries_left - 1 ,
686- last_err = err ,
687- )
688- # Feed was never returned in self.num_retries tries. Raise the last
689- # exception encountered.
690- raise err
684+ if resp .status_code != requests .codes .OK :
685+ raise HTTPError (url , try_index , resp .status_code )
686+
687+ feed = feedparser .parse (resp .content )
688+ if len (feed .entries ) == 0 and not first_page :
689+ raise UnexpectedEmptyPageError (url , try_index , feed )
690+
691+ if feed .bozo :
692+ logger .warning (
693+ "Bozo feed; consider handling: %s" ,
694+ feed .bozo_exception if "bozo_exception" in feed else None ,
695+ )
696+
691697 return feed
692698
693699
@@ -727,16 +733,25 @@ class UnexpectedEmptyPageError(ArxivError):
727733 See `Client.results` for usage.
728734 """
729735
730- def __init__ (self , url : str , retry : int ):
736+ raw_feed : feedparser .FeedParserDict
737+ """
738+ The raw output of `feedparser.parse`. Sometimes this contains useful
739+ diagnostic information, e.g. in 'bozo_exception'.
740+ """
741+
742+ def __init__ (self , url : str , retry : int , raw_feed : feedparser .FeedParserDict ):
731743 """
732744 Constructs an `UnexpectedEmptyPageError` encountered for the specified
733745 API URL after `retry` tries.
734746 """
735747 self .url = url
748+ self .raw_feed = raw_feed
736749 super ().__init__ (url , retry , "Page of results was unexpectedly empty" )
737750
738751 def __repr__ (self ) -> str :
739- return "{}({}, {})" .format (_classname (self ), repr (self .url ), repr (self .retry ))
752+ return "{}({}, {}, {})" .format (
753+ _classname (self ), repr (self .url ), repr (self .retry ), repr (self .raw_feed )
754+ )
740755
741756
742757class HTTPError (ArxivError ):
@@ -748,29 +763,18 @@ class HTTPError(ArxivError):
748763
749764 status : int
750765 """The HTTP status reported by feedparser."""
751- entry : feedparser .FeedParserDict
752- """The feed entry describing the error, if present."""
753766
754- def __init__ (self , url : str , retry : int , feed : feedparser . FeedParserDict ):
767+ def __init__ (self , url : str , retry : int , status : int ):
755768 """
756769 Constructs an `HTTPError` for the specified status code, encountered for
757770 the specified API URL after `retry` tries.
758771 """
759772 self .url = url
760- self .status = feed .status
761- # If the feed is valid and includes a single entry, trust it's an
762- # explanation.
763- if not feed .bozo and len (feed .entries ) == 1 :
764- self .entry = feed .entries [0 ]
765- else :
766- self .entry = None
773+ self .status = status
767774 super ().__init__ (
768775 url ,
769776 retry ,
770- "Page request resulted in HTTP {}: {}" .format (
771- self .status ,
772- self .entry .summary if self .entry else None ,
773- ),
777+ "Page request resulted in HTTP {}" .format (self .status ),
774778 )
775779
776780 def __repr__ (self ) -> str :
0 commit comments