Skip to content

Commit 7d8e800

Browse files
committed
Move Session into _http.py
1 parent 0d9517a commit 7d8e800

File tree

4 files changed

+334
-329
lines changed

4 files changed

+334
-329
lines changed

src/wayback/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,6 @@
1212

1313
from ._client import ( # noqa
1414
Mode,
15-
WaybackClient,
16-
WaybackSession)
15+
WaybackClient)
16+
17+
from ._http import WaybackSession # noqa

src/wayback/_client.py

Lines changed: 4 additions & 319 deletions
Original file line numberDiff line numberDiff line change
@@ -19,29 +19,18 @@
1919
import hashlib
2020
import logging
2121
import re
22-
import time
23-
from urllib.parse import urljoin, urlparse
24-
from urllib3 import PoolManager, Timeout as Urllib3Timeout
25-
from urllib3.exceptions import (ConnectTimeoutError,
26-
MaxRetryError,
27-
ProtocolError,
28-
ReadTimeoutError,
29-
ProxyError,
30-
TimeoutError)
22+
from urllib.parse import urljoin
3123

3224
from warnings import warn
33-
from . import _utils, __version__
34-
from ._http import InternalHttpResponse, serialize_querystring # noqa
25+
from . import _utils
26+
from ._http import is_memento_response, WaybackSession # noqa
3527
from ._models import CdxRecord, Memento
3628
from .exceptions import (WaybackException,
3729
UnexpectedResponseFormat,
3830
BlockedByRobotsError,
3931
BlockedSiteError,
4032
MementoPlaybackError,
41-
NoMementoError,
42-
WaybackRetryError,
43-
RateLimitError,
44-
SessionClosedError)
33+
NoMementoError)
4534

4635

4736
logger = logging.getLogger(__name__)
@@ -69,16 +58,6 @@
6958
# Make sure it roughly starts with a valid protocol + domain + port?
7059
URL_ISH = re.compile(r'^[\w+\-]+://[^/?=&]+\.\w\w+(:\d+)?(/|$)')
7160

72-
# Global default rate limits for various endpoints. Internet Archive folks have
73-
# asked us to set the defaults at 80% of the hard limits.
74-
DEFAULT_CDX_RATE_LIMIT = _utils.RateLimit(0.8 * 60 / 60)
75-
DEFAULT_TIMEMAP_RATE_LIMIT = _utils.RateLimit(0.8 * 100 / 60)
76-
DEFAULT_MEMENTO_RATE_LIMIT = _utils.RateLimit(0.8 * 600 / 60)
77-
78-
# If a rate limit response (i.e. a response with status == 429) does not
79-
# include a `Retry-After` header, recommend pausing for this long.
80-
DEFAULT_RATE_LIMIT_DELAY = 60
81-
8261

8362
class Mode(Enum):
8463
"""
@@ -147,10 +126,6 @@ def is_malformed_url(url):
147126
return False
148127

149128

150-
def is_memento_response(response):
151-
return 'Memento-Datetime' in response.headers
152-
153-
154129
def cdx_hash(content):
155130
if isinstance(content, str):
156131
content = content.encode()
@@ -258,296 +233,6 @@ def clean_memento_links(links, mode):
258233
return result
259234

260235

261-
class WaybackSession:
262-
"""
263-
Manages HTTP requests to Wayback Machine servers, handling things like
264-
retries, rate limiting, connection pooling, timeouts, etc.
265-
266-
Parameters
267-
----------
268-
retries : int, default: 6
269-
The maximum number of retries for requests.
270-
backoff : int or float, default: 2
271-
Number of seconds from which to calculate how long to back off and wait
272-
when retrying requests. The first retry is always immediate, but
273-
subsequent retries increase by powers of 2:
274-
275-
seconds = backoff * 2 ^ (retry number - 1)
276-
277-
So if this was `4`, retries would happen after the following delays:
278-
0 seconds, 4 seconds, 8 seconds, 16 seconds, ...
279-
timeout : int or float or tuple of (int or float, int or float), default: 60
280-
A timeout to use for all requests.
281-
See the Requests docs for more:
282-
https://docs.python-requests.org/en/master/user/advanced/#timeouts
283-
user_agent : str, optional
284-
A custom user-agent string to use in all requests. Defaults to:
285-
`wayback/{version} (+https://github.com/edgi-govdata-archiving/wayback)`
286-
search_calls_per_second : wayback.RateLimit or int or float, default: 0.8
287-
The maximum number of calls per second made to the CDX search API.
288-
To disable the rate limit, set this to 0.
289-
290-
To have multiple sessions share a rate limit (so requests made by one
291-
session count towards the limit of the other session), use a
292-
single :class:`wayback.RateLimit` instance and pass it to each
293-
``WaybackSession`` instance. If you do not set a limit, the default
294-
limit is shared globally across all sessions.
295-
memento_calls_per_second : wayback.RateLimit or int or float, default: 8
296-
The maximum number of calls per second made to the memento API.
297-
To disable the rate limit, set this to 0.
298-
299-
To have multiple sessions share a rate limit (so requests made by one
300-
session count towards the limit of the other session), use a
301-
single :class:`wayback.RateLimit` instance and pass it to each
302-
``WaybackSession`` instance. If you do not set a limit, the default
303-
limit is shared globally across all sessions.
304-
timemap_calls_per_second : wayback.RateLimit or int or float, default: 1.33
305-
The maximum number of calls per second made to the timemap API (the
306-
Wayback Machine's new, beta CDX search is part of the timemap API).
307-
To disable the rate limit, set this to 0.
308-
309-
To have multiple sessions share a rate limit (so requests made by one
310-
session count towards the limit of the other session), use a
311-
single :class:`wayback.RateLimit` instance and pass it to each
312-
``WaybackSession`` instance. If you do not set a limit, the default
313-
limit is shared globally across all sessions.
314-
"""
315-
316-
# It seems Wayback sometimes produces 500 errors for transient issues, so
317-
# they make sense to retry here. Usually not in other contexts, though.
318-
retryable_statuses = frozenset((413, 421, 500, 502, 503, 504, 599))
319-
320-
# XXX: TimeoutError should be a base class for both ConnectTimeoutError
321-
# and ReadTimeoutError, so we don't need them here...?
322-
retryable_errors = (ConnectTimeoutError, MaxRetryError, ReadTimeoutError,
323-
ProxyError, TimeoutError,
324-
# XXX: These used to be wrapped with
325-
# requests.ConnectionError, which we would then have to
326-
# inspect to see if it needed retrying. Need to make
327-
# sure/think through whether these should be retried.
328-
ProtocolError, OSError)
329-
# Handleable errors *may* be retryable, but need additional logic beyond
330-
# just the error type. See `should_retry_error()`.
331-
#
332-
# XXX: see notes above about what should get retried; which things need to
333-
# be caught but then more deeply inspected, blah blah blah:
334-
# handleable_errors = (ConnectionError,) + retryable_errors
335-
handleable_errors = () + retryable_errors
336-
337-
def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None,
338-
search_calls_per_second=DEFAULT_CDX_RATE_LIMIT,
339-
memento_calls_per_second=DEFAULT_MEMENTO_RATE_LIMIT,
340-
timemap_calls_per_second=DEFAULT_TIMEMAP_RATE_LIMIT):
341-
super().__init__()
342-
self.retries = retries
343-
self.backoff = backoff
344-
self.timeout = timeout
345-
self.headers = {
346-
'User-Agent': (user_agent or
347-
f'wayback/{__version__} (+https://github.com/edgi-govdata-archiving/wayback)'),
348-
'Accept-Encoding': 'gzip, deflate'
349-
}
350-
self.rate_limts = {
351-
'/web/timemap': _utils.RateLimit.make_limit(timemap_calls_per_second),
352-
'/cdx': _utils.RateLimit.make_limit(search_calls_per_second),
353-
# The memento limit is actually a generic Wayback limit.
354-
'/': _utils.RateLimit.make_limit(memento_calls_per_second),
355-
}
356-
# XXX: These parameters are the same as requests, but we have had at
357-
# least one user reach in and change the adapters we used with requests
358-
# to modify these. We should consider whether different values are
359-
# appropriate (e.g. block=True) or if these need to be exposed somehow.
360-
#
361-
# XXX: Consider using a HTTPSConnectionPool instead of a PoolManager.
362-
# We can make some code simpler if we are always assuming the same host.
363-
# (At current, we only use one host, so this is feasible.)
364-
#
365-
# XXX: Do we need a cookie jar? urllib3 doesn't do any cookie management
366-
# for us, and the Wayback Machine may set some cookies we should retain
367-
# in subsequent requests. (In practice, it doesn't appear the CDX,
368-
# Memento, or Timemap APIs do by default, but not sure what happens if
369-
# you send S3-style credentials or use other endpoints.)
370-
self._pool_manager = PoolManager(
371-
num_pools=10,
372-
maxsize=10,
373-
block=False,
374-
)
375-
# NOTE: the nice way to accomplish retry/backoff is with a urllib3:
376-
# adapter = requests.adapters.HTTPAdapter(
377-
# max_retries=Retry(total=5, backoff_factor=2,
378-
# status_forcelist=(503, 504)))
379-
# self.mount('http://', adapter)
380-
# But Wayback mementos can have errors, which complicates things. See:
381-
# https://github.com/urllib3/urllib3/issues/1445#issuecomment-422950868
382-
#
383-
# Also note that, if we are ever able to switch to that, we may need to
384-
# get more fancy with log filtering, since we *expect* lots of retries
385-
# with Wayback's APIs, but urllib3 logs a warning on every retry:
386-
# https://github.com/urllib3/urllib3/blob/5b047b645f5f93900d5e2fc31230848c25eb1f5f/src/urllib3/connectionpool.py#L730-L737
387-
388-
def request(self, method, url, *, params=None, allow_redirects=True, timeout=-1) -> InternalHttpResponse:
389-
if not self._pool_manager:
390-
raise SessionClosedError('This session has already been closed '
391-
'and cannot send new HTTP requests.')
392-
393-
start_time = time.time()
394-
maximum = self.retries
395-
retries = 0
396-
397-
timeout = self.timeout if timeout is -1 else timeout
398-
# XXX: grabbed from requests. Clean up for our use case.
399-
if isinstance(timeout, tuple):
400-
try:
401-
connect, read = timeout
402-
timeout = Urllib3Timeout(connect=connect, read=read)
403-
except ValueError:
404-
raise ValueError(
405-
f"Invalid timeout {timeout}. Pass a (connect, read) timeout tuple, "
406-
f"or a single float to set both timeouts to the same value."
407-
)
408-
elif isinstance(timeout, Urllib3Timeout):
409-
pass
410-
else:
411-
timeout = Urllib3Timeout(connect=timeout, read=timeout)
412-
413-
parsed = urlparse(url)
414-
for path, limit in self.rate_limts.items():
415-
if parsed.path.startswith(path):
416-
rate_limit = limit
417-
break
418-
else:
419-
rate_limit = DEFAULT_MEMENTO_RATE_LIMIT
420-
421-
# Do our own querystring work since urllib3 serializes lists poorly.
422-
if params:
423-
serialized = serialize_querystring(params)
424-
if parsed.query:
425-
url += f'&{serialized}'
426-
else:
427-
url += f'?{serialized}'
428-
429-
while True:
430-
retry_delay = 0
431-
try:
432-
# XXX: should be `debug()`. Set to warning to testing.
433-
logger.warning('sending HTTP request %s "%s", %s', method, url, params)
434-
rate_limit.wait()
435-
response = InternalHttpResponse(self._pool_manager.request(
436-
method=method,
437-
url=url,
438-
# fields=serialize_querystring(params),
439-
headers=self.headers,
440-
# XXX: is allow_redirects safe for preload_content == False?
441-
# XXX: it is, BUT THAT SKIPS OUR RATE LIMITING, which also
442-
# is obviously already a problem today, but we ought to get
443-
# it fixed now. Leaving this on for the moment, but it
444-
# must be addressed before merging.
445-
redirect=allow_redirects,
446-
preload_content=False,
447-
timeout=timeout
448-
), url)
449-
450-
retry_delay = self.get_retry_delay(retries, response)
451-
452-
if retries >= maximum or not self.should_retry(response):
453-
if response.status_code == 429:
454-
response.close()
455-
raise RateLimitError(response, retry_delay)
456-
return response
457-
else:
458-
logger.debug('Received error response (status: %s), will retry', response.status_code)
459-
response.close(cache=False)
460-
# XXX: urllib3's MaxRetryError can wrap all the other errors, so
461-
# we should probably be checking `error.reason` on it. See how
462-
# requests handles this:
463-
# https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/adapters.py#L502-L537
464-
#
465-
# XXX: requests.RetryError used to be in our list of handleable
466-
# errors; it gets raised when urllib3 raises a MaxRetryError with a
467-
# ResponseError for its `reason` attribute. Need to test the
468-
# situation here...
469-
#
470-
# XXX: Consider how read-related exceptions need to be handled (or
471-
# not). In requests:
472-
# https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/models.py#L794-L839
473-
except WaybackSession.handleable_errors as error:
474-
response = getattr(error, 'response', None)
475-
if response is not None:
476-
response.close()
477-
478-
if retries >= maximum:
479-
raise WaybackRetryError(retries, time.time() - start_time, error) from error
480-
elif self.should_retry_error(error):
481-
retry_delay = self.get_retry_delay(retries, response)
482-
logger.info('Caught exception during request, will retry: %s', error)
483-
else:
484-
raise
485-
486-
logger.debug('Will retry after sleeping for %s seconds...', retry_delay)
487-
time.sleep(retry_delay)
488-
retries += 1
489-
490-
def should_retry(self, response: InternalHttpResponse):
491-
# A memento may actually be a capture of an error, so don't retry it :P
492-
if is_memento_response(response):
493-
return False
494-
495-
return response.status_code in self.retryable_statuses
496-
497-
def should_retry_error(self, error):
498-
if isinstance(error, WaybackSession.retryable_errors):
499-
return True
500-
# XXX: ConnectionError was a broad wrapper from requests; there are more
501-
# narrow errors in urllib3 we can catch, so this is probably (???) no
502-
# longer relevant. But urllib3 has some other wrapper exceptions that we
503-
# might need to dig into more, see:
504-
# https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/adapters.py#L502-L537
505-
#
506-
# elif isinstance(error, ConnectionError):
507-
# # ConnectionErrors from requests actually wrap a whole family of
508-
# # more detailed errors from urllib3, so we need to do some string
509-
# # checking to determine whether the error is retryable.
510-
# text = str(error)
511-
# # NOTE: we have also seen this, which may warrant retrying:
512-
# # `requests.exceptions.ConnectionError: ('Connection aborted.',
513-
# # RemoteDisconnected('Remote end closed connection without
514-
# # response'))`
515-
# if 'NewConnectionError' in text or 'Max retries' in text:
516-
# return True
517-
518-
return False
519-
520-
def get_retry_delay(self, retries, response: InternalHttpResponse = None):
521-
delay = 0
522-
523-
# As of 2023-11-27, the Wayback Machine does not set a `Retry-After`
524-
# header, so this parsing is just future-proofing.
525-
if response is not None:
526-
delay = _utils.parse_retry_after(response.headers.get('Retry-After')) or delay
527-
if response.status_code == 429 and delay == 0:
528-
delay = DEFAULT_RATE_LIMIT_DELAY
529-
530-
# No default backoff on the first retry.
531-
if retries > 0:
532-
delay = max(self.backoff * 2 ** (retries - 1), delay)
533-
534-
return delay
535-
536-
# XXX: Needs to do the right thing. Requests sessions closed all their
537-
# adapters, which does:
538-
# self.poolmanager.clear()
539-
# for proxy in self.proxy_manager.values():
540-
# proxy.clear()
541-
def reset(self):
542-
"Reset any network connections the session is using."
543-
self._pool_manager.clear()
544-
545-
def close(self) -> None:
546-
if self._pool_manager:
547-
self._pool_manager.clear()
548-
self._pool_manager = None
549-
550-
551236
# TODO: add retry, backoff, cross_thread_backoff, and rate_limit options that
552237
# create a custom instance of urllib3.utils.Retry
553238
class WaybackClient(_utils.DepthCountedContext):

0 commit comments

Comments
 (0)