|
19 | 19 | import hashlib |
20 | 20 | import logging |
21 | 21 | import re |
22 | | -import time |
23 | | -from urllib.parse import urljoin, urlparse |
24 | | -from urllib3 import PoolManager, Timeout as Urllib3Timeout |
25 | | -from urllib3.exceptions import (ConnectTimeoutError, |
26 | | - MaxRetryError, |
27 | | - ProtocolError, |
28 | | - ReadTimeoutError, |
29 | | - ProxyError, |
30 | | - TimeoutError) |
| 22 | +from urllib.parse import urljoin |
31 | 23 |
|
32 | 24 | from warnings import warn |
33 | | -from . import _utils, __version__ |
34 | | -from ._http import InternalHttpResponse, serialize_querystring # noqa |
| 25 | +from . import _utils |
| 26 | +from ._http import is_memento_response, WaybackSession # noqa |
35 | 27 | from ._models import CdxRecord, Memento |
36 | 28 | from .exceptions import (WaybackException, |
37 | 29 | UnexpectedResponseFormat, |
38 | 30 | BlockedByRobotsError, |
39 | 31 | BlockedSiteError, |
40 | 32 | MementoPlaybackError, |
41 | | - NoMementoError, |
42 | | - WaybackRetryError, |
43 | | - RateLimitError, |
44 | | - SessionClosedError) |
| 33 | + NoMementoError) |
45 | 34 |
|
46 | 35 |
|
47 | 36 | logger = logging.getLogger(__name__) |
|
69 | 58 | # Make sure it roughly starts with a valid protocol + domain + port? |
70 | 59 | URL_ISH = re.compile(r'^[\w+\-]+://[^/?=&]+\.\w\w+(:\d+)?(/|$)') |
71 | 60 |
|
72 | | -# Global default rate limits for various endpoints. Internet Archive folks have |
73 | | -# asked us to set the defaults at 80% of the hard limits. |
74 | | -DEFAULT_CDX_RATE_LIMIT = _utils.RateLimit(0.8 * 60 / 60) |
75 | | -DEFAULT_TIMEMAP_RATE_LIMIT = _utils.RateLimit(0.8 * 100 / 60) |
76 | | -DEFAULT_MEMENTO_RATE_LIMIT = _utils.RateLimit(0.8 * 600 / 60) |
77 | | - |
78 | | -# If a rate limit response (i.e. a response with status == 429) does not |
79 | | -# include a `Retry-After` header, recommend pausing for this long. |
80 | | -DEFAULT_RATE_LIMIT_DELAY = 60 |
81 | | - |
82 | 61 |
|
83 | 62 | class Mode(Enum): |
84 | 63 | """ |
@@ -147,10 +126,6 @@ def is_malformed_url(url): |
147 | 126 | return False |
148 | 127 |
|
149 | 128 |
|
150 | | -def is_memento_response(response): |
151 | | - return 'Memento-Datetime' in response.headers |
152 | | - |
153 | | - |
154 | 129 | def cdx_hash(content): |
155 | 130 | if isinstance(content, str): |
156 | 131 | content = content.encode() |
@@ -258,296 +233,6 @@ def clean_memento_links(links, mode): |
258 | 233 | return result |
259 | 234 |
|
260 | 235 |
|
261 | | -class WaybackSession: |
262 | | - """ |
263 | | - Manages HTTP requests to Wayback Machine servers, handling things like |
264 | | - retries, rate limiting, connection pooling, timeouts, etc. |
265 | | -
|
266 | | - Parameters |
267 | | - ---------- |
268 | | - retries : int, default: 6 |
269 | | - The maximum number of retries for requests. |
270 | | - backoff : int or float, default: 2 |
271 | | - Number of seconds from which to calculate how long to back off and wait |
272 | | - when retrying requests. The first retry is always immediate, but |
273 | | - subsequent retries increase by powers of 2: |
274 | | -
|
275 | | - seconds = backoff * 2 ^ (retry number - 1) |
276 | | -
|
277 | | - So if this was `4`, retries would happen after the following delays: |
278 | | - 0 seconds, 4 seconds, 8 seconds, 16 seconds, ... |
279 | | - timeout : int or float or tuple of (int or float, int or float), default: 60 |
280 | | - A timeout to use for all requests. |
281 | | - See the Requests docs for more: |
282 | | - https://docs.python-requests.org/en/master/user/advanced/#timeouts |
283 | | - user_agent : str, optional |
284 | | - A custom user-agent string to use in all requests. Defaults to: |
285 | | - `wayback/{version} (+https://github.com/edgi-govdata-archiving/wayback)` |
286 | | - search_calls_per_second : wayback.RateLimit or int or float, default: 0.8 |
287 | | - The maximum number of calls per second made to the CDX search API. |
288 | | - To disable the rate limit, set this to 0. |
289 | | -
|
290 | | - To have multiple sessions share a rate limit (so requests made by one |
291 | | - session count towards the limit of the other session), use a |
292 | | - single :class:`wayback.RateLimit` instance and pass it to each |
293 | | - ``WaybackSession`` instance. If you do not set a limit, the default |
294 | | - limit is shared globally across all sessions. |
295 | | - memento_calls_per_second : wayback.RateLimit or int or float, default: 8 |
296 | | - The maximum number of calls per second made to the memento API. |
297 | | - To disable the rate limit, set this to 0. |
298 | | -
|
299 | | - To have multiple sessions share a rate limit (so requests made by one |
300 | | - session count towards the limit of the other session), use a |
301 | | - single :class:`wayback.RateLimit` instance and pass it to each |
302 | | - ``WaybackSession`` instance. If you do not set a limit, the default |
303 | | - limit is shared globally across all sessions. |
304 | | - timemap_calls_per_second : wayback.RateLimit or int or float, default: 1.33 |
305 | | - The maximum number of calls per second made to the timemap API (the |
306 | | - Wayback Machine's new, beta CDX search is part of the timemap API). |
307 | | - To disable the rate limit, set this to 0. |
308 | | -
|
309 | | - To have multiple sessions share a rate limit (so requests made by one |
310 | | - session count towards the limit of the other session), use a |
311 | | - single :class:`wayback.RateLimit` instance and pass it to each |
312 | | - ``WaybackSession`` instance. If you do not set a limit, the default |
313 | | - limit is shared globally across all sessions. |
314 | | - """ |
315 | | - |
316 | | - # It seems Wayback sometimes produces 500 errors for transient issues, so |
317 | | - # they make sense to retry here. Usually not in other contexts, though. |
318 | | - retryable_statuses = frozenset((413, 421, 500, 502, 503, 504, 599)) |
319 | | - |
320 | | - # XXX: TimeoutError should be a base class for both ConnectTimeoutError |
321 | | - # and ReadTimeoutError, so we don't need them here...? |
322 | | - retryable_errors = (ConnectTimeoutError, MaxRetryError, ReadTimeoutError, |
323 | | - ProxyError, TimeoutError, |
324 | | - # XXX: These used to be wrapped with |
325 | | - # requests.ConnectionError, which we would then have to |
326 | | - # inspect to see if it needed retrying. Need to make |
327 | | - # sure/think through whether these should be retried. |
328 | | - ProtocolError, OSError) |
329 | | - # Handleable errors *may* be retryable, but need additional logic beyond |
330 | | - # just the error type. See `should_retry_error()`. |
331 | | - # |
332 | | - # XXX: see notes above about what should get retried; which things need to |
333 | | - # be caught but then more deeply inspected, blah blah blah: |
334 | | - # handleable_errors = (ConnectionError,) + retryable_errors |
335 | | - handleable_errors = () + retryable_errors |
336 | | - |
337 | | - def __init__(self, retries=6, backoff=2, timeout=60, user_agent=None, |
338 | | - search_calls_per_second=DEFAULT_CDX_RATE_LIMIT, |
339 | | - memento_calls_per_second=DEFAULT_MEMENTO_RATE_LIMIT, |
340 | | - timemap_calls_per_second=DEFAULT_TIMEMAP_RATE_LIMIT): |
341 | | - super().__init__() |
342 | | - self.retries = retries |
343 | | - self.backoff = backoff |
344 | | - self.timeout = timeout |
345 | | - self.headers = { |
346 | | - 'User-Agent': (user_agent or |
347 | | - f'wayback/{__version__} (+https://github.com/edgi-govdata-archiving/wayback)'), |
348 | | - 'Accept-Encoding': 'gzip, deflate' |
349 | | - } |
350 | | - self.rate_limts = { |
351 | | - '/web/timemap': _utils.RateLimit.make_limit(timemap_calls_per_second), |
352 | | - '/cdx': _utils.RateLimit.make_limit(search_calls_per_second), |
353 | | - # The memento limit is actually a generic Wayback limit. |
354 | | - '/': _utils.RateLimit.make_limit(memento_calls_per_second), |
355 | | - } |
356 | | - # XXX: These parameters are the same as requests, but we have had at |
357 | | - # least one user reach in and change the adapters we used with requests |
358 | | - # to modify these. We should consider whether different values are |
359 | | - # appropriate (e.g. block=True) or if these need to be exposed somehow. |
360 | | - # |
361 | | - # XXX: Consider using a HTTPSConnectionPool instead of a PoolManager. |
362 | | - # We can make some code simpler if we are always assuming the same host. |
363 | | - # (At current, we only use one host, so this is feasible.) |
364 | | - # |
365 | | - # XXX: Do we need a cookie jar? urllib3 doesn't do any cookie management |
366 | | - # for us, and the Wayback Machine may set some cookies we should retain |
367 | | - # in subsequent requests. (In practice, it doesn't appear the CDX, |
368 | | - # Memento, or Timemap APIs do by default, but not sure what happens if |
369 | | - # you send S3-style credentials or use other endpoints.) |
370 | | - self._pool_manager = PoolManager( |
371 | | - num_pools=10, |
372 | | - maxsize=10, |
373 | | - block=False, |
374 | | - ) |
375 | | - # NOTE: the nice way to accomplish retry/backoff is with a urllib3: |
376 | | - # adapter = requests.adapters.HTTPAdapter( |
377 | | - # max_retries=Retry(total=5, backoff_factor=2, |
378 | | - # status_forcelist=(503, 504))) |
379 | | - # self.mount('http://', adapter) |
380 | | - # But Wayback mementos can have errors, which complicates things. See: |
381 | | - # https://github.com/urllib3/urllib3/issues/1445#issuecomment-422950868 |
382 | | - # |
383 | | - # Also note that, if we are ever able to switch to that, we may need to |
384 | | - # get more fancy with log filtering, since we *expect* lots of retries |
385 | | - # with Wayback's APIs, but urllib3 logs a warning on every retry: |
386 | | - # https://github.com/urllib3/urllib3/blob/5b047b645f5f93900d5e2fc31230848c25eb1f5f/src/urllib3/connectionpool.py#L730-L737 |
387 | | - |
388 | | - def request(self, method, url, *, params=None, allow_redirects=True, timeout=-1) -> InternalHttpResponse: |
389 | | - if not self._pool_manager: |
390 | | - raise SessionClosedError('This session has already been closed ' |
391 | | - 'and cannot send new HTTP requests.') |
392 | | - |
393 | | - start_time = time.time() |
394 | | - maximum = self.retries |
395 | | - retries = 0 |
396 | | - |
397 | | - timeout = self.timeout if timeout is -1 else timeout |
398 | | - # XXX: grabbed from requests. Clean up for our use case. |
399 | | - if isinstance(timeout, tuple): |
400 | | - try: |
401 | | - connect, read = timeout |
402 | | - timeout = Urllib3Timeout(connect=connect, read=read) |
403 | | - except ValueError: |
404 | | - raise ValueError( |
405 | | - f"Invalid timeout {timeout}. Pass a (connect, read) timeout tuple, " |
406 | | - f"or a single float to set both timeouts to the same value." |
407 | | - ) |
408 | | - elif isinstance(timeout, Urllib3Timeout): |
409 | | - pass |
410 | | - else: |
411 | | - timeout = Urllib3Timeout(connect=timeout, read=timeout) |
412 | | - |
413 | | - parsed = urlparse(url) |
414 | | - for path, limit in self.rate_limts.items(): |
415 | | - if parsed.path.startswith(path): |
416 | | - rate_limit = limit |
417 | | - break |
418 | | - else: |
419 | | - rate_limit = DEFAULT_MEMENTO_RATE_LIMIT |
420 | | - |
421 | | - # Do our own querystring work since urllib3 serializes lists poorly. |
422 | | - if params: |
423 | | - serialized = serialize_querystring(params) |
424 | | - if parsed.query: |
425 | | - url += f'&{serialized}' |
426 | | - else: |
427 | | - url += f'?{serialized}' |
428 | | - |
429 | | - while True: |
430 | | - retry_delay = 0 |
431 | | - try: |
432 | | - # XXX: should be `debug()`. Set to warning to testing. |
433 | | - logger.warning('sending HTTP request %s "%s", %s', method, url, params) |
434 | | - rate_limit.wait() |
435 | | - response = InternalHttpResponse(self._pool_manager.request( |
436 | | - method=method, |
437 | | - url=url, |
438 | | - # fields=serialize_querystring(params), |
439 | | - headers=self.headers, |
440 | | - # XXX: is allow_redirects safe for preload_content == False? |
441 | | - # XXX: it is, BUT THAT SKIPS OUR RATE LIMITING, which also |
442 | | - # is obviously already a problem today, but we ought to get |
443 | | - # it fixed now. Leaving this on for the moment, but it |
444 | | - # must be addressed before merging. |
445 | | - redirect=allow_redirects, |
446 | | - preload_content=False, |
447 | | - timeout=timeout |
448 | | - ), url) |
449 | | - |
450 | | - retry_delay = self.get_retry_delay(retries, response) |
451 | | - |
452 | | - if retries >= maximum or not self.should_retry(response): |
453 | | - if response.status_code == 429: |
454 | | - response.close() |
455 | | - raise RateLimitError(response, retry_delay) |
456 | | - return response |
457 | | - else: |
458 | | - logger.debug('Received error response (status: %s), will retry', response.status_code) |
459 | | - response.close(cache=False) |
460 | | - # XXX: urllib3's MaxRetryError can wrap all the other errors, so |
461 | | - # we should probably be checking `error.reason` on it. See how |
462 | | - # requests handles this: |
463 | | - # https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/adapters.py#L502-L537 |
464 | | - # |
465 | | - # XXX: requests.RetryError used to be in our list of handleable |
466 | | - # errors; it gets raised when urllib3 raises a MaxRetryError with a |
467 | | - # ResponseError for its `reason` attribute. Need to test the |
468 | | - # situation here... |
469 | | - # |
470 | | - # XXX: Consider how read-related exceptions need to be handled (or |
471 | | - # not). In requests: |
472 | | - # https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/models.py#L794-L839 |
473 | | - except WaybackSession.handleable_errors as error: |
474 | | - response = getattr(error, 'response', None) |
475 | | - if response is not None: |
476 | | - response.close() |
477 | | - |
478 | | - if retries >= maximum: |
479 | | - raise WaybackRetryError(retries, time.time() - start_time, error) from error |
480 | | - elif self.should_retry_error(error): |
481 | | - retry_delay = self.get_retry_delay(retries, response) |
482 | | - logger.info('Caught exception during request, will retry: %s', error) |
483 | | - else: |
484 | | - raise |
485 | | - |
486 | | - logger.debug('Will retry after sleeping for %s seconds...', retry_delay) |
487 | | - time.sleep(retry_delay) |
488 | | - retries += 1 |
489 | | - |
490 | | - def should_retry(self, response: InternalHttpResponse): |
491 | | - # A memento may actually be a capture of an error, so don't retry it :P |
492 | | - if is_memento_response(response): |
493 | | - return False |
494 | | - |
495 | | - return response.status_code in self.retryable_statuses |
496 | | - |
497 | | - def should_retry_error(self, error): |
498 | | - if isinstance(error, WaybackSession.retryable_errors): |
499 | | - return True |
500 | | - # XXX: ConnectionError was a broad wrapper from requests; there are more |
501 | | - # narrow errors in urllib3 we can catch, so this is probably (???) no |
502 | | - # longer relevant. But urllib3 has some other wrapper exceptions that we |
503 | | - # might need to dig into more, see: |
504 | | - # https://github.com/psf/requests/blob/a25fde6989f8df5c3d823bc9f2e2fc24aa71f375/src/requests/adapters.py#L502-L537 |
505 | | - # |
506 | | - # elif isinstance(error, ConnectionError): |
507 | | - # # ConnectionErrors from requests actually wrap a whole family of |
508 | | - # # more detailed errors from urllib3, so we need to do some string |
509 | | - # # checking to determine whether the error is retryable. |
510 | | - # text = str(error) |
511 | | - # # NOTE: we have also seen this, which may warrant retrying: |
512 | | - # # `requests.exceptions.ConnectionError: ('Connection aborted.', |
513 | | - # # RemoteDisconnected('Remote end closed connection without |
514 | | - # # response'))` |
515 | | - # if 'NewConnectionError' in text or 'Max retries' in text: |
516 | | - # return True |
517 | | - |
518 | | - return False |
519 | | - |
520 | | - def get_retry_delay(self, retries, response: InternalHttpResponse = None): |
521 | | - delay = 0 |
522 | | - |
523 | | - # As of 2023-11-27, the Wayback Machine does not set a `Retry-After` |
524 | | - # header, so this parsing is just future-proofing. |
525 | | - if response is not None: |
526 | | - delay = _utils.parse_retry_after(response.headers.get('Retry-After')) or delay |
527 | | - if response.status_code == 429 and delay == 0: |
528 | | - delay = DEFAULT_RATE_LIMIT_DELAY |
529 | | - |
530 | | - # No default backoff on the first retry. |
531 | | - if retries > 0: |
532 | | - delay = max(self.backoff * 2 ** (retries - 1), delay) |
533 | | - |
534 | | - return delay |
535 | | - |
536 | | - # XXX: Needs to do the right thing. Requests sessions closed all their |
537 | | - # adapters, which does: |
538 | | - # self.poolmanager.clear() |
539 | | - # for proxy in self.proxy_manager.values(): |
540 | | - # proxy.clear() |
541 | | - def reset(self): |
542 | | - "Reset any network connections the session is using." |
543 | | - self._pool_manager.clear() |
544 | | - |
545 | | - def close(self) -> None: |
546 | | - if self._pool_manager: |
547 | | - self._pool_manager.clear() |
548 | | - self._pool_manager = None |
549 | | - |
550 | | - |
551 | 236 | # TODO: add retry, backoff, cross_thread_backoff, and rate_limit options that |
552 | 237 | # create a custom instance of urllib3.utils.Retry |
553 | 238 | class WaybackClient(_utils.DepthCountedContext): |
|
0 commit comments