Skip to content

Commit d8c075e

Browse files
authored
S2 also retrying 403's (#1017)
1 parent eeb148b commit d8c075e

File tree

2 files changed

+54
-20
lines changed

2 files changed

+54
-20
lines changed

paperqa/clients/semantic_scholar.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from collections.abc import Collection
66
from datetime import datetime
77
from enum import IntEnum, auto
8+
from functools import partial
89
from http import HTTPStatus
910
from itertools import starmap
1011
from typing import Any
@@ -17,6 +18,7 @@
1718
from paperqa.utils import (
1819
_get_with_retrying,
1920
clean_upbibtex,
21+
is_retryable,
2022
strings_similarity,
2123
union_collections_to_ordered_list,
2224
)
@@ -125,6 +127,12 @@ async def _s2_get_with_retrying(url: str, **get_kwargs) -> dict[str, Any]:
125127
get_kwargs.get("timeout")
126128
or aiohttp.ClientTimeout(SEMANTIC_SCHOLAR_API_REQUEST_TIMEOUT)
127129
),
130+
# On 7/21/2025, flaky ClientResponseError was seen with 'citations' traversals on
131+
# paper ID 3516396ffa1fd32d4327e199d9b97ec67dc0439a with DOI 10.1126/science.2821624
132+
# > aiohttp.client_exceptions.ClientResponseError: 403, message='Forbidden'
133+
retry_predicate=partial(
134+
is_retryable, additional_status_codes={HTTPStatus.FORBIDDEN}
135+
),
128136
**get_kwargs,
129137
)
130138

paperqa/utils.py

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import re
1111
import string
1212
import unicodedata
13-
from collections.abc import Awaitable, Collection, Iterable, Iterator, Mapping
13+
from collections.abc import Awaitable, Callable, Collection, Iterable, Iterator, Mapping
1414
from datetime import datetime
1515
from functools import reduce
1616
from http import HTTPStatus
@@ -26,8 +26,8 @@
2626
from pybtex.style.formatting import unsrtalpha
2727
from pybtex.style.template import FieldIsMissing
2828
from tenacity import (
29+
AsyncRetrying,
2930
before_sleep_log,
30-
retry,
3131
retry_if_exception,
3232
stop_after_attempt,
3333
wait_incrementing,
@@ -402,7 +402,10 @@ def create_bibtex_key(author: list[str], year: str | int, title: str) -> str:
402402
return remove_substrings(key, FORBIDDEN_KEY_CHARACTERS)
403403

404404

405-
def is_retryable(exc: BaseException) -> bool:
405+
def is_retryable(
406+
exc: BaseException,
407+
additional_status_codes: Collection[HTTPStatus | int] | None = None,
408+
) -> bool:
406409
"""Check if an exception is known to be a retryable HTTP issue."""
407410
if isinstance(
408411
exc, aiohttp.ServerDisconnectedError | aiohttp.ClientConnectionResetError
@@ -411,33 +414,56 @@ def is_retryable(exc: BaseException) -> bool:
411414
# > aiohttp.client_exceptions.ClientConnectionResetError:
412415
# > Cannot write to closing transport
413416
return True
414-
return isinstance(exc, aiohttp.ClientResponseError) and exc.status in {
417+
retry_status_codes: set[int] = {
415418
httpx.codes.INTERNAL_SERVER_ERROR.value,
416419
httpx.codes.GATEWAY_TIMEOUT.value,
417420
}
421+
if additional_status_codes:
422+
retry_status_codes.update(
423+
status_code.value if isinstance(status_code, HTTPStatus) else status_code
424+
for status_code in additional_status_codes
425+
)
426+
return (
427+
isinstance(exc, aiohttp.ClientResponseError)
428+
and exc.status in retry_status_codes
429+
)
418430

419431

420-
@retry(
421-
retry=retry_if_exception(is_retryable),
422-
before_sleep=before_sleep_log(logger, logging.WARNING),
423-
stop=stop_after_attempt(5),
424-
wait=wait_incrementing(0.1, 0.1),
425-
)
426-
async def _get_with_retrying(
432+
async def _get_with_retrying( # type: ignore[return] # noqa: RET503
427433
url: str,
428434
session: aiohttp.ClientSession,
429435
http_exception_mappings: dict[HTTPStatus | int, Exception] | None = None,
436+
retry_predicate: Callable[[BaseException], bool] = is_retryable,
430437
**get_kwargs,
431438
) -> dict[str, Any]:
432-
"""Get from a URL with retrying protection."""
433-
try:
434-
async with session.get(url, **get_kwargs) as response:
435-
response.raise_for_status()
436-
return await response.json()
437-
except aiohttp.ClientResponseError as e:
438-
if http_exception_mappings and e.status in http_exception_mappings:
439-
raise http_exception_mappings[e.status] from e
440-
raise
439+
"""Get from a URL with retrying protection.
440+
441+
Args:
442+
url: Target URL for the GET request.
443+
session: Session for the GET request.
444+
http_exception_mappings: Optional mapping of HTTP status codes to
445+
custom Exceptions to be thrown.
446+
retry_predicate: Optional predicate to dictate when to retry.
447+
**get_kwargs: Optional additional keyword arguments for the GET request.
448+
449+
Returns:
450+
JSON result from the GET request.
451+
"""
452+
async for attempt in AsyncRetrying(
453+
retry=retry_if_exception(retry_predicate),
454+
before_sleep=before_sleep_log(logger, logging.WARNING),
455+
stop=stop_after_attempt(5),
456+
wait=wait_incrementing(0.1, 0.1),
457+
):
458+
with attempt:
459+
try:
460+
async with session.get(url, **get_kwargs) as response:
461+
response.raise_for_status()
462+
return await response.json()
463+
except aiohttp.ClientResponseError as e:
464+
if http_exception_mappings and e.status in http_exception_mappings:
465+
raise http_exception_mappings[e.status] from e
466+
raise
441467

442468

443469
def union_collections_to_ordered_list(collections: Iterable) -> list:

0 commit comments

Comments
 (0)