Skip to content

Commit f30cb90

Browse files
committed
Cache responses from Unpaywall for a week
1 parent dc526f8 commit f30cb90

File tree

3 files changed

+12
-1
lines changed

3 files changed

+12
-1
lines changed

.github/workflows/deploy.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ jobs:
1313
runs-on: ubuntu-latest
1414
steps:
1515
- uses: actions/checkout@v4
16+
- name: Persist requests-cache's cache file
17+
uses: actions/cache@v4
18+
with:
19+
key: http_cache
20+
path: http_cache.sqlite
1621
- uses: actions/setup-python@v5
1722
with:
1823
python-version: '3.13'

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ authors = [
88
]
99
dependencies = [
1010
"requests",
11+
"requests-cache",
1112
"PyYAML",
1213
]
1314
description = "Parse publications from ResearchFish API and produces the files needed to update Earlham Institute's website and CKAN."

rfparser/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
Response,
2626
Session,
2727
)
28+
from requests_cache import CachedSession
2829

2930
from .util import (
3031
extend_list_to_size,
@@ -42,6 +43,8 @@
4243
REQUEST_TIMEOUT = 5.0
4344
REQUEST_RETRIES = 3
4445
REQUEST_RETRIES_BACKOFF_FACTOR = 1.0
46+
# How long to store a cached response, if Cache-Control headers are missing in the response
47+
CACHED_RESPONSE_EXPIRE_AFTER = 7 * 24 * 60 * 60
4548
BASE_CR_URL = "https://api.crossref.org"
4649
BASE_DC_URL = "https://api.datacite.org"
4750
BASE_DOI_URL = "https://doi.org"
@@ -248,6 +251,8 @@ def get_url(
248251
sleep(backoff_time)
249252
else:
250253
raise Exception(f"Failed too many times to get URL {url}")
254+
if isinstance(s, CachedSession):
255+
log.debug("URL %s retrieved from cache: %s", url, r.from_cache) # type:ignore[attr-defined]
251256
return r
252257

253258

@@ -567,7 +572,7 @@ def main() -> None:
567572
cr_headers = {
568573
"User-Agent": f"rfparser/{__version__} (https://github.com/TGAC/rfparser; mailto:{config['email']})",
569574
}
570-
unpaywall_session = Session()
575+
unpaywall_session = CachedSession(expire_after=CACHED_RESPONSE_EXPIRE_AFTER, cache_control=True)
571576
for doi, pub in pubs_with_doi.items():
572577
pub["metadata_ok"] = False
573578
if doi in BROKEN_DOI_TO_REASON:

0 commit comments

Comments
 (0)