Skip to content

Commit c35038b

Browse files
committed
Update zenodo_fetch.py: fix API pagination limits and query handling
- Change MAX_RECORDS_PER_REQUEST from 300 to 25 for unauthenticated calls - Fix query parameter handling to only include when not empty - Change default sort from 'bestmatch' to 'newest' - Update base_query from '*' to empty string for all records
1 parent 67fe51c commit c35038b

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

scripts/1-fetch/zenodo_fetch.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@
4444
DEFAULT_FETCH_LIMIT = (
4545
100000 # Increased to capture more CC licenses from 5.5M+ records
4646
)
47-
# API limitation: Zenodo supports 1000+ records per request, 300 chosen
48-
MAX_RECORDS_PER_REQUEST = 300
47+
# API limitation: Zenodo supports 25 records per request for unauthenticated calls
48+
MAX_RECORDS_PER_REQUEST = 25
4949
ZENODO_API_BASE_URL = "https://zenodo.org/api/records"
5050

5151
# CSV Headers
@@ -265,7 +265,7 @@ def classify_license(license_data):
265265
return "Unknown"
266266

267267

268-
def fetch_zenodo_records(session, page=1, size=100, query="*"):
268+
def fetch_zenodo_records(session, page=1, size=100, query=""):
269269
"""
270270
Fetch Zenodo records using REST API.
271271
@@ -274,11 +274,14 @@ def fetch_zenodo_records(session, page=1, size=100, query="*"):
274274
- metadata.resource_type (structured type information)
275275
"""
276276
params = {
277-
"q": query,
278277
"size": min(size, MAX_RECORDS_PER_REQUEST),
279278
"page": page,
280-
"sort": "bestmatch",
279+
"sort": "newest",
281280
}
281+
282+
# Only add query parameter if not empty
283+
if query:
284+
params["q"] = query
282285

283286
try:
284287
response = session.get(ZENODO_API_BASE_URL, params=params, timeout=15)
@@ -389,7 +392,7 @@ def main():
389392

390393
# Build query for all records - CC filtering happens during processing
391394
# Note: Zenodo's search API doesn't support reliable license field queries
392-
base_query = "*"
395+
base_query = "" # Empty query returns all records
393396

394397
if args.dates_back:
395398
from_year = datetime.now().year - args.dates_back

0 commit comments

Comments
 (0)