Skip to content

Commit e402be3

Browse files
committed
rely on session for retry instead of manually managing it. reduce timeout to 30s
1 parent dc4191f commit e402be3

File tree

1 file changed

+11
-30
lines changed

1 file changed

+11
-30
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
import traceback
1414
import unicodedata
1515
from collections import Counter
16-
from time import sleep
1716
from urllib.parse import urlparse
1817

1918
# Third-party
@@ -309,43 +308,25 @@ def query_internet_archive(args):
309308
rows = 1000000
310309
total_rows = 0
311310
total_processed = 0
312-
max_retries = 3
313311

314312
session = shared.get_session(
315313
accept_header="application/json", session=ArchiveSession()
316314
)
317315
while True:
318316
# Loop until no more results are returned by the API
319317
LOGGER.info(f"Fetching {rows} items starting at {total_rows}...")
320-
results = None
321-
322-
for attempt in range(max_retries):
323-
try:
324-
# Use search_items for simpler pagination management
325-
search = session.search_items(
326-
query,
327-
fields=fields,
328-
params={"rows": rows, "start": total_rows},
329-
request_kwargs={"timeout": 120},
330-
)
331318

332-
# Convert to list to iterate over
333-
results = list(search)
334-
total_rows += len(results)
335-
break
336-
337-
except Exception as e:
338-
wait_time = 2**attempt
339-
LOGGER.warning(
340-
f"API request failed (Attempt {attempt+1}/{max_retries}). "
341-
f"Waiting {wait_time}s.Error: {e}"
342-
f"\n{traceback.format_exc()}"
343-
)
344-
sleep(wait_time)
345-
else:
346-
raise shared.QuantifyingException(
347-
f"Failed to fetch data after {max_retries} attempts.", 1
348-
)
319+
# Use search_items for simpler pagination management
320+
response = session.search_items(
321+
query,
322+
fields=fields,
323+
params={"rows": rows, "start": total_rows},
324+
request_kwargs={"timeout": 30},
325+
)
326+
327+
# Convert to list to iterate over
328+
results = list(response)
329+
total_rows += len(results)
349330

350331
if not results:
351332
LOGGER.info("No more results. Ending pagination.")

0 commit comments

Comments
 (0)