|
13 | 13 | import traceback |
14 | 14 | import unicodedata |
15 | 15 | from collections import Counter |
16 | | -from time import sleep |
17 | 16 | from urllib.parse import urlparse |
18 | 17 |
|
19 | 18 | # Third-party |
@@ -309,43 +308,25 @@ def query_internet_archive(args): |
309 | 308 | rows = 1000000 |
310 | 309 | total_rows = 0 |
311 | 310 | total_processed = 0 |
312 | | - max_retries = 3 |
313 | 311 |
|
314 | 312 | session = shared.get_session( |
315 | 313 | accept_header="application/json", session=ArchiveSession() |
316 | 314 | ) |
317 | 315 | while True: |
318 | 316 | # Loop until no more results are returned by the API |
319 | 317 | LOGGER.info(f"Fetching {rows} items starting at {total_rows}...") |
320 | | - results = None |
321 | | - |
322 | | - for attempt in range(max_retries): |
323 | | - try: |
324 | | - # Use search_items for simpler pagination management |
325 | | - search = session.search_items( |
326 | | - query, |
327 | | - fields=fields, |
328 | | - params={"rows": rows, "start": total_rows}, |
329 | | - request_kwargs={"timeout": 120}, |
330 | | - ) |
331 | 318 |
|
332 | | - # Convert to list to iterate over |
333 | | - results = list(search) |
334 | | - total_rows += len(results) |
335 | | - break |
336 | | - |
337 | | - except Exception as e: |
338 | | - wait_time = 2**attempt |
339 | | - LOGGER.warning( |
340 | | - f"API request failed (Attempt {attempt+1}/{max_retries}). " |
341 | | - f"Waiting {wait_time}s.Error: {e}" |
342 | | - f"\n{traceback.format_exc()}" |
343 | | - ) |
344 | | - sleep(wait_time) |
345 | | - else: |
346 | | - raise shared.QuantifyingException( |
347 | | - f"Failed to fetch data after {max_retries} attempts.", 1 |
348 | | - ) |
| 319 | + # Use search_items for simpler pagination management |
| 320 | + response = session.search_items( |
| 321 | + query, |
| 322 | + fields=fields, |
| 323 | + params={"rows": rows, "start": total_rows}, |
| 324 | + request_kwargs={"timeout": 30}, |
| 325 | + ) |
| 326 | + |
| 327 | + # Convert to list to iterate over |
| 328 | + results = list(response) |
| 329 | + total_rows += len(results) |
349 | 330 |
|
350 | 331 | if not results: |
351 | 332 | LOGGER.info("No more results. Ending pagination.") |
|
0 commit comments