Skip to content

Commit 19e1618

Browse files
committed
Rewrite the retrying of failures logic
1 parent aaf280f commit 19e1618

File tree

1 file changed

+84
-66
lines changed

1 file changed

+84
-66
lines changed

scripts/portal-fetcher/openshift-docs-downloader.py

Lines changed: 84 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ async def check_if_modified(session, url, db_path, semaphore):
495495
# If we can't check, assume we need to download
496496
return True, None, None
497497

498-
async def download_page(session, url, output_dir, db_path, semaphore, force=False):
498+
async def download_page(session, url, output_dir, db_path, semaphore, force=False, max_retries=3):
499499
"""
500500
Download a page and save it to the local file system
501501
@@ -505,6 +505,7 @@ async def download_page(session, url, output_dir, db_path, semaphore, force=Fals
505505
output_dir (Path): Base output directory
506506
db_path (Path): Path to SQLite database
507507
semaphore (asyncio.Semaphore): Semaphore for limiting concurrent requests
508+
max_retries (int): Maximum number of retry attempts
508509
force (bool): Force download even if not modified
509510
510511
Returns:
@@ -525,51 +526,68 @@ async def download_page(session, url, output_dir, db_path, semaphore, force=Fals
525526
logger.info(f"Skipping {url} (not modified)")
526527
record_download(db_path, url, local_path, status="success", etag=etag, last_modified=last_modified, change_type="unchanged")
527528
return (url, True, set())
528-
529-
try:
530-
async with semaphore:
531-
logger.info(f"Downloading {url}")
532-
async with session.get(url, timeout=30) as response:
533-
if response.status == 200:
534-
content = await response.text()
535-
536-
# Get ETag and Last-Modified
537-
etag = response.headers.get('ETag')
538-
last_modified = response.headers.get('Last-Modified')
539-
540-
# Determine change type
541-
if local_path.exists():
542-
change_type = "updated"
543-
else:
544-
change_type = "new"
545-
546-
# Save the content
547-
with open(local_path, "w", encoding="utf-8") as f:
548-
f.write(content)
549-
550-
# Record in database
551-
record_download(db_path, url, local_path, etag=etag, last_modified=last_modified, change_type=change_type)
552-
logger.info(f"Downloaded {url} -> {local_path} ({change_type})")
553-
return (url, True, set())
554-
elif response.status == 404:
555-
# Check if this might be an expected 404 for an external link
556-
if is_likely_external_link(url):
557-
logger.info(f"Skipping 404 for likely external URL: {url}")
558-
record_download(db_path, url, "N/A", status="skipped", change_type="external_404")
529+
for attempt in range(max_retries):
530+
try:
531+
async with semaphore:
532+
retry_msg = f" (attempt {attempt+1}/{max_retries})" if attempt > 0 else ""
533+
logger.info(f"Downloading {url}{retry_msg}")
534+
async with session.get(url, timeout=30) as response:
535+
if response.status == 200:
536+
content = await response.text()
537+
538+
# Get ETag and Last-Modified
539+
etag = response.headers.get('ETag')
540+
last_modified = response.headers.get('Last-Modified')
541+
542+
# Determine change type
543+
if local_path.exists():
544+
change_type = "updated"
545+
else:
546+
change_type = "new"
547+
548+
# Save the content
549+
with open(local_path, "w", encoding="utf-8") as f:
550+
f.write(content)
551+
552+
# Record in database
553+
record_download(db_path, url, local_path, etag=etag, last_modified=last_modified, change_type=change_type)
554+
logger.info(f"Downloaded {url} -> {local_path} ({change_type})")
559555
return (url, True, set())
556+
elif response.status == 404:
557+
# Check if this might be an expected 404 for an external link
558+
if is_likely_external_link(url):
559+
logger.info(f"Skipping 404 for likely external URL: {url}")
560+
record_download(db_path, url, "N/A", status="skipped", change_type="external_404")
561+
return (url, True, set())
562+
elif attempt == max_retries - 1:
563+
# It's a real 404 for a document that should exist, and we've tried max times
564+
logger.warning(f"Failed to download {url}: HTTP 404")
565+
record_download(db_path, url, local_path, status="failed_404", change_type="error")
566+
return (url, False, {url})
560567
else:
561-
# It's a real 404 for a document that should exist
562-
logger.warning(f"Failed to download {url}: HTTP 404")
563-
record_download(db_path, url, local_path, status="failed_404", change_type="error")
564-
return (url, False, {url})
565-
else:
566-
logger.warning(f"Failed to download {url}: HTTP {response.status}")
567-
record_download(db_path, url, local_path, status="failed", change_type="error")
568-
return (url, False, {url})
569-
except Exception as e:
570-
logger.error(f"Error downloading {url}: {e}")
571-
record_download(db_path, url, local_path, status="error", change_type="error")
572-
return (url, False, {url})
568+
if attempt == max_retries - 1:
569+
# Final attempt failed with a non-404 error
570+
logger.warning(f"Failed to download {url}: HTTP {response.status}")
571+
record_download(db_path, url, local_path, status="failed", change_type="error")
572+
return (url, False, {url})
573+
574+
# If we got here and it's not the last attempt, continue to next try
575+
if attempt < max_retries - 1:
576+
logger.warning(f"Retrying download for {url} after unsuccessful attempt")
577+
await asyncio.sleep(1) # Small delay before retry
578+
579+
except Exception as e:
580+
if attempt < max_retries - 1:
581+
logger.warning(f"Error downloading {url}: {e}. Retrying...")
582+
await asyncio.sleep(1) # Small delay before retry
583+
else:
584+
logger.error(f"Error downloading {url}: {e}. Max retries reached.")
585+
record_download(db_path, url, local_path, status="error", change_type="error")
586+
return (url, False, {url})
587+
588+
# Fallback case - this should rarely happen but just in case
589+
record_download(db_path, url, local_path, status="error", change_type="error")
590+
return (url, False, {url})
573591

574592
async def extract_links(session, url, base_url, visited_urls, semaphore):
575593
"""
@@ -707,7 +725,7 @@ async def crawl(session, start_url, base_url, semaphore):
707725

708726
return (visited_urls, html_single_urls)
709727

710-
async def download_all(session, urls, output_dir, db_path, semaphore, force=False):
728+
async def download_all(session, urls, output_dir, db_path, semaphore, force=False, max_retries=3):
711729
"""
712730
Download all discovered html-single pages
713731
@@ -718,6 +736,7 @@ async def download_all(session, urls, output_dir, db_path, semaphore, force=Fals
718736
db_path (Path): Path to SQLite database
719737
semaphore (asyncio.Semaphore): Semaphore for limiting concurrent requests
720738
force (bool): Force download even if files haven't changed
739+
max_retries (int): Maximum number of retry attempts for failed downloads
721740
722741
Returns:
723742
tuple: (successes, failures, failed_downloads)
@@ -726,10 +745,10 @@ async def download_all(session, urls, output_dir, db_path, semaphore, force=Fals
726745
logger.warning("No html-single pages found to download.")
727746
return (0, 0, set())
728747

729-
logger.info(f"Starting download of {len(urls)} pages (force={force})...")
748+
logger.info(f"Starting download of {len(urls)} pages (force={force}, max_retries={max_retries})...")
730749

731750
# Create tasks for all downloads
732-
tasks = [download_page(session, url, output_dir, db_path, semaphore, force) for url in urls]
751+
tasks = [download_page(session, url, output_dir, db_path, semaphore, force, max_retries) for url in urls]
733752
results = await asyncio.gather(*tasks)
734753

735754
# Count successes and failures, and collect failed downloads
@@ -931,7 +950,7 @@ def export_change_report(db_path, output_dir):
931950

932951
return report
933952

934-
async def run_downloader(base_url, output_dir, concurrency=5, force=False, skip_toc=False):
953+
async def run_downloader(base_url, output_dir, concurrency=5, force=False, skip_toc=False, max_retries=3):
935954
"""
936955
Run the complete download process
937956
@@ -941,13 +960,15 @@ async def run_downloader(base_url, output_dir, concurrency=5, force=False, skip_
941960
concurrency (int): Number of concurrent downloads
942961
force (bool): Force download even if files haven't changed
943962
skip_toc (bool): Skip TOC verification
963+
max_retries (int): Maximum number of retry attempts for failed downloads
944964
945965
Returns:
946966
tuple: (verification_passed, toc_verification_passed, elapsed_time)
947967
"""
948968
logger.info(f"Starting OpenShift Documentation download for {base_url}")
949969
logger.info(f"Output directory: {output_dir}")
950970
logger.info(f"Force download: {force}")
971+
logger.info(f"Max retries: {max_retries}")
951972

952973
# Normalize base URL
953974
if base_url.endswith('/'):
@@ -970,26 +991,16 @@ async def run_downloader(base_url, output_dir, concurrency=5, force=False, skip_
970991
# Step 1: Crawl to discover all html-single pages
971992
visited_urls, html_single_urls = await crawl(session, base_url, base_url, semaphore)
972993

973-
# Step 2: Download discovered pages
994+
# Step 2: Download discovered pages with built-in retry
974995
successes, failures, failed_downloads = await download_all(
975-
session, html_single_urls, output_dir_path, db_path, semaphore, force
996+
session, html_single_urls, output_dir_path, db_path, semaphore, force, max_retries
976997
)
977998

978-
# Step 3: Retry failed downloads
979-
if failed_downloads:
980-
logger.info(f"Retrying {len(failed_downloads)} failed downloads...")
981-
retry_successes, retry_failures, still_failed = await download_all(
982-
session, failed_downloads, output_dir_path, db_path, semaphore, True
983-
)
984-
985-
logger.info(f"Retry completed: {retry_successes} successful, {retry_failures} still failed.")
986-
987-
if retry_failures > 0:
988-
logger.warning(f"Failed to download {retry_failures} pages after retry.")
989-
# Update failed downloads list
990-
with open(output_dir_path / "failed_downloads.json", "w") as f:
991-
json.dump(list(still_failed), f, indent=2)
992-
999+
# Record any failed downloads
1000+
if failures > 0:
1001+
with open(output_dir_path / "failed_downloads.json", "w") as f:
1002+
json.dump(list(failed_downloads), f, indent=2)
1003+
9931004
# Step 4: Verify downloads
9941005
verification_passed = await verify_downloads(html_single_urls, db_path, output_dir_path)
9951006

@@ -1095,11 +1106,18 @@ async def main_async(args):
10951106
# Run the downloader
10961107
verification_passed, toc_verification_passed, elapsed_time = await run_downloader(
10971108
base_url,
1098-
output_dir,
1109+
output_dir,
1110+
max_retries=args.max_retries,
10991111
concurrency=args.concurrency,
11001112
force=args.force,
11011113
skip_toc=args.skip_toc_verification
11021114
)
1115+
parser.add_argument(
1116+
"--max-retries",
1117+
type=check_positive_int,
1118+
default=3,
1119+
help="Maximum number of retry attempts for failed downloads (default: 3)"
1120+
)
11031121

11041122
return verification_passed and toc_verification_passed
11051123

0 commit comments

Comments
 (0)