@@ -495,7 +495,7 @@ async def check_if_modified(session, url, db_path, semaphore):
495
495
# If we can't check, assume we need to download
496
496
return True , None , None
497
497
498
- async def download_page (session , url , output_dir , db_path , semaphore , force = False ):
498
+ async def download_page (session , url , output_dir , db_path , semaphore , force = False , max_retries = 3 ):
499
499
"""
500
500
Download a page and save it to the local file system
501
501
@@ -505,6 +505,7 @@ async def download_page(session, url, output_dir, db_path, semaphore, force=Fals
505
505
output_dir (Path): Base output directory
506
506
db_path (Path): Path to SQLite database
507
507
semaphore (asyncio.Semaphore): Semaphore for limiting concurrent requests
508
+ max_retries (int): Maximum number of retry attempts
508
509
force (bool): Force download even if not modified
509
510
510
511
Returns:
@@ -525,51 +526,68 @@ async def download_page(session, url, output_dir, db_path, semaphore, force=Fals
525
526
logger .info (f"Skipping { url } (not modified)" )
526
527
record_download (db_path , url , local_path , status = "success" , etag = etag , last_modified = last_modified , change_type = "unchanged" )
527
528
return (url , True , set ())
528
-
529
- try :
530
- async with semaphore :
531
- logger .info (f"Downloading { url } " )
532
- async with session .get (url , timeout = 30 ) as response :
533
- if response .status == 200 :
534
- content = await response .text ()
535
-
536
- # Get ETag and Last-Modified
537
- etag = response .headers .get ('ETag' )
538
- last_modified = response .headers .get ('Last-Modified' )
539
-
540
- # Determine change type
541
- if local_path .exists ():
542
- change_type = "updated"
543
- else :
544
- change_type = "new"
545
-
546
- # Save the content
547
- with open (local_path , "w" , encoding = "utf-8" ) as f :
548
- f .write (content )
549
-
550
- # Record in database
551
- record_download (db_path , url , local_path , etag = etag , last_modified = last_modified , change_type = change_type )
552
- logger .info (f"Downloaded { url } -> { local_path } ({ change_type } )" )
553
- return (url , True , set ())
554
- elif response .status == 404 :
555
- # Check if this might be an expected 404 for an external link
556
- if is_likely_external_link (url ):
557
- logger .info (f"Skipping 404 for likely external URL: { url } " )
558
- record_download (db_path , url , "N/A" , status = "skipped" , change_type = "external_404" )
529
+ for attempt in range (max_retries ):
530
+ try :
531
+ async with semaphore :
532
+ retry_msg = f" (attempt { attempt + 1 } /{ max_retries } )" if attempt > 0 else ""
533
+ logger .info (f"Downloading { url } { retry_msg } " )
534
+ async with session .get (url , timeout = 30 ) as response :
535
+ if response .status == 200 :
536
+ content = await response .text ()
537
+
538
+ # Get ETag and Last-Modified
539
+ etag = response .headers .get ('ETag' )
540
+ last_modified = response .headers .get ('Last-Modified' )
541
+
542
+ # Determine change type
543
+ if local_path .exists ():
544
+ change_type = "updated"
545
+ else :
546
+ change_type = "new"
547
+
548
+ # Save the content
549
+ with open (local_path , "w" , encoding = "utf-8" ) as f :
550
+ f .write (content )
551
+
552
+ # Record in database
553
+ record_download (db_path , url , local_path , etag = etag , last_modified = last_modified , change_type = change_type )
554
+ logger .info (f"Downloaded { url } -> { local_path } ({ change_type } )" )
559
555
return (url , True , set ())
556
+ elif response .status == 404 :
557
+ # Check if this might be an expected 404 for an external link
558
+ if is_likely_external_link (url ):
559
+ logger .info (f"Skipping 404 for likely external URL: { url } " )
560
+ record_download (db_path , url , "N/A" , status = "skipped" , change_type = "external_404" )
561
+ return (url , True , set ())
562
+ elif attempt == max_retries - 1 :
563
+ # It's a real 404 for a document that should exist, and we've tried max times
564
+ logger .warning (f"Failed to download { url } : HTTP 404" )
565
+ record_download (db_path , url , local_path , status = "failed_404" , change_type = "error" )
566
+ return (url , False , {url })
560
567
else :
561
- # It's a real 404 for a document that should exist
562
- logger .warning (f"Failed to download { url } : HTTP 404" )
563
- record_download (db_path , url , local_path , status = "failed_404" , change_type = "error" )
564
- return (url , False , {url })
565
- else :
566
- logger .warning (f"Failed to download { url } : HTTP { response .status } " )
567
- record_download (db_path , url , local_path , status = "failed" , change_type = "error" )
568
- return (url , False , {url })
569
- except Exception as e :
570
- logger .error (f"Error downloading { url } : { e } " )
571
- record_download (db_path , url , local_path , status = "error" , change_type = "error" )
572
- return (url , False , {url })
568
+ if attempt == max_retries - 1 :
569
+ # Final attempt failed with a non-404 error
570
+ logger .warning (f"Failed to download { url } : HTTP { response .status } " )
571
+ record_download (db_path , url , local_path , status = "failed" , change_type = "error" )
572
+ return (url , False , {url })
573
+
574
+ # If we got here and it's not the last attempt, continue to next try
575
+ if attempt < max_retries - 1 :
576
+ logger .warning (f"Retrying download for { url } after unsuccessful attempt" )
577
+ await asyncio .sleep (1 ) # Small delay before retry
578
+
579
+ except Exception as e :
580
+ if attempt < max_retries - 1 :
581
+ logger .warning (f"Error downloading { url } : { e } . Retrying..." )
582
+ await asyncio .sleep (1 ) # Small delay before retry
583
+ else :
584
+ logger .error (f"Error downloading { url } : { e } . Max retries reached." )
585
+ record_download (db_path , url , local_path , status = "error" , change_type = "error" )
586
+ return (url , False , {url })
587
+
588
+ # Fallback case - this should rarely happen but just in case
589
+ record_download (db_path , url , local_path , status = "error" , change_type = "error" )
590
+ return (url , False , {url })
573
591
574
592
async def extract_links (session , url , base_url , visited_urls , semaphore ):
575
593
"""
@@ -707,7 +725,7 @@ async def crawl(session, start_url, base_url, semaphore):
707
725
708
726
return (visited_urls , html_single_urls )
709
727
710
- async def download_all (session , urls , output_dir , db_path , semaphore , force = False ):
728
+ async def download_all (session , urls , output_dir , db_path , semaphore , force = False , max_retries = 3 ):
711
729
"""
712
730
Download all discovered html-single pages
713
731
@@ -718,6 +736,7 @@ async def download_all(session, urls, output_dir, db_path, semaphore, force=Fals
718
736
db_path (Path): Path to SQLite database
719
737
semaphore (asyncio.Semaphore): Semaphore for limiting concurrent requests
720
738
force (bool): Force download even if files haven't changed
739
+ max_retries (int): Maximum number of retry attempts for failed downloads
721
740
722
741
Returns:
723
742
tuple: (successes, failures, failed_downloads)
@@ -726,10 +745,10 @@ async def download_all(session, urls, output_dir, db_path, semaphore, force=Fals
726
745
logger .warning ("No html-single pages found to download." )
727
746
return (0 , 0 , set ())
728
747
729
- logger .info (f"Starting download of { len (urls )} pages (force={ force } )..." )
748
+ logger .info (f"Starting download of { len (urls )} pages (force={ force } , max_retries= { max_retries } )..." )
730
749
731
750
# Create tasks for all downloads
732
- tasks = [download_page (session , url , output_dir , db_path , semaphore , force ) for url in urls ]
751
+ tasks = [download_page (session , url , output_dir , db_path , semaphore , force , max_retries ) for url in urls ]
733
752
results = await asyncio .gather (* tasks )
734
753
735
754
# Count successes and failures, and collect failed downloads
@@ -931,7 +950,7 @@ def export_change_report(db_path, output_dir):
931
950
932
951
return report
933
952
934
- async def run_downloader (base_url , output_dir , concurrency = 5 , force = False , skip_toc = False ):
953
+ async def run_downloader (base_url , output_dir , concurrency = 5 , force = False , skip_toc = False , max_retries = 3 ):
935
954
"""
936
955
Run the complete download process
937
956
@@ -941,13 +960,15 @@ async def run_downloader(base_url, output_dir, concurrency=5, force=False, skip_
941
960
concurrency (int): Number of concurrent downloads
942
961
force (bool): Force download even if files haven't changed
943
962
skip_toc (bool): Skip TOC verification
963
+ max_retries (int): Maximum number of retry attempts for failed downloads
944
964
945
965
Returns:
946
966
tuple: (verification_passed, toc_verification_passed, elapsed_time)
947
967
"""
948
968
logger .info (f"Starting OpenShift Documentation download for { base_url } " )
949
969
logger .info (f"Output directory: { output_dir } " )
950
970
logger .info (f"Force download: { force } " )
971
+ logger .info (f"Max retries: { max_retries } " )
951
972
952
973
# Normalize base URL
953
974
if base_url .endswith ('/' ):
@@ -970,26 +991,16 @@ async def run_downloader(base_url, output_dir, concurrency=5, force=False, skip_
970
991
# Step 1: Crawl to discover all html-single pages
971
992
visited_urls , html_single_urls = await crawl (session , base_url , base_url , semaphore )
972
993
973
- # Step 2: Download discovered pages
994
+ # Step 2: Download discovered pages with built-in retry
974
995
successes , failures , failed_downloads = await download_all (
975
- session , html_single_urls , output_dir_path , db_path , semaphore , force
996
+ session , html_single_urls , output_dir_path , db_path , semaphore , force , max_retries
976
997
)
977
998
978
- # Step 3: Retry failed downloads
979
- if failed_downloads :
980
- logger .info (f"Retrying { len (failed_downloads )} failed downloads..." )
981
- retry_successes , retry_failures , still_failed = await download_all (
982
- session , failed_downloads , output_dir_path , db_path , semaphore , True
983
- )
984
-
985
- logger .info (f"Retry completed: { retry_successes } successful, { retry_failures } still failed." )
986
-
987
- if retry_failures > 0 :
988
- logger .warning (f"Failed to download { retry_failures } pages after retry." )
989
- # Update failed downloads list
990
- with open (output_dir_path / "failed_downloads.json" , "w" ) as f :
991
- json .dump (list (still_failed ), f , indent = 2 )
992
-
999
+ # Record any failed downloads
1000
+ if failures > 0 :
1001
+ with open (output_dir_path / "failed_downloads.json" , "w" ) as f :
1002
+ json .dump (list (failed_downloads ), f , indent = 2 )
1003
+
993
1004
# Step 4: Verify downloads
994
1005
verification_passed = await verify_downloads (html_single_urls , db_path , output_dir_path )
995
1006
@@ -1095,11 +1106,18 @@ async def main_async(args):
1095
1106
# Run the downloader
1096
1107
verification_passed , toc_verification_passed , elapsed_time = await run_downloader (
1097
1108
base_url ,
1098
- output_dir ,
1109
+ output_dir ,
1110
+ max_retries = args .max_retries ,
1099
1111
concurrency = args .concurrency ,
1100
1112
force = args .force ,
1101
1113
skip_toc = args .skip_toc_verification
1102
1114
)
1115
+ parser .add_argument (
1116
+ "--max-retries" ,
1117
+ type = check_positive_int ,
1118
+ default = 3 ,
1119
+ help = "Maximum number of retry attempts for failed downloads (default: 3)"
1120
+ )
1103
1121
1104
1122
return verification_passed and toc_verification_passed
1105
1123
0 commit comments