@@ -893,27 +893,6 @@ async def verify_against_toc(session, html_single_urls, visited_urls, base_url,
893
893
logger .info ("TOC verification completed successfully." )
894
894
return True
895
895
896
- async def proxy_setup (proxy_url = None ):
897
- """
898
- Set up proxy configuration for requests
899
-
900
- Args:
901
- proxy_url (str): Proxy URL if provided
902
-
903
- Returns:
904
- dict: Proxy configuration for aiohttp
905
- """
906
- if not proxy_url :
907
- # Check environment variables
908
- proxy_url = os .environ .get ('HTTP_PROXY' ) or os .environ .get ('http_proxy' )
909
-
910
- if proxy_url :
911
- logger .info (f"Using proxy: { proxy_url } " )
912
- return {
913
- 'proxy' : proxy_url
914
- }
915
- return None
916
-
917
896
def export_url_mapping (db_path , output_dir ):
918
897
"""
919
898
Export a mapping of local file paths to their source URLs
@@ -952,7 +931,7 @@ def export_change_report(db_path, output_dir):
952
931
953
932
return report
954
933
955
- async def run_downloader (base_url , output_dir , concurrency = 5 , force = False , proxy_url = None , skip_toc = False ):
934
+ async def run_downloader (base_url , output_dir , concurrency = 5 , force = False , skip_toc = False ):
956
935
"""
957
936
Run the complete download process
958
937
@@ -961,7 +940,6 @@ async def run_downloader(base_url, output_dir, concurrency=5, force=False, proxy
961
940
output_dir (str): Directory where documentation will be saved
962
941
concurrency (int): Number of concurrent downloads
963
942
force (bool): Force download even if files haven't changed
964
- proxy_url (str): Proxy URL to use for requests
965
943
skip_toc (bool): Skip TOC verification
966
944
967
945
Returns:
@@ -986,12 +964,9 @@ async def run_downloader(base_url, output_dir, concurrency=5, force=False, proxy
986
964
# Create semaphore for limiting concurrent requests
987
965
semaphore = asyncio .Semaphore (concurrency )
988
966
989
- # Set up proxy
990
- proxy_config = await proxy_setup (proxy_url )
991
-
992
967
start_time = time .time ()
993
968
994
- async with aiohttp .ClientSession () as session :
969
+ async with aiohttp .ClientSession (trust_env = True ) as session :
995
970
# Step 1: Crawl to discover all html-single pages
996
971
visited_urls , html_single_urls = await crawl (session , base_url , base_url , semaphore )
997
972
0 commit comments