1- """
2- Chromium module
1+ """"
2+ chromium module
33"""
44import asyncio
55from typing import Any , AsyncIterator , Iterator , List , Optional
66from langchain_community .document_loaders .base import BaseLoader
77from langchain_core .documents import Document
8+ import aiohttp
9+ import async_timeout
810from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
911
1012logger = get_logger ("web-loader" )
@@ -21,6 +23,9 @@ class ChromiumLoader(BaseLoader):
2123 urls: A list of URLs to scrape content from.
2224 """
2325
26+ RETRY_LIMIT = 3
27+ TIMEOUT = 10
28+
2429 def __init__ (
2530 self ,
2631 urls : List [str ],
@@ -66,17 +71,29 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
6671
6772 Returns:
6873 str: The scraped HTML content or an error message if an exception occurs.
69-
7074 """
7175 import undetected_chromedriver as uc
7276
7377 logger .info (f"Starting scraping with { self .backend } ..." )
7478 results = ""
75- try :
76- driver = uc .Chrome (headless = self .headless )
77- results = driver .get (url ).page_content
78- except Exception as e :
79- results = f"Error: { e } "
79+ attempt = 0
80+
81+ while attempt < self .RETRY_LIMIT :
82+ try :
83+ async with async_timeout .timeout (self .TIMEOUT ):
84+ driver = uc .Chrome (headless = self .headless )
85+ driver .get (url )
86+ results = driver .page_content
87+ logger .info (f"Successfully scraped { url } " )
88+ break
89+ except (aiohttp .ClientError , asyncio .TimeoutError ) as e :
90+ attempt += 1
91+ logger .error (f"Attempt { attempt } failed: { e } " )
92+ if attempt == self .RETRY_LIMIT :
93+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
94+ finally :
95+ driver .quit ()
96+
8097 return results
8198
8299 async def ascrape_playwright (self , url : str ) -> str :
@@ -88,28 +105,36 @@ async def ascrape_playwright(self, url: str) -> str:
88105
89106 Returns:
90107 str: The scraped HTML content or an error message if an exception occurs.
91-
92108 """
93109 from playwright .async_api import async_playwright
94110 from undetected_playwright import Malenia
95111
96112 logger .info (f"Starting scraping with { self .backend } ..." )
97113 results = ""
98- async with async_playwright () as p :
99- browser = await p .chromium .launch (
100- headless = self .headless , proxy = self .proxy , ** self .browser_config
101- )
114+ attempt = 0
115+
116+ while attempt < self .RETRY_LIMIT :
102117 try :
103- context = await browser .new_context ()
104- await Malenia .apply_stealth (context )
105- page = await context .new_page ()
106- await page .goto (url , wait_until = "domcontentloaded" )
107- await page .wait_for_load_state (self .load_state )
108- results = await page .content () # Simply get the HTML content
109- logger .info ("Content scraped" )
110- except Exception as e :
111- results = f"Error: { e } "
112- await browser .close ()
118+ async with async_playwright () as p , async_timeout .timeout (self .TIMEOUT ):
119+ browser = await p .chromium .launch (
120+ headless = self .headless , proxy = self .proxy , ** self .browser_config
121+ )
122+ context = await browser .new_context ()
123+ await Malenia .apply_stealth (context )
124+ page = await context .new_page ()
125+ await page .goto (url , wait_until = "domcontentloaded" )
126+ await page .wait_for_load_state (self .load_state )
127+ results = await page .content ()
128+ logger .info ("Content scraped" )
129+ break
130+ except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
131+ attempt += 1
132+ logger .error (f"Attempt { attempt } failed: { e } " )
133+ if attempt == self .RETRY_LIMIT :
134+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
135+ finally :
136+ await browser .close ()
137+
113138 return results
114139
115140 def lazy_load (self ) -> Iterator [Document ]:
@@ -121,7 +146,6 @@ def lazy_load(self) -> Iterator[Document]:
121146
122147 Yields:
123148 Document: The scraped content encapsulated within a Document object.
124-
125149 """
126150 scraping_fn = getattr (self , f"ascrape_{ self .backend } " )
127151
0 commit comments