99
1010logger = get_logger ("web-loader" )
1111
12+
1213class ChromiumLoader (BaseLoader ):
1314 """Scrapes HTML pages from URLs using a (headless) instance of the
1415 Chromium web driver with proxy protection.
@@ -34,6 +35,7 @@ def __init__(
3435 proxy : Optional [Proxy ] = None ,
3536 load_state : str = "domcontentloaded" ,
3637 requires_js_support : bool = False ,
38+ storage_state : Optional [str ] = None ,
3739 ** kwargs : Any ,
3840 ):
3941 """Initialize the loader with a list of URL paths.
@@ -63,6 +65,7 @@ def __init__(
6365 self .urls = urls
6466 self .load_state = load_state
6567 self .requires_js_support = requires_js_support
68+ self .storage_state = storage_state
6669
6770 async def ascrape_undetected_chromedriver (self , url : str ) -> str :
6871 """
@@ -92,7 +95,9 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
9295 attempt += 1
9396 logger .error (f"Attempt { attempt } failed: { e } " )
9497 if attempt == self .RETRY_LIMIT :
95- results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
98+ results = (
99+ f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
100+ )
96101 finally :
97102 driver .quit ()
98103
@@ -244,7 +249,9 @@ async def ascrape_playwright(self, url: str) -> str:
244249 browser = await p .chromium .launch (
245250 headless = self .headless , proxy = self .proxy , ** self .browser_config
246251 )
247- context = await browser .new_context ()
252+ context = await browser .new_context (
253+ storage_state = self .storage_state
254+ )
248255 await Malenia .apply_stealth (context )
249256 page = await context .new_page ()
250257 await page .goto (url , wait_until = "domcontentloaded" )
@@ -262,6 +269,7 @@ async def ascrape_playwright(self, url: str) -> str:
262269
263270 return results
264271
272+
265273 async def ascrape_with_js_support (self , url : str ) -> str :
266274 """
267275 Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
@@ -270,7 +278,7 @@ async def ascrape_with_js_support(self, url: str) -> str:
270278 url (str): The URL to scrape.
271279
272280 Returns:
273- str: The fully rendered HTML content after JavaScript execution,
281+ str: The fully rendered HTML content after JavaScript execution,
274282 or an error message if an exception occurs.
275283 """
276284 from playwright .async_api import async_playwright
@@ -285,7 +293,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
285293 browser = await p .chromium .launch (
286294 headless = self .headless , proxy = self .proxy , ** self .browser_config
287295 )
288- context = await browser .new_context ()
296+ context = await browser .new_context (
297+ storage_state = self .storage_state
298+ )
289299 page = await context .new_page ()
290300 await page .goto (url , wait_until = "networkidle" )
291301 results = await page .content ()
@@ -295,7 +305,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
295305 attempt += 1
296306 logger .error (f"Attempt { attempt } failed: { e } " )
297307 if attempt == self .RETRY_LIMIT :
298- results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
308+ results = (
309+ f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
310+ )
299311 finally :
300312 await browser .close ()
301313
@@ -312,7 +324,9 @@ def lazy_load(self) -> Iterator[Document]:
312324 Document: The scraped content encapsulated within a Document object.
313325 """
314326 scraping_fn = (
315- self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
327+ self .ascrape_with_js_support
328+ if self .requires_js_support
329+ else getattr (self , f"ascrape_{ self .backend } " )
316330 )
317331
318332 for url in self .urls :
@@ -334,7 +348,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
334348 source URL as metadata.
335349 """
336350 scraping_fn = (
337- self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
351+ self .ascrape_with_js_support
352+ if self .requires_js_support
353+ else getattr (self , f"ascrape_{ self .backend } " )
338354 )
339355
340356 tasks = [scraping_fn (url ) for url in self .urls ]
0 commit comments