88
99logger = get_logger ("web-loader" )
1010
11+
1112class ChromiumLoader (BaseLoader ):
1213 """Scrapes HTML pages from URLs using a (headless) instance of the
1314 Chromium web driver with proxy protection.
@@ -33,6 +34,7 @@ def __init__(
3334 proxy : Optional [Proxy ] = None ,
3435 load_state : str = "domcontentloaded" ,
3536 requires_js_support : bool = False ,
37+ storage_state : Optional [str ] = None ,
3638 ** kwargs : Any ,
3739 ):
3840 """Initialize the loader with a list of URL paths.
@@ -62,6 +64,7 @@ def __init__(
6264 self .urls = urls
6365 self .load_state = load_state
6466 self .requires_js_support = requires_js_support
67+ self .storage_state = storage_state
6568
6669 async def ascrape_undetected_chromedriver (self , url : str ) -> str :
6770 """
@@ -91,7 +94,9 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
9194 attempt += 1
9295 logger .error (f"Attempt { attempt } failed: { e } " )
9396 if attempt == self .RETRY_LIMIT :
94- results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
97+ results = (
98+ f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
99+ )
95100 finally :
96101 driver .quit ()
97102
@@ -113,7 +118,9 @@ async def ascrape_playwright(self, url: str) -> str:
113118 browser = await p .chromium .launch (
114119 headless = self .headless , proxy = self .proxy , ** self .browser_config
115120 )
116- context = await browser .new_context ()
121+ context = await browser .new_context (
122+ storage_state = self .storage_state
123+ )
117124 await Malenia .apply_stealth (context )
118125 page = await context .new_page ()
119126 await page .goto (url , wait_until = "domcontentloaded" )
@@ -125,10 +132,12 @@ async def ascrape_playwright(self, url: str) -> str:
125132 attempt += 1
126133 logger .error (f"Attempt { attempt } failed: { e } " )
127134 if attempt == self .RETRY_LIMIT :
128- raise RuntimeError (f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } " )
135+ raise RuntimeError (
136+ f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } "
137+ )
129138 finally :
130- if ' browser' in locals ():
131- await browser . close ()
139+ if " browser" in locals ():
140+
132141
133142 async def ascrape_with_js_support (self , url : str ) -> str :
134143 """
@@ -138,7 +147,7 @@ async def ascrape_with_js_support(self, url: str) -> str:
138147 url (str): The URL to scrape.
139148
140149 Returns:
141- str: The fully rendered HTML content after JavaScript execution,
150+ str: The fully rendered HTML content after JavaScript execution,
142151 or an error message if an exception occurs.
143152 """
144153 from playwright .async_api import async_playwright
@@ -153,7 +162,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
153162 browser = await p .chromium .launch (
154163 headless = self .headless , proxy = self .proxy , ** self .browser_config
155164 )
156- context = await browser .new_context ()
165+ context = await browser .new_context (
166+ storage_state = self .storage_state
167+ )
157168 page = await context .new_page ()
158169 await page .goto (url , wait_until = "networkidle" )
159170 results = await page .content ()
@@ -163,7 +174,9 @@ async def ascrape_with_js_support(self, url: str) -> str:
163174 attempt += 1
164175 logger .error (f"Attempt { attempt } failed: { e } " )
165176 if attempt == self .RETRY_LIMIT :
166- results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
177+ results = (
178+ f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
179+ )
167180 finally :
168181 await browser .close ()
169182
@@ -180,7 +193,9 @@ def lazy_load(self) -> Iterator[Document]:
180193 Document: The scraped content encapsulated within a Document object.
181194 """
182195 scraping_fn = (
183- self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
196+ self .ascrape_with_js_support
197+ if self .requires_js_support
198+ else getattr (self , f"ascrape_{ self .backend } " )
184199 )
185200
186201 for url in self .urls :
@@ -202,7 +217,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
202217 source URL as metadata.
203218 """
204219 scraping_fn = (
205- self .ascrape_with_js_support if self .requires_js_support else getattr (self , f"ascrape_{ self .backend } " )
220+ self .ascrape_with_js_support
221+ if self .requires_js_support
222+ else getattr (self , f"ascrape_{ self .backend } " )
206223 )
207224
208225 tasks = [scraping_fn (url ) for url in self .urls ]
0 commit comments