44from langchain_core .documents import Document
55import aiohttp
66import async_timeout
7+ from typing import Union
78from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
89
910logger = get_logger ("web-loader" )
@@ -111,14 +112,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
111112
112113 return results
113114
115+ async def ascrape_playwright_scroll (
116+ self ,
117+ url : str ,
118+ timeout : Union [int , None ]= 30 ,
119+ scroll : int = 15000 ,
120+ sleep : float = 2 ,
121+ scroll_to_bottom : bool = False
122+ ) -> str :
123+ """
124+ Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
125+
126+ Notes:
127+ - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
128+ - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
129+ the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
130+ - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
131+ Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
132+ make sure that the scrolling actually happens, thereby allowing the page height to change.
133+ - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
134+
135+ Args:
136+ - url (str): The URL to scrape.
137+ - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
138+ Can also be set to None, in which case the scraper will only stop when the page height stops changing.
139+ - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
140+ Less than this and we don't scroll enough to see any content change.
141+ - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
142+ Defaults to 2. Must be greater than 0.
143+
144+ Returns:
145+ str: The scraped HTML content
146+
147+ Raises:
148+ - ValueError: If the timeout value is less than or equal to 0.
149+ - ValueError: If the sleep value is less than or equal to 0.
150+ - ValueError: If the scroll value is less than 5000.
151+ """
152+ # NB: I have tested using scrollHeight to determine when to stop scrolling
153+ # but it doesn't always work as expected. The page height doesn't change on some sites like
154+ # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
155+ # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
156+
157+ if timeout and timeout <= 0 :
158+ raise ValueError ("If set, timeout value for scrolling scraper must be greater than 0." )
159+
160+ if sleep <= 0 :
161+ raise ValueError ("Sleep for scrolling scraper value must be greater than 0." )
162+
163+ if scroll < 5000 :
164+ raise ValueError ("Scroll value for scrolling scraper must be greater than or equal to 5000." )
165+
166+ from playwright .async_api import async_playwright
167+ from undetected_playwright import Malenia
168+ import time
169+
170+ logger .info (f"Starting scraping with scrolling support for { url } ..." )
171+
172+ results = ""
173+ attempt = 0
174+
175+ while attempt < self .RETRY_LIMIT :
176+ try :
177+ async with async_playwright () as p :
178+ browser = await p .chromium .launch (
179+ headless = self .headless , proxy = self .proxy , ** self .browser_config
180+ )
181+ context = await browser .new_context ()
182+ await Malenia .apply_stealth (context )
183+ page = await context .new_page ()
184+ await page .goto (url , wait_until = "domcontentloaded" )
185+ await page .wait_for_load_state (self .load_state )
186+
187+ previous_height = None
188+ start_time = time .time ()
189+
190+ # Store the heights of the page after each scroll
191+ # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
192+ # or simly when the page stops changing for some reason.
193+ heights = []
194+
195+ while True :
196+ current_height = await page .evaluate ("document.body.scrollHeight" )
197+ heights .append (current_height )
198+ heights = heights [- 5 :] # Keep only the last 5 heights, to not run out of memory
199+
200+ # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
201+ # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
202+ # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
203+ if scroll_to_bottom and previous_height == current_height :
204+ logger .info (f"Reached bottom of page for url { url } " )
205+ break
206+
207+ previous_height = current_height
208+
209+ await page .mouse .wheel (0 , scroll )
210+ logger .debug (f"Scrolled { url } to current height { current_height } px..." )
211+ time .sleep (sleep ) # Allow some time for any lazy-loaded content to load
212+
213+ current_time = time .time ()
214+ elapsed_time = current_time - start_time
215+ logger .debug (f"Elapsed time: { elapsed_time } seconds" )
216+
217+ if timeout :
218+ if elapsed_time >= timeout :
219+ logger .info (f"Reached timeout of { timeout } seconds for url { url } " )
220+ break
221+ elif len (heights ) == 5 and len (set (heights )) == 1 :
222+ logger .info (f"Page height has not changed for url { url } for the last 5 scrolls. Stopping." )
223+ break
224+
225+ results = await page .content ()
226+ break
227+
228+ except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
229+ attempt += 1
230+ logger .error (f"Attempt { attempt } failed: { e } " )
231+ if attempt == self .RETRY_LIMIT :
232+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
233+ finally :
234+ await browser .close ()
235+
236+ return results
237+
114238 async def ascrape_playwright (self , url : str ) -> str :
115239 """
116240 Asynchronously scrape the content of a given URL using Playwright's async API.
241+
242+ Args:
243+ url (str): The URL to scrape.
244+
245+ Returns:
246+ str: The scraped HTML content or an error message if an exception occurs.
117247 """
118248 from playwright .async_api import async_playwright
119249 from undetected_playwright import Malenia
120250
121251 logger .info (f"Starting scraping with { self .backend } ..." )
252+ results = ""
122253 attempt = 0
123254
124255 while attempt < self .RETRY_LIMIT :
@@ -136,19 +267,21 @@ async def ascrape_playwright(self, url: str) -> str:
136267 await page .wait_for_load_state (self .load_state )
137268 results = await page .content ()
138269 logger .info ("Content scraped" )
139- return results
270+ break
140271 except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
141272 attempt += 1
142273 logger .error (f"Attempt { attempt } failed: { e } " )
143274 if attempt == self .RETRY_LIMIT :
144- raise RuntimeError (
145- f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } "
146- )
275+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
147276 finally :
148277 if "browser" in locals ():
149278 await browser .close ()
150279
151280
281+ return results
282+
283+
284+
152285 async def ascrape_with_js_support (self , url : str ) -> str :
153286 """
154287 Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
0 commit comments