44from langchain_core .documents import Document
55import aiohttp
66import async_timeout
7+ from typing import Union
78from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
89
910logger = get_logger ("web-loader" )
@@ -102,14 +103,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
102103
103104 return results
104105
106+ async def ascrape_playwright_scroll (
107+ self ,
108+ url : str ,
109+ timeout : Union [int , None ]= 30 ,
110+ scroll : int = 15000 ,
111+ sleep : float = 2 ,
112+ scroll_to_bottom : bool = False
113+ ) -> str :
114+ """
115+ Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
116+
117+ Notes:
118+ - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
119+ - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
120+ the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
121+ - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
122+ Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
123+ make sure that the scrolling actually happens, thereby allowing the page height to change.
124+ - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
125+
126+ Args:
127+ - url (str): The URL to scrape.
128+ - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
129+ Can also be set to None, in which case the scraper will only stop when the page height stops changing.
130+ - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
131+ Less than this and we don't scroll enough to see any content change.
132+ - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
133+ Defaults to 2. Must be greater than 0.
134+
135+ Returns:
136+ str: The scraped HTML content
137+
138+ Raises:
139+ - ValueError: If the timeout value is less than or equal to 0.
140+ - ValueError: If the sleep value is less than or equal to 0.
141+ - ValueError: If the scroll value is less than 5000.
142+ """
143+ # NB: I have tested using scrollHeight to determine when to stop scrolling
144+ # but it doesn't always work as expected. The page height doesn't change on some sites like
145+ # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
146+ # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
147+
148+ if timeout and timeout <= 0 :
149+ raise ValueError ("If set, timeout value for scrolling scraper must be greater than 0." )
150+
151+ if sleep <= 0 :
152+ raise ValueError ("Sleep for scrolling scraper value must be greater than 0." )
153+
154+ if scroll < 5000 :
155+ raise ValueError ("Scroll value for scrolling scraper must be greater than or equal to 5000." )
156+
157+ from playwright .async_api import async_playwright
158+ from undetected_playwright import Malenia
159+ import time
160+
161+ logger .info (f"Starting scraping with scrolling support for { url } ..." )
162+
163+ results = ""
164+ attempt = 0
165+
166+ while attempt < self .RETRY_LIMIT :
167+ try :
168+ async with async_playwright () as p :
169+ browser = await p .chromium .launch (
170+ headless = self .headless , proxy = self .proxy , ** self .browser_config
171+ )
172+ context = await browser .new_context ()
173+ await Malenia .apply_stealth (context )
174+ page = await context .new_page ()
175+ await page .goto (url , wait_until = "domcontentloaded" )
176+ await page .wait_for_load_state (self .load_state )
177+
178+ previous_height = None
179+ start_time = time .time ()
180+
181+ # Store the heights of the page after each scroll
182+ # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
183+ # or simly when the page stops changing for some reason.
184+ heights = []
185+
186+ while True :
187+ current_height = await page .evaluate ("document.body.scrollHeight" )
188+ heights .append (current_height )
189+ heights = heights [- 5 :] # Keep only the last 5 heights, to not run out of memory
190+
191+ # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
192+ # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
193+ # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
194+ if scroll_to_bottom and previous_height == current_height :
195+ logger .info (f"Reached bottom of page for url { url } " )
196+ break
197+
198+ previous_height = current_height
199+
200+ await page .mouse .wheel (0 , scroll )
201+ logger .debug (f"Scrolled { url } to current height { current_height } px..." )
202+ time .sleep (sleep ) # Allow some time for any lazy-loaded content to load
203+
204+ current_time = time .time ()
205+ elapsed_time = current_time - start_time
206+ logger .debug (f"Elapsed time: { elapsed_time } seconds" )
207+
208+ if timeout :
209+ if elapsed_time >= timeout :
210+ logger .info (f"Reached timeout of { timeout } seconds for url { url } " )
211+ break
212+ elif len (heights ) == 5 and len (set (heights )) == 1 :
213+ logger .info (f"Page height has not changed for url { url } for the last 5 scrolls. Stopping." )
214+ break
215+
216+ results = await page .content ()
217+ break
218+
219+ except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
220+ attempt += 1
221+ logger .error (f"Attempt { attempt } failed: { e } " )
222+ if attempt == self .RETRY_LIMIT :
223+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
224+ finally :
225+ await browser .close ()
226+
227+ return results
228+
105229 async def ascrape_playwright (self , url : str ) -> str :
106230 """
107231 Asynchronously scrape the content of a given URL using Playwright's async API.
232+
233+ Args:
234+ url (str): The URL to scrape.
235+
236+ Returns:
237+ str: The scraped HTML content or an error message if an exception occurs.
108238 """
109239 from playwright .async_api import async_playwright
110240 from undetected_playwright import Malenia
111241
112242 logger .info (f"Starting scraping with { self .backend } ..." )
243+ results = ""
113244 attempt = 0
114245
115246 while attempt < self .RETRY_LIMIT :
@@ -127,16 +258,16 @@ async def ascrape_playwright(self, url: str) -> str:
127258 await page .wait_for_load_state (self .load_state )
128259 results = await page .content ()
129260 logger .info ("Content scraped" )
130- return results
261+ break
131262 except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
132263 attempt += 1
133264 logger .error (f"Attempt { attempt } failed: { e } " )
134265 if attempt == self .RETRY_LIMIT :
135- raise RuntimeError (
136- f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } "
137- )
266+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
138267 finally :
139- if "browser" in locals ():
268+ await browser .close ()
269+
270+ return results
140271
141272
142273 async def ascrape_with_js_support (self , url : str ) -> str :
0 commit comments