44from langchain_core .documents import Document
55import aiohttp
66import async_timeout
7+ from typing import Union
78from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
89
910logger = get_logger ("web-loader" )
11+ logger .setLevel ("INFO" )
1012
1113class ChromiumLoader (BaseLoader ):
1214 """Scrapes HTML pages from URLs using a (headless) instance of the
@@ -97,14 +99,144 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
9799
98100 return results
99101
102+ async def ascrape_playwright_scroll (
103+ self ,
104+ url : str ,
105+ timeout : Union [int , None ]= 30 ,
106+ scroll : int = 15000 ,
107+ sleep : float = 2 ,
108+ scroll_to_bottom : bool = False
109+ ) -> str :
110+ """
111+ Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
112+
113+ Notes:
114+ - The user gets to decide between scrolling to the bottom of the page or scrolling by a finite amount of time.
115+ - If the user chooses to scroll to the bottom, the scraper will stop when the page height stops changing or when
116+ the timeout is reached. In this case, the user should opt for an appropriate timeout value i.e. larger than usual.
117+ - Sleep needs to be set to a value greater than 0 to allow lazy-loaded content to load.
118+ Additionally, if used with scroll_to_bottom=True, the sleep value should be set to a higher value, to
119+ make sure that the scrolling actually happens, thereby allowing the page height to change.
120+ - Probably the best website to test this is https://www.reddit.com/ as it has infinite scrolling.
121+
122+ Args:
123+ - url (str): The URL to scrape.
124+ - timeout (Union[int, None]): The maximum time to spend scrolling. This is separate from the global timeout. If set, must be greater than 0.
125+ Can also be set to None, in which case the scraper will only stop when the page height stops changing.
126+ - scroll (float): The number of pixels to scroll down by. Defaults to 15000. Cannot be less than 5000 pixels.
127+ Less than this and we don't scroll enough to see any content change.
128+ - sleep (int): The number of seconds to sleep after each scroll, to allow the page to load.
129+ Defaults to 2. Must be greater than 0.
130+
131+ Returns:
132+ str: The scraped HTML content
133+
134+ Raises:
135+ - ValueError: If the timeout value is less than or equal to 0.
136+ - ValueError: If the sleep value is less than or equal to 0.
137+ - ValueError: If the scroll value is less than 5000.
138+ """
139+ # NB: I have tested using scrollHeight to determine when to stop scrolling
140+ # but it doesn't always work as expected. The page height doesn't change on some sites like
141+ # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom.
142+ # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!?
143+
144+ if timeout and timeout <= 0 :
145+ raise ValueError ("If set, timeout value for scrolling scraper must be greater than 0." )
146+
147+ if sleep <= 0 :
148+ raise ValueError ("Sleep for scrolling scraper value must be greater than 0." )
149+
150+ if scroll < 5000 :
151+ raise ValueError ("Scroll value for scrolling scraper must be greater than or equal to 5000." )
152+
153+ from playwright .async_api import async_playwright
154+ from undetected_playwright import Malenia
155+ import time
156+
157+ logger .info (f"Starting scraping with scrolling support for { url } ..." )
158+
159+ results = ""
160+ attempt = 0
161+
162+ while attempt < self .RETRY_LIMIT :
163+ try :
164+ async with async_playwright () as p :
165+ browser = await p .chromium .launch (
166+ headless = self .headless , proxy = self .proxy , ** self .browser_config
167+ )
168+ context = await browser .new_context ()
169+ await Malenia .apply_stealth (context )
170+ page = await context .new_page ()
171+ await page .goto (url , wait_until = "domcontentloaded" )
172+ await page .wait_for_load_state (self .load_state )
173+
174+ previous_height = None
175+ start_time = time .time ()
176+
177+ # Store the heights of the page after each scroll
178+ # This is useful in case we scroll with a timer and want to stop shortly after reaching the bottom
179+ # or simly when the page stops changing for some reason.
180+ heights = []
181+
182+ while True :
183+ current_height = await page .evaluate ("document.body.scrollHeight" )
184+ heights .append (current_height )
185+ heights = heights [- 5 :] # Keep only the last 5 heights, to not run out of memory
186+
187+ # Break if we've reached the bottom of the page i.e. if scrolling makes no more progress
188+ # Attention!!! This is not always reliable. Sometimes the page might not change due to lazy loading
189+ # or other reasons. In such cases, the user should set scroll_to_bottom=False and set a timeout.
190+ if scroll_to_bottom and previous_height == current_height :
191+ logger .info (f"Reached bottom of page for url { url } " )
192+ break
193+
194+ previous_height = current_height
195+
196+ await page .mouse .wheel (0 , scroll )
197+ logger .debug (f"Scrolled { url } to current height { current_height } px..." )
198+ time .sleep (sleep ) # Allow some time for any lazy-loaded content to load
199+
200+ current_time = time .time ()
201+ elapsed_time = current_time - start_time
202+ logger .debug (f"Elapsed time: { elapsed_time } seconds" )
203+
204+ if timeout :
205+ if elapsed_time >= timeout :
206+ logger .info (f"Reached timeout of { timeout } seconds for url { url } " )
207+ break
208+ elif len (heights ) == 5 and len (set (heights )) == 1 :
209+ logger .info (f"Page height has not changed for url { url } for the last 5 scrolls. Stopping." )
210+ break
211+
212+ results = await page .content ()
213+ break
214+
215+ except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
216+ attempt += 1
217+ logger .error (f"Attempt { attempt } failed: { e } " )
218+ if attempt == self .RETRY_LIMIT :
219+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
220+ finally :
221+ await browser .close ()
222+
223+ return results
224+
100225 async def ascrape_playwright (self , url : str ) -> str :
101226 """
102227 Asynchronously scrape the content of a given URL using Playwright's async API.
228+
229+ Args:
230+ url (str): The URL to scrape.
231+
232+ Returns:
233+ str: The scraped HTML content or an error message if an exception occurs.
103234 """
104235 from playwright .async_api import async_playwright
105236 from undetected_playwright import Malenia
106237
107238 logger .info (f"Starting scraping with { self .backend } ..." )
239+ results = ""
108240 attempt = 0
109241
110242 while attempt < self .RETRY_LIMIT :
@@ -120,15 +252,16 @@ async def ascrape_playwright(self, url: str) -> str:
120252 await page .wait_for_load_state (self .load_state )
121253 results = await page .content ()
122254 logger .info ("Content scraped" )
123- return results
255+ break
124256 except (aiohttp .ClientError , asyncio .TimeoutError , Exception ) as e :
125257 attempt += 1
126258 logger .error (f"Attempt { attempt } failed: { e } " )
127259 if attempt == self .RETRY_LIMIT :
128- raise RuntimeError ( f"Failed to fetch { url } after { self .RETRY_LIMIT } attempts: { e } ")
260+ results = f"Error: Network error after { self .RETRY_LIMIT } attempts - { e } "
129261 finally :
130- if 'browser' in locals ():
131- await browser .close ()
262+ await browser .close ()
263+
264+ return results
132265
133266 async def ascrape_with_js_support (self , url : str ) -> str :
134267 """
0 commit comments