44from langchain_core .documents import Document
55import aiohttp
66import async_timeout
7+ from selenium import webdriver
8+ from selenium .webdriver .chrome .options import Options as ChromeOptions
79from typing import Union
810from ..utils import Proxy , dynamic_import , get_logger , parse_or_search_proxy
911
@@ -36,6 +38,7 @@ def __init__(
3638 load_state : str = "domcontentloaded" ,
3739 requires_js_support : bool = False ,
3840 storage_state : Optional [str ] = None ,
41+ browser_name : str = "chromium" , #default chromium
3942 ** kwargs : Any ,
4043 ):
4144 """Initialize the loader with a list of URL paths.
@@ -66,6 +69,7 @@ def __init__(
6669 self .load_state = load_state
6770 self .requires_js_support = requires_js_support
6871 self .storage_state = storage_state
72+ self .browser_name = browser_name
6973
7074 async def scrape (self , url :str ) -> str :
7175 if self .backend == "playwright" :
@@ -95,11 +99,35 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
9599 while attempt < self .RETRY_LIMIT :
96100 try :
97101 async with async_timeout .timeout (self .TIMEOUT ):
98- driver = uc .Chrome (headless = self .headless )
99- driver .get (url )
100- results = driver .page_source
101- logger .info (f"Successfully scraped { url } " )
102- break
102+ # Handling browser selection
103+ if self .backend == "selenium" :
104+ if self .browser_name == "chromium" :
105+ options = ChromeOptions ()
106+ options .headless = self .headless
107+ # Initialize undetected chromedriver for Selenium
108+ driver = uc .Chrome (options = options )
109+ driver .get (url )
110+ results = driver .page_source
111+ logger .info (f"Successfully scraped { url } with { self .browser_name } " )
112+ break
113+ elif self .browser_name == "firefox" :
114+ from selenium .webdriver .firefox .options import Options as FirefoxOptions
115+ options = FirefoxOptions ()
116+ options .headless = self .headless
117+ # Initialize undetected Firefox driver (if required)
118+ driver = webdriver .Firefox (options = options )
119+ driver .get (url )
120+ results = driver .page_source
121+ logger .info (f"Successfully scraped { url } with { self .browser_name } " )
122+ break
123+ else :
124+ logger .error (f"Unsupported browser { self .browser_name } for Selenium." )
125+ results = f"Error: Unsupported browser { self .browser_name } ."
126+ break
127+ else :
128+ logger .error (f"Unsupported backend { self .backend } ." )
129+ results = f"Error: Unsupported backend { self .backend } ."
130+ break
103131 except (aiohttp .ClientError , asyncio .TimeoutError ) as e :
104132 attempt += 1
105133 logger .error (f"Attempt { attempt } failed: { e } " )
@@ -118,7 +146,8 @@ async def ascrape_playwright_scroll(
118146 timeout : Union [int , None ]= 30 ,
119147 scroll : int = 15000 ,
120148 sleep : float = 2 ,
121- scroll_to_bottom : bool = False
149+ scroll_to_bottom : bool = False ,
150+ browser_name : str = "chromium" #default chrome is added
122151 ) -> str :
123152 """
124153 Asynchronously scrape the content of a given URL using Playwright's sync API and scrolling.
@@ -175,9 +204,17 @@ async def ascrape_playwright_scroll(
175204 while attempt < self .RETRY_LIMIT :
176205 try :
177206 async with async_playwright () as p :
178- browser = await p .chromium .launch (
207+ browser = None
208+ if browser_name == "chromium" :
209+ browser = await p .chromium .launch (
179210 headless = self .headless , proxy = self .proxy , ** self .browser_config
180211 )
212+ elif browser_name == "firefox" :
213+ browser = await p .firefox .launch (
214+ headless = self .headless , proxy = self .proxy , ** self .browser_config
215+ )
216+ else :
217+ raise ValueError (f"Invalid browser name: { browser_name } " )
181218 context = await browser .new_context ()
182219 await Malenia .apply_stealth (context )
183220 page = await context .new_page ()
@@ -235,7 +272,7 @@ async def ascrape_playwright_scroll(
235272
236273 return results
237274
238- async def ascrape_playwright (self , url : str ) -> str :
275+ async def ascrape_playwright (self , url : str , browser_name : str = "chromium" ) -> str :
239276 """
240277 Asynchronously scrape the content of a given URL using Playwright's async API.
241278
@@ -255,9 +292,17 @@ async def ascrape_playwright(self, url: str) -> str:
255292 while attempt < self .RETRY_LIMIT :
256293 try :
257294 async with async_playwright () as p , async_timeout .timeout (self .TIMEOUT ):
258- browser = await p .chromium .launch (
295+ browser = None
296+ if browser_name == "chromium" :
297+ browser = await p .chromium .launch (
259298 headless = self .headless , proxy = self .proxy , ** self .browser_config
260299 )
300+ elif browser_name == "firefox" :
301+ browser = await p .firefox .launch (
302+ headless = self .headless , proxy = self .proxy , ** self .browser_config
303+ )
304+ else :
305+ raise ValueError (f"Invalid browser name: { browser_name } " )
261306 context = await browser .new_context (
262307 storage_state = self .storage_state
263308 )
@@ -282,7 +327,7 @@ async def ascrape_playwright(self, url: str) -> str:
282327
283328
284329
285- async def ascrape_with_js_support (self , url : str ) -> str :
330+ async def ascrape_with_js_support (self , url : str , browser_name : str = "chromium" ) -> str :
286331 """
287332 Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
288333
@@ -302,9 +347,17 @@ async def ascrape_with_js_support(self, url: str) -> str:
302347 while attempt < self .RETRY_LIMIT :
303348 try :
304349 async with async_playwright () as p , async_timeout .timeout (self .TIMEOUT ):
305- browser = await p .chromium .launch (
350+ browser = None
351+ if browser_name == "chromium" :
352+ browser = await p .chromium .launch (
306353 headless = self .headless , proxy = self .proxy , ** self .browser_config
307354 )
355+ elif browser_name == "firefox" :
356+ browser = await p .firefox .launch (
357+ headless = self .headless , proxy = self .proxy , ** self .browser_config
358+ )
359+ else :
360+ raise ValueError (f"Invalid browser name: { browser_name } " )
308361 context = await browser .new_context (
309362 storage_state = self .storage_state
310363 )
0 commit comments