1515 Optional ,
1616 Sequence ,
1717 Union ,
18- Literal
18+ Literal ,
1919)
2020import aiohttp
2121import certifi
2222import validators
23- from langchain_community .document_loaders import (
24- PlaywrightURLLoader ,
25- WebBaseLoader
26- )
23+ from langchain_community .document_loaders import PlaywrightURLLoader , WebBaseLoader
2724from langchain_community .document_loaders .firecrawl import FireCrawlLoader
2825from langchain_community .document_loaders .base import BaseLoader
2926from langchain_core .documents import Document
3330 PLAYWRIGHT_WS_URI ,
3431 RAG_WEB_LOADER_ENGINE ,
3532 FIRECRAWL_API_BASE_URL ,
36- FIRECRAWL_API_KEY
33+ FIRECRAWL_API_KEY ,
3734)
3835from open_webui .env import SRC_LOG_LEVELS
3936
@@ -75,6 +72,7 @@ def safe_validate_urls(url: Sequence[str]) -> Sequence[str]:
7572 continue
7673 return valid_urls
7774
75+
7876def resolve_hostname (hostname ):
7977 # Get address information
8078 addr_info = socket .getaddrinfo (hostname , None )
@@ -85,16 +83,13 @@ def resolve_hostname(hostname):
8583
8684 return ipv4_addresses , ipv6_addresses
8785
86+
8887def extract_metadata (soup , url ):
89- metadata = {
90- "source" : url
91- }
88+ metadata = {"source" : url }
9289 if title := soup .find ("title" ):
9390 metadata ["title" ] = title .get_text ()
9491 if description := soup .find ("meta" , attrs = {"name" : "description" }):
95- metadata ["description" ] = description .get (
96- "content" , "No description found."
97- )
92+ metadata ["description" ] = description .get ("content" , "No description found." )
9893 if html := soup .find ("html" ):
9994 metadata ["language" ] = html .get ("lang" , "No language found." )
10095 return metadata
@@ -104,7 +99,7 @@ def verify_ssl_cert(url: str) -> bool:
10499 """Verify SSL certificate for the given URL."""
105100 if not url .startswith ("https://" ):
106101 return True
107-
102+
108103 try :
109104 hostname = url .split ("://" )[- 1 ].split ("/" )[0 ]
110105 context = ssl .create_default_context (cafile = certifi .where ())
@@ -133,7 +128,7 @@ def __init__(
133128 params : Optional [Dict ] = None ,
134129 ):
135130 """Concurrent document loader for FireCrawl operations.
136-
131+
137132 Executes multiple FireCrawlLoader instances concurrently using thread pooling
138133 to improve bulk processing efficiency.
139134 Args:
@@ -142,7 +137,7 @@ def __init__(
142137 trust_env: If True, use proxy settings from environment variables.
143138 requests_per_second: Number of requests per second to limit to.
144139 continue_on_failure (bool): If True, continue loading other URLs on failure.
145- api_key: API key for FireCrawl service. Defaults to None
140+ api_key: API key for FireCrawl service. Defaults to None
146141 (uses FIRE_CRAWL_API_KEY environment variable if not provided).
147142 api_url: Base URL for FireCrawl API. Defaults to official API endpoint.
148143 mode: Operation mode selection:
@@ -154,15 +149,15 @@ def __init__(
154149 Examples include crawlerOptions.
155150 For more details, visit: https://github.com/mendableai/firecrawl-py
156151 """
157- proxy_server = proxy .get (' server' ) if proxy else None
152+ proxy_server = proxy .get (" server" ) if proxy else None
158153 if trust_env and not proxy_server :
159154 env_proxies = urllib .request .getproxies ()
160- env_proxy_server = env_proxies .get (' https' ) or env_proxies .get (' http' )
155+ env_proxy_server = env_proxies .get (" https" ) or env_proxies .get (" http" )
161156 if env_proxy_server :
162157 if proxy :
163- proxy [' server' ] = env_proxy_server
158+ proxy [" server" ] = env_proxy_server
164159 else :
165- proxy = { ' server' : env_proxy_server }
160+ proxy = {" server" : env_proxy_server }
166161 self .web_paths = web_paths
167162 self .verify_ssl = verify_ssl
168163 self .requests_per_second = requests_per_second
@@ -184,7 +179,7 @@ def lazy_load(self) -> Iterator[Document]:
184179 api_key = self .api_key ,
185180 api_url = self .api_url ,
186181 mode = self .mode ,
187- params = self .params
182+ params = self .params ,
188183 )
189184 yield from loader .lazy_load ()
190185 except Exception as e :
@@ -203,7 +198,7 @@ async def alazy_load(self):
203198 api_key = self .api_key ,
204199 api_url = self .api_url ,
205200 mode = self .mode ,
206- params = self .params
201+ params = self .params ,
207202 )
208203 async for document in loader .alazy_load ():
209204 yield document
@@ -251,7 +246,7 @@ def _safe_process_url_sync(self, url: str) -> bool:
251246
252247class SafePlaywrightURLLoader (PlaywrightURLLoader ):
253248 """Load HTML pages safely with Playwright, supporting SSL verification, rate limiting, and remote browser connection.
254-
249+
255250 Attributes:
256251 web_paths (List[str]): List of URLs to load.
257252 verify_ssl (bool): If True, verify SSL certificates.
@@ -273,27 +268,27 @@ def __init__(
273268 headless : bool = True ,
274269 remove_selectors : Optional [List [str ]] = None ,
275270 proxy : Optional [Dict [str , str ]] = None ,
276- playwright_ws_url : Optional [str ] = None
271+ playwright_ws_url : Optional [str ] = None ,
277272 ):
278273 """Initialize with additional safety parameters and remote browser support."""
279274
280- proxy_server = proxy .get (' server' ) if proxy else None
275+ proxy_server = proxy .get (" server" ) if proxy else None
281276 if trust_env and not proxy_server :
282277 env_proxies = urllib .request .getproxies ()
283- env_proxy_server = env_proxies .get (' https' ) or env_proxies .get (' http' )
278+ env_proxy_server = env_proxies .get (" https" ) or env_proxies .get (" http" )
284279 if env_proxy_server :
285280 if proxy :
286- proxy [' server' ] = env_proxy_server
281+ proxy [" server" ] = env_proxy_server
287282 else :
288- proxy = { ' server' : env_proxy_server }
283+ proxy = {" server" : env_proxy_server }
289284
290285 # We'll set headless to False if using playwright_ws_url since it's handled by the remote browser
291286 super ().__init__ (
292287 urls = web_paths ,
293288 continue_on_failure = continue_on_failure ,
294289 headless = headless if playwright_ws_url is None else False ,
295290 remove_selectors = remove_selectors ,
296- proxy = proxy
291+ proxy = proxy ,
297292 )
298293 self .verify_ssl = verify_ssl
299294 self .requests_per_second = requests_per_second
@@ -339,7 +334,9 @@ async def alazy_load(self) -> AsyncIterator[Document]:
339334 if self .playwright_ws_url :
340335 browser = await p .chromium .connect (self .playwright_ws_url )
341336 else :
342- browser = await p .chromium .launch (headless = self .headless , proxy = self .proxy )
337+ browser = await p .chromium .launch (
338+ headless = self .headless , proxy = self .proxy
339+ )
343340
344341 for url in self .urls :
345342 try :
@@ -394,6 +391,7 @@ def _safe_process_url_sync(self, url: str) -> bool:
394391 self ._sync_wait_for_rate_limit ()
395392 return True
396393
394+
397395class SafeWebBaseLoader (WebBaseLoader ):
398396 """WebBaseLoader with enhanced error handling for URLs."""
399397
@@ -496,11 +494,13 @@ async def aload(self) -> list[Document]:
496494 """Load data into Document objects."""
497495 return [document async for document in self .alazy_load ()]
498496
497+
499498RAG_WEB_LOADER_ENGINES = defaultdict (lambda : SafeWebBaseLoader )
500499RAG_WEB_LOADER_ENGINES ["playwright" ] = SafePlaywrightURLLoader
501500RAG_WEB_LOADER_ENGINES ["safe_web" ] = SafeWebBaseLoader
502501RAG_WEB_LOADER_ENGINES ["firecrawl" ] = SafeFireCrawlLoader
503502
503+
504504def get_web_loader (
505505 urls : Union [str , Sequence [str ]],
506506 verify_ssl : bool = True ,
@@ -515,7 +515,7 @@ def get_web_loader(
515515 "verify_ssl" : verify_ssl ,
516516 "requests_per_second" : requests_per_second ,
517517 "continue_on_failure" : True ,
518- "trust_env" : trust_env
518+ "trust_env" : trust_env ,
519519 }
520520
521521 if PLAYWRIGHT_WS_URI .value :
@@ -529,6 +529,10 @@ def get_web_loader(
529529 WebLoaderClass = RAG_WEB_LOADER_ENGINES [RAG_WEB_LOADER_ENGINE .value ]
530530 web_loader = WebLoaderClass (** web_loader_args )
531531
532- log .debug ("Using RAG_WEB_LOADER_ENGINE %s for %s URLs" , web_loader .__class__ .__name__ , len (safe_urls ))
532+ log .debug (
533+ "Using RAG_WEB_LOADER_ENGINE %s for %s URLs" ,
534+ web_loader .__class__ .__name__ ,
535+ len (safe_urls ),
536+ )
533537
534- return web_loader
538+ return web_loader
0 commit comments