@@ -76,6 +76,10 @@ def __init__(
7676 None if node_config is None else node_config .get ("browser_base" , None )
7777 )
7878
79+ self .scrape_do = (
80+ None if node_config is None else node_config .get ("scrape_do" , None )
81+ )
82+
7983 def execute (self , state ):
8084 """
8185 Executes the node's logic to fetch HTML content from a specified URL and
@@ -102,7 +106,7 @@ def execute(self, state):
102106
103107 source = input_data [0 ]
104108 input_type = input_keys [0 ]
105-
109+
106110 handlers = {
107111 "json_dir" : self .handle_directory ,
108112 "xml_dir" : self .handle_directory ,
@@ -271,19 +275,34 @@ def handle_web_source(self, state, source):
271275 try :
272276 from ..docloaders .browser_base import browser_base_fetch
273277 except ImportError :
274- raise ImportError ("The browserbase module is not installed. Please install it using `pip install browserbase`." )
278+ raise ImportError ("""The browserbase module is not installed.
279+ Please install it using `pip install browserbase`.""" )
275280
276281 data = browser_base_fetch (self .browser_base .get ("api_key" ),
277282 self .browser_base .get ("project_id" ), [source ])
278283
279284 document = [Document (page_content = content ,
280285 metadata = {"source" : source }) for content in data ]
286+ elif self .scrape_do is not None :
287+ from ..docloaders .scrape_do import scrape_do_fetch
288+ if self .scrape_do .get ("use_proxy" ) is None or self .scrape_do .get ("geoCode" ) is None or self .scrape_do .get ("super_proxy" ) is None :
289+ data = scrape_do_fetch (self .scrape_do .get ("api_key" ),
290+ source )
291+ else :
292+ data = scrape_do_fetch (self .scrape_do .get ("api_key" ),
293+ source , self .scrape_do .get ("use_proxy" ),
294+ self .scrape_do .get ("geoCode" ),
295+ self .scrape_do .get ("super_proxy" ))
296+
297+ document = [Document (page_content = data ,
298+ metadata = {"source" : source })]
281299 else :
282300 loader = ChromiumLoader ([source ], headless = self .headless , ** loader_kwargs )
283301 document = loader .load ()
284302
285303 if not document or not document [0 ].page_content .strip ():
286- raise ValueError ("No HTML body content found in the document fetched by ChromiumLoader." )
304+ raise ValueError ("""No HTML body content found in
305+ the document fetched by ChromiumLoader.""" )
287306 parsed_content = document [0 ].page_content
288307
289308 if (isinstance (self .llm_model , ChatOpenAI ) or isinstance (self .llm_model , AzureChatOpenAI )) and not self .script_creator or self .force and not self .script_creator and not self .openai_md_enabled :
@@ -292,7 +311,7 @@ def handle_web_source(self, state, source):
292311 compressed_document = [
293312 Document (page_content = parsed_content , metadata = {"source" : "html file" })
294313 ]
295-
314+
296315 return self .update_state (state , compressed_document )
297316
298317 def update_state (self , state , compressed_document ):
0 commit comments