66from langchain_community .document_loaders import AsyncChromiumLoader
77from langchain_core .documents import Document
88from .base_node import BaseNode
9- from ..utils .remover import remover
9+ from ..utils .cleanup_html import cleanup_html
10+ import requests
11+ from bs4 import BeautifulSoup
1012
1113
1214class FetchNode (BaseNode ):
@@ -32,6 +34,7 @@ class FetchNode(BaseNode):
3234 def __init__ (self , input : str , output : List [str ], node_config : Optional [dict ]= None , node_name : str = "Fetch" ):
3335 super ().__init__ (node_name , "node" , input , output , 1 )
3436
37+ self .useSoup = True if node_config is None else node_config .get ("useSoup" , True )
3538 self .headless = True if node_config is None else node_config .get ("headless" , True )
3639 self .verbose = False if node_config is None else node_config .get ("verbose" , False )
3740
@@ -67,10 +70,22 @@ def execute(self, state):
6770 })]
6871 # if it is a local directory
6972 elif not source .startswith ("http" ):
70- compressed_document = [Document (page_content = remover (source ), metadata = {
73+ compressed_document = [Document (page_content = cleanup_html (source ), metadata = {
7174 "source" : "local_dir"
7275 })]
7376
77+ elif self .useSoup :
78+ response = requests .get (source )
79+ if response .status_code == 200 :
80+ soup = BeautifulSoup (response .text , 'html.parser' )
81+ links = soup .find_all ('a' )
82+ link_urls = []
83+ for link in links :
84+ if 'href' in link .attrs :
85+ link_urls .append (link ['href' ])
86+ compressed_document = [Document (page_content = cleanup_html (soup .prettify (), link_urls ))]
87+ else :
88+ print (f"Failed to retrieve contents from the webpage at url: { url } " )
7489 else :
7590 if self .node_config is not None and self .node_config .get ("endpoint" ) is not None :
7691
@@ -87,7 +102,7 @@ def execute(self, state):
87102
88103 document = loader .load ()
89104 compressed_document = [
90- Document (page_content = remover (str (document [0 ].page_content )))]
105+ Document (page_content = cleanup_html (str (document [0 ].page_content )))]
91106
92107 state .update ({self .output [0 ]: compressed_document })
93108 return state
0 commit comments