88from langchain_core .documents import Document
99from langchain_community .document_loaders import PyPDFLoader
1010from .base_node import BaseNode
11- from ..utils .cleanup_html import cleanup_html
12- import requests
13- from bs4 import BeautifulSoup
11+ from ..utils .remover import remover
1412
1513
1614class FetchNode (BaseNode ):
@@ -36,7 +34,6 @@ class FetchNode(BaseNode):
3634 def __init__ (self , input : str , output : List [str ], node_config : Optional [dict ] = None , node_name : str = "Fetch" ):
3735 super ().__init__ (node_name , "node" , input , output , 1 )
3836
39-
4037 self .headless = True if node_config is None else node_config .get (
4138 "headless" , True )
4239 self .verbose = False if node_config is None else node_config .get (
@@ -97,22 +94,10 @@ def execute(self, state):
9794 pass
9895
9996 elif not source .startswith ("http" ):
100- compressed_document = [Document (page_content = cleanup_html (source ), metadata = {
97+ compressed_document = [Document (page_content = remover (source ), metadata = {
10198 "source" : "local_dir"
10299 })]
103100
104- elif self .useSoup :
105- response = requests .get (source )
106- if response .status_code == 200 :
107- soup = BeautifulSoup (response .text , 'html.parser' )
108- links = soup .find_all ('a' )
109- link_urls = []
110- for link in links :
111- if 'href' in link .attrs :
112- link_urls .append (link ['href' ])
113- compressed_document = [Document (page_content = cleanup_html (soup .prettify (), link_urls ))]
114- else :
115- print (f"Failed to retrieve contents from the webpage at url: { url } " )
116101 else :
117102 if self .node_config is not None and self .node_config .get ("endpoint" ) is not None :
118103
@@ -129,7 +114,7 @@ def execute(self, state):
129114
130115 document = loader .load ()
131116 compressed_document = [
132- Document (page_content = cleanup_html (str (document [0 ].page_content )))]
117+ Document (page_content = remover (str (document [0 ].page_content )))]
133118
134119 state .update ({self .output [0 ]: compressed_document })
135- return state
120+ return state
0 commit comments