33"""
44import pandas as pd
55import json
6+ import requests
67from typing import List , Optional
78from langchain_community .document_loaders import AsyncChromiumLoader
89from langchain_core .documents import Document
910from langchain_community .document_loaders import PyPDFLoader
1011from .base_node import BaseNode
11- from ..utils .remover import remover
12+ from ..utils .cleanup_html import cleanup_html
1213
1314
1415class FetchNode (BaseNode ):
@@ -38,6 +39,8 @@ def __init__(self, input: str, output: List[str], node_config: Optional[dict] =
3839 "headless" , True )
3940 self .verbose = False if node_config is None else node_config .get (
4041 "verbose" , False )
42+ self .useSoup = True if node_config is None else node_config .get (
43+ "useSoup" , True )
4144
4245 def execute (self , state ):
4346 """
@@ -94,9 +97,17 @@ def execute(self, state):
9497 pass
9598
9699 elif not source .startswith ("http" ):
97- compressed_document = [Document (page_content = remover (source ), metadata = {
100+ compressed_document = [Document (page_content = cleanup_html (source ), metadata = {
98101 "source" : "local_dir"
99102 })]
103+
104+ elif self .useSoup :
105+ response = requests .get (source )
106+ if response .status_code == 200 :
107+ cleanedup_html = cleanup_html (response .text , source )
108+ compressed_document = [Document (page_content = cleanedup_html )]
109+ else :
110+ print (f"Failed to retrieve contents from the webpage at url: { url } " )
100111
101112 else :
102113 if self .node_config is not None and self .node_config .get ("endpoint" ) is not None :
@@ -114,7 +125,7 @@ def execute(self, state):
114125
115126 document = loader .load ()
116127 compressed_document = [
117- Document (page_content = remover (str (document [0 ].page_content )))]
128+ Document (page_content = cleanup_html (str (document [0 ].page_content )))]
118129
119130 state .update ({self .output [0 ]: compressed_document })
120131 return state
0 commit comments