@@ -131,15 +131,21 @@ def execute(self, state):
131131 pass
132132
133133 elif not source .startswith ("http" ):
134+ self .logger .info (f"Fetching local HTML content from: { source } " )
135+ if not source .strip ():
136+ raise ValueError ("No HTML body content found in the local source." )
134137 title , minimized_body , link_urls , image_urls = cleanup_html (source , source )
135138 parsed_content = f"Title: { title } , Body: { minimized_body } , Links: { link_urls } , Images: { image_urls } "
136139 compressed_document = [
137140 Document (page_content = parsed_content , metadata = {"source" : "local_dir" })
138141 ]
139142
140143 elif self .useSoup :
144+ self .logger .info (f"Fetching HTML content using requests from: { source } " )
141145 response = requests .get (source )
142146 if response .status_code == 200 :
147+ if not response .text .strip ():
148+ raise ValueError ("No HTML body content found in the response." )
143149 title , minimized_body , link_urls , image_urls = cleanup_html (
144150 response .text , source
145151 )
@@ -151,6 +157,7 @@ def execute(self, state):
151157 )
152158
153159 else :
160+ self .logger .info (f"Fetching HTML content using ChromiumLoader from: { source } " )
154161 loader_kwargs = {}
155162
156163 if self .node_config is not None :
@@ -159,6 +166,9 @@ def execute(self, state):
159166 loader = ChromiumLoader ([source ], headless = self .headless , ** loader_kwargs )
160167 document = loader .load ()
161168
169+ if not document or not document [0 ].page_content .strip ():
170+ raise ValueError ("No HTML body content found in the document fetched by ChromiumLoader." )
171+
162172 title , minimized_body , link_urls , image_urls = cleanup_html (
163173 str (document [0 ].page_content ), source
164174 )
0 commit comments