Skip to content

Commit 879c94a

Browse files
Update cleanup_html.py
1 parent 828bdee commit 879c94a

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

scrapegraphai/utils/cleanup_html.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@ def cleanup_html(html_content: str, base_url: str) -> str:
2424
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
2525
"""
2626

27+
import logging
28+
logging.basicConfig(level=logging.DEBUG)
29+
30+
# Add logging to capture the HTML content before parsing
31+
logging.debug(f'HTML content before parsing: {html_content}')
32+
2733
soup = BeautifulSoup(html_content, 'html.parser')
2834

2935
# Title Extraction
@@ -57,9 +63,9 @@ def cleanup_html(html_content: str, base_url: str) -> str:
5763
if body_content:
5864
# Minify the HTML within the body tag
5965
minimized_body = minify(str(body_content))
60-
6166
return title, minimized_body, link_urls, image_urls
62-
# return "Title: " + title + ", Body: " + minimized_body + ", Links: " + str(link_urls) + ", Images: " + str(image_urls)
6367

64-
# throw an error if no body content is found
65-
raise ValueError("No HTML body content found, please try setting the 'headless' flag to False in the graph configuration.")
68+
# Add fallback mechanism
69+
else:
70+
logging.error(f'No body content found in HTML: {html_content}')
71+
raise ValueError(f"No HTML body content found, please try setting the 'headless' flag to False in the graph configuration. HTML content: {html_content}")

0 commit comments

Comments
 (0)