Skip to content

Commit 4703a0b

Browse files
authored
Update remover.py
1 parent 4233430 commit 4703a0b

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

scrapegraphai/utils/remover.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,9 @@
77

88
def remover(html_content: str) -> str:
99
"""
10-
This function processes HTML content, removes unnecessary tags,
11-
minifies the HTML, and retrieves the title and body content.
10+
This function processes HTML content, removes unnecessary tags
11+
(including style tags), minifies the HTML, and retrieves the
12+
title and body content.
1213
1314
Parameters:
1415
html_content (str): The HTML content to parse
@@ -23,14 +24,16 @@ def remover(html_content: str) -> str:
2324
title_tag = soup.find('title')
2425
title = title_tag.get_text() if title_tag else ""
2526

26-
# Script Tag Removal
27-
[script.extract() for script in soup.find_all('script')]
27+
# Script and Style Tag Removal
28+
for tag in soup.find_all(['script', 'style']):
29+
tag.extract()
2830

2931
# Body Extraction (if it exists)
3032
body_content = soup.find('body')
3133
if body_content:
3234
# Minify the HTML within the body tag
3335
minimized_body = minify(str(body_content))
34-
return "Title: " + title + ", Body: " + minimized_body
36+
return "Title: " + title + ", Body: " + minimized_body
3537
else:
36-
return "Title: " + title + ", Body: No body content found"
38+
return "Title: " + title + ", Body: No body content found"
39+

0 commit comments

Comments
 (0)