Skip to content

Commit 9661c77

Browse files
committed
add minimizer function
1 parent 3fc18b2 commit 9661c77

File tree

1 file changed

+16
-9
lines changed

1 file changed

+16
-9
lines changed

scrapegraphai/utils/remover.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,36 @@
1-
"""
2-
Module for removing the unused html tags
1+
"""
2+
Module for minimizing the code
33
"""
44
from bs4 import BeautifulSoup
5+
from minify_html import minify
56

67

78
def remover(html_content: str) -> str:
89
"""
9-
This function processes the HTML content, removes unnecessary tags,
10-
and retrieves the title and body content.
10+
This function processes HTML content, removes unnecessary tags,
11+
minifies the HTML, and retrieves the title and body content.
1112
1213
Parameters:
13-
html_content (str): the HTML content to parse
14+
html_content (str): The HTML content to parse
1415
1516
Returns:
16-
str: the parsed title followed by the body content without script tags
17+
str: The parsed title followed by the minified body content
1718
"""
1819

1920
soup = BeautifulSoup(html_content, 'html.parser')
2021

22+
# Title Extraction
2123
title_tag = soup.find('title')
2224
title = title_tag.get_text() if title_tag else ""
2325

26+
# Script Tag Removal
2427
[script.extract() for script in soup.find_all('script')]
2528

29+
# Body Extraction (if it exists)
2630
body_content = soup.find('body')
27-
body = str(body_content) if body_content else ""
28-
29-
return "Title: " + title + ", Body: " + body
31+
if body_content:
32+
# Minify the HTML within the body tag
33+
minimized_body = minify(str(body_content))
34+
return "Title: " + title + ", Body: " + minimized_body
35+
else:
36+
return "Title: " + title + ", Body: No body content found"

0 commit comments

Comments
 (0)