Skip to content

Commit b0e446f

Browse files
committed
feat: apply remove to the document before updating the state
1 parent 4703a0b commit b0e446f

File tree

3 files changed

+17
-7
lines changed

3 files changed

+17
-7
lines changed

scrapegraphai/nodes/fetch_node.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,15 @@ def execute(self, state):
7272

7373
# if it is a local directory
7474
if not source.startswith("http"):
75-
document = [Document(page_content=remover(source), metadata={
75+
compressedDocument = [Document(page_content=remover(source), metadata={
7676
"source": "local_dir"
7777
})]
7878

7979
# if it is a URL
8080
else:
8181
loader = AsyncHtmlLoader(source)
8282
document = loader.load()
83-
state.update({self.output[0]: document})
83+
compressedDocument = [Document(page_content=remover(str(document)))]
84+
85+
state.update({self.output[0]: compressedDocument})
8486
return state

scrapegraphai/utils/remover.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,24 @@ def remover(html_content: str) -> str:
2424
title_tag = soup.find('title')
2525
title = title_tag.get_text() if title_tag else ""
2626

27-
# Script and Style Tag Removal
27+
# Script and Style Tag Removal
2828
for tag in soup.find_all(['script', 'style']):
2929
tag.extract()
3030

3131
# Body Extraction (if it exists)
3232
body_content = soup.find('body')
3333
if body_content:
34+
# Remove some attributes from tags
35+
""" tagsToRemove = ['style', 'rel', 'width',
36+
'height', 'target', 'media',
37+
'onerror', 'onload', 'onclick']
38+
for tag in body_content.find_all():
39+
for attr in tagsToRemove:
40+
if tag.has_attr(attr):
41+
del tag.attrs[attr] """
42+
3443
# Minify the HTML within the body tag
3544
minimized_body = minify(str(body_content))
36-
return "Title: " + title + ", Body: " + minimized_body
45+
return "Title: " + title + ", Body: " + minimized_body
3746
else:
38-
return "Title: " + title + ", Body: No body content found"
39-
47+
return "Title: " + title + ", Body: No body content found"

tests/Readme.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Test section
2-
Regarding the tests for the folder graphs and nodes it was created a specific repo as a example
2+
Regarding the tests for the folder graphs and nodes it was created a specific repo as a example
33
([link of the repo](https://github.com/VinciGit00/Scrapegrah-ai-website-for-tests)). The test website is hosted [here](https://scrapegrah-ai-website-for-tests.onrender.com).

0 commit comments

Comments
 (0)