Skip to content

Commit 6511d0d

Browse files
authored
Merge pull request #161 from Marcusfam-RB/Refactor/remove-bs4-dependency
Refactor/remove bs4 dependency
2 parents d70214d + a21a405 commit 6511d0d

File tree

5 files changed

+120
-40
lines changed

5 files changed

+120
-40
lines changed

poetry.lock

Lines changed: 1 addition & 35 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pydoll/elements/web_element.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from typing import Optional
44

55
import aiofiles
6-
from bs4 import BeautifulSoup
76

87
from pydoll.commands import (
98
DomCommands,
@@ -34,7 +33,10 @@
3433
from pydoll.protocol.dom.types import Quad
3534
from pydoll.protocol.page.responses import CaptureScreenshotResponse
3635
from pydoll.protocol.page.types import Viewport
37-
from pydoll.utils import decode_base64_to_bytes
36+
from pydoll.utils import (
37+
decode_base64_to_bytes,
38+
extract_text_from_html,
39+
)
3840

3941

4042
class WebElement(FindElementsMixin): # noqa: PLR0904
@@ -99,8 +101,7 @@ def is_enabled(self) -> bool:
99101
async def text(self) -> str:
100102
"""Visible text content of the element."""
101103
outer_html = await self.inner_html
102-
soup = BeautifulSoup(outer_html, 'html.parser')
103-
return soup.get_text(strip=True)
104+
return extract_text_from_html(outer_html, strip=True)
104105

105106
@property
106107
async def bounds(self) -> Quad:

pydoll/utils.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import logging
33
import os
44
import re
5+
from html import unescape
6+
from html.parser import HTMLParser
57

68
import aiohttp
79

@@ -10,6 +12,94 @@
1012
logger = logging.getLogger(__name__)
1113

1214

15+
class TextExtractor(HTMLParser):
16+
"""
17+
HTML parser for text extraction.
18+
19+
Extracts visible text content from an HTML string, excluding the contents of
20+
tags specified in _skip_tags.
21+
"""
22+
def __init__(self):
23+
super().__init__()
24+
self._parts = []
25+
self._skip = False
26+
self._skip_tags = {"script", "style", "template"}
27+
28+
def handle_starttag(self, tag, attrs):
29+
"""
30+
Marks the parser to skip content inside tags specified in _skip_tags.
31+
32+
Args:
33+
tag (str): The tag name.
34+
attrs (list): A list of (attribute, value) pairs.
35+
"""
36+
if tag in self._skip_tags:
37+
self._skip = True
38+
39+
def handle_endtag(self, tag):
40+
"""
41+
Marks the parser the end of skip tags.
42+
43+
Args:
44+
tag (str): The tag name.
45+
"""
46+
if tag in self._skip_tags:
47+
self._skip = False
48+
49+
def handle_data(self, data):
50+
"""
51+
Handles text nodes. Adds them to the result unless they are within a skip tag.
52+
53+
Args:
54+
data (str): The text data.
55+
"""
56+
if not self._skip:
57+
self._parts.append(unescape(data))
58+
59+
def get_strings(self, strip: bool):
60+
"""
61+
Yields all collected visible text fragments.
62+
63+
Args:
64+
strip (bool): Whether to strip leading/trailing whitespace from each fragment.
65+
66+
Yields:
67+
str: Visible text fragments.
68+
"""
69+
for text in self._parts:
70+
yield text.strip() if strip else text
71+
72+
def get_text(self, separator: str, strip: bool) -> str:
73+
"""
74+
Returns all visible text.
75+
76+
Args:
77+
separator (str): String inserted between extracted text fragments.
78+
strip (bool): Whether to strip whitespace from each fragment.
79+
80+
Returns:
81+
str: The visible text.
82+
"""
83+
return separator.join(self.get_strings(strip=strip))
84+
85+
86+
def extract_text_from_html(html: str, separator: str = '', strip: bool = False) -> str:
87+
"""
88+
Extracts visible text content from an HTML string.
89+
90+
Args:
91+
html (str): The HTML string to extract text from.
92+
separator (str, optional): String inserted between extracted text fragments. Defaults to ''.
93+
strip (bool, optional): Whether to strip whitespace from text fragments. Defaults to False.
94+
95+
Returns:
96+
str: The extracted visible text.
97+
"""
98+
parser = TextExtractor()
99+
parser.feed(html)
100+
return parser.get_text(separator=separator, strip=strip)
101+
102+
13103
def decode_base64_to_bytes(image: str) -> bytes:
14104
"""
15105
Decodes a base64 image string to bytes.

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ python = "^3.10"
1313
websockets = "^13.1"
1414
aiohttp = "^3.9.5"
1515
aiofiles = "^23.2.1"
16-
beautifulsoup4 = "^4.12.3"
1716
mkdocstrings = "^0.29.1"
1817

1918

tests/test_utils.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
has_return_outside_function,
1414
is_script_already_function,
1515
validate_browser_paths,
16+
extract_text_from_html,
1617
)
1718

1819

@@ -401,3 +402,26 @@ def test_has_return_outside_function_arrow_function(self):
401402
'''
402403
assert has_return_outside_function(script) is False
403404

405+
def test_extract_text_without_strip_without_separator(self):
406+
html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
407+
'<template>hidden</template></div>')
408+
result = extract_text_from_html(html)
409+
assert result == 'Hello world '
410+
411+
def test_extract_text_with_strip_without_separator(self):
412+
html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
413+
'<template>hidden</template></div>')
414+
result = extract_text_from_html(html, strip=True)
415+
assert result == 'Helloworld'
416+
417+
def test_extract_text_without_strip_with_separator(self):
418+
html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
419+
'<template>hidden</template></div>')
420+
result = extract_text_from_html(html, separator="/")
421+
assert result == 'Hello / world '
422+
423+
def test_extract_text_with_strip_with_separator(self):
424+
html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
425+
'<template>hidden</template></div>')
426+
result = extract_text_from_html(html, strip=True, separator="/")
427+
assert result == 'Hello/world'

0 commit comments

Comments
 (0)