Merge pull request #161 from Marcusfam-RB/Refactor/remove-bs4-dependency

thalissonvs · web-flow · commit 6511d0d46c75 · 2025-06-18T00:06:29.000-03:00
Refactor/remove bs4 dependency
diff --git a/poetry.lock b/poetry.lock
diff --git a/pydoll/elements/web_element.py b/pydoll/elements/web_element.py
@@ -3,7 +3,6 @@
 from typing import Optional
 
 import aiofiles
-from bs4 import BeautifulSoup
 
 from pydoll.commands import (
     DomCommands,
@@ -34,7 +33,10 @@
 from pydoll.protocol.dom.types import Quad
 from pydoll.protocol.page.responses import CaptureScreenshotResponse
 from pydoll.protocol.page.types import Viewport
-from pydoll.utils import decode_base64_to_bytes
+from pydoll.utils import (
+    decode_base64_to_bytes,
+    extract_text_from_html,
+)
 
 
 class WebElement(FindElementsMixin):  # noqa: PLR0904
@@ -99,8 +101,7 @@ def is_enabled(self) -> bool:
     async def text(self) -> str:
         """Visible text content of the element."""
         outer_html = await self.inner_html
-        soup = BeautifulSoup(outer_html, 'html.parser')
-        return soup.get_text(strip=True)
+        return extract_text_from_html(outer_html, strip=True)
 
     @property
     async def bounds(self) -> Quad:
diff --git a/pydoll/utils.py b/pydoll/utils.py
@@ -2,6 +2,8 @@
 import logging
 import os
 import re
+from html import unescape
+from html.parser import HTMLParser
 
 import aiohttp
 
@@ -10,6 +12,94 @@
 logger = logging.getLogger(__name__)
 
 
+class TextExtractor(HTMLParser):
+    """
+    HTML parser for text extraction.
+
+    Extracts visible text content from an HTML string, excluding the contents of
+    tags specified in _skip_tags.
+    """
+    def __init__(self):
+        super().__init__()
+        self._parts = []
+        self._skip = False
+        self._skip_tags = {"script", "style", "template"}
+
+    def handle_starttag(self, tag, attrs):
+        """
+        Marks the parser to skip content inside tags specified in _skip_tags.
+
+        Args:
+            tag (str): The tag name.
+            attrs (list): A list of (attribute, value) pairs.
+        """
+        if tag in self._skip_tags:
+            self._skip = True
+
+    def handle_endtag(self, tag):
+        """
+        Marks the parser the end of skip tags.
+
+        Args:
+            tag (str): The tag name.
+        """
+        if tag in self._skip_tags:
+            self._skip = False
+
+    def handle_data(self, data):
+        """
+        Handles text nodes. Adds them to the result unless they are within a skip tag.
+
+        Args:
+            data (str): The text data.
+        """
+        if not self._skip:
+            self._parts.append(unescape(data))
+
+    def get_strings(self, strip: bool):
+        """
+        Yields all collected visible text fragments.
+
+        Args:
+            strip (bool): Whether to strip leading/trailing whitespace from each fragment.
+
+        Yields:
+            str: Visible text fragments.
+        """
+        for text in self._parts:
+            yield text.strip() if strip else text
+
+    def get_text(self, separator: str, strip: bool) -> str:
+        """
+        Returns all visible text.
+
+        Args:
+            separator (str): String inserted between extracted text fragments.
+            strip (bool): Whether to strip whitespace from each fragment.
+
+        Returns:
+            str: The visible text.
+        """
+        return separator.join(self.get_strings(strip=strip))
+
+
+def extract_text_from_html(html: str, separator: str = '', strip: bool = False) -> str:
+    """
+    Extracts visible text content from an HTML string.
+
+    Args:
+        html (str): The HTML string to extract text from.
+        separator (str, optional): String inserted between extracted text fragments. Defaults to ''.
+        strip (bool, optional): Whether to strip whitespace from text fragments. Defaults to False.
+
+    Returns:
+        str: The extracted visible text.
+    """
+    parser = TextExtractor()
+    parser.feed(html)
+    return parser.get_text(separator=separator, strip=strip)
+
+
 def decode_base64_to_bytes(image: str) -> bytes:
     """
     Decodes a base64 image string to bytes.
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,6 @@ python = "^3.10"
 websockets = "^13.1"
 aiohttp = "^3.9.5"
 aiofiles = "^23.2.1"
-beautifulsoup4 = "^4.12.3"
 mkdocstrings = "^0.29.1"
 
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -13,6 +13,7 @@
     has_return_outside_function,
     is_script_already_function,
     validate_browser_paths,
+    extract_text_from_html,
 )
 
 
@@ -401,3 +402,26 @@ def test_has_return_outside_function_arrow_function(self):
         '''
         assert has_return_outside_function(script) is False
 
+    def test_extract_text_without_strip_without_separator(self):
+        html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
+                '<template>hidden</template></div>')
+        result = extract_text_from_html(html)
+        assert result == 'Hello  world '
+
+    def test_extract_text_with_strip_without_separator(self):
+        html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
+                '<template>hidden</template></div>')
+        result = extract_text_from_html(html, strip=True)
+        assert result == 'Helloworld'
+
+    def test_extract_text_without_strip_with_separator(self):
+        html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
+                '<template>hidden</template></div>')
+        result = extract_text_from_html(html, separator="/")
+        assert result == 'Hello / world '
+
+    def test_extract_text_with_strip_with_separator(self):
+        html = ('<div>Hello <span> world </span><script>alert(1)</script><style>body { color: red; }</style>'
+                '<template>hidden</template></div>')
+        result = extract_text_from_html(html, strip=True, separator="/")
+        assert result == 'Hello/world'