feat(parser): replacing tldextract with tld library

D4Vinci · D4Vinci · commit 90c52c45c753 · 2026-01-23T00:00:48.000+02:00
This might break the adaptive data users have for websites BUT:
1. tld uses ~3.7x less memory during extraction operations (1.5 MB vs 5.7 MB).
2. tld uses ~56% less memory on import (5.2 MB vs 11.9 MB).
3. Zero dependencies (vs 3 for tldextract).

In return, it's 30ms slower for extracting 5000 URLs, which is negligible. Also, the type hints aren't always accurate, but it's fine; I corrected them.
diff --git a/pyproject.toml b/pyproject.toml
@@ -60,7 +60,7 @@ dependencies = [
     "lxml>=6.0.2",
     "cssselect>=1.3.0",
     "orjson>=3.11.5",
-    "tldextract>=5.3.1",
+    "tld>=0.13.1",
     "w3lib>=2.3.0"
 ]
 
diff --git a/scrapling/cli.py b/scrapling/cli.py
@@ -128,6 +128,9 @@ def install(force):  # pragma: no cover
             ],
             "Playwright dependencies",
         )
+        from tld.utils import update_tld_names
+
+        update_tld_names(fail_silently=True)
         # if no errors raised by the above commands, then we add the below file
         __PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
     else:
diff --git a/scrapling/core/storage.py b/scrapling/core/storage.py
@@ -8,7 +8,7 @@
 from lxml.html import HtmlElement
 
 from scrapling.core.utils import _StorageTools, log
-from scrapling.core._types import Dict, Optional, Any
+from scrapling.core._types import Dict, Optional, Any, cast
 
 
 class StorageSystemMixin(ABC):  # pragma: no cover
@@ -17,18 +17,24 @@ def __init__(self, url: Optional[str] = None):
         """
         :param url: URL of the website we are working on to separate it from other websites data
         """
-        self.url = url
+        # Make the url in lowercase to handle this edge case until it's updated: https://github.com/barseghyanartur/tld/issues/124
+        self.url = url.lower() if (url and isinstance(url, str)) else None
 
     @lru_cache(64, typed=True)
     def _get_base_url(self, default_value: str = "default") -> str:
-        if not self.url or not isinstance(self.url, str):
+        if not self.url:
             return default_value
 
         try:
-            from tldextract import extract as tld
+            from tld import get_tld, Result
 
-            extracted = tld(self.url)
-            return extracted.top_domain_under_public_suffix or extracted.domain or default_value
+            # Fixing the inaccurate return type hint in `get_tld`
+            extracted: Result | None = cast(
+                Result, get_tld(self.url, as_object=True, fail_silently=True, fix_protocol=True)
+            )
+            if not extracted:
+                return default_value
+            return extracted.fld or extracted.domain or default_value
         except AttributeError:
             return default_value
 
diff --git a/scrapling/engines/toolbelt/fingerprints.py b/scrapling/engines/toolbelt/fingerprints.py
@@ -5,11 +5,11 @@
 from functools import lru_cache
 from platform import system as platform_system
 
-from tldextract import extract
+from tld import get_tld, Result
 from browserforge.headers import Browser, HeaderGenerator
 from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
 
-from scrapling.core._types import Dict, Literal, Tuple
+from scrapling.core._types import Dict, Literal, Tuple, cast
 
 __OS_NAME__ = platform_system()
 OSName = Literal["linux", "macos", "windows"]
@@ -28,11 +28,15 @@ def generate_convincing_referer(url: str) -> str | None:
     :param url: The URL you are about to fetch.
     :return: Google's search URL of the domain name, or None for localhost/IP addresses
     """
-    extracted = extract(url)
+    # Fixing the inaccurate return type hint in `get_tld`
+    extracted: Result | None = cast(Result, get_tld(url, as_object=True, fail_silently=True))
+    if not extracted:
+        return None
+
     website_name = extracted.domain
 
     # Skip generating referer for localhost, IP addresses, or when there's no valid domain
-    if not website_name or not extracted.suffix or website_name in ("localhost", "127.0.0.1", "::1"):
+    if not website_name or not extracted.tld or website_name in ("localhost", "127.0.0.1", "::1"):
         return None
 
     # Check if it's an IP address (simple check for IPv4)

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ dependencies = [`
`60`	`60`	`"lxml>=6.0.2",`
`61`	`61`	`"cssselect>=1.3.0",`
`62`	`62`	`"orjson>=3.11.5",`
`63`		`- "tldextract>=5.3.1",`
	`63`	`+ "tld>=0.13.1",`
`64`	`64`	`"w3lib>=2.3.0"`
`65`	`65`	`]`
`66`	`66`