Skip to content

Commit 90c52c4

Browse files
committed
feat(parser): replacing tldextract with tld library
This might break the adaptive data users have for websites BUT: 1. tld uses ~3.7x less memory during extraction operations (1.5 MB vs 5.7 MB). 2. tld uses ~56% less memory on import (5.2 MB vs 11.9 MB). 3. Zero dependencies (vs 3 for tldextract). In return, it's 30ms slower for extracting 5000 URLs, which is negligible. Also, the type hints aren't always accurate, but it's fine; I corrected them.
1 parent 5dbbd84 commit 90c52c4

File tree

4 files changed

+24
-11
lines changed

4 files changed

+24
-11
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ dependencies = [
6060
"lxml>=6.0.2",
6161
"cssselect>=1.3.0",
6262
"orjson>=3.11.5",
63-
"tldextract>=5.3.1",
63+
"tld>=0.13.1",
6464
"w3lib>=2.3.0"
6565
]
6666

scrapling/cli.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ def install(force): # pragma: no cover
128128
],
129129
"Playwright dependencies",
130130
)
131+
from tld.utils import update_tld_names
132+
133+
update_tld_names(fail_silently=True)
131134
# if no errors raised by the above commands, then we add the below file
132135
__PACKAGE_DIR__.joinpath(".scrapling_dependencies_installed").touch()
133136
else:

scrapling/core/storage.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from lxml.html import HtmlElement
99

1010
from scrapling.core.utils import _StorageTools, log
11-
from scrapling.core._types import Dict, Optional, Any
11+
from scrapling.core._types import Dict, Optional, Any, cast
1212

1313

1414
class StorageSystemMixin(ABC): # pragma: no cover
@@ -17,18 +17,24 @@ def __init__(self, url: Optional[str] = None):
1717
"""
1818
:param url: URL of the website we are working on to separate it from other websites data
1919
"""
20-
self.url = url
20+
# Make the url in lowercase to handle this edge case until it's updated: https://github.com/barseghyanartur/tld/issues/124
21+
self.url = url.lower() if (url and isinstance(url, str)) else None
2122

2223
@lru_cache(64, typed=True)
2324
def _get_base_url(self, default_value: str = "default") -> str:
24-
if not self.url or not isinstance(self.url, str):
25+
if not self.url:
2526
return default_value
2627

2728
try:
28-
from tldextract import extract as tld
29+
from tld import get_tld, Result
2930

30-
extracted = tld(self.url)
31-
return extracted.top_domain_under_public_suffix or extracted.domain or default_value
31+
# Fixing the inaccurate return type hint in `get_tld`
32+
extracted: Result | None = cast(
33+
Result, get_tld(self.url, as_object=True, fail_silently=True, fix_protocol=True)
34+
)
35+
if not extracted:
36+
return default_value
37+
return extracted.fld or extracted.domain or default_value
3238
except AttributeError:
3339
return default_value
3440

scrapling/engines/toolbelt/fingerprints.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,11 @@
55
from functools import lru_cache
66
from platform import system as platform_system
77

8-
from tldextract import extract
8+
from tld import get_tld, Result
99
from browserforge.headers import Browser, HeaderGenerator
1010
from browserforge.headers.generator import SUPPORTED_OPERATING_SYSTEMS
1111

12-
from scrapling.core._types import Dict, Literal, Tuple
12+
from scrapling.core._types import Dict, Literal, Tuple, cast
1313

1414
__OS_NAME__ = platform_system()
1515
OSName = Literal["linux", "macos", "windows"]
@@ -28,11 +28,15 @@ def generate_convincing_referer(url: str) -> str | None:
2828
:param url: The URL you are about to fetch.
2929
:return: Google's search URL of the domain name, or None for localhost/IP addresses
3030
"""
31-
extracted = extract(url)
31+
# Fixing the inaccurate return type hint in `get_tld`
32+
extracted: Result | None = cast(Result, get_tld(url, as_object=True, fail_silently=True))
33+
if not extracted:
34+
return None
35+
3236
website_name = extracted.domain
3337

3438
# Skip generating referer for localhost, IP addresses, or when there's no valid domain
35-
if not website_name or not extracted.suffix or website_name in ("localhost", "127.0.0.1", "::1"):
39+
if not website_name or not extracted.tld or website_name in ("localhost", "127.0.0.1", "::1"):
3640
return None
3741

3842
# Check if it's an IP address (simple check for IPv4)

0 commit comments

Comments
 (0)