Skip to content

Commit 8cdb183

Browse files
committed
bookmarkmgr: Fix URLs with outdated subdomains
1 parent 0de1513 commit 8cdb183

File tree

3 files changed

+61
-5
lines changed

3 files changed

+61
-5
lines changed

bookmarkmgr/bookmarkmgr/checks/link_status.py

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
1-
from collections.abc import Awaitable, Callable
1+
from collections.abc import Awaitable
22
from enum import IntEnum, unique
3+
from functools import partial
34
from http import HTTPStatus
45
import itertools
56
import re
7+
from typing import Any, Protocol
68
from urllib.parse import ParseResult, quote, urlparse
79

10+
from tld import get_fld
11+
812
from bookmarkmgr.cronet import RequestError, Response
913
from bookmarkmgr.scraper import Page
1014

@@ -20,6 +24,12 @@
2024
HTTPStatus.GONE.value,
2125
}
2226

27+
_get_fld_lax = partial(
28+
get_fld,
29+
fail_silently=True,
30+
fix_protocol=True,
31+
)
32+
2333

2434
@unique
2535
class LinkStatus(IntEnum):
@@ -78,21 +88,54 @@ async def check_link_status(
7888
return link_status, error
7989

8090

81-
def _fix_url_quoting(url: ParseResult) -> ParseResult:
91+
def _fix_url_quoting(
92+
url: ParseResult,
93+
**_: Any,
94+
) -> ParseResult:
8295
return url._replace(path=quote(url.path))
8396

8497

85-
def _fix_url_trailing_slash(url: ParseResult) -> ParseResult:
98+
def _fix_url_subdomain(
99+
url: ParseResult,
100+
redirect_url: ParseResult,
101+
) -> ParseResult:
102+
if (
103+
url.hostname is None
104+
or redirect_url.hostname is None
105+
or url.hostname == redirect_url.hostname
106+
):
107+
return url
108+
109+
if _get_fld_lax(url.hostname) == _get_fld_lax(redirect_url.hostname):
110+
return url._replace(netloc=redirect_url.netloc)
111+
112+
return url
113+
114+
115+
def _fix_url_trailing_slash(
116+
url: ParseResult,
117+
**_: Any,
118+
) -> ParseResult:
86119
return url._replace(
87120
path=(
88121
url.path.rstrip("/") if url.path.endswith("/") else f"{url.path}/"
89122
),
90123
)
91124

92125

93-
_URL_FIXERS: list[Callable[[ParseResult], ParseResult]] = [
126+
class _FixerCallable(Protocol):
127+
def __call__(
128+
self,
129+
url: ParseResult,
130+
*,
131+
redirect_url: ParseResult,
132+
) -> ParseResult: ...
133+
134+
135+
_URL_FIXERS: list[_FixerCallable] = [
94136
_fix_url_quoting,
95137
_fix_url_trailing_slash,
138+
_fix_url_subdomain,
96139
]
97140

98141

@@ -113,6 +156,7 @@ def get_fixed_url(response: Response, url: str) -> None | str:
113156
for fixer in fixer_combination:
114157
fixed_parsed_url = fixer(
115158
fixed_parsed_url,
159+
redirect_url=parsed_redirect_url,
116160
)
117161

118162
if fixed_parsed_url == parsed_redirect_url:

bookmarkmgr/poetry.lock

Lines changed: 12 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bookmarkmgr/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ cffi = "^1.17.1"
3737
enlighten = "^1.13.0"
3838
overrides = "^7.7.0"
3939
python = "^3.12"
40+
tld = "^0.13"
4041
yarl = "^1.18.3"
4142

4243
[tool.poetry.group.dev.dependencies]

0 commit comments

Comments
 (0)