1- from collections .abc import Awaitable , Callable
1+ from collections .abc import Awaitable
22from enum import IntEnum , unique
3+ from functools import partial
34from http import HTTPStatus
45import itertools
56import re
7+ from typing import Any , Protocol
68from urllib .parse import ParseResult , quote , urlparse
79
10+ from tld import get_fld
11+
812from bookmarkmgr .cronet import RequestError , Response
913from bookmarkmgr .scraper import Page
1014
2024 HTTPStatus .GONE .value ,
2125}
2226
27+ _get_fld_lax = partial (
28+ get_fld ,
29+ fail_silently = True ,
30+ fix_protocol = True ,
31+ )
32+
2333
2434@unique
2535class LinkStatus (IntEnum ):
@@ -78,21 +88,54 @@ async def check_link_status(
7888 return link_status , error
7989
8090
81- def _fix_url_quoting (url : ParseResult ) -> ParseResult :
91+ def _fix_url_quoting (
92+ url : ParseResult ,
93+ ** _ : Any ,
94+ ) -> ParseResult :
8295 return url ._replace (path = quote (url .path ))
8396
8497
85- def _fix_url_trailing_slash (url : ParseResult ) -> ParseResult :
98+ def _fix_url_subdomain (
99+ url : ParseResult ,
100+ redirect_url : ParseResult ,
101+ ) -> ParseResult :
102+ if (
103+ url .hostname is None
104+ or redirect_url .hostname is None
105+ or url .hostname == redirect_url .hostname
106+ ):
107+ return url
108+
109+ if _get_fld_lax (url .hostname ) == _get_fld_lax (redirect_url .hostname ):
110+ return url ._replace (netloc = redirect_url .netloc )
111+
112+ return url
113+
114+
115+ def _fix_url_trailing_slash (
116+ url : ParseResult ,
117+ ** _ : Any ,
118+ ) -> ParseResult :
86119 return url ._replace (
87120 path = (
88121 url .path .rstrip ("/" ) if url .path .endswith ("/" ) else f"{ url .path } /"
89122 ),
90123 )
91124
92125
93- _URL_FIXERS : list [Callable [[ParseResult ], ParseResult ]] = [
126+ class _FixerCallable (Protocol ):
127+ def __call__ (
128+ self ,
129+ url : ParseResult ,
130+ * ,
131+ redirect_url : ParseResult ,
132+ ) -> ParseResult : ...
133+
134+
135+ _URL_FIXERS : list [_FixerCallable ] = [
94136 _fix_url_quoting ,
95137 _fix_url_trailing_slash ,
138+ _fix_url_subdomain ,
96139]
97140
98141
@@ -113,6 +156,7 @@ def get_fixed_url(response: Response, url: str) -> None | str:
113156 for fixer in fixer_combination :
114157 fixed_parsed_url = fixer (
115158 fixed_parsed_url ,
159+ redirect_url = parsed_redirect_url ,
116160 )
117161
118162 if fixed_parsed_url == parsed_redirect_url :
0 commit comments