Skip to content

Commit d7a9d05

Browse files
committed
chg: [psl_faup] add support for additionnal tlds like b32.i2p + improve performance
1 parent e9e34ea commit d7a9d05

File tree

1 file changed

+33
-16
lines changed

1 file changed

+33
-16
lines changed

bin/lib/psl_faup.py

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from publicsuffixlist import PublicSuffixList
99
from urllib.parse import urlparse, urlunparse
1010

11+
ADDED_TLD = {'b32.i2p'}
12+
1113
def _ensure_bytes(binary):
1214
if isinstance(binary, bytes):
1315
return binary
@@ -38,6 +40,8 @@ def __init__(self):
3840
self._url = None
3941
self._retval = {}
4042
self.ip_as_host = ''
43+
self.host = None
44+
self.tld = None
4145

4246
def _clear(self):
4347
self.decoded = False
@@ -52,12 +56,12 @@ def decode(self, url):
5256
"""
5357
self._clear()
5458
if isinstance(url, bytes) and b'//' not in url[:10]:
55-
if b'.onion' in url:
59+
if b'.onion' in url or b'.i2p':
5660
url = b'http://' + url
5761
else:
5862
url = b'https://' + url
5963
elif '//' not in url[:10]:
60-
if '.onion' in url:
64+
if '.onion' in url or '.i2p' in url:
6165
url = f'http://{url}'
6266
else:
6367
url = f'https://{url}'
@@ -84,6 +88,7 @@ def decode(self, url):
8488

8589
self.decoded = True
8690
self._retval = {}
91+
self.host = self.get_host()
8792

8893
@property
8994
def url(self):
@@ -123,6 +128,9 @@ def get_host(self):
123128
if not self.decoded or not self._url:
124129
raise UrlNotDecoded("You must call pslfaup.decode() first")
125130

131+
if self.host:
132+
return self.host
133+
126134
if self._url.hostname is None:
127135
return None
128136
elif self._url.hostname.isascii():
@@ -134,8 +142,11 @@ def get_domain(self):
134142
if not self.decoded or not self._url:
135143
raise UrlNotDecoded("You must call pslfaup.decode() first")
136144

137-
if self.get_host() is not None and not self.ip_as_host:
138-
return self.psl.privatesuffix(self.get_host())
145+
if self.host is not None and not self.ip_as_host:
146+
domain = self.host[:-(len(self.get_tld()) + 1)].rsplit('.', 1)[-1]
147+
if domain:
148+
return f'{domain}.{self.tld}'
149+
# return self.psl.privatesuffix(self.host)
139150
return None
140151

141152
def get_domain_without_tld(self):
@@ -144,26 +155,32 @@ def get_domain_without_tld(self):
144155

145156
if self.get_tld() is not None and not self.ip_as_host:
146157
if domain := self.get_domain():
147-
return domain.rsplit(self.get_tld(), 1)[0].rstrip('.')
158+
return domain.rsplit(self.tld, 1)[0].rstrip('.')
148159
return None
149160

150161
def get_subdomain(self):
151162
if not self.decoded or not self._url:
152163
raise UrlNotDecoded("You must call pslfaup.decode() first")
153164

154-
if self.get_host() is not None and not self.ip_as_host:
165+
if self.host is not None and not self.ip_as_host:
155166
domain = self.get_domain()
156-
host = self.get_host()
157-
if domain and host and domain in host:
158-
return host.rsplit(domain, 1)[0].rstrip('.') or None
167+
if domain and self.host and domain in self.host:
168+
return self.host.rsplit(domain, 1)[0].rstrip('.') or None
159169
return None
160170

161171
def get_tld(self):
162-
if not self.decoded or not self._url:
163-
raise UrlNotDecoded("You must call pslfaup.decode() first")
164-
165-
if self.get_host() is not None and not self.ip_as_host:
166-
return self.psl.publicsuffix(self.get_host())
172+
if self.tld:
173+
return self.tld
174+
175+
if self.host is not None and not self.ip_as_host:
176+
for added_tld in ADDED_TLD:
177+
if self.host.endswith(added_tld):
178+
print('added')
179+
self.tld = added_tld
180+
return added_tld
181+
print('standard')
182+
self.tld = self.psl.publicsuffix(self.host)
183+
return self.tld
167184
return None
168185

169186
def get_port(self):
@@ -198,7 +215,7 @@ def get(self):
198215
self._retval["domain"] = self.get_domain()
199216
# self._retval["domain_without_tld"] = self.get_domain_without_tld()
200217
self._retval["subdomain"] = self.get_subdomain()
201-
self._retval["host"] = self.get_host()
218+
self._retval["host"] = self.host
202219
self._retval["port"] = self.get_port()
203220
self._retval["resource_path"] = self.get_resource_path()
204221
self._retval["query_string"] = self.get_query_string()
@@ -224,4 +241,4 @@ def unparse_url(url):
224241

225242

226243
if __name__ == '__main__':
227-
print(unparse_url('TEST.onion'))
244+
print(unparse_url('http://www.TEST.github.io'))

0 commit comments

Comments
 (0)