88from publicsuffixlist import PublicSuffixList
99from urllib .parse import urlparse , urlunparse
1010
11+ ADDED_TLD = {'b32.i2p' }
12+
1113def _ensure_bytes (binary ):
1214 if isinstance (binary , bytes ):
1315 return binary
@@ -38,6 +40,8 @@ def __init__(self):
3840 self ._url = None
3941 self ._retval = {}
4042 self .ip_as_host = ''
43+ self .host = None
44+ self .tld = None
4145
4246 def _clear (self ):
4347 self .decoded = False
@@ -52,12 +56,12 @@ def decode(self, url):
5256 """
5357 self ._clear ()
5458 if isinstance (url , bytes ) and b'//' not in url [:10 ]:
55- if b'.onion' in url :
59+ if b'.onion' in url or b'.i2p' :
5660 url = b'http://' + url
5761 else :
5862 url = b'https://' + url
5963 elif '//' not in url [:10 ]:
60- if '.onion' in url :
64+ if '.onion' in url or '.i2p' in url :
6165 url = f'http://{ url } '
6266 else :
6367 url = f'https://{ url } '
@@ -84,6 +88,7 @@ def decode(self, url):
8488
8589 self .decoded = True
8690 self ._retval = {}
91+ self .host = self .get_host ()
8792
8893 @property
8994 def url (self ):
@@ -123,6 +128,9 @@ def get_host(self):
123128 if not self .decoded or not self ._url :
124129 raise UrlNotDecoded ("You must call pslfaup.decode() first" )
125130
131+ if self .host :
132+ return self .host
133+
126134 if self ._url .hostname is None :
127135 return None
128136 elif self ._url .hostname .isascii ():
@@ -134,8 +142,11 @@ def get_domain(self):
134142 if not self .decoded or not self ._url :
135143 raise UrlNotDecoded ("You must call pslfaup.decode() first" )
136144
137- if self .get_host () is not None and not self .ip_as_host :
138- return self .psl .privatesuffix (self .get_host ())
145+ if self .host is not None and not self .ip_as_host :
146+ domain = self .host [:- (len (self .get_tld ()) + 1 )].rsplit ('.' , 1 )[- 1 ]
147+ if domain :
148+ return f'{ domain } .{ self .tld } '
149+ # return self.psl.privatesuffix(self.host)
139150 return None
140151
141152 def get_domain_without_tld (self ):
@@ -144,26 +155,32 @@ def get_domain_without_tld(self):
144155
145156 if self .get_tld () is not None and not self .ip_as_host :
146157 if domain := self .get_domain ():
147- return domain .rsplit (self .get_tld () , 1 )[0 ].rstrip ('.' )
158+ return domain .rsplit (self .tld , 1 )[0 ].rstrip ('.' )
148159 return None
149160
150161 def get_subdomain (self ):
151162 if not self .decoded or not self ._url :
152163 raise UrlNotDecoded ("You must call pslfaup.decode() first" )
153164
154- if self .get_host () is not None and not self .ip_as_host :
165+ if self .host is not None and not self .ip_as_host :
155166 domain = self .get_domain ()
156- host = self .get_host ()
157- if domain and host and domain in host :
158- return host .rsplit (domain , 1 )[0 ].rstrip ('.' ) or None
167+ if domain and self .host and domain in self .host :
168+ return self .host .rsplit (domain , 1 )[0 ].rstrip ('.' ) or None
159169 return None
160170
161171 def get_tld (self ):
162- if not self .decoded or not self ._url :
163- raise UrlNotDecoded ("You must call pslfaup.decode() first" )
164-
165- if self .get_host () is not None and not self .ip_as_host :
166- return self .psl .publicsuffix (self .get_host ())
172+ if self .tld :
173+ return self .tld
174+
175+ if self .host is not None and not self .ip_as_host :
176+ for added_tld in ADDED_TLD :
177+ if self .host .endswith (added_tld ):
178+ print ('added' )
179+ self .tld = added_tld
180+ return added_tld
181+ print ('standard' )
182+ self .tld = self .psl .publicsuffix (self .host )
183+ return self .tld
167184 return None
168185
169186 def get_port (self ):
@@ -198,7 +215,7 @@ def get(self):
198215 self ._retval ["domain" ] = self .get_domain ()
199216 # self._retval["domain_without_tld"] = self.get_domain_without_tld()
200217 self ._retval ["subdomain" ] = self .get_subdomain ()
201- self ._retval ["host" ] = self .get_host ()
218+ self ._retval ["host" ] = self .host
202219 self ._retval ["port" ] = self .get_port ()
203220 self ._retval ["resource_path" ] = self .get_resource_path ()
204221 self ._retval ["query_string" ] = self .get_query_string ()
@@ -224,4 +241,4 @@ def unparse_url(url):
224241
225242
226243if __name__ == '__main__' :
227- print (unparse_url ('TEST.onion ' ))
244+ print (unparse_url ('http://www. TEST.github.io ' ))
0 commit comments