Skip to content

Commit 2048571

Browse files
committed
chg: [faup] replace and remove faup
1 parent db1930b commit 2048571

File tree

12 files changed

+253
-99
lines changed

12 files changed

+253
-99
lines changed

bin/lib/crawlers.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
from lib.objects import HHHashs
4848
from lib.objects.Items import Item
4949
from lib import Tag
50+
from lib import psl_faup
5051

5152
config_loader = ConfigLoader()
5253
r_db = config_loader.get_db_conn("Kvrocks_DB")
@@ -60,8 +61,6 @@
6061
D_SCREENSHOT = config_loader.get_config_boolean('Crawler', 'default_screenshot')
6162
config_loader = None
6263

63-
faup = Faup()
64-
6564
# logger_crawler = logging.getLogger('crawlers.log')
6665

6766
# # # # # # # #
@@ -185,18 +184,11 @@ def is_valid_onion_domain(domain):
185184
# return False
186185

187186
def is_valid_domain(domain):
188-
faup.decode(domain)
189-
url_unpack = faup.get()
190-
unpack_domain = url_unpack['domain'].lower()
187+
unpack_domain = psl_faup.get_domain(domain)
191188
return domain == unpack_domain
192189

193-
def get_faup():
194-
return faup
195-
196190
def unpack_url(url):
197-
f = get_faup()
198-
f.decode(url)
199-
url_decoded = f.get()
191+
url_decoded = psl_faup.unparse_url(url)
200192
port = url_decoded['port']
201193
if not port:
202194
if url_decoded['scheme'] == 'http':
@@ -274,9 +266,7 @@ def extract_favicon_from_html(html, url):
274266
# - <meta name="msapplication-config" content="/icons/browserconfig.xml">
275267

276268
# Root Favicon
277-
f = get_faup()
278-
f.decode(url)
279-
url_decoded = f.get()
269+
url_decoded = psl_faup.unparse_url(url)
280270
root_domain = f"{url_decoded['scheme']}://{url_decoded['domain']}"
281271
default_icon = f'{root_domain}/favicon.ico'
282272
favicons_urls.add(default_icon)
@@ -503,9 +493,7 @@ def extract_hhhash(har, domain, date):
503493
if entrie.get('response').get('status') == 200: # != 301:
504494
# print(url, entrie.get('response').get('status'))
505495

506-
f = get_faup()
507-
f.decode(url)
508-
domain_url = f.get().get('domain')
496+
domain_url = psl_faup.get_domain(url)
509497
if domain_url == domain:
510498

511499
headers = entrie.get('response').get('headers')

bin/lib/psl_faup.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
#!/usr/bin/env python3
2+
# -*-coding:UTF-8 -*
3+
4+
import idna
5+
import ipaddress
6+
import socket
7+
8+
from publicsuffixlist import PublicSuffixList
9+
from urllib.parse import urlparse, urlunparse
10+
11+
def _ensure_bytes(binary):
12+
if isinstance(binary, bytes):
13+
return binary
14+
else:
15+
return binary.encode('utf-8')
16+
17+
18+
def _ensure_str(string):
19+
if isinstance(string, str):
20+
return string
21+
else:
22+
return string.decode('utf-8')
23+
24+
25+
class UrlNotDecoded(Exception):
26+
pass
27+
28+
29+
# https://github.com/MISP/PyMISP/blob/main/pymisp/tools/_psl_faup.py
30+
class PSLFaup:
31+
"""
32+
Fake Faup Python Library using PSL for Windows support
33+
"""
34+
35+
def __init__(self):
36+
self.decoded = False
37+
self.psl = PublicSuffixList()
38+
self._url = None
39+
self._retval = {}
40+
self.ip_as_host = ''
41+
42+
def _clear(self):
43+
self.decoded = False
44+
self._url = None
45+
self._retval = {}
46+
self.ip_as_host = ''
47+
48+
def decode(self, url):
49+
"""
50+
This function creates a dict of all the url fields.
51+
:param url: The URL to normalize
52+
"""
53+
self._clear()
54+
if isinstance(url, bytes) and b'//' not in url[:10]:
55+
url = b'//' + url
56+
elif '//' not in url[:10]:
57+
url = '//' + url
58+
self._url = urlparse(url)
59+
60+
if self._url is None:
61+
raise UrlNotDecoded("Unable to parse URL")
62+
63+
self.ip_as_host = ''
64+
if self._url.hostname is None:
65+
raise UrlNotDecoded("Unable to parse URL")
66+
hostname = _ensure_str(self._url.hostname)
67+
try:
68+
ipv4_bytes = socket.inet_aton(hostname)
69+
ipv4 = ipaddress.IPv4Address(ipv4_bytes)
70+
self.ip_as_host = ipv4.compressed
71+
except (OSError, ValueError):
72+
try:
73+
addr, _, _ = hostname.partition('%')
74+
ipv6 = ipaddress.IPv6Address(addr)
75+
self.ip_as_host = ipv6.compressed
76+
except ValueError:
77+
pass
78+
79+
self.decoded = True
80+
self._retval = {}
81+
82+
@property
83+
def url(self):
84+
if not self.decoded or not self._url:
85+
raise UrlNotDecoded("You must call pslfaup.decode() first")
86+
87+
if host := self.get_host():
88+
netloc = host + ('' if self.get_port() is None else f':{self.get_port()}')
89+
return _ensure_bytes(
90+
urlunparse(
91+
(self.get_scheme(), netloc, self.get_resource_path(),
92+
'', self.get_query_string(), self.get_fragment(),)
93+
)
94+
)
95+
return None
96+
97+
def get_credential(self):
98+
if not self.decoded or not self._url:
99+
raise UrlNotDecoded("You must call pslfaup.decode() first")
100+
101+
if self._url.username and self._url.password:
102+
return _ensure_str(self._url.username) + ':' + _ensure_str(self._url.password)
103+
if self._url.username:
104+
return _ensure_str(self._url.username)
105+
return None
106+
107+
def get_scheme(self):
108+
"""
109+
Get the scheme of the url given in the decode function
110+
:returns: The URL scheme
111+
"""
112+
if not self.decoded or not self._url:
113+
raise UrlNotDecoded("You must call pslfaup.decode() first")
114+
return _ensure_str(self._url.scheme if self._url.scheme else '')
115+
116+
def get_host(self):
117+
if not self.decoded or not self._url:
118+
raise UrlNotDecoded("You must call pslfaup.decode() first")
119+
120+
if self._url.hostname is None:
121+
return None
122+
elif self._url.hostname.isascii():
123+
return _ensure_str(self._url.hostname)
124+
else:
125+
return _ensure_str(idna.encode(self._url.hostname, uts46=True))
126+
127+
def get_domain(self):
128+
if not self.decoded or not self._url:
129+
raise UrlNotDecoded("You must call pslfaup.decode() first")
130+
131+
if self.get_host() is not None and not self.ip_as_host:
132+
return self.psl.privatesuffix(self.get_host())
133+
return None
134+
135+
def get_domain_without_tld(self):
136+
if not self.decoded or not self._url:
137+
raise UrlNotDecoded("You must call pslfaup.decode() first")
138+
139+
if self.get_tld() is not None and not self.ip_as_host:
140+
if domain := self.get_domain():
141+
return domain.rsplit(self.get_tld(), 1)[0].rstrip('.')
142+
return None
143+
144+
def get_subdomain(self):
145+
if not self.decoded or not self._url:
146+
raise UrlNotDecoded("You must call pslfaup.decode() first")
147+
148+
if self.get_host() is not None and not self.ip_as_host:
149+
domain = self.get_domain()
150+
host = self.get_host()
151+
if domain and host and domain in host:
152+
return host.rsplit(domain, 1)[0].rstrip('.') or None
153+
return None
154+
155+
def get_tld(self):
156+
if not self.decoded or not self._url:
157+
raise UrlNotDecoded("You must call pslfaup.decode() first")
158+
159+
if self.get_host() is not None and not self.ip_as_host:
160+
return self.psl.publicsuffix(self.get_host())
161+
return None
162+
163+
def get_port(self):
164+
if not self.decoded or not self._url:
165+
raise UrlNotDecoded("You must call pslfaup.decode() first")
166+
return self._url.port
167+
168+
def get_resource_path(self):
169+
if not self.decoded or not self._url:
170+
raise UrlNotDecoded("You must call pslfaup.decode() first")
171+
172+
return _ensure_str(self._url.path)
173+
174+
def get_query_string(self):
175+
if not self.decoded or not self._url:
176+
raise UrlNotDecoded("You must call pslfaup.decode() first")
177+
178+
return _ensure_str(self._url.query)
179+
180+
def get_fragment(self):
181+
if not self.decoded or not self._url:
182+
raise UrlNotDecoded("You must call pslfaup.decode() first")
183+
184+
return _ensure_str(self._url.fragment)
185+
186+
def get(self):
187+
self._retval["scheme"] = self.get_scheme()
188+
self._retval["tld"] = self.get_tld()
189+
self._retval["domain"] = self.get_domain()
190+
# self._retval["domain_without_tld"] = self.get_domain_without_tld()
191+
self._retval["subdomain"] = self.get_subdomain()
192+
self._retval["host"] = self.get_host()
193+
self._retval["port"] = self.get_port()
194+
self._retval["resource_path"] = self.get_resource_path()
195+
self._retval["query_string"] = self.get_query_string()
196+
self._retval["fragment"] = self.get_fragment()
197+
self._retval["url"] = self.url
198+
return self._retval
199+
200+
201+
def get_domain(url):
202+
f = PSLFaup()
203+
f.decode(url)
204+
return f.get_domain()
205+
206+
def get_url(url):
207+
f = PSLFaup()
208+
f.decode(url)
209+
return f.url
210+
211+
def unparse_url(url):
212+
f = PSLFaup()
213+
f.decode(url)
214+
return f.get()
215+
216+
217+
if __name__ == '__main__':
218+
print(unparse_url('Example.COM'))

bin/modules/Credential.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,14 @@
2828
##################################
2929
import os
3030
import sys
31-
import time
32-
from pyfaup.faup import Faup
3331

3432
sys.path.append(os.environ['AIL_BIN'])
3533
##################################
3634
# Import Project packages
3735
##################################
3836
from modules.abstract_module import AbstractModule
3937
from lib import ConfigLoader
38+
from lib import psl_faup
4039

4140

4241
class Credential(AbstractModule):
@@ -57,8 +56,6 @@ class Credential(AbstractModule):
5756
def __init__(self):
5857
super(Credential, self).__init__()
5958

60-
self.faup = Faup()
61-
6259
self.regex_web = r"((?:https?:\/\/)[\.-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)"
6360
self.regex_cred = r"[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+"
6461
self.regex_site_for_stats = r"@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:"
@@ -118,13 +115,7 @@ def compute(self, message):
118115
creds_sites[site_domain] = 1
119116

120117
for url in all_sites:
121-
self.faup.decode(url)
122-
domain = self.faup.get()['domain']
123-
# # TODO: # FIXME: remove me, check faup versionb
124-
try:
125-
domain = domain.decode()
126-
except:
127-
pass
118+
domain = psl_faup.get_domain(url)
128119
if domain in creds_sites.keys():
129120
creds_sites[domain] += 1
130121
else:

bin/modules/LibInjection.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
import sys
1616
import pylibinjection
1717

18-
from datetime import datetime
19-
from pyfaup.faup import Faup
2018
from urllib.parse import unquote
2119

2220

@@ -25,22 +23,20 @@
2523
# Import Project packages
2624
##################################
2725
from modules.abstract_module import AbstractModule
26+
from lib import psl_faup
2827

2928
class LibInjection(AbstractModule):
3029
"""docstring for LibInjection module."""
3130

3231
def __init__(self):
3332
super(LibInjection, self).__init__()
3433

35-
self.faup = Faup()
36-
3734
self.logger.info(f"Module: {self.module_name} Launched")
3835

3936
def compute(self, message):
4037
url = message
4138

42-
self.faup.decode(url)
43-
url_parsed = self.faup.get()
39+
url_parsed = psl_faup.unparse_url(url)
4440
# # TODO: # FIXME: remove me
4541
try:
4642
resource_path = url_parsed['resource_path'].encode()

bin/modules/Mail.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,6 @@
1919
import dns.resolver
2020
import dns.exception
2121

22-
# from pyfaup.faup import Faup
23-
2422
sys.path.append(os.environ['AIL_BIN'])
2523
##################################
2624
# Import Project packages #
@@ -44,8 +42,6 @@ def __init__(self, queue=True):
4442

4543
self.dns_server = config_loader.get_config_str('Mail', 'dns')
4644

47-
# self.faup = Faup()
48-
4945
# Numbers of Mails needed to Tags
5046
self.mail_threshold = 10
5147

bin/modules/Onion.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from lib.ConfigLoader import ConfigLoader
2626
from lib.objects.Domains import Domain
2727
from lib import crawlers
28+
from lib import psl_faup
2829

2930
class Onion(AbstractModule):
3031
"""docstring for Onion module."""
@@ -39,8 +40,6 @@ def __init__(self, queue=True):
3940
# regex timeout
4041
self.regex_timeout = config_loader.get_config_int("Onion", "max_execution_time")
4142

42-
self.faup = crawlers.get_faup()
43-
4443
# activate_crawler = p.config.get("Crawler", "activate_crawler")
4544
self.har = config_loader.get_config_boolean('Crawler', 'default_har')
4645
self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
@@ -86,8 +85,7 @@ def compute(self, message):
8685
print(url)
8786

8887
# TODO Crawl subdomain
89-
url_unpack = crawlers.unpack_url(url)
90-
domain = url_unpack['domain']
88+
domain = psl_faup.get_domain(url)
9189
if crawlers.is_valid_onion_domain(domain):
9290
domains.append(domain)
9391
onion_urls.append(url)

0 commit comments

Comments
 (0)