Skip to content

Commit c1d0c6a

Browse files
committed
new: [crawler] add I2P crawler + auto discovery crawler
1 parent d7a9d05 commit c1d0c6a

File tree

9 files changed

+164
-39
lines changed

9 files changed

+164
-39
lines changed

bin/crawlers/Crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ def compute(self, capture):
379379
print('task: ', task.uuid, 'Unsafe Content Filtered')
380380
print()
381381

382-
# onion messages correlation
382+
# onion/i2p messages correlation
383383
if crawlers.is_domain_correlation_cache(self.original_domain.id):
384384
crawlers.save_domain_correlation_cache(self.original_domain.was_up(), domain)
385385

bin/lib/crawlers.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,32 @@ def is_valid_onion_domain(domain):
195195
# return True
196196
# return False
197197

198+
def get_reserved_i2p_domains():
199+
return {'console.i2p', 'mail.i2p', 'proxy.i2p', 'router.i2p'}
200+
201+
def is_valid_i2p_b32_domain(domain):
202+
dom = domain[:-8]
203+
# Distinguish old from new flavors by length. Old b32 addresses are always {52 chars}.b32.i2p.
204+
# New ones are {56+ chars}.b32.i2p
205+
if len(dom) == 52 or 56 >= len(dom) <= 64:
206+
return dom.isalnum()
207+
else:
208+
return False
209+
210+
def is_valid_i2p_domain(domain):
211+
if not domain.endswith('.i2p'):
212+
return False
213+
if domain.endswith('b32.i2p'):
214+
return is_valid_i2p_b32_domain(domain)
215+
else:
216+
if domain in get_reserved_i2p_domains():
217+
return False
218+
# 67 characters maximum, including the '.i2p'
219+
if len(domain) > 67:
220+
return False
221+
return domain[:-4].replace('-', '').isalnum()
222+
223+
198224
def is_valid_domain(domain):
199225
unpack_domain = psl_faup.get_domain(domain)
200226
return domain == unpack_domain
@@ -1442,8 +1468,11 @@ def create(self, frequency, user, url,
14421468
self._set_field('cookiejar', cookiejar)
14431469
if header:
14441470
self._set_field('header', header)
1471+
1472+
if url_decoded['domain'].endswith('i2p'):
1473+
proxy = None
14451474
if proxy:
1446-
if proxy == 'web':
1475+
if proxy == 'web' or proxy == 'i2p':
14471476
proxy = None
14481477
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
14491478
proxy = 'force_tor'
@@ -1782,7 +1811,9 @@ def create(self, url, depth=1, har=True, screenshot=True, header=None, cookiejar
17821811
har = int(har)
17831812
screenshot = int(screenshot)
17841813

1785-
if proxy == 'web':
1814+
if domain.endswith('i2p'):
1815+
proxy = None
1816+
if proxy == 'web' or proxy == 'i2p':
17861817
proxy = None
17871818
elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
17881819
proxy = 'force_tor'
@@ -1954,7 +1985,7 @@ def api_parse_task_dict_basic(data, user_id):
19541985
proxy = data.get('proxy', None)
19551986
if proxy == 'onion' or proxy == 'tor' or proxy == 'force_tor':
19561987
proxy = 'force_tor'
1957-
elif proxy == 'web':
1988+
elif proxy == 'web' or proxy == 'i2p':
19581989
proxy = None
19591990
elif proxy:
19601991
verify = api_verify_proxy(proxy)
@@ -1964,7 +1995,7 @@ def api_parse_task_dict_basic(data, user_id):
19641995
tags = data.get('tags', [])
19651996

19661997
data = {'depth_limit': depth_limit, 'har': har, 'screenshot': screenshot, 'proxy': proxy, 'tags': tags}
1967-
if url :
1998+
if url:
19681999
data['url'] = url
19692000
elif urls:
19702001
data['urls'] = urls
@@ -2087,7 +2118,7 @@ def is_crawler_activated():
20872118
return activate_crawler == 'True'
20882119

20892120
def get_crawler_all_types():
2090-
return ['onion', 'web']
2121+
return ['i2p', 'onion', 'web']
20912122

20922123
##-- CRAWLER GLOBAL --##
20932124

bin/lib/objects/Domains.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ def __init__(self, id):
5555
def get_domain_type(self):
5656
if str(self.id).endswith('.onion'):
5757
return 'onion'
58+
elif str(self.id).endswith('.i2p'):
59+
return 'i2p'
5860
else:
5961
return 'web'
6062

@@ -262,6 +264,9 @@ def get_svg_icon(self):
262264
if self.get_domain_type() == 'onion':
263265
style = 'fas'
264266
icon = '\uf06e'
267+
elif self.get_domain_type() == 'i2p':
268+
style = 'fas'
269+
icon = '\uf21b' # TODO change me
265270
else:
266271
style = 'fab'
267272
icon = '\uf13b'
@@ -525,7 +530,7 @@ def _write_in_zip_buffer(zf, path, filename):
525530
############################################################################
526531

527532
def get_all_domains_types():
528-
return ['onion', 'web'] # i2p
533+
return ['i2p', 'onion', 'web']
529534

530535
def sanitize_domains_types(types):
531536
domains_types = get_all_domains_types()
@@ -628,14 +633,11 @@ def get_domains_dates_by_daterange(date_from, date_to, domain_types, up=True, do
628633
def get_domains_by_month(date_month, domains_types, up=True, down=True):
629634
start = f'{date_month}01'
630635
end = Date.get_month_last_day(date_month)
631-
if 'onion' in domains_types:
632-
domains = get_domains_by_daterange(start, end, 'onion', up=up, down=down)
633-
else:
634-
domains = []
635-
if 'web' in domains_types:
636-
web = get_domains_by_daterange(start, end, 'web', up=up, down=down)
637-
if web:
638-
domains.extend(web)
636+
domains = []
637+
for domain_type in domains_types:
638+
doms = get_domains_by_daterange(start, end, domain_type, up=up, down=down)
639+
if doms:
640+
domains.extend(doms)
639641
return domains
640642

641643
def get_domain_up_iterator():
@@ -653,7 +655,7 @@ def get_domains_meta(domains):
653655
# TODO ADD TAGS FILTER
654656
def get_domains_up_by_filers(domain_types, date_from=None, date_to=None, tags=[], nb_obj=28, page=1):
655657
if not domain_types:
656-
domain_types = ['onion', 'web']
658+
domain_types = get_all_domains_types()
657659
if not tags:
658660
domains = []
659661
if not date_from and not date_to:
@@ -683,6 +685,8 @@ def sanitize_domain_name_to_search(name_to_search, domain_type):
683685
return ""
684686
if domain_type == 'onion':
685687
r_name = r'[a-z0-9\.]+'
688+
elif domain_type == 'ip':
689+
r_name = r'[a-z0-9-\.]+'
686690
else:
687691
r_name = r'[a-zA-Z0-9-_\.]+'
688692
# invalid domain name

bin/lib/psl_faup.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,8 @@ def get_tld(self):
175175
if self.host is not None and not self.ip_as_host:
176176
for added_tld in ADDED_TLD:
177177
if self.host.endswith(added_tld):
178-
print('added')
179178
self.tld = added_tld
180179
return added_tld
181-
print('standard')
182180
self.tld = self.psl.publicsuffix(self.host)
183181
return self.tld
184182
return None

bin/modules/Onion.py

Lines changed: 48 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,13 @@ def __init__(self, queue=True):
4545
self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
4646

4747
self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
48-
# self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
48+
self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
4949
re.compile(self.onion_regex)
50-
# re.compile(self.i2p_regex)
50+
re.compile(self.i2p_regex)
5151

5252
self.logger.info(f"Module: {self.module_name} Launched")
5353

54-
# TEMP var: SAVE I2P Domain (future I2P crawler)
55-
# self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")
56-
57-
def extract(self, obj, content, tag):
54+
def extract(self, obj, content, tag): # TODO add I2P
5855
extracted = []
5956
if obj.type == 'item':
6057
if 'infoleak:submission="crawler"' in obj.get_tags():
@@ -74,13 +71,16 @@ def extract(self, obj, content, tag):
7471

7572
def compute(self, message):
7673
onion_urls = []
74+
i2p_urls = []
7775
domains = set()
7876

7977
obj = self.get_obj()
8078
content = obj.get_content()
8179

8280
# max execution time on regex
8381
res = self.regex_findall(self.onion_regex, obj.get_id(), content, r_set=True)
82+
for r in res:
83+
domains.add(r['domain'])
8484
for x in res:
8585
# String to tuple
8686
x = x[2:-2].replace(" '", "").split("',")
@@ -91,7 +91,7 @@ def compute(self, message):
9191

9292
# TODO Crawl subdomain
9393
if len(url) >= 62:
94-
print(url)
94+
# perf
9595
if len(url) == 69 and url.endswith(".onion"):
9696
domain = url[7:]
9797
domains.add(domain)
@@ -103,7 +103,42 @@ def compute(self, message):
103103
domains.add(domain)
104104
onion_urls.append(url)
105105

106-
if onion_urls:
106+
res = self.regex_findall(self.i2p_regex, obj.get_id(), content, r_set=True)
107+
for x in res:
108+
# String to tuple
109+
x = x[2:-2].replace(" '", "").split("',")
110+
url = x[0]
111+
if url.startswith("://"):
112+
url = url[3:]
113+
url = url.lower()
114+
115+
# perf
116+
if url.endswith('b32.i2p'):
117+
b32_url = url
118+
if url.startswith('http://'):
119+
b32_url = url[7:]
120+
if crawlers.is_valid_i2p_b32_domain(b32_url):
121+
domains.add(b32_url)
122+
i2p_urls.append(b32_url)
123+
continue
124+
elif url.endswith('.i2p'):
125+
dom_url = url
126+
if url.startswith('http://'):
127+
dom_url = url[7:]
128+
if '.' not in dom_url[:-4]:
129+
if crawlers.is_valid_i2p_domain(dom_url):
130+
domains.add(dom_url)
131+
i2p_urls.append(dom_url)
132+
continue
133+
134+
domain = psl_faup.get_domain(url)
135+
if domain:
136+
if crawlers.is_valid_i2p_domain(domain):
137+
domains.add(domain)
138+
i2p_urls.append(url)
139+
140+
# Onion + I2P
141+
if domains:
107142
if crawlers.is_crawler_activated():
108143
for domain in domains:
109144
dom = Domain(domain)
@@ -128,11 +163,12 @@ def compute(self, message):
128163
crawlers.add_domain_correlation_cache(domain, f'chat:{chat_subtype}:{chat_id}')
129164
crawlers.add_domain_correlation_cache(domain, self.obj.get_global_id())
130165
else:
131-
print(f'Detected {len(domains)} .onion(s);{self.obj.get_global_id()}')
166+
print(f'Detected {len(domains)} .onion/i2p;{self.obj.get_global_id()}')
132167

133-
# TAG Object
134-
tag = 'infoleak:automatic-detection="onion"'
135-
self.add_message_to_queue(message=tag, queue='Tags')
168+
if onion_urls:
169+
# TAG Object
170+
tag = 'infoleak:automatic-detection="onion"'
171+
self.add_message_to_queue(message=tag, queue='Tags')
136172

137173

138174
if __name__ == "__main__":

var/www/blueprints/crawler_splash.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def manual():
105105
user_org = current_user.get_org()
106106
user_id = current_user.get_user_id()
107107
l_cookiejar = crawlers.api_get_cookiejars_selector(user_org, user_id)
108-
crawlers_types = crawlers.get_crawler_all_types()
108+
crawlers_types = ['onion', 'web']
109109
proxies = [] # TODO HANDLE PROXIES
110110
return render_template("crawler_manual.html",
111111
is_manager_connected=crawlers.get_lacus_connection_metadata(),
@@ -219,6 +219,7 @@ def send_to_spider():
219219
return create_json_response(res[0], res[1])
220220
return redirect(url_for('crawler_splash.manual'))
221221

222+
# Send Unknown onion to crawler
222223
@crawler_splash.route("/crawlers/domain_discovery", methods=['GET'])
223224
@login_required
224225
@login_user_no_api
@@ -482,6 +483,7 @@ def crawlers_domain_download():
482483
@login_read_only
483484
def domains_explorer_post_filter():
484485
domain_onion = request.form.get('domain_onion_switch')
486+
domain_i2p = request.form.get('domain_i2p_switch')
485487
domain_regular = request.form.get('domain_regular_switch')
486488
date_from = request.form.get('date_from')
487489
date_to = request.form.get('date_to')
@@ -493,7 +495,7 @@ def domains_explorer_post_filter():
493495
date_from = None
494496
date_to = None
495497

496-
if domain_onion and domain_regular:
498+
if domain_onion and domain_regular and domain_i2p:
497499
if date_from and date_to:
498500
return redirect(url_for('crawler_splash.domains_explorer_all', date_from=date_from, date_to=date_to))
499501
else:
@@ -503,6 +505,11 @@ def domains_explorer_post_filter():
503505
return redirect(url_for('crawler_splash.domains_explorer_web', date_from=date_from, date_to=date_to))
504506
else:
505507
return redirect(url_for('crawler_splash.domains_explorer_web'))
508+
elif domain_i2p:
509+
if date_from and date_to:
510+
return redirect(url_for('crawler_splash.domains_explorer_i2p', date_from=date_from, date_to=date_to))
511+
else:
512+
return redirect(url_for('crawler_splash.domains_explorer_i2p'))
506513
else:
507514
if date_from and date_to:
508515
return redirect(url_for('crawler_splash.domains_explorer_onion', date_from=date_from, date_to=date_to))
@@ -522,7 +529,7 @@ def domains_explorer_all():
522529
except:
523530
page = 1
524531

525-
dict_data = Domains.get_domains_up_by_filers(['onion', 'web'], page=page, date_from=date_from, date_to=date_to)
532+
dict_data = Domains.get_domains_up_by_filers(Domains.get_all_domains_types(), page=page, date_from=date_from, date_to=date_to)
526533
return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='all')
527534

528535

@@ -542,6 +549,21 @@ def domains_explorer_onion():
542549
return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label,
543550
domain_type='onion')
544551

552+
@crawler_splash.route('/domains/explorer/i2p', methods=['GET'])
553+
@login_required
554+
@login_read_only
555+
def domains_explorer_i2p():
556+
page = request.args.get('page')
557+
date_from = request.args.get('date_from')
558+
date_to = request.args.get('date_to')
559+
try:
560+
page = int(page)
561+
except:
562+
page = 1
563+
564+
dict_data = Domains.get_domains_up_by_filers(['i2p'], page=page, date_from=date_from, date_to=date_to)
565+
return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label,
566+
domain_type='i2p')
545567

546568
@crawler_splash.route('/domains/explorer/web', methods=['GET'])
547569
@login_required

0 commit comments

Comments
 (0)