new: [crawler] add I2P crawler + auto discovery crawler

Terrtia · Terrtia · commit c1d0c6aca1cb · 2025-09-18T10:32:56.000+02:00
diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py
@@ -379,7 +379,7 @@ def compute(self, capture):
             print('task:   ', task.uuid, 'Unsafe Content Filtered')
             print()
 
-        # onion messages correlation
+        # onion/i2p messages correlation
         if crawlers.is_domain_correlation_cache(self.original_domain.id):
             crawlers.save_domain_correlation_cache(self.original_domain.was_up(), domain)
 
diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
@@ -195,6 +195,32 @@ def is_valid_onion_domain(domain):
     #         return True
     # return False
 
+def get_reserved_i2p_domains():
+    return {'console.i2p', 'mail.i2p', 'proxy.i2p', 'router.i2p'}
+
+def is_valid_i2p_b32_domain(domain):
+    dom = domain[:-8]
+    # Distinguish old from new flavors by length. Old b32 addresses are always {52 chars}.b32.i2p.
+    # New ones are {56+ chars}.b32.i2p
+    if len(dom) == 52 or 56 >= len(dom) <= 64:
+        return dom.isalnum()
+    else:
+        return False
+
+def is_valid_i2p_domain(domain):
+    if not domain.endswith('.i2p'):
+        return False
+    if domain.endswith('b32.i2p'):
+        return is_valid_i2p_b32_domain(domain)
+    else:
+        if domain in get_reserved_i2p_domains():
+            return False
+        # 67 characters maximum, including the '.i2p'
+        if len(domain) > 67:
+            return False
+        return domain[:-4].replace('-', '').isalnum()
+
+
 def is_valid_domain(domain):
     unpack_domain = psl_faup.get_domain(domain)
     return domain == unpack_domain
@@ -1442,8 +1468,11 @@ def create(self, frequency, user, url,
             self._set_field('cookiejar', cookiejar)
         if header:
             self._set_field('header', header)
+
+        if url_decoded['domain'].endswith('i2p'):
+            proxy = None
         if proxy:
-            if proxy == 'web':
+            if proxy == 'web' or proxy == 'i2p':
                 proxy = None
             elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
                 proxy = 'force_tor'
@@ -1782,7 +1811,9 @@ def create(self, url, depth=1, har=True, screenshot=True, header=None, cookiejar
         har = int(har)
         screenshot = int(screenshot)
 
-        if proxy == 'web':
+        if domain.endswith('i2p'):
+            proxy = None
+        if proxy == 'web' or proxy == 'i2p':
             proxy = None
         elif proxy == 'force_tor' or proxy == 'tor' or proxy == 'onion':
             proxy = 'force_tor'
@@ -1954,7 +1985,7 @@ def api_parse_task_dict_basic(data, user_id):
     proxy = data.get('proxy', None)
     if proxy == 'onion' or proxy == 'tor' or proxy == 'force_tor':
         proxy = 'force_tor'
-    elif proxy == 'web':
+    elif proxy == 'web' or proxy == 'i2p':
         proxy = None
     elif proxy:
         verify = api_verify_proxy(proxy)
@@ -1964,7 +1995,7 @@ def api_parse_task_dict_basic(data, user_id):
     tags = data.get('tags', [])
 
     data = {'depth_limit': depth_limit, 'har': har, 'screenshot': screenshot, 'proxy': proxy, 'tags': tags}
-    if url :
+    if url:
         data['url'] = url
     elif urls:
         data['urls'] = urls
@@ -2087,7 +2118,7 @@ def is_crawler_activated():
     return activate_crawler == 'True'
 
 def get_crawler_all_types():
-    return ['onion', 'web']
+    return ['i2p', 'onion', 'web']
 
 ##-- CRAWLER GLOBAL --##
 
diff --git a/bin/lib/objects/Domains.py b/bin/lib/objects/Domains.py
@@ -55,6 +55,8 @@ def __init__(self, id):
     def get_domain_type(self):
         if str(self.id).endswith('.onion'):
             return 'onion'
+        elif str(self.id).endswith('.i2p'):
+            return 'i2p'
         else:
             return 'web'
 
@@ -262,6 +264,9 @@ def get_svg_icon(self):
         if self.get_domain_type() == 'onion':
             style = 'fas'
             icon = '\uf06e'
+        elif self.get_domain_type() == 'i2p':
+            style = 'fas'
+            icon = '\uf21b'  # TODO change me
         else:
             style = 'fab'
             icon = '\uf13b'
@@ -525,7 +530,7 @@ def _write_in_zip_buffer(zf, path, filename):
 ############################################################################
 
 def get_all_domains_types():
-    return ['onion', 'web']  # i2p
+    return ['i2p', 'onion', 'web']
 
 def sanitize_domains_types(types):
     domains_types = get_all_domains_types()
@@ -628,14 +633,11 @@ def get_domains_dates_by_daterange(date_from, date_to, domain_types, up=True, do
 def get_domains_by_month(date_month, domains_types, up=True, down=True):
     start = f'{date_month}01'
     end = Date.get_month_last_day(date_month)
-    if 'onion' in domains_types:
-        domains = get_domains_by_daterange(start, end, 'onion', up=up, down=down)
-    else:
-        domains = []
-    if 'web' in domains_types:
-        web = get_domains_by_daterange(start, end, 'web', up=up, down=down)
-        if web:
-            domains.extend(web)
+    domains = []
+    for domain_type in domains_types:
+        doms = get_domains_by_daterange(start, end, domain_type, up=up, down=down)
+        if doms:
+            domains.extend(doms)
     return domains
 
 def get_domain_up_iterator():
@@ -653,7 +655,7 @@ def get_domains_meta(domains):
 # TODO ADD TAGS FILTER
 def get_domains_up_by_filers(domain_types, date_from=None, date_to=None, tags=[], nb_obj=28, page=1):
     if not domain_types:
-        domain_types = ['onion', 'web']
+        domain_types = get_all_domains_types()
     if not tags:
         domains = []
         if not date_from and not date_to:
@@ -683,6 +685,8 @@ def sanitize_domain_name_to_search(name_to_search, domain_type):
         return ""
     if domain_type == 'onion':
         r_name = r'[a-z0-9\.]+'
+    elif domain_type == 'ip':
+        r_name = r'[a-z0-9-\.]+'
     else:
         r_name = r'[a-zA-Z0-9-_\.]+'
     # invalid domain name
diff --git a/bin/lib/psl_faup.py b/bin/lib/psl_faup.py
@@ -175,10 +175,8 @@ def get_tld(self):
         if self.host is not None and not self.ip_as_host:
             for added_tld in ADDED_TLD:
                 if self.host.endswith(added_tld):
-                    print('added')
                     self.tld = added_tld
                     return added_tld
-            print('standard')
             self.tld = self.psl.publicsuffix(self.host)
             return self.tld
         return None
diff --git a/bin/modules/Onion.py b/bin/modules/Onion.py
@@ -45,16 +45,13 @@ def __init__(self, queue=True):
         self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
 
         self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
-        # self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
+        self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
         re.compile(self.onion_regex)
-        # re.compile(self.i2p_regex)
+        re.compile(self.i2p_regex)
 
         self.logger.info(f"Module: {self.module_name} Launched")
 
-        # TEMP var: SAVE I2P Domain (future I2P crawler)
-        # self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")
-
-    def extract(self, obj, content, tag):
+    def extract(self, obj, content, tag): # TODO add I2P
         extracted = []
         if obj.type == 'item':
             if 'infoleak:submission="crawler"' in obj.get_tags():
@@ -74,13 +71,16 @@ def extract(self, obj, content, tag):
 
     def compute(self, message):
         onion_urls = []
+        i2p_urls = []
         domains = set()
 
         obj = self.get_obj()
         content = obj.get_content()
 
         # max execution time on regex
         res = self.regex_findall(self.onion_regex, obj.get_id(), content, r_set=True)
+        for r in res:
+            domains.add(r['domain'])
         for x in res:
             # String to tuple
             x = x[2:-2].replace(" '", "").split("',")
@@ -91,7 +91,7 @@ def compute(self, message):
 
             # TODO Crawl subdomain
             if len(url) >= 62:
-                print(url)
+                # perf
                 if len(url) == 69 and url.endswith(".onion"):
                     domain = url[7:]
                     domains.add(domain)
@@ -103,7 +103,42 @@ def compute(self, message):
                             domains.add(domain)
                             onion_urls.append(url)
 
-        if onion_urls:
+        res = self.regex_findall(self.i2p_regex, obj.get_id(), content, r_set=True)
+        for x in res:
+            # String to tuple
+            x = x[2:-2].replace(" '", "").split("',")
+            url = x[0]
+            if url.startswith("://"):
+                url = url[3:]
+            url = url.lower()
+
+            # perf
+            if url.endswith('b32.i2p'):
+                b32_url = url
+                if url.startswith('http://'):
+                    b32_url = url[7:]
+                if crawlers.is_valid_i2p_b32_domain(b32_url):
+                    domains.add(b32_url)
+                    i2p_urls.append(b32_url)
+                    continue
+            elif url.endswith('.i2p'):
+                dom_url = url
+                if url.startswith('http://'):
+                    dom_url = url[7:]
+                if '.' not in dom_url[:-4]:
+                    if crawlers.is_valid_i2p_domain(dom_url):
+                        domains.add(dom_url)
+                        i2p_urls.append(dom_url)
+                        continue
+
+            domain = psl_faup.get_domain(url)
+            if domain:
+                if crawlers.is_valid_i2p_domain(domain):
+                    domains.add(domain)
+                    i2p_urls.append(url)
+
+        # Onion + I2P
+        if domains:
             if crawlers.is_crawler_activated():
                 for domain in domains:
                     dom = Domain(domain)
@@ -128,11 +163,12 @@ def compute(self, message):
                             crawlers.add_domain_correlation_cache(domain, f'chat:{chat_subtype}:{chat_id}')
                             crawlers.add_domain_correlation_cache(domain, self.obj.get_global_id())
             else:
-                print(f'Detected {len(domains)} .onion(s);{self.obj.get_global_id()}')
+                print(f'Detected {len(domains)} .onion/i2p;{self.obj.get_global_id()}')
 
-            # TAG Object
-            tag = 'infoleak:automatic-detection="onion"'
-            self.add_message_to_queue(message=tag, queue='Tags')
+            if onion_urls:
+                # TAG Object
+                tag = 'infoleak:automatic-detection="onion"'
+                self.add_message_to_queue(message=tag, queue='Tags')
 
 
 if __name__ == "__main__":
diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py
@@ -105,7 +105,7 @@ def manual():
     user_org = current_user.get_org()
     user_id = current_user.get_user_id()
     l_cookiejar = crawlers.api_get_cookiejars_selector(user_org, user_id)
-    crawlers_types = crawlers.get_crawler_all_types()
+    crawlers_types = ['onion', 'web']
     proxies = []  # TODO HANDLE PROXIES
     return render_template("crawler_manual.html",
                            is_manager_connected=crawlers.get_lacus_connection_metadata(),
@@ -219,6 +219,7 @@ def send_to_spider():
         return create_json_response(res[0], res[1])
     return redirect(url_for('crawler_splash.manual'))
 
+# Send Unknown onion to crawler
 @crawler_splash.route("/crawlers/domain_discovery", methods=['GET'])
 @login_required
 @login_user_no_api
@@ -482,6 +483,7 @@ def crawlers_domain_download():
 @login_read_only
 def domains_explorer_post_filter():
     domain_onion = request.form.get('domain_onion_switch')
+    domain_i2p = request.form.get('domain_i2p_switch')
     domain_regular = request.form.get('domain_regular_switch')
     date_from = request.form.get('date_from')
     date_to = request.form.get('date_to')
@@ -493,7 +495,7 @@ def domains_explorer_post_filter():
         date_from = None
         date_to = None
 
-    if domain_onion and domain_regular:
+    if domain_onion and domain_regular and domain_i2p:
         if date_from and date_to:
             return redirect(url_for('crawler_splash.domains_explorer_all', date_from=date_from, date_to=date_to))
         else:
@@ -503,6 +505,11 @@ def domains_explorer_post_filter():
             return redirect(url_for('crawler_splash.domains_explorer_web', date_from=date_from, date_to=date_to))
         else:
             return redirect(url_for('crawler_splash.domains_explorer_web'))
+    elif domain_i2p:
+        if date_from and date_to:
+            return redirect(url_for('crawler_splash.domains_explorer_i2p', date_from=date_from, date_to=date_to))
+        else:
+            return redirect(url_for('crawler_splash.domains_explorer_i2p'))
     else:
         if date_from and date_to:
             return redirect(url_for('crawler_splash.domains_explorer_onion', date_from=date_from, date_to=date_to))
@@ -522,7 +529,7 @@ def domains_explorer_all():
     except:
         page = 1
 
-    dict_data = Domains.get_domains_up_by_filers(['onion', 'web'], page=page, date_from=date_from, date_to=date_to)
+    dict_data = Domains.get_domains_up_by_filers(Domains.get_all_domains_types(), page=page, date_from=date_from, date_to=date_to)
     return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label, domain_type='all')
 
 
@@ -542,6 +549,21 @@ def domains_explorer_onion():
     return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label,
                            domain_type='onion')
 
+@crawler_splash.route('/domains/explorer/i2p', methods=['GET'])
+@login_required
+@login_read_only
+def domains_explorer_i2p():
+    page = request.args.get('page')
+    date_from = request.args.get('date_from')
+    date_to = request.args.get('date_to')
+    try:
+        page = int(page)
+    except:
+        page = 1
+
+    dict_data = Domains.get_domains_up_by_filers(['i2p'], page=page, date_from=date_from, date_to=date_to)
+    return render_template("domain_explorer.html", dict_data=dict_data, bootstrap_label=bootstrap_label,
+                           domain_type='i2p')
 
 @crawler_splash.route('/domains/explorer/web', methods=['GET'])
 @login_required
diff --git a/var/www/templates/crawler/crawler_splash/dashboard_crawler.html b/var/www/templates/crawler/crawler_splash/dashboard_crawler.html
diff --git a/var/www/templates/crawler/menu_sidebar.html b/var/www/templates/crawler/menu_sidebar.html
diff --git a/var/www/templates/domains/filter_domains.html b/var/www/templates/domains/filter_domains.html