Skip to content

Commit 0f3b12e

Browse files
committed
fix: [i2p crawler] only filter errored I2P pages + fix i2P crawler stats
1 parent 62d2b4a commit 0f3b12e

File tree

1 file changed

+75
-73
lines changed

1 file changed

+75
-73
lines changed

bin/crawlers/Crawler.py

Lines changed: 75 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -435,88 +435,90 @@ def save_capture_response(self, parent_id, entries):
435435
dom_hash_id = DomHashs.extract_dom_hash(entries['html'])
436436

437437
# FILTER I2P 'Website Unknown' and 'Website Unreachable'
438+
filter_page = False
438439
if self.domain.id.endswith('.i2p'):
439-
if dom_hash_id == '186eff95227efa351e6acfc00a807a7b': # 'Website Unreachable'
440+
if dom_hash_id == '186eff95227efa351e6acfc00a807a7b' or dom_hash_id == '58f5624724ece6452bf2fd50975df06a': # 'Website Unreachable'
440441
print(title_content.encode())
441442
print('I2P Website Unreachable')
442-
return False
443+
filter_page = True
443444
elif dom_hash_id == 'd71f204a2ee135a45b1e34deb8377094': # b'Website Unknown'
444445
print(title_content.encode())
445446
print('Website Unknown - Website Not Found in Addressbook')
446-
return False
447+
filter_page = True
447448
elif dom_hash_id == 'a530b30b5921d45f591a0c6a716ffcd9': # 'Website Unreachable'
448449
print(title_content.encode())
449450
print('Invalid Destination')
450-
return False
451-
452-
453-
# DOM-HASH
454-
dom_hash = DomHashs.create(entries['html'], obj_id=dom_hash_id)
455-
dom_hash.add(self.date.replace('/', ''), item)
456-
dom_hash.add_correlation('domain', '', self.domain.id)
457-
458-
gzip64encoded = crawlers.get_gzipped_b64_item(item.id, entries['html'])
459-
# send item to Global
460-
relay_message = f'crawler {gzip64encoded}'
461-
self.add_message_to_queue(obj=item, message=relay_message, queue='Importers')
462-
463-
# Tag # TODO replace me with metadata to tags
464-
msg = f'infoleak:submission="crawler"' # TODO FIXME
465-
self.add_message_to_queue(obj=item, message=msg, queue='Tags')
466-
467-
# TODO replace me with metadata to add
468-
crawlers.create_item_metadata(item_id, last_url, parent_id)
469-
if self.root_item is None:
470-
self.root_item = item_id
471-
parent_id = item_id
472-
473-
# TITLE
474-
if title_content:
475-
title = Titles.create_title(title_content)
476-
title.add(item.get_date(), item)
477-
# Tracker
478-
self.tracker_yara.compute_manual(title)
479-
# if not title.is_tags_safe():
480-
# unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
481-
# self.domain.add_tag(unsafe_tag)
482-
# item.add_tag(unsafe_tag)
483-
self.add_message_to_queue(obj=title, message=self.domain.id, queue='Titles')
484-
485-
# SCREENSHOT
486-
if self.screenshot:
487-
if 'png' in entries and entries.get('png'):
488-
screenshot = Screenshots.create_screenshot(entries['png'], b64=False)
489-
if screenshot:
490-
if not screenshot.is_tags_safe():
491-
unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
492-
self.domain.add_tag(unsafe_tag)
493-
item.add_tag(unsafe_tag)
494-
# Remove Placeholder pages # TODO Replace with warning list ???
495-
if screenshot.id not in self.placeholder_screenshots:
496-
# Create Correlations
497-
screenshot.add_correlation('item', '', item_id)
498-
screenshot.add_correlation('domain', '', self.domain.id)
499-
self.add_message_to_queue(obj=screenshot, queue='Images')
500-
# HAR
501-
if self.har:
502-
if 'har' in entries and entries.get('har'):
503-
har_id = crawlers.create_har_id(self.date, item_id)
504-
crawlers.save_har(har_id, entries['har'])
505-
for cookie_name in crawlers.extract_cookies_names_from_har(entries['har']):
506-
print(cookie_name)
507-
cookie = CookiesNames.create(cookie_name)
508-
cookie.add(self.date.replace('/', ''), self.domain)
509-
for etag_content in crawlers.extract_etag_from_har(entries['har']):
510-
print(etag_content)
511-
etag = Etags.create(etag_content)
512-
etag.add(self.date.replace('/', ''), self.domain)
513-
crawlers.extract_hhhash(entries['har'], self.domain.id, self.date.replace('/', ''))
514-
515-
# FAVICON
516-
if entries.get('potential_favicons'):
517-
for favicon in entries['potential_favicons']:
518-
fav = Favicons.create(favicon)
519-
fav.add(item.get_date(), item)
451+
filter_page = True
452+
453+
if not filter_page:
454+
455+
# DOM-HASH
456+
dom_hash = DomHashs.create(entries['html'], obj_id=dom_hash_id)
457+
dom_hash.add(self.date.replace('/', ''), item)
458+
dom_hash.add_correlation('domain', '', self.domain.id)
459+
460+
gzip64encoded = crawlers.get_gzipped_b64_item(item.id, entries['html'])
461+
# send item to Global
462+
relay_message = f'crawler {gzip64encoded}'
463+
self.add_message_to_queue(obj=item, message=relay_message, queue='Importers')
464+
465+
# Tag # TODO replace me with metadata to tags
466+
msg = f'infoleak:submission="crawler"' # TODO FIXME
467+
self.add_message_to_queue(obj=item, message=msg, queue='Tags')
468+
469+
# TODO replace me with metadata to add
470+
crawlers.create_item_metadata(item_id, last_url, parent_id)
471+
if self.root_item is None:
472+
self.root_item = item_id
473+
parent_id = item_id
474+
475+
# TITLE
476+
if title_content:
477+
title = Titles.create_title(title_content)
478+
title.add(item.get_date(), item)
479+
# Tracker
480+
self.tracker_yara.compute_manual(title)
481+
# if not title.is_tags_safe():
482+
# unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
483+
# self.domain.add_tag(unsafe_tag)
484+
# item.add_tag(unsafe_tag)
485+
self.add_message_to_queue(obj=title, message=self.domain.id, queue='Titles')
486+
487+
# SCREENSHOT
488+
if self.screenshot:
489+
if 'png' in entries and entries.get('png'):
490+
screenshot = Screenshots.create_screenshot(entries['png'], b64=False)
491+
if screenshot:
492+
if not screenshot.is_tags_safe():
493+
unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
494+
self.domain.add_tag(unsafe_tag)
495+
item.add_tag(unsafe_tag)
496+
# Remove Placeholder pages # TODO Replace with warning list ???
497+
if screenshot.id not in self.placeholder_screenshots:
498+
# Create Correlations
499+
screenshot.add_correlation('item', '', item_id)
500+
screenshot.add_correlation('domain', '', self.domain.id)
501+
self.add_message_to_queue(obj=screenshot, queue='Images')
502+
# HAR
503+
if self.har:
504+
if 'har' in entries and entries.get('har'):
505+
har_id = crawlers.create_har_id(self.date, item_id)
506+
crawlers.save_har(har_id, entries['har'])
507+
for cookie_name in crawlers.extract_cookies_names_from_har(entries['har']):
508+
print(cookie_name)
509+
cookie = CookiesNames.create(cookie_name)
510+
cookie.add(self.date.replace('/', ''), self.domain)
511+
for etag_content in crawlers.extract_etag_from_har(entries['har']):
512+
print(etag_content)
513+
etag = Etags.create(etag_content)
514+
etag.add(self.date.replace('/', ''), self.domain)
515+
crawlers.extract_hhhash(entries['har'], self.domain.id, self.date.replace('/', ''))
516+
517+
# FAVICON
518+
if entries.get('potential_favicons'):
519+
for favicon in entries['potential_favicons']:
520+
fav = Favicons.create(favicon)
521+
fav.add(item.get_date(), item)
520522

521523
# Next Children
522524
entries_children = entries.get('children')

0 commit comments

Comments
 (0)