@@ -435,88 +435,90 @@ def save_capture_response(self, parent_id, entries):
435435 dom_hash_id = DomHashs .extract_dom_hash (entries ['html' ])
436436
437437 # FILTER I2P 'Website Unknown' and 'Website Unreachable'
438+ filter_page = False
438439 if self .domain .id .endswith ('.i2p' ):
439- if dom_hash_id == '186eff95227efa351e6acfc00a807a7b' : # 'Website Unreachable'
440+ if dom_hash_id == '186eff95227efa351e6acfc00a807a7b' or dom_hash_id == '58f5624724ece6452bf2fd50975df06a' : # 'Website Unreachable'
440441 print (title_content .encode ())
441442 print ('I2P Website Unreachable' )
442- return False
443+ filter_page = True
443444 elif dom_hash_id == 'd71f204a2ee135a45b1e34deb8377094' : # b'Website Unknown'
444445 print (title_content .encode ())
445446 print ('Website Unknown - Website Not Found in Addressbook' )
446- return False
447+ filter_page = True
447448 elif dom_hash_id == 'a530b30b5921d45f591a0c6a716ffcd9' : # 'Website Unreachable'
448449 print (title_content .encode ())
449450 print ('Invalid Destination' )
450- return False
451-
452-
453- # DOM-HASH
454- dom_hash = DomHashs .create (entries ['html' ], obj_id = dom_hash_id )
455- dom_hash .add (self .date .replace ('/' , '' ), item )
456- dom_hash .add_correlation ('domain' , '' , self .domain .id )
457-
458- gzip64encoded = crawlers .get_gzipped_b64_item (item .id , entries ['html' ])
459- # send item to Global
460- relay_message = f'crawler { gzip64encoded } '
461- self .add_message_to_queue (obj = item , message = relay_message , queue = 'Importers' )
462-
463- # Tag # TODO replace me with metadata to tags
464- msg = f'infoleak:submission="crawler"' # TODO FIXME
465- self .add_message_to_queue (obj = item , message = msg , queue = 'Tags' )
466-
467- # TODO replace me with metadata to add
468- crawlers .create_item_metadata (item_id , last_url , parent_id )
469- if self .root_item is None :
470- self .root_item = item_id
471- parent_id = item_id
472-
473- # TITLE
474- if title_content :
475- title = Titles .create_title (title_content )
476- title .add (item .get_date (), item )
477- # Tracker
478- self .tracker_yara .compute_manual (title )
479- # if not title.is_tags_safe():
480- # unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
481- # self.domain.add_tag(unsafe_tag)
482- # item.add_tag(unsafe_tag)
483- self .add_message_to_queue (obj = title , message = self .domain .id , queue = 'Titles' )
484-
485- # SCREENSHOT
486- if self .screenshot :
487- if 'png' in entries and entries .get ('png' ):
488- screenshot = Screenshots .create_screenshot (entries ['png' ], b64 = False )
489- if screenshot :
490- if not screenshot .is_tags_safe ():
491- unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
492- self .domain .add_tag (unsafe_tag )
493- item .add_tag (unsafe_tag )
494- # Remove Placeholder pages # TODO Replace with warning list ???
495- if screenshot .id not in self .placeholder_screenshots :
496- # Create Correlations
497- screenshot .add_correlation ('item' , '' , item_id )
498- screenshot .add_correlation ('domain' , '' , self .domain .id )
499- self .add_message_to_queue (obj = screenshot , queue = 'Images' )
500- # HAR
501- if self .har :
502- if 'har' in entries and entries .get ('har' ):
503- har_id = crawlers .create_har_id (self .date , item_id )
504- crawlers .save_har (har_id , entries ['har' ])
505- for cookie_name in crawlers .extract_cookies_names_from_har (entries ['har' ]):
506- print (cookie_name )
507- cookie = CookiesNames .create (cookie_name )
508- cookie .add (self .date .replace ('/' , '' ), self .domain )
509- for etag_content in crawlers .extract_etag_from_har (entries ['har' ]):
510- print (etag_content )
511- etag = Etags .create (etag_content )
512- etag .add (self .date .replace ('/' , '' ), self .domain )
513- crawlers .extract_hhhash (entries ['har' ], self .domain .id , self .date .replace ('/' , '' ))
514-
515- # FAVICON
516- if entries .get ('potential_favicons' ):
517- for favicon in entries ['potential_favicons' ]:
518- fav = Favicons .create (favicon )
519- fav .add (item .get_date (), item )
451+ filter_page = True
452+
453+ if not filter_page :
454+
455+ # DOM-HASH
456+ dom_hash = DomHashs .create (entries ['html' ], obj_id = dom_hash_id )
457+ dom_hash .add (self .date .replace ('/' , '' ), item )
458+ dom_hash .add_correlation ('domain' , '' , self .domain .id )
459+
460+ gzip64encoded = crawlers .get_gzipped_b64_item (item .id , entries ['html' ])
461+ # send item to Global
462+ relay_message = f'crawler { gzip64encoded } '
463+ self .add_message_to_queue (obj = item , message = relay_message , queue = 'Importers' )
464+
465+ # Tag # TODO replace me with metadata to tags
466+ msg = f'infoleak:submission="crawler"' # TODO FIXME
467+ self .add_message_to_queue (obj = item , message = msg , queue = 'Tags' )
468+
469+ # TODO replace me with metadata to add
470+ crawlers .create_item_metadata (item_id , last_url , parent_id )
471+ if self .root_item is None :
472+ self .root_item = item_id
473+ parent_id = item_id
474+
475+ # TITLE
476+ if title_content :
477+ title = Titles .create_title (title_content )
478+ title .add (item .get_date (), item )
479+ # Tracker
480+ self .tracker_yara .compute_manual (title )
481+ # if not title.is_tags_safe():
482+ # unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
483+ # self.domain.add_tag(unsafe_tag)
484+ # item.add_tag(unsafe_tag)
485+ self .add_message_to_queue (obj = title , message = self .domain .id , queue = 'Titles' )
486+
487+ # SCREENSHOT
488+ if self .screenshot :
489+ if 'png' in entries and entries .get ('png' ):
490+ screenshot = Screenshots .create_screenshot (entries ['png' ], b64 = False )
491+ if screenshot :
492+ if not screenshot .is_tags_safe ():
493+ unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
494+ self .domain .add_tag (unsafe_tag )
495+ item .add_tag (unsafe_tag )
496+ # Remove Placeholder pages # TODO Replace with warning list ???
497+ if screenshot .id not in self .placeholder_screenshots :
498+ # Create Correlations
499+ screenshot .add_correlation ('item' , '' , item_id )
500+ screenshot .add_correlation ('domain' , '' , self .domain .id )
501+ self .add_message_to_queue (obj = screenshot , queue = 'Images' )
502+ # HAR
503+ if self .har :
504+ if 'har' in entries and entries .get ('har' ):
505+ har_id = crawlers .create_har_id (self .date , item_id )
506+ crawlers .save_har (har_id , entries ['har' ])
507+ for cookie_name in crawlers .extract_cookies_names_from_har (entries ['har' ]):
508+ print (cookie_name )
509+ cookie = CookiesNames .create (cookie_name )
510+ cookie .add (self .date .replace ('/' , '' ), self .domain )
511+ for etag_content in crawlers .extract_etag_from_har (entries ['har' ]):
512+ print (etag_content )
513+ etag = Etags .create (etag_content )
514+ etag .add (self .date .replace ('/' , '' ), self .domain )
515+ crawlers .extract_hhhash (entries ['har' ], self .domain .id , self .date .replace ('/' , '' ))
516+
517+ # FAVICON
518+ if entries .get ('potential_favicons' ):
519+ for favicon in entries ['potential_favicons' ]:
520+ fav = Favicons .create (favicon )
521+ fav .add (item .get_date (), item )
520522
521523 # Next Children
522524 entries_children = entries .get ('children' )
0 commit comments