@@ -375,8 +375,8 @@ def compute(self, capture):
375375 print ('task: ' , task .uuid , 'completed' )
376376 print ()
377377 else :
378- print ('capture:' , capture .uuid , 'Unsafe Content Filtered' )
379- print ('task: ' , task .uuid , 'Unsafe Content Filtered' )
378+ print ('capture:' , capture .uuid , 'Unsafe Content Filtered or error ' )
379+ print ('task: ' , task .uuid , 'Unsafe Content Filtered or error ' )
380380 print ()
381381
382382 # onion/i2p messages correlation
@@ -421,6 +421,37 @@ def save_capture_response(self, parent_id, entries):
421421 item = Item (item_id )
422422 print (item .id )
423423
424+ # TITLE
425+ signal .alarm (60 )
426+ try :
427+ title_content = crawlers .extract_title_from_html (entries ['html' ])
428+ except TimeoutException :
429+ self .logger .warning (f'BeautifulSoup HTML parser timeout: { item_id } ' )
430+ title_content = None
431+ else :
432+ signal .alarm (0 )
433+
434+ # DOM-HASH
435+ dom_hash = DomHashs .create (entries ['html' ])
436+
437+ # FILTER I2P 'Website Unknown' and 'Website Unreachable'
438+ if self .domain .id .endswith ('.i2p' ):
439+ if dom_hash == '186eff95227efa351e6acfc00a807a7b' and title_content == 'Website Unreachable' :
440+ print ('I2P Website Unreachable' )
441+ return False
442+ if dom_hash == 'd71f204a2ee135a45b1e34deb8377094' and title_content == 'Website Unknown' :
443+ print ('Website Unknown - Website Not Found in Addressbook' )
444+ return False
445+
446+ # DOM-HASH
447+ dom_hash .add (self .date .replace ('/' , '' ), item )
448+ dom_hash .add_correlation ('domain' , '' , self .domain .id )
449+
450+ if self .domain .id .endswith ('.i2p' ):
451+ if
452+ if title_content == 'Website Unknown' or title_content == 'Website Unreachable' :
453+ return False
454+
424455 gzip64encoded = crawlers .get_gzipped_b64_item (item .id , entries ['html' ])
425456 # send item to Global
426457 relay_message = f'crawler { gzip64encoded } '
@@ -436,21 +467,7 @@ def save_capture_response(self, parent_id, entries):
436467 self .root_item = item_id
437468 parent_id = item_id
438469
439- # DOM-HASH
440- dom_hash = DomHashs .create (entries ['html' ])
441- dom_hash .add (self .date .replace ('/' , '' ), item )
442- dom_hash .add_correlation ('domain' , '' , self .domain .id )
443-
444470 # TITLE
445- signal .alarm (60 )
446- try :
447- title_content = crawlers .extract_title_from_html (entries ['html' ])
448- except TimeoutException :
449- self .logger .warning (f'BeautifulSoup HTML parser timeout: { item_id } ' )
450- title_content = None
451- else :
452- signal .alarm (0 )
453-
454471 if title_content :
455472 title = Titles .create_title (title_content )
456473 title .add (item .get_date (), item )
0 commit comments