Skip to content

Commit 180ca15

Browse files
committed
chg: [i2p crawler] filter I2P Website Unreachable and Website Unknown - Website Not Found in Addressbook domains
1 parent c1d0c6a commit 180ca15

File tree

1 file changed

+33
-16
lines changed

1 file changed

+33
-16
lines changed

bin/crawlers/Crawler.py

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,8 @@ def compute(self, capture):
375375
print('task: ', task.uuid, 'completed')
376376
print()
377377
else:
378-
print('capture:', capture.uuid, 'Unsafe Content Filtered')
379-
print('task: ', task.uuid, 'Unsafe Content Filtered')
378+
print('capture:', capture.uuid, 'Unsafe Content Filtered or error')
379+
print('task: ', task.uuid, 'Unsafe Content Filtered or error')
380380
print()
381381

382382
# onion/i2p messages correlation
@@ -421,6 +421,37 @@ def save_capture_response(self, parent_id, entries):
421421
item = Item(item_id)
422422
print(item.id)
423423

424+
# TITLE
425+
signal.alarm(60)
426+
try:
427+
title_content = crawlers.extract_title_from_html(entries['html'])
428+
except TimeoutException:
429+
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
430+
title_content = None
431+
else:
432+
signal.alarm(0)
433+
434+
# DOM-HASH
435+
dom_hash = DomHashs.create(entries['html'])
436+
437+
# FILTER I2P 'Website Unknown' and 'Website Unreachable'
438+
if self.domain.id.endswith('.i2p'):
439+
if dom_hash == '186eff95227efa351e6acfc00a807a7b' and title_content == 'Website Unreachable':
440+
print('I2P Website Unreachable')
441+
return False
442+
if dom_hash == 'd71f204a2ee135a45b1e34deb8377094' and title_content == 'Website Unknown':
443+
print('Website Unknown - Website Not Found in Addressbook')
444+
return False
445+
446+
# DOM-HASH
447+
dom_hash.add(self.date.replace('/', ''), item)
448+
dom_hash.add_correlation('domain', '', self.domain.id)
449+
450+
if self.domain.id.endswith('.i2p'):
451+
if
452+
if title_content == 'Website Unknown' or title_content == 'Website Unreachable':
453+
return False
454+
424455
gzip64encoded = crawlers.get_gzipped_b64_item(item.id, entries['html'])
425456
# send item to Global
426457
relay_message = f'crawler {gzip64encoded}'
@@ -436,21 +467,7 @@ def save_capture_response(self, parent_id, entries):
436467
self.root_item = item_id
437468
parent_id = item_id
438469

439-
# DOM-HASH
440-
dom_hash = DomHashs.create(entries['html'])
441-
dom_hash.add(self.date.replace('/', ''), item)
442-
dom_hash.add_correlation('domain', '', self.domain.id)
443-
444470
# TITLE
445-
signal.alarm(60)
446-
try:
447-
title_content = crawlers.extract_title_from_html(entries['html'])
448-
except TimeoutException:
449-
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
450-
title_content = None
451-
else:
452-
signal.alarm(0)
453-
454471
if title_content:
455472
title = Titles.create_title(title_content)
456473
title.add(item.get_date(), item)

0 commit comments

Comments
 (0)