Skip to content

Commit 1666e4f

Browse files
committed
fix: [crawler] add domhash extraction timeout on malformed html
1 parent 833f743 commit 1666e4f

File tree

1 file changed

+17
-11
lines changed

1 file changed

+17
-11
lines changed

bin/crawlers/Crawler.py

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -435,29 +435,35 @@ def save_capture_response(self, parent_id, entries):
435435
item = Item(item_id)
436436
print(item.id)
437437

438+
is_valid_html = True
439+
438440
# TITLE
439441
signal.alarm(60)
440442
try:
441443
title_content = crawlers.extract_title_from_html(entries['html'])
442444
except TimeoutException:
443445
self.logger.warning(f'BeautifulSoup HTML parser timeout: {item_id}')
444446
title_content = None
447+
is_valid_html = False
445448
else:
446449
signal.alarm(0)
447450

448451
# DOM-HASH ID
449-
signal.alarm(60)
450-
try:
451-
dom_hash_id = DomHashs.extract_dom_hash(entries['html'])
452-
except TimeoutException:
453-
self.logger.warning(f'BeautifulSoup HTML parser for domhash timeout: {item_id}')
454-
dom_hash_id = None
455-
except ValueError as e:
456-
signal.alarm(0)
457-
self.logger.warning(f'BeautifulSoup HTML invalid: {str(e)} {item_id}')
458-
dom_hash_id = None
452+
if is_valid_html:
453+
signal.alarm(60)
454+
try:
455+
dom_hash_id = DomHashs.extract_dom_hash(entries['html'])
456+
except TimeoutException:
457+
self.logger.warning(f'BeautifulSoup HTML parser for domhash timeout: {item_id}')
458+
dom_hash_id = None
459+
except ValueError as e:
460+
signal.alarm(0)
461+
self.logger.warning(f'BeautifulSoup HTML invalid: {str(e)} {item_id}')
462+
dom_hash_id = None
463+
else:
464+
signal.alarm(0)
459465
else:
460-
signal.alarm(0)
466+
dom_hash_id = None
461467

462468
# FILTER I2P 'Website Unknown' and 'Website Unreachable'
463469
if self.domain.id.endswith('.i2p') and dom_hash_id:

0 commit comments

Comments
 (0)