Skip to content

Commit 35050b3

Browse files
committed
Use r.jina.ai to parse dynamic js webpages
1 parent a8ca8c6 commit 35050b3

File tree

5 files changed

+23
-6
lines changed

5 files changed

+23
-6
lines changed

hacker_news/llm/openai.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,7 @@ def call_openai_family(content: str, sys_prompt: str) -> str:
130130

131131

132132
def summarize_by_openai_family(content: str) -> str:
133-
return call_openai_family(content, "You are a helpful summarizer. Please think step by step and use third person mood to summarize all user's input in 2 short English sentences. "
134-
"Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.")
133+
return call_openai_family(content, "You are a helpful summarizer. Please think step by step to summarize all user's input in 2 concise English sentences. Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.")
135134

136135

137136
def translate_by_openai_family(content: str, lang: str) -> str:

page_content_extractor/__init__.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,17 @@
1515

1616

1717
# dispatcher
18-
def parser_factory(url):
18+
def parser_factory(url, use_jina=False):
1919
"""
2020
Returns the extracted object, which should have at least two
2121
methods `get_content` and `get_illustration`
2222
"""
2323
if not url.startswith('http'):
2424
url = 'http://' + url
25-
resp = session.get(url)
25+
headers = None
26+
if use_jina:
27+
headers = {'x-respond-with': 'html'}
28+
resp = session.get(url, headers=headers)
2629
# Some sites like science.org forbid us by responding 403, but still have meta description tags, so donot raise here
2730
# resp.raise_for_status()
2831

@@ -43,6 +46,13 @@ def parser_factory(url):
4346
logger.exception('Failed to parse this pdf file, %s', resp.url)
4447
elif ct.startswith('text') or 'html' in ct or 'xml' in ct or 'charset' in ct:
4548
logger.info('Get an %s to parse', ct)
46-
return HtmlContentExtractor(resp.text, resp.url)
49+
p = HtmlContentExtractor(resp.text, resp.url)
50+
if not use_jina and p.is_empty():
51+
logger.info('%s is empty? switch to jina', resp.url)
52+
try:
53+
return parser_factory('https://r.jina.ai/'+url, use_jina=True)
54+
except Exception as e:
55+
logger.warning('jina %s throws an error: %s', 'https://r.jina.ai/'+url, e)
56+
return p
4757

4858
raise TypeError(f'I have no idea how the {ct} is formatted')

page_content_extractor/html.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ def __init__(self, html, url=''):
6969
# self.clean_up_html()
7070
self.relative_path2_abs_url()
7171

72+
def is_empty(self):
73+
return not self.article.get_text(separator='', strip=True)
74+
7275
# def __del__(self):
7376
# # TODO won't call
7477
# logger.info('calc_effective_text_len: %s, parents_of_article_header: %s, calc_img_area_len: %s',

templates/base.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ <h3 id="{{ news.slug() }}">
208208
{% endif %}
209209
{% if news.summary %}
210210
<div class="{% if news.summary.startswith('<iframe') %}embed-responsive embed-responsive-16by9 {% else %}summary-text{% endif %}"
211-
{% if news.summarized_by.value == 'OpenAI' and news.summary|translate(lang)|length > config.summary_size %}
211+
{% if news.summarized_by.is_finally() and news.summary|translate(lang)|length > config.summary_size %}
212212
title="{{ news.summary|translate(lang)|trim }}"
213213
{% endif %}>
214214
{% set summary = news.summary|translate(lang) -%}

test/test_html_parser.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,11 @@ def test_link_intensive_wikipedia(self):
239239
content = parser.get_content()
240240
self.assertTrue(content.startswith('Google Sidewiki was a web annotation tool from Google'))
241241

242+
def test_dynamic_js_page(self):
243+
parser = parser_factory('https://www.science.org/content/article/u-s-wants-change-how-researchers-get-access-huge-trove-health-data-many-don-t-idea')
244+
content = parser.get_content()
245+
self.assertTrue(content.startswith('Health researchers'))
246+
242247
def test_longer_meta_description(self):
243248
html_doc = """
244249
<meta property="og:description" content="aaaa" />

0 commit comments

Comments
 (0)