Use r.jina.ai to parse dynamic js webpages

polyrabbit · polyrabbit · commit 35050b398e37 · 2024-05-13T23:13:58.000+08:00
diff --git a/hacker_news/llm/openai.py b/hacker_news/llm/openai.py
@@ -130,8 +130,7 @@ def call_openai_family(content: str, sys_prompt: str) -> str:
 
 
 def summarize_by_openai_family(content: str) -> str:
-    return call_openai_family(content, "You are a helpful summarizer. Please think step by step and use third person mood to summarize all user's input in 2 short English sentences. "
-                                       "Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.")
+    return call_openai_family(content, "You are a helpful summarizer. Please think step by step to summarize all user's input in 2 concise English sentences. Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.")
 
 
 def translate_by_openai_family(content: str, lang: str) -> str:
diff --git a/page_content_extractor/__init__.py b/page_content_extractor/__init__.py
@@ -15,14 +15,17 @@
 
 
 # dispatcher
-def parser_factory(url):
+def parser_factory(url, use_jina=False):
     """
         Returns the extracted object, which should have at least two
         methods `get_content` and `get_illustration`
     """
     if not url.startswith('http'):
         url = 'http://' + url
-    resp = session.get(url)
+    headers = None
+    if use_jina:
+        headers = {'x-respond-with': 'html'}
+    resp = session.get(url, headers=headers)
     # Some sites like science.org forbid us by responding 403, but still have meta description tags, so donot raise here
     # resp.raise_for_status()
 
@@ -43,6 +46,13 @@ def parser_factory(url):
             logger.exception('Failed to parse this pdf file, %s', resp.url)
     elif ct.startswith('text') or 'html' in ct or 'xml' in ct or 'charset' in ct:
         logger.info('Get an %s to parse', ct)
-        return HtmlContentExtractor(resp.text, resp.url)
+        p = HtmlContentExtractor(resp.text, resp.url)
+        if not use_jina and p.is_empty():
+            logger.info('%s is empty? switch to jina', resp.url)
+            try:
+                return parser_factory('https://r.jina.ai/'+url, use_jina=True)
+            except Exception as e:
+                logger.warning('jina %s throws an error: %s', 'https://r.jina.ai/'+url, e)
+        return p
 
     raise TypeError(f'I have no idea how the {ct} is formatted')
diff --git a/page_content_extractor/html.py b/page_content_extractor/html.py
@@ -69,6 +69,9 @@ def __init__(self, html, url=''):
         # self.clean_up_html()
         self.relative_path2_abs_url()
 
+    def is_empty(self):
+        return not self.article.get_text(separator='', strip=True)
+
     # def __del__(self):
     #     # TODO won't call
     #     logger.info('calc_effective_text_len: %s, parents_of_article_header: %s, calc_img_area_len: %s',
diff --git a/templates/base.html b/templates/base.html
@@ -208,7 +208,7 @@ <h3 id="{{ news.slug() }}">
                 {% endif %}
                 {% if news.summary %}
                     <div class="{% if news.summary.startswith('<iframe') %}embed-responsive embed-responsive-16by9 {% else %}summary-text{% endif %}"
-                        {% if news.summarized_by.value == 'OpenAI' and news.summary|translate(lang)|length > config.summary_size %}
+                        {% if news.summarized_by.is_finally() and news.summary|translate(lang)|length > config.summary_size %}
                          title="{{ news.summary|translate(lang)|trim }}"
                         {% endif %}>
                         {% set summary = news.summary|translate(lang) -%}
diff --git a/test/test_html_parser.py b/test/test_html_parser.py
@@ -239,6 +239,11 @@ def test_link_intensive_wikipedia(self):
         content = parser.get_content()
         self.assertTrue(content.startswith('Google Sidewiki was a web annotation tool from Google'))
 
+    def test_dynamic_js_page(self):
+        parser = parser_factory('https://www.science.org/content/article/u-s-wants-change-how-researchers-get-access-huge-trove-health-data-many-don-t-idea')
+        content = parser.get_content()
+        self.assertTrue(content.startswith('Health researchers'))
+
     def test_longer_meta_description(self):
         html_doc = """
         <meta property="og:description" content="aaaa" />