From 35050b398e3752bf9369d68055c2bd27735064ba Mon Sep 17 00:00:00 2001
From: Changxin Miao <miaochangxin@step.ai>
Date: Sun, 12 May 2024 00:21:42 +0800
Subject: [PATCH] Use r.jina.ai to parse dynamic js webpages

---
 hacker_news/llm/openai.py          |  3 +--
 page_content_extractor/__init__.py | 16 +++++++++++++---
 page_content_extractor/html.py     |  3 +++
 templates/base.html                |  2 +-
 test/test_html_parser.py           |  5 +++++
 5 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/hacker_news/llm/openai.py b/hacker_news/llm/openai.py
index 8a1c02f..296015d 100644
--- a/hacker_news/llm/openai.py
+++ b/hacker_news/llm/openai.py
@@ -130,8 +130,7 @@ def call_openai_family(content: str, sys_prompt: str) -> str:
 
 
 def summarize_by_openai_family(content: str) -> str:
-    return call_openai_family(content, "You are a helpful summarizer. Please think step by step and use third person mood to summarize all user's input in 2 short English sentences. "
-                                       "Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.")
+    return call_openai_family(content, "You are a helpful summarizer. Please think step by step to summarize all user's input in 2 concise English sentences. Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.")
 
 
 def translate_by_openai_family(content: str, lang: str) -> str:
diff --git a/page_content_extractor/__init__.py b/page_content_extractor/__init__.py
index ca1e8ae..969353c 100644
--- a/page_content_extractor/__init__.py
+++ b/page_content_extractor/__init__.py
@@ -15,14 +15,17 @@
 
 
 # dispatcher
-def parser_factory(url):
+def parser_factory(url, use_jina=False):
     """
         Returns the extracted object, which should have at least two
         methods `get_content` and `get_illustration`
     """
     if not url.startswith('http'):
         url = 'http://' + url
-    resp = session.get(url)
+    headers = None
+    if use_jina:
+        headers = {'x-respond-with': 'html'}
+    resp = session.get(url, headers=headers)
     # Some sites like science.org forbid us by responding 403, but still have meta description tags, so donot raise here
     # resp.raise_for_status()
 
@@ -43,6 +46,13 @@ def parser_factory(url):
             logger.exception('Failed to parse this pdf file, %s', resp.url)
     elif ct.startswith('text') or 'html' in ct or 'xml' in ct or 'charset' in ct:
         logger.info('Get an %s to parse', ct)
-        return HtmlContentExtractor(resp.text, resp.url)
+        p = HtmlContentExtractor(resp.text, resp.url)
+        if not use_jina and p.is_empty():
+            logger.info('%s is empty? switch to jina', resp.url)
+            try:
+                return parser_factory('https://r.jina.ai/'+url, use_jina=True)
+            except Exception as e:
+                logger.warning('jina %s throws an error: %s', 'https://r.jina.ai/'+url, e)
+        return p
 
     raise TypeError(f'I have no idea how the {ct} is formatted')
diff --git a/page_content_extractor/html.py b/page_content_extractor/html.py
index 6fdb6d2..796dd73 100644
--- a/page_content_extractor/html.py
+++ b/page_content_extractor/html.py
@@ -69,6 +69,9 @@ def __init__(self, html, url=''):
         # self.clean_up_html()
         self.relative_path2_abs_url()
 
+    def is_empty(self):
+        return not self.article.get_text(separator='', strip=True)
+
     # def __del__(self):
     #     # TODO won't call
     #     logger.info('calc_effective_text_len: %s, parents_of_article_header: %s, calc_img_area_len: %s',
diff --git a/templates/base.html b/templates/base.html
index 56c1d89..25c1ec5 100644
--- a/templates/base.html
+++ b/templates/base.html
@@ -208,7 +208,7 @@ <h3 id="{{ news.slug() }}">
                 {% endif %}
                 {% if news.summary %}
                     <div class="{% if news.summary.startswith('<iframe') %}embed-responsive embed-responsive-16by9 {% else %}summary-text{% endif %}"
-                        {% if news.summarized_by.value == 'OpenAI' and news.summary|translate(lang)|length > config.summary_size %}
+                        {% if news.summarized_by.is_finally() and news.summary|translate(lang)|length > config.summary_size %}
                          title="{{ news.summary|translate(lang)|trim }}"
                         {% endif %}>
                         {% set summary = news.summary|translate(lang) -%}
diff --git a/test/test_html_parser.py b/test/test_html_parser.py
index a239d4d..cdc337c 100644
--- a/test/test_html_parser.py
+++ b/test/test_html_parser.py
@@ -239,6 +239,11 @@ def test_link_intensive_wikipedia(self):
         content = parser.get_content()
         self.assertTrue(content.startswith('Google Sidewiki was a web annotation tool from Google'))
 
+    def test_dynamic_js_page(self):
+        parser = parser_factory('https://www.science.org/content/article/u-s-wants-change-how-researchers-get-access-huge-trove-health-data-many-don-t-idea')
+        content = parser.get_content()
+        self.assertTrue(content.startswith('Health researchers'))
+
     def test_longer_meta_description(self):
         html_doc = """
         <meta property="og:description" content="aaaa" />