From 35050b398e3752bf9369d68055c2bd27735064ba Mon Sep 17 00:00:00 2001 From: Changxin Miao Date: Sun, 12 May 2024 00:21:42 +0800 Subject: [PATCH] Use r.jina.ai to parse dynamic js webpages --- hacker_news/llm/openai.py | 3 +-- page_content_extractor/__init__.py | 16 +++++++++++++--- page_content_extractor/html.py | 3 +++ templates/base.html | 2 +- test/test_html_parser.py | 5 +++++ 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/hacker_news/llm/openai.py b/hacker_news/llm/openai.py index 8a1c02f..296015d 100644 --- a/hacker_news/llm/openai.py +++ b/hacker_news/llm/openai.py @@ -130,8 +130,7 @@ def call_openai_family(content: str, sys_prompt: str) -> str: def summarize_by_openai_family(content: str) -> str: - return call_openai_family(content, "You are a helpful summarizer. Please think step by step and use third person mood to summarize all user's input in 2 short English sentences. " - "Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.") + return call_openai_family(content, "You are a helpful summarizer. Please think step by step to summarize all user's input in 2 concise English sentences. Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.") def translate_by_openai_family(content: str, lang: str) -> str: diff --git a/page_content_extractor/__init__.py b/page_content_extractor/__init__.py index ca1e8ae..969353c 100644 --- a/page_content_extractor/__init__.py +++ b/page_content_extractor/__init__.py @@ -15,14 +15,17 @@ # dispatcher -def parser_factory(url): +def parser_factory(url, use_jina=False): """ Returns the extracted object, which should have at least two methods `get_content` and `get_illustration` """ if not url.startswith('http'): url = 'http://' + url - resp = session.get(url) + headers = None + if use_jina: + headers = {'x-respond-with': 'html'} + resp = session.get(url, headers=headers) # Some sites like science.org forbid us by responding 403, but still have meta description tags, so donot raise here # resp.raise_for_status() @@ -43,6 +46,13 @@ def parser_factory(url): logger.exception('Failed to parse this pdf file, %s', resp.url) elif ct.startswith('text') or 'html' in ct or 'xml' in ct or 'charset' in ct: logger.info('Get an %s to parse', ct) - return HtmlContentExtractor(resp.text, resp.url) + p = HtmlContentExtractor(resp.text, resp.url) + if not use_jina and p.is_empty(): + logger.info('%s is empty? switch to jina', resp.url) + try: + return parser_factory('https://r.jina.ai/'+url, use_jina=True) + except Exception as e: + logger.warning('jina %s throws an error: %s', 'https://r.jina.ai/'+url, e) + return p raise TypeError(f'I have no idea how the {ct} is formatted') diff --git a/page_content_extractor/html.py b/page_content_extractor/html.py index 6fdb6d2..796dd73 100644 --- a/page_content_extractor/html.py +++ b/page_content_extractor/html.py @@ -69,6 +69,9 @@ def __init__(self, html, url=''): # self.clean_up_html() self.relative_path2_abs_url() + def is_empty(self): + return not self.article.get_text(separator='', strip=True) + # def __del__(self): # # TODO won't call # logger.info('calc_effective_text_len: %s, parents_of_article_header: %s, calc_img_area_len: %s', diff --git a/templates/base.html b/templates/base.html index 56c1d89..25c1ec5 100644 --- a/templates/base.html +++ b/templates/base.html @@ -208,7 +208,7 @@

{% endif %} {% if news.summary %}
config.summary_size %} + {% if news.summarized_by.is_finally() and news.summary|translate(lang)|length > config.summary_size %} title="{{ news.summary|translate(lang)|trim }}" {% endif %}> {% set summary = news.summary|translate(lang) -%} diff --git a/test/test_html_parser.py b/test/test_html_parser.py index a239d4d..cdc337c 100644 --- a/test/test_html_parser.py +++ b/test/test_html_parser.py @@ -239,6 +239,11 @@ def test_link_intensive_wikipedia(self): content = parser.get_content() self.assertTrue(content.startswith('Google Sidewiki was a web annotation tool from Google')) + def test_dynamic_js_page(self): + parser = parser_factory('https://www.science.org/content/article/u-s-wants-change-how-researchers-get-access-huge-trove-health-data-many-don-t-idea') + content = parser.get_content() + self.assertTrue(content.startswith('Health researchers')) + def test_longer_meta_description(self): html_doc = """