Skip to content

Commit

Permalink
Use r.jina.ai to parse dynamic js webpages
Browse files Browse the repository at this point in the history
  • Loading branch information
polyrabbit committed May 13, 2024
1 parent a8ca8c6 commit 35050b3
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 6 deletions.
3 changes: 1 addition & 2 deletions hacker_news/llm/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ def call_openai_family(content: str, sys_prompt: str) -> str:


def summarize_by_openai_family(content: str) -> str:
return call_openai_family(content, "You are a helpful summarizer. Please think step by step and use third person mood to summarize all user's input in 2 short English sentences. "
"Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.")
return call_openai_family(content, "You are a helpful summarizer. Please think step by step to summarize all user's input in 2 concise English sentences. Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.")


def translate_by_openai_family(content: str, lang: str) -> str:
Expand Down
16 changes: 13 additions & 3 deletions page_content_extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@


# dispatcher
def parser_factory(url):
def parser_factory(url, use_jina=False):
"""
Returns the extracted object, which should have at least two
methods `get_content` and `get_illustration`
"""
if not url.startswith('http'):
url = 'http://' + url
resp = session.get(url)
headers = None
if use_jina:
headers = {'x-respond-with': 'html'}
resp = session.get(url, headers=headers)
# Some sites like science.org forbid us by responding 403, but still have meta description tags, so donot raise here
# resp.raise_for_status()

Expand All @@ -43,6 +46,13 @@ def parser_factory(url):
logger.exception('Failed to parse this pdf file, %s', resp.url)
elif ct.startswith('text') or 'html' in ct or 'xml' in ct or 'charset' in ct:
logger.info('Get an %s to parse', ct)
return HtmlContentExtractor(resp.text, resp.url)
p = HtmlContentExtractor(resp.text, resp.url)
if not use_jina and p.is_empty():
logger.info('%s is empty? switch to jina', resp.url)
try:
return parser_factory('https://r.jina.ai/'+url, use_jina=True)
except Exception as e:
logger.warning('jina %s throws an error: %s', 'https://r.jina.ai/'+url, e)
return p

raise TypeError(f'I have no idea how the {ct} is formatted')
3 changes: 3 additions & 0 deletions page_content_extractor/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def __init__(self, html, url=''):
# self.clean_up_html()
self.relative_path2_abs_url()

def is_empty(self):
return not self.article.get_text(separator='', strip=True)

# def __del__(self):
# # TODO won't call
# logger.info('calc_effective_text_len: %s, parents_of_article_header: %s, calc_img_area_len: %s',
Expand Down
2 changes: 1 addition & 1 deletion templates/base.html
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ <h3 id="{{ news.slug() }}">
{% endif %}
{% if news.summary %}
<div class="{% if news.summary.startswith('<iframe') %}embed-responsive embed-responsive-16by9 {% else %}summary-text{% endif %}"
{% if news.summarized_by.value == 'OpenAI' and news.summary|translate(lang)|length > config.summary_size %}
{% if news.summarized_by.is_finally() and news.summary|translate(lang)|length > config.summary_size %}
title="{{ news.summary|translate(lang)|trim }}"
{% endif %}>
{% set summary = news.summary|translate(lang) -%}
Expand Down
5 changes: 5 additions & 0 deletions test/test_html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,11 @@ def test_link_intensive_wikipedia(self):
content = parser.get_content()
self.assertTrue(content.startswith('Google Sidewiki was a web annotation tool from Google'))

def test_dynamic_js_page(self):
parser = parser_factory('https://www.science.org/content/article/u-s-wants-change-how-researchers-get-access-huge-trove-health-data-many-don-t-idea')
content = parser.get_content()
self.assertTrue(content.startswith('Health researchers'))

def test_longer_meta_description(self):
html_doc = """
<meta property="og:description" content="aaaa" />
Expand Down

0 comments on commit 35050b3

Please sign in to comment.