diff --git a/hacker_news/llm/openai.py b/hacker_news/llm/openai.py index 8a1c02f..296015d 100644 --- a/hacker_news/llm/openai.py +++ b/hacker_news/llm/openai.py @@ -130,8 +130,7 @@ def call_openai_family(content: str, sys_prompt: str) -> str: def summarize_by_openai_family(content: str) -> str: - return call_openai_family(content, "You are a helpful summarizer. Please think step by step and use third person mood to summarize all user's input in 2 short English sentences. " - "Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.") + return call_openai_family(content, "You are a helpful summarizer. Please think step by step to summarize all user's input in 2 concise English sentences. Ensure the summary does not exceed 100 characters. Provide response in plain text format without any Markdown formatting.") def translate_by_openai_family(content: str, lang: str) -> str: diff --git a/page_content_extractor/__init__.py b/page_content_extractor/__init__.py index ca1e8ae..969353c 100644 --- a/page_content_extractor/__init__.py +++ b/page_content_extractor/__init__.py @@ -15,14 +15,17 @@ # dispatcher -def parser_factory(url): +def parser_factory(url, use_jina=False): """ Returns the extracted object, which should have at least two methods `get_content` and `get_illustration` """ if not url.startswith('http'): url = 'http://' + url - resp = session.get(url) + headers = None + if use_jina: + headers = {'x-respond-with': 'html'} + resp = session.get(url, headers=headers) # Some sites like science.org forbid us by responding 403, but still have meta description tags, so donot raise here # resp.raise_for_status() @@ -43,6 +46,13 @@ def parser_factory(url): logger.exception('Failed to parse this pdf file, %s', resp.url) elif ct.startswith('text') or 'html' in ct or 'xml' in ct or 'charset' in ct: logger.info('Get an %s to parse', ct) - return HtmlContentExtractor(resp.text, resp.url) + p = HtmlContentExtractor(resp.text, resp.url) + if not use_jina and p.is_empty(): + logger.info('%s is empty? switch to jina', resp.url) + try: + return parser_factory('https://r.jina.ai/'+url, use_jina=True) + except Exception as e: + logger.warning('jina %s throws an error: %s', 'https://r.jina.ai/'+url, e) + return p raise TypeError(f'I have no idea how the {ct} is formatted') diff --git a/page_content_extractor/html.py b/page_content_extractor/html.py index 6fdb6d2..796dd73 100644 --- a/page_content_extractor/html.py +++ b/page_content_extractor/html.py @@ -69,6 +69,9 @@ def __init__(self, html, url=''): # self.clean_up_html() self.relative_path2_abs_url() + def is_empty(self): + return not self.article.get_text(separator='', strip=True) + # def __del__(self): # # TODO won't call # logger.info('calc_effective_text_len: %s, parents_of_article_header: %s, calc_img_area_len: %s', diff --git a/templates/base.html b/templates/base.html index 56c1d89..25c1ec5 100644 --- a/templates/base.html +++ b/templates/base.html @@ -208,7 +208,7 @@