diff --git a/hacker_news/llm/openai.py b/hacker_news/llm/openai.py index 296015d..1b044b0 100644 --- a/hacker_news/llm/openai.py +++ b/hacker_news/llm/openai.py @@ -79,45 +79,38 @@ def call_openai_family(content: str, sys_prompt: str) -> str: # Gemma outputs weird words like Kün/viciss/▁purcha/▁xPos/▁Gorb kwargs['logit_bias'] = {200507: -100, 225856: -100, 6204: -100, 232014: -100, 172406: -100} - if config.openai_model.startswith('text-'): - prompt = (f'Use third person mood to summarize the following article delimited by triple backticks in 2 concise English sentences. Ensure the summary does not exceed 100 characters.\n' - f'```{content.strip(".")}.```') - resp = openai.Completion.create( - prompt=prompt, - **kwargs - ) - answer = resp['choices'][0]['text'].strip() - else: - resp = openai.ChatCompletion.create( - messages=[ - { - "role": "system", - "content": sys_prompt - }, - {'role': 'user', 'content': content}, - ], - **kwargs) - message = resp["choices"][0]["message"] - if message.get('function_call'): - json_str = message['function_call']['arguments'] - if resp["choices"][0]['finish_reason'] == 'length': - json_str += '"}' # best effort to save truncated answers - try: - answer = json.loads(json_str) - except JSONDecodeError as e: - logger.warning(f'Failed to decode answer from openai, will fallback to plain text, error: {e}') - return '' # Let fallback code kicks in - else: - answer = message['content'].strip() + resp = openai.ChatCompletion.create( + messages=[ + { + "role": "system", + "content": sys_prompt + }, + {'role': 'user', 'content': content}, + ], + **kwargs) logger.info(f'content: {content}') logger.info(f'took {time.time() - start_time}s to generate: ' # Default str(resp) prints \u516c f'{json.dumps(resp.to_dict_recursive(), sort_keys=True, indent=2, ensure_ascii=False)}') + if 'error' in resp: + raise Exception(f'error message: {resp["error"].get("message")}, code: {resp["error"].get("code")}') + message = resp["choices"][0]["message"] + if message.get('function_call'): + json_str = message['function_call']['arguments'] + if resp["choices"][0]['finish_reason'] == 'length': + json_str += '"}' # best effort to save truncated answers + try: + answer = json.loads(json_str) + except JSONDecodeError as e: + logger.warning(f'Failed to decode answer from openai, will fallback to plain text, error: {e}') + return '' # Let fallback code kicks in + else: + answer = message['content'].strip() # Gemma sometimes returns "**Summary:**\n\nXXX\n\n**Key points:**\n\nXXX", extract the summary part for line in answer.split('\n'): if not line.strip(): continue - if 'summary' in line.lower() and len(line) <= 100: + if 'summary' in line.lower() and line.strip()[-1] == ':': continue answer = line break diff --git a/hacker_news/news.py b/hacker_news/news.py index 27bd64c..620cde3 100644 --- a/hacker_news/news.py +++ b/hacker_news/news.py @@ -80,6 +80,9 @@ def get_score(self) -> int: except: return 0 + def is_hiring_job(self) -> bool: + return self.get_score() == 0 and not self.author and 'YC ' in self.title + def slug(self): return slugify(self.title or 'no title') @@ -130,7 +133,8 @@ def summarize_by_openai(self, content): if not openai.api_key: logger.info("OpenAI API key is not set") return '' - if self.get_score() < config.openai_score_threshold: # Avoid expensive openai + if (self.get_score() < config.openai_score_threshold # Avoid expensive openai + and not self.is_hiring_job()): logger.info("Score %d is too small, ignore openai", self.get_score()) return '' diff --git a/test/test_hackernews_parser.py b/test/test_hackernews_parser.py index fa755ac..f343bd1 100644 --- a/test/test_hackernews_parser.py +++ b/test/test_hackernews_parser.py @@ -2,6 +2,7 @@ from datetime import datetime, timedelta from hacker_news.algolia_api import get_news +from hacker_news.news import News from hacker_news.parser import HackerNewsParser @@ -52,3 +53,7 @@ def test_algolia_api(self): date = news_list[0].submit_time.date() for news in news_list: self.assertEqual(date, news.submit_time.date()) + + def test_maybe_jobs_post(self): + news = News(title='MixRank (YC S11) Is Hiring Software Engineers and Founders Globally') + self.assertTrue(news.is_hiring_job()) \ No newline at end of file