diff --git a/.github/workflows/static.yml b/.github/workflows/static.yml index 4edb69c..3284251 100644 --- a/.github/workflows/static.yml +++ b/.github/workflows/static.yml @@ -106,6 +106,10 @@ jobs: - name: Unit Test run: make test if: ${{ github.event_name == 'push' }} + env: + COZE_API_ENDPOINT: ${{ secrets.COZE_API_ENDPOINT }} + COZE_API_KEY: ${{ secrets.COZE_API_KEY }} + COZE_BOT_ID: ${{ secrets.COZE_BOT_ID }} - name: Generate Daily Page run: make gh_daily_page @@ -113,6 +117,9 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} DATABASE_URL: ${{ secrets.DATABASE_URL }} SYSLOG_ADDRESS: ${{ secrets.SYSLOG_ADDRESS }} + COZE_API_ENDPOINT: ${{ secrets.COZE_API_ENDPOINT }} + COZE_API_KEY: ${{ secrets.COZE_API_KEY }} + COZE_BOT_ID: ${{ secrets.COZE_BOT_ID }} if: ${{ github.event_name == 'push' || endswith(github.run_id, '3') || endswith(github.run_id, '6') || endswith(github.run_id, '9') }} - name: Generate Home Page @@ -121,6 +128,9 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} DATABASE_URL: ${{ secrets.DATABASE_URL }} SYSLOG_ADDRESS: ${{ secrets.SYSLOG_ADDRESS }} + COZE_API_ENDPOINT: ${{ secrets.COZE_API_ENDPOINT }} + COZE_API_KEY: ${{ secrets.COZE_API_KEY }} + COZE_BOT_ID: ${{ secrets.COZE_BOT_ID }} - name: Setup Pages uses: actions/configure-pages@v2 diff --git a/config.py b/config.py index e289dbb..d974ad0 100644 --- a/config.py +++ b/config.py @@ -62,12 +62,23 @@ def int_env(name, default): transformer_model = os.getenv('TRANSFORMER_MODEL') or 't5-large' logger.info(f'Use transformer model {transformer_model}') + +def coze_enabled(): + return coze_api_endpoint and coze_api_key and coze_bot_id + + +coze_api_endpoint = os.getenv('COZE_API_ENDPOINT') +coze_api_key = os.getenv('COZE_API_KEY') +coze_bot_id = os.getenv('COZE_BOT_ID') +logger.info(f'Coze api {"enabled" if coze_enabled() else "disabled"}') + openai_keys = os.getenv('OPENAI_API_KEY').split(',') if os.getenv('OPENAI_API_KEY') else [None] openai.api_key = random.choice(openai_keys) # Round-robin available keys openai_key_index = openai_keys.index(openai.api_key) logger.info(f'Use openai api key #{openai_key_index}') openai_model = os.getenv('OPENAI_MODEL') or 'gpt-3.5-turbo' openai_score_threshold = int_env('OPENAI_SCORE_THRESHOLD', 20) +local_llm_score_threshold = 10 logger.info(f'Use openai model {openai_model}') output_dir = os.path.join(os.path.dirname(__file__), 'output/') diff --git a/hacker_news/llm/coze.py b/hacker_news/llm/coze.py new file mode 100644 index 0000000..d50589d --- /dev/null +++ b/hacker_news/llm/coze.py @@ -0,0 +1,107 @@ +import json +import logging +import random +import time + +import config +from hacker_news.llm.openai import sanitize_for_openai, sanitize_title +from page_content_extractor import session + +logger = logging.getLogger(__name__) + +coze_headers = { + 'Authorization': f'Bearer {config.coze_api_key}', +} + +# example response (SSEs): +# b'event:message +# data:{ +# "messages": [ +# { +# "role": "assistant", +# "type": "answer", +# "content": "Sure, how about this one:\n\n\"Read the following article and provide a concise summary, outlining the main points, key findings, and any conclusions drawn.\"", +# "content_type": "text" +# }, +# { +# "role": "assistant", +# "type": "follow_up", +# "content": "What are the main points of the article?", +# "content_type": "text" +# }, +# { +# "role": "assistant", +# "type": "follow_up", +# "content": "What were the key findings mentioned in the article?", +# "content_type": "text" +# }, +# { +# "role": "assistant", +# "type": "follow_up", +# "content": "Were any conclusions drawn in the article?", +# "content_type": "text" +# } +# ], +# "conversation_id": "123", +# "code": 0, +# "msg": "success" +# }' + + +def summarize_by_coze(content: str, title: str) -> str: + if not config.coze_enabled(): + return '' + + start_time = time.time() + # Seems coze adds more context in prompt, and even answer is counted + content = sanitize_for_openai(content, overhead=1000) + title = sanitize_title(title) + + prompt = (f'Use third person mood to summarize the main points of the following page delimited by triple backticks in 2 concise sentences. ' + f'Ensure the summary does not exceed 100 characters.\n' + f'Title: "{title}"\n' + f'```{content}.```') + + try: + resp = session.post(config.coze_api_endpoint, headers=coze_headers, stream=True, json={ + 'conversation_id': f'{random.randint(100, 9999)}', + 'bot_id': config.coze_bot_id, + 'user': 'single_user', + 'query': prompt, + 'stream': False, + }) + resp.raise_for_status() + + for line in resp.iter_lines(): + if line and line.startswith(b'data:'): + line = line[len(b'data:'):].strip() + try: + resp_json = json.loads(line) + except json.JSONDecodeError as e: + logger.warning(f'Failed to decode coze response, unexpected json {line}, error: {e}') + return '' + break + else: + logger.warning(f'Unexpected coze response, no data line found') + return '' + + except Exception as e: + logger.warning(f'Failed to summarize using coze, {e}') + return '' + + if resp_json.get('code', 'not-exist') != 0: + logger.warning(f'Unexpected coze response, code: {resp_json.get("code", "not-exist")}, msg: {resp_json.get("msg", "not-exist")}') + return '' + + if len(resp_json.get('messages', [])) == 0: + logger.warning(f'Unexpected coze response, no message list') + return '' + + for msg in resp_json['messages']: + if msg['type'] == 'answer' and msg.get('content'): + summary = msg['content'].strip().strip('"').strip() + logger.info(f'took {time.time() - start_time}s to generate: {summary}') + return summary + + logger.warning(f'Unexpected coze response, no answer message found') + return '' diff --git a/hacker_news/llm/openai.py b/hacker_news/llm/openai.py new file mode 100644 index 0000000..e1715ca --- /dev/null +++ b/hacker_news/llm/openai.py @@ -0,0 +1,19 @@ +import tiktoken + +import config + + +def sanitize_for_openai(text, overhead): + text = text.replace('```', ' ').strip() # in case of prompt injection + + # one token generally corresponds to ~4 characters, from https://platform.openai.com/tokenizer + if len(text) > 4096 * 2: + enc = tiktoken.encoding_for_model(config.openai_model) + tokens = enc.encode(text) + if len(tokens) > 4096 - overhead: # 4096: model's context limit + text = enc.decode(tokens[:4096 - overhead]) + return text.strip(".").strip() + + +def sanitize_title(title): + return title.replace('"', "'").replace('\n', ' ').strip() diff --git a/hacker_news/news.py b/hacker_news/news.py index a6844f7..8b6846f 100644 --- a/hacker_news/news.py +++ b/hacker_news/news.py @@ -6,12 +6,13 @@ from json import JSONDecodeError import openai -import tiktoken from slugify import slugify import config import db.summary from db.summary import Model +from hacker_news.llm.coze import summarize_by_coze +from hacker_news.llm.openai import sanitize_for_openai, sanitize_title from page_content_extractor import parser_factory from page_content_extractor.webimage import WebImage @@ -104,10 +105,10 @@ def summarize(self, content=None) -> (str, Model): f'No need to summarize since we have a small text of size {len(content)}') return content, Model.FULL - summary = self.summarize_by_openai(content) + summary = self.summarize_by_coze(content) or self.summarize_by_openai(content) if summary: return summary, Model.OPENAI - if self.get_score() >= 10: # Avoid slow local inference + if self.get_score() >= config.local_llm_score_threshold: # Avoid slow local inference if Model.from_value(self.cache.model).local_llm() and self.cache.summary: logger.info(f'Cache hit for {self.url}, model {self.cache.model}') return self.cache.summary, self.cache.get_summary_model() @@ -121,6 +122,11 @@ def summarize(self, content=None) -> (str, Model): logger.info("Score %d is too small, ignore local llm", self.get_score()) return content, Model.PREFIX + def summarize_by_coze(self, content): + if self.get_score() < config.local_llm_score_threshold: + return '' + return summarize_by_coze(content, self.title) + def summarize_by_openai(self, content): if not openai.api_key: logger.info("OpenAI API key is not set") @@ -129,16 +135,10 @@ def summarize_by_openai(self, content): logger.info("Score %d is too small, ignore openai", self.get_score()) return '' - content = content.replace('```', ' ').strip() # in case of prompt injection - - # one token generally corresponds to ~4 characters, from https://platform.openai.com/tokenizer - if len(content) > 4096 * 2: - enc = tiktoken.encoding_for_model(config.openai_model) - tokens = enc.encode(content) - if len(tokens) > 4096 - 200: # 4096: model's context limit, 200: function + prompt tokens (to reduce hitting rate limit) - content = enc.decode(tokens[:4096 - 200]) + # 200: function + prompt tokens (to reduce hitting rate limit) + content = sanitize_for_openai(content, overhead=200) - title = self.title.replace('"', "'").replace('\n', ' ').strip() or 'no title' + title = sanitize_title(self.title) or 'no title' # Hope one day this model will be clever enough to output correct json # Note: sentence should end with ".", "third person" - https://news.ycombinator.com/item?id=36262670 prompt = f'Output only answers to following 3 steps.\n' \ @@ -158,6 +158,7 @@ def summarize_by_openai(self, content): logger.exception(f'Failed to summarize using openai, key #{config.openai_key_index}, {e}') # Make this error explicit in the log return '' + # TODO: move to llm module def openai_complete(self, prompt, need_json): start_time = time.time() kwargs = {'model': config.openai_model, diff --git a/page_content_extractor/webimage.py b/page_content_extractor/webimage.py index ee56a1b..b8c2375 100644 --- a/page_content_extractor/webimage.py +++ b/page_content_extractor/webimage.py @@ -63,6 +63,11 @@ def is_candidate(self): logger.info('Failed on image bytesize check, size is %s, %s', len(self.raw_data), self.url) return False + img = Image.open(io.BytesIO(self.raw_data)) + colors = img.getcolors(maxcolors=2) + if colors is not None and len(colors) == 1: + logger.info('Maybe a solid color image(%s), colors=%s', self.url, len(colors)) + return True self._is_candidate = True self.width, self.height = width, height return True diff --git a/requirements.txt b/requirements.txt index 38dc659..d09d282 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,8 @@ requests==2.31.0 mock==5.1.0 werkzeug==2.3.7 feedwerk==1.1.0 -Pillow==10.0.1 -fake-useragent==1.2.1 +Pillow==10.2.0 +fake-useragent==1.4.0 olefile openai==0.28.1 torch==2.1.0 @@ -23,4 +23,4 @@ sqlalchemy==2.0.21 psycopg2==2.9.9 humanize==4.8.0 llama-cpp-python==0.2.11 -tiktoken==0.5.1 \ No newline at end of file +tiktoken==0.5.2 \ No newline at end of file diff --git a/test/test_news_summary.py b/test/test_news_summary.py index 7f9934a..82661a8 100644 --- a/test/test_news_summary.py +++ b/test/test_news_summary.py @@ -8,6 +8,7 @@ import db from db.engine import session from db.summary import Model +from hacker_news.llm.coze import summarize_by_coze from hacker_news.news import News @@ -47,6 +48,15 @@ def test_summarize_by_llama(self): self.assertGreater(len(summary), 80) self.assertLess(len(summary), config.summary_size * 2) + @unittest.skipUnless(config.coze_enabled(), 'coze is disabled') + def test_summarize_by_coze(self): + fpath = os.path.join(os.path.dirname(__file__), 'fixtures/telnet.txt') + with open(fpath, 'r') as fp: + content = fp.read() + summary = summarize_by_coze(content, 'telnet') + self.assertGreater(len(summary), 80) + self.assertLess(len(summary), config.summary_size * 2) + def test_parse_step_answer(self): news = News('The guide to software development with Guix') self.assertEqual(news.parse_title_translation('"Guix软件开发指南"的中文翻译。'),