feat: server mode

pond918 · Jul 11, 2023 · 5be901a · 5be901a
1 parent 64a720a
commit 5be901a
Show file tree

Hide file tree

Showing 10 changed files with 97 additions and 26 deletions.
diff --git a/.eslintrc.cjs b/.eslintrc.cjs
@@ -13,5 +13,6 @@ module.exports = {
     '@typescript-eslint/no-dupe-class-members': ['error'],
     '@typescript-eslint/no-useless-constructor': ['error'],
     '@typescript-eslint/no-inferrable-types': ['off'],
+    '@typescript-eslint/no-explicit-any': 'off',
   },
 }
diff --git a/package.json b/package.json
@@ -3,7 +3,7 @@
   "types": "dist/index.d.ts",
   "main": "dist/index.js",
   "description": "extendable llm bots sdk to integrate with any system, with tree structured history and conversations support. you can easliy add new own bots under the hood.",
-  "version": "0.0.1006",
+  "version": "0.0.2",
   "keywords": [
     "llm",
     "chatbot",

diff --git a/src/bots/LangChainBot.ts b/src/bots/LangChainBot.ts
@@ -1,16 +1,18 @@
 import { ChatOpenAI } from 'langchain/chat_models/openai'
-import { AIChatMessage, HumanChatMessage, SystemChatMessage } from 'langchain/schema'
+import { AIChatMessage, HumanChatMessage, LLMResult, SystemChatMessage } from 'langchain/schema'
 import { LLMBot, LLMServerType } from './base-bot'
 import { ChatDto } from './chat.dto'
 
 export default abstract class LangChainBot extends LLMBot {
   protected _chatModel!: ChatOpenAI
+  /** if stream = true, token usage not available for azure-gpt */
+  protected _usage = true
 
   constructor(protected readonly _brandId = 'langChainBot', outputFormat = 'markdown') {
     super(_brandId, outputFormat)
   }
 
-  async _sendPrompt(msg: ChatDto, streamCallback?: (msg: ChatDto) => void): Promise<ChatDto> {
+  async _sendPrompt(msg: ChatDto): Promise<ChatDto> {
     // Convert the messages to the correct format
     const messages =
       msg.options.__history?.map(m => {
@@ -25,12 +27,47 @@ export default abstract class LangChainBot extends LLMBot {
       {
         handleLLMNewToken(token: string) {
           if (token) res += token
-          streamCallback && streamCallback(new ChatDto(res, token ? -1 : 0))
+          msg.options?.stream && msg.options.stream(new ChatDto(res, -1))
+        },
+        handleLLMEnd: (val: LLMResult) => {
+          res || (res = val.generations[0][0].text) // TODO
+          const tokens = val.llmOutput?.tokenUsage?.totalTokens
+          if (this._usage) {
+            if (!tokens) {
+              msg.statusCode = 500
+              throw new Error((msg.message = 'LLM token usage should not be empty'))
+            }
+            msg.options.tokens = tokens
+            msg.options.quotaTokens = (msg.options.quotaTokens || 0) - tokens
+          }
+          msg.options?.stream && msg.options.stream(new ChatDto(res, 0))
         },
       },
     ]
-    await this._chatModel.call(messages, undefined, callbacks)
-    return new ChatDto(res)
+
+    const { streaming, temperature, n, topP, maxTokens } = this._chatModel
+
+    try {
+      this._chatModel.streaming = !this._usage && !!msg.options.stream
+      msg.options?.n && (this._chatModel.n = msg.options.n)
+      msg.options?.topP && (this._chatModel.topP = msg.options.topP)
+      msg.options?.modelName && (this._chatModel.modelName = msg.options.modelName)
+      msg.options?.temperature && (this._chatModel.temperature = msg.options.temperature)
+      msg.options?.maxTokens &&
+        (this._chatModel.maxTokens = msg.options.quotaTokens
+          ? Math.min(msg.options.quotaTokens, msg.options.maxTokens)
+          : msg.options.maxTokens)
+
+      await this._chatModel.call(messages, undefined, callbacks)
+    } finally {
+      for (const [p, v] of Object.entries({ streaming, temperature, n, topP, maxTokens }))
+        (this._chatModel as any)[p] = v
+    }
+    const resp = new ChatDto(res, msg.statusCode)
+    msg.options.tokens &&
+      ([resp.options.tokens, resp.options.quotaTokens] = [msg.options.tokens, msg.options.quotaTokens])
+    msg.message && (resp.message = msg.message)
+    return resp
   }
 
   _getServerType(): LLMServerType {

diff --git a/src/bots/base-bot.ts b/src/bots/base-bot.ts
@@ -79,12 +79,12 @@ export abstract class LLMBot {
     await this._userStorage.set('_isAvailable', v)
   }
 
-  async sendPrompt(msg: ChatDto, streamCallback?: (msg: ChatDto) => void): Promise<ChatDto> {
+  async sendPrompt(msg: ChatDto): Promise<ChatDto> {
     if (!msg.text) return new ChatDto('')
 
     if (!(await this.isAvailable())) {
       const msg = new ChatDto('bot.notAvailable', 404)
-      streamCallback && streamCallback(msg)
+      msg.options?.stream && msg.options.stream(msg)
       return msg
     }
 
@@ -109,7 +109,7 @@ export abstract class LLMBot {
       await this._setConversation(msg.options._conversationKey)
     }
 
-    return this._sendPrompt(msg, streamCallback).then(async resp => {
+    return this._sendPrompt(msg).then(async resp => {
       // store response msg into history
       resp.options.lastMsgId = msg.id
       resp.options.type = 'ai'
@@ -122,7 +122,7 @@ export abstract class LLMBot {
    * @param msg prompt msg, or whole chat thread.
    * @param streamCallback
    */
-  abstract _sendPrompt(msg: ChatDto, streamCallback?: (msg: ChatDto) => void): Promise<ChatDto>
+  abstract _sendPrompt(msg: ChatDto): Promise<ChatDto>
 
   /**
    * @returns the LLM server type:

diff --git a/src/bots/chat.dto.ts b/src/bots/chat.dto.ts
@@ -21,12 +21,26 @@ export class ChatDto {
       lastMsgId?: string
       /** conversation key from llm server */
       _conversationKey?: string
-      /** TODO approximately max words of new response */
-      maxResponse?: number
       /** if true, this msg & it's response will not be stored into history */
       stateless?: boolean
       /** history of this msg */
       __history?: ChatDto[]
+
+      //// llm props
+      /** stream callback */
+      stream?: (msg: ChatDto) => void
+      modelName?: string
+      temperature?: number
+      /** Number of completions to generate for each prompt */
+      n?: number
+      /** Total probability mass of tokens to consider at each step */
+      topP?: number
+      /** respond token usage for current QA */
+      tokens?: number
+      /** Maximum number of tokens to generate in the completion. -1 returns as many tokens as possible given the prompt and the model's maximum context size. */
+      maxTokens?: number
+      /** quota number of tokens, will be updated in LLM response. */
+      quotaTokens?: number
     } & Record<string, unknown> = {},
   ) {
     statusCode ? (this.statusCode = statusCode) : (this.id = nanoid())

diff --git a/src/bots/huggingface/GradioBot.ts b/src/bots/huggingface/GradioBot.ts
@@ -52,11 +52,11 @@ export default abstract class GradioBot extends LLMBot {
     return available
   }
 
-  async _sendPrompt(prompt: ChatDto, streamCallback?: (msg: ChatDto) => void): Promise<ChatDto> {
+  async _sendPrompt(msg: ChatDto): Promise<ChatDto> {
     let result: ChatDto = new ChatDto('', -1)
     for (const key in this._fnIndexes) {
       const fn_index = this._fnIndexes[key]
-      const resp = await this._sendFnIndex(fn_index, prompt, streamCallback)
+      const resp = await this._sendFnIndex(fn_index, msg, msg.options?.stream)
       resp && !resp.statusCode && resp.text && (result = resp)
     }
     this._formalizeResponse(result)
@@ -115,13 +115,18 @@ export default abstract class GradioBot extends LLMBot {
           } else if (event.msg === 'process_completed') {
             // Done
             if (event.success && event.output.data) {
-              const prompt = this.parseData(fn_index, event.output.data)
-              const resp = new ChatDto(
-                prompt,
-                fn_index == this._fnIndexes.at(-1) ? 0 : -1, // Only the last one is done
-              )
-              streamCallback && streamCallback(resp)
-              resolve(resp)
+              if (typeof event.output.data[2] !== 'string' || event.output.data[2] === '') {
+                const prompt = this.parseData(fn_index, event.output.data)
+                const resp = new ChatDto(
+                  prompt,
+                  fn_index == this._fnIndexes.at(-1) ? 0 : -1, // Only the last one is done
+                )
+                streamCallback && streamCallback(resp)
+                resolve(resp)
+              } else {
+                const errorMsg = this.parseError(event.output.data[2])
+                reject(new Error(errorMsg))
+              }
             } else {
               reject(new Error(event.output.error))
             }
@@ -163,6 +168,7 @@ export default abstract class GradioBot extends LLMBot {
     }
   }
 
+  abstract parseError(error: string): string
   abstract makeData(fn_index: number, prompt: ChatDto): unknown
   abstract parseData(fn_index: number, data: unknown): string
 

diff --git a/src/bots/lmsys/LMSYSBot.ts b/src/bots/lmsys/LMSYSBot.ts
@@ -14,7 +14,7 @@ export default class LMSYSBot extends GradioBot {
   _lock = new AsyncLock() // FIXME Send requests in queue to save LMSYS
 
   constructor(name: string, readonly _model: string) {
-    super(name, 'https://chat.lmsys.org/', [7, 8], 'html')
+    super(name, 'https://chat.lmsys.org/', [9, 10], 'html')
   }
 
   /** needn't token */
@@ -42,6 +42,16 @@ export default class LMSYSBot extends GradioBot {
     if (fn_index === this._fnIndexes[1]) {
       r = data[1].at(-1)[1]
     }
-    return r
+    return r || ''
+  }
+
+  parseError(errorMsg: string) {
+    if (errorMsg.includes('REFRESH THIS PAGE')) {
+      errorMsg = errorMsg.replace(
+        'REFRESH THIS PAGE',
+        `<a href="${this._loginUrl}" target="innerWindow">REFRESH THIS PAGE</a>`,
+      )
+    }
+    return errorMsg
   }
 }
diff --git a/src/bots/microsoft/AzureOpenAIAPIBot.ts b/src/bots/microsoft/AzureOpenAIAPIBot.ts
@@ -10,14 +10,17 @@ export default class AzureOpenAIAPIBot extends LangChainBot {
     const conf: Record<string, any> = typeof config == 'function' ? (config = config()) : config
     if (!conf?.azureApiKey) return false
 
+    this._usage = !conf.no_usage_report // if stream = true, token usage not available
     this._chatModel = new ChatOpenAI({
+      modelName: conf.modelName,
       azureOpenAIApiKey: conf.azureApiKey,
       azureOpenAIApiInstanceName: conf.azureApiInstanceName,
       azureOpenAIApiDeploymentName: conf.azureOpenAIApiDeploymentName,
       azureOpenAIApiVersion: conf.azureOpenAIApiVersion,
       temperature: conf.temperature,
-      streaming: true,
+      streaming: false,
     })
+    // curl -X POST -H 'Content-type: application/json' -H 'User-Agent: OpenAI/NodeJS/3.3.0' -H 'api-key: c30fe7f14b464d52b515f77148643d60' -H 'Authorization: Bearer undefined' --data '{"model":"gpt-3.5-turbo","temperature":0.7,"top_p":1,"frequency_penalty":0,"presence_penalty":0,"n":1,"stream":false,"messages":[{"role":"user","content":"!"}]}' https://openai-kanjian.openai.azure.com/openai/deployments/gpt-35-turbo/chat/completi ns\?api-version=2023-05-15
 
     return !!this._chatModel
   }

diff --git a/src/storage/chat-history.ts b/src/storage/chat-history.ts
@@ -37,7 +37,7 @@ export class ChatHistory {
 
       if (!msg.options.stateless && parent.options.leaf) {
         delete parent.options.leaf
-        await this._storage.set(msg.id as string, [preId, parent])
+        await this._storage.set(parent.id as string, [preId, parent])
       }
     } else if (pid !== '') {
       // append to current conversation

diff --git a/test/llmbots.e2e.spec.ts b/test/llmbots.e2e.spec.ts
@@ -17,7 +17,7 @@ describe('builtin LLMBots: vicuna-13b (e2e)', () => {
 
     // contextual conversation
     const req = new ChatDto("What's his wife's full name. reply 5 words most")
-    const resp = await claudeBot?.sendPrompt(req, msg => console.log(msg))
+    const resp = await claudeBot?.sendPrompt(req)
     console.log(resp)
     expect(resp?.text).not.toBeNull()
     expect(resp?.options.lastMsgId).toEqual(req.id)