add twitch llm max tokens vs. story tokens

allow setting twitch llm usage different than main stories. reduce story length max tokens for llm story messages.
groovybits · Mar 30, 2024 · 4d137e0 · 4d137e0
1 parent b20c18d
commit 4d137e0
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 21 deletions.
diff --git a/scripts/twitch.sh b/scripts/twitch.sh
@@ -21,7 +21,7 @@ MODEL_ID=7b-it
 # Generic settings
 USE_API=1
 CHAT_FORMAT=chatml
-MAX_TOKENS=2000
+MAX_TOKENS=800
 TEMPERATURE=0.8
 CONTEXT_SIZE=8000
 QUANTIZED=0
@@ -30,15 +30,16 @@ SD_MAX_LENGTH=50
 ## Pipeline Settings
 DAEMON=1
 CONTINUOUS=1
-POLL_INTERVAL=1000
+POLL_INTERVAL=60000
 PIPELINE_CONCURRENCY=6
 ASYNC_CONCURRENCY=0
 NDI_TIMEOUT=600
 ## Twitch Chat Settings
 TWITCH_MODEL=mistral
 TWITCH_LLM_CONCURRENCY=1
 TWITCH_CHAT_HISTORY=16
-TWITCH_MAX_TOKENS=120
+TWITCH_MAX_TOKENS_CHAT=120
+TWITCH_MAX_TOKENS_LLM=500
 ## Stable Diffusion Settings
 SD_TEXT_MIN=70
 SD_WIDTH=512
@@ -106,7 +107,8 @@ DYLD_LIBRARY_PATH=`pwd`:/usr/local/lib:$DYLD_LIBRARY_PATH \
     --twitch-chat-history $TWITCH_CHAT_HISTORY \
     --twitch-llm-concurrency $TWITCH_LLM_CONCURRENCY \
     --twitch-model $TWITCH_MODEL \
-    --twitch-max-tokens $TWITCH_MAX_TOKENS \
+    --twitch-max-tokens-chat $TWITCH_MAX_TOKENS_CHAT \
+    --twitch-max-tokens-llm $TWITCH_MAX_TOKENS_LLM \
     --twitch-prompt "$TWITCH_PROMPT" \
     --mimic3-tts \
     $SD_API_CMD \

diff --git a/src/args.rs b/src/args.rs
@@ -756,11 +756,20 @@ pub struct Args {
     /// Twitch Max Tokens - max tokens for LLM
     #[clap(
         long,
-        env = "TWITCH_MAX_TOKENS",
+        env = "TWITCH_MAX_TOKENS_CHAT",
         default_value_t = 150,
         help = "Twitch Max Tokens."
     )]
-    pub twitch_max_tokens: usize,
+    pub twitch_max_tokens_chat: usize,
+
+    //// Twitch Max Tokens LLM - max tokens for LLM
+    #[clap(
+        long,
+        env = "TWITCH_MAX_TOKENS_LLM",
+        default_value_t = 150,
+        help = "Twitch Max Tokens LLM."
+    )]
+    pub twitch_max_tokens_llm: usize,
 
     /// single concurrency - bool single concurrency for all models, wait between each request
     #[clap(

diff --git a/src/main.rs b/src/main.rs
@@ -782,15 +782,6 @@ async fn main() {
             }
         }
 
-        // Did not get a message from twitch, so don't process the query
-        if !twitch_query && args.twitch_client {
-            if !args.continuous {
-                // sleep for a while to avoid busy loop
-                tokio::time::sleep(Duration::from_millis(100)).await;
-                continue;
-            }
-        }
-
         // break the loop if we are not running as a daemon or hit max iterations
         let rctrlc_clone = running_ctrlc.clone();
         if (!rctrlc_clone.load(Ordering::SeqCst)
@@ -880,8 +871,28 @@ async fn main() {
         // Calculate elapsed time since last start
         let elapsed = poll_start_time.elapsed();
 
+        let mut max_tokens = args.max_tokens as usize;
+
+        // Did not get a message from twitch, so don't process the query
+        if !twitch_query && args.twitch_client {
+            if args.continuous {
+                // only play a story after poll_interval_duration has passed, else continue
+                if elapsed < poll_interval_duration {
+                    tokio::time::sleep(Duration::from_millis(100)).await;
+                    continue;
+                }
+            } else {
+                // sleep for a while to avoid busy loop
+                tokio::time::sleep(Duration::from_millis(100)).await;
+                continue;
+            }
+        } else if args.twitch_client && twitch_query {
+            // reset the max tokens
+            max_tokens = args.twitch_max_tokens_llm;
+        }
+
         // Sleep only if the elapsed time is less than the poll interval
-        if !twitch_query
+        if !args.twitch_client
             && iterations > 0
             && !args.interactive
             && (args.daemon || args.max_iterations > 1)
@@ -1148,7 +1159,7 @@ async fn main() {
             tokio::spawn(async move {
                 let open_ai_request = OpenAIRequest {
                     model: &model_clone,
-                    max_tokens: &args.max_tokens,
+                    max_tokens: &max_tokens,
                     messages: messages_clone,
                     temperature: &args.temperature,
                     top_p: &args.top_p,
@@ -1173,7 +1184,7 @@ async fn main() {
                 let mistral_clone = mistral.clone();
                 if let Err(e) = mistral_clone(
                     prompt_clone,
-                    args.max_tokens as usize,
+                    max_tokens as usize,
                     args.temperature as f64,
                     args.quantized,
                     Some(model_id),
@@ -1187,7 +1198,7 @@ async fn main() {
                 let gemma_clone = gemma.clone();
                 if let Err(e) = gemma_clone(
                     prompt_clone,
-                    args.max_tokens as usize,
+                    max_tokens as usize,
                     args.temperature as f64,
                     args.quantized,
                     Some(model_id),

diff --git a/src/openai_api.rs b/src/openai_api.rs
@@ -21,7 +21,7 @@ pub struct Message {
 pub struct OpenAIRequest<'a> {
     pub model: &'a str,
     pub messages: Vec<Message>,
-    pub max_tokens: &'a i32,        // add this field to the request struct
+    pub max_tokens: &'a usize,      // add this field to the request struct
     pub temperature: &'a f32,       // add this field to the request struct
     pub top_p: &'a f32,             // add this field to the request struct
     pub presence_penalty: &'a f32,  // add this field to the request struct

diff --git a/src/twitch_client.rs b/src/twitch_client.rs
@@ -97,7 +97,7 @@ async fn on_msg(
     if !msg.text().starts_with("!help") && !msg.text().starts_with("!message") {
         // LLM Thread
         let (external_sender, mut external_receiver) = tokio::sync::mpsc::channel::<String>(100);
-        let max_tokens = args.twitch_max_tokens;
+        let max_tokens = args.twitch_max_tokens_chat;
         let temperature = 0.8;
         let quantized = false;
         let max_messages = args.twitch_chat_history;