twitch chat interactive mode (#31)

* twitch chat interactive mode - twitch chat is interactive. - stable diffusion token count to avoid crashing. - better prompt segmentation. - STT doesn't make awful sounds. - Pad NDI audio output and remove funny characters. * add cabextract dep and fonts extraction - less history in twitch bot. * unpack fonts fixes * twitch chat info added to readme * v0.5.2 * build without needing fonts extracted allow building without fonts extracted for docs.rs. * add build info for fonts inclusion --------- Co-authored-by: Chris Kennedy <[email protected]>
groovybits · Mar 22, 2024 · 01fd037 · 01fd037
1 parent 5f1e079
commit 01fd037
Show file tree

Hide file tree

Showing 17 changed files with 830 additions and 401 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -1,25 +1,27 @@
 name: Rust
 
 on:
-  push:
-    branches: [ "main", "staging" ]
-  pull_request:
-    branches: [ "main", "staging" ]
+    push:
+        branches: ["main", "staging"]
+    pull_request:
+        branches: ["main", "staging"]
 
 env:
-  CARGO_TERM_COLOR: always
+    CARGO_TERM_COLOR: always
 
 jobs:
-  build:
-    runs-on: macos-latest  # Changed from ubuntu-latest to macos-latest
+    build:
+        runs-on: macos-latest # Changed from ubuntu-latest to macos-latest
 
-    steps:
-    - uses: actions/checkout@v3
-    - name: Install dependencies
-      run: |
-        brew update
-        brew install libpcap zmq capnp
-    - name: Build
-      run: cargo build --verbose
-    - name: Run tests
-      run: cargo test --verbose
+        steps:
+            - uses: actions/checkout@v3
+            - name: Install dependencies
+              run: |
+                  brew update
+                  brew install libpcap zmq capnp cabextract
+            - name: Build
+              run: |
+                  cd fonts && sh unpack_fonts.sh && cd ../
+                  cargo build --verbose
+            - name: Run tests
+              run: cargo test --verbose
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,7 +9,7 @@ license-file = "LICENSE"
 homepage = "https://github.com/groovybits/rsllm/wiki"
 repository = "https://github.com/groovybits/rsllm"
 authors = ["Chris Kennedy"]
-version = "0.5.1"
+version = "0.5.2"
 edition = "2021"
 
 [lib]
@@ -22,6 +22,7 @@ mps = ["candle-core/metal", "candle-nn/metal", "metal", "candle-metal-kernels"]
 ndi = ["ndi-sdk-rsllm"]
 metavoice = []
 audioplayer = ["rodio"]
+fonts = ["rusttype", "imageproc"]
 
 [profile.release-with-debug]
 inherits = "release"
@@ -71,8 +72,8 @@ candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.
 ndi-sdk-rsllm = { git = "https://github.com/groovybits/rust-ndi.git", version = "0.1.2", optional = true }
 rand = { version = "0.8.5", features = ["small_rng"] }
 uuid = { version = "1.7.0", features = ["v4"] }
-imageproc = "0.23.0"
-rusttype = "0.9.3"
+imageproc = { version = "0.23.0", optional = true }
+rusttype = { version = "0.9.3", optional = true }
 rodio = { version = "0.17.3", features = ["wav", "mp3"], optional = true }
 minimp3 = "0.5.1"
 tmi = "0.5.0"

diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ The Rust LLM Stream Analyzer and Content Generator is optimized for MacOS Metal
 -   **Voice and Speech Integration**: Plans to incorporate Whisper for voice-driven interactions, akin to Alexa, allowing users to communicate with the toolkit using voice commands and receive streaming text inputs in response. (Planned Feature)
 -   **Image Generation and NDI Output**: Supports generating images from text descriptions and outputting through NDI for a wide range of applications, including real-time content creation and broadcasting. (In Beta Testing)
 -   **TTS MetaVoice / Mimic3 TTS API / OpenAI TTS API**: Candle implements TTS using MetaVoice which is the default but a WIP as the author is shoring up the implementation quality and optimizing for Metal GPUs (isn't realtime currently, sounds very "wavy"). OpenAI TTS API support generates very nice speech for a prie if wanting quality/realtime speech generation. Mimic3 TTS API requires running the mimic3-server and is a bit more involved but is a good alternative to OpenAI TTS API since it is free if you have a local system. <https://github.com/MycroftAI/mimic3>
+-   **Twitch Chat Interactive AI**: Integrated Twitch chat for real-time AI interactions, enabling users to engage with the toolkit through chat commands and receive AI-generated responses.
 
 ![RSLLM](https://storage.googleapis.com/groovybits/images/rsllm/rsllm.webp)
 
@@ -50,7 +51,8 @@ The toolkit excels in analyzing real-time data streams and generating AI-driven
 3. **Compile with Metal GPU Support and NDI SDK support**:
     ```bash
     # export DYLD_LIBRARY_PATH=`pwd`:$DYLD_LIBRARY_PATH
-    # cargo build --release --features=mps,ndi,audioplayer,metavoice
+    # cargo build --release --features=fonts,mps,ndi,audioplayer,metavoice
+
     ## run the compile.sh which basically does the above command + gets the libndi.dylib.
     ./scripts/compile.sh # Script helps handle the NDI SDK dependency and DYLD_LIBRARY_PATH
     ```
@@ -77,7 +79,7 @@ The toolkit is designed to facilitate a wide range of AI-driven operations, from
 
 -   **Running with Candle and OS Stats for AI System Analysis**:
     ```bash
-    cargo run --release --features ndi,mps,metavoice,audioplayer -- \
+    cargo run --release --features fonts,ndi,mps,metavoice,audioplayer -- \
       --candle_llm gemma \
       --model-id "2b-it" \
       --max-tokens 1000 \
@@ -120,7 +122,6 @@ RsLLM has a mission to research and explore implementing a versatile, high-perfo
 
 ### Priority:
 
--   Twitch Chat integration fixes for threading and input/output through mpsc channels async (WIP).
 -   MpegTS Chat for analysis freeform over current and historical mpegts streams data.
 -   Improve Image/TTS Latency and async cooridaation of output. Use an NDI pre-Queue for images and audio to ensure they are in sync and non-latent.
 -   RAG document chromium use and caching of embeddings for augmented documentation based LLM context.

diff --git a/scripts/alice.sh b/scripts/alice.sh
@@ -16,8 +16,7 @@ MAX_TOKENS=3000
 ALIGNMENT=right
 TEMPERATURE=0.8
 POLL_INTERVAL=3000
-PIPELINE_CONCURRENCY=2
-LLM_CONCURRENCY=1
+PIPELINE_CONCURRENCY=3
 CONTEXT_SIZE=4000
 SUBTITLES=1
 DAEMON=0
@@ -51,7 +50,6 @@ DYLD_LIBRARY_PATH=`pwd`:/usr/local/lib:$DYLD_LIBRARY_PATH \
     --image-alignment $ALIGNMENT \
     --temperature $TEMPERATURE \
     --pipeline-concurrency $PIPELINE_CONCURRENCY \
-    --llm-concurrency $LLM_CONCURRENCY \
     --poll-interval $POLL_INTERVAL \
     --llm-history-size $CONTEXT_SIZE \
     $SUBTITLE_CMD \

diff --git a/scripts/compile.sh b/scripts/compile.sh
@@ -15,11 +15,11 @@ fi
 ## Build release version
 cargo build \
     --release \
-    --features mps,ndi,audioplayer,metavoice
+    --features mps,ndi,audioplayer,metavoice,fonts
 
 # Build debug version
 cargo build \
-    --features mps,ndi,audioplayer,metavoice
+    --features mps,ndi,audioplayer,metavoice,fonts
 
 if [ ! -f "target/release/rsllm" ]; then
     echo "Error building rsllm, please check output"

diff --git a/scripts/twitch.sh b/scripts/twitch.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Alice's AI Wonderland Character:
+# - Parody of walt-disney's original Alice animations, the first ones that got published.
+#
+# RsLLM configuration script:
+# - @2024 Christi Kennedy - This is not related to any known alices or wonderlands.
+#
+#
+
+# === CONFIGURATION ===
+BUILD_TYPE=release
+MODEL=gemma
+MODEL_ID=2b-it
+MAX_TOKENS=800
+ALIGNMENT=center
+TEMPERATURE=0.8
+CONTINUOUS=0
+POLL_INTERVAL=0
+PIPELINE_CONCURRENCY=3
+CONTEXT_SIZE=3000
+SUBTITLES=1
+DAEMON=1
+KEEP_HISTORY=1
+GREETING="Hi I'm Alice, ask me a question!"
+# === END OF CONFIGURATION ===
+#
+#
+SUBTITLE_CMD=
+DAEMON_CMD=
+NO_HISTORY_CMD=
+if [ "$SUBTITLES" == 1 ]; then
+    SUBTITLE_CMD="--subtitles"
+fi
+if [ "$DAEMON" == 1 ]; then
+    DAEMON_CMD="--daemon"
+fi
+if [ "$CONTINUOUS" == 1 ]; then
+    CONTINUOUS_CMD="--continuous"
+fi
+if [ "$KEEP_HISTORY" == 0 ]; then
+    NO_HISTORY_CMD="--no-history"
+fi
+
+SYSTEM_PROMPT="you are Alice and run Alice's AI Wonderland twitch streaming channel, you chat with twitch users in your twitch channel who ask you questions and have discussions. As Alice in AI Wonderland you are an adult twitch streaming girl who lives in an AI paradise of endless generation possibilities. Always talk in first person as the character speaking. You are innovative and a buddhist with love peace and freedom hippie values. Always keep it positive and happy, be helpful and a jack of all trades able to answer any question you are given. If you don't know for sure, just make up something funny."
+PROMPT="create a story based on an anime About Alice an adult twitch streaming girl who lives in AI Wonderland. Have it vary off the title 'Alice in AI Wonderland' with a random plotline you create based on classic anime characters appearing in the wonderland. Alices AI Wonderland is a  happy fun show where Alice goes through experiences similar to Alice in Wonderland where she grows small or large depending one what she eats. Add in AI technology twists. Have it fully formatted like a transcript with the character speaking parts mostly speaking in first person, minimal narration. create a whole episode full length with classic anime characters with Alice the main character of AI Wonderland."
+
+
+DYLD_LIBRARY_PATH=`pwd`:/usr/local/lib:$DYLD_LIBRARY_PATH \
+    RUST_BACKTRACE=full target/${BUILD_TYPE}/rsllm \
+    --query "$PROMPT" \
+    --system-prompt "$SYSTEM_PROMPT" \
+    --candle-llm $MODEL \
+    --twitch-client \
+    --sd-image \
+    --ndi-audio \
+    --ndi-images \
+    --mimic3-tts \
+    --model-id $MODEL_ID \
+    --image-alignment $ALIGNMENT \
+    --temperature $TEMPERATURE \
+    --pipeline-concurrency $PIPELINE_CONCURRENCY \
+    --poll-interval $POLL_INTERVAL \
+    --llm-history-size $CONTEXT_SIZE \
+    --greeting "$GREETING" \
+    $SUBTITLE_CMD \
+    $DAEMON_CMD \
+    $CONTINUOUS_CMD \
+    $NO_HISTORY_CMD \
+    --max-tokens $MAX_TOKENS $@
diff --git a/src/args.rs b/src/args.rs
@@ -4,7 +4,7 @@ use clap::Parser;
 #[derive(Parser, Debug, Clone)]
 #[clap(
     author = "Chris Kennedy",
-    version = "0.5.1",
+    version = "0.5.2",
     about = "Rust LLM Stream Analyzer and Content Generator"
 )]
 pub struct Args {
@@ -215,24 +215,6 @@ pub struct Args {
     )]
     pub audio_chunk_size: f32,
 
-    /// llm concurrency
-    #[clap(
-        long,
-        env = "LLM_CONCURRENCY",
-        default_value = "1",
-        help = "llm concurrency."
-    )]
-    pub llm_concurrency: usize,
-
-    /// max_concurrent_sd_image_tasks for the sd image semaphore
-    #[clap(
-        long,
-        env = "MAX_CONCURRENT_SD_IMAGE_TASKS",
-        default_value = "8",
-        help = "max_concurrent_sd_image_tasks for the sd image semaphore."
-    )]
-    pub max_concurrent_sd_image_tasks: usize,
-
     /// Pipeline concurrency - max concurrent pipeline tasks
     #[clap(
         long,
@@ -507,12 +489,12 @@ pub struct Args {
     )]
     pub sd_image: bool,
 
-    /// SD Max Length for SD Image
+    /// SD Max Length in tokens for SD Image
     #[clap(
         long,
         env = "SD_MAX_LENGTH",
-        default_value_t = 80,
-        help = "SD Max Length for SD Image hardsub text segments. Will be less than this amount."
+        default_value_t = 77,
+        help = "SD Max Length in tokens for SD Image hardsub text segments. example: 77 tokens is avg 77 * 4 == 308 chars."
     )]
     pub sd_max_length: usize,
 
@@ -626,11 +608,20 @@ pub struct Args {
     /// shutdown_msg - message to send when shutting down
     #[clap(
         long,
-        env = "SHUTDOWN_MSG",
-        default_value = "End of Episode.\nBrought to you by GroovyLife.AI\nDeveloped by The Groovy Organization",
-        help = "shutdown_msg - message to send when shutting down."
+        env = "GREETING",
+        default_value = "Hi I'm Alice, ask me a question!",
+        help = "greeting - message to send after done speaking."
     )]
-    pub shutdown_msg: String,
+    pub greeting: String,
+
+    /// assistant image description
+    #[clap(
+        long,
+        env = "ASSISTANT_IMAGE_DESCRIPTION",
+        default_value = "A head shot of Alice from Alice in AI Wonderland. A streaming girl on twitch who is live streaming AI generated content. Similar a magical anime girl in appearance.",
+        help = "assistant image description."
+    )]
+    pub assistant_image_prompt: String,
 
     /// Subtitles - enable subtitles
     #[clap(
@@ -650,6 +641,15 @@ pub struct Args {
     )]
     pub subtitle_position: String,
 
+    /// Continuous - continuous mode where it will keep running the query until stopped
+    #[clap(
+        long,
+        env = "CONTINUOUS",
+        default_value_t = false,
+        help = "Continuous - continuous mode where it will keep running the query until stopped."
+    )]
+    pub continuous: bool,
+
     /// enable twitch client
     #[clap(
         long,

diff --git a/src/audio.rs b/src/audio.rs
@@ -0,0 +1,69 @@
+use minimp3::{Decoder, Frame};
+use std::io::Cursor;
+use std::io::Result;
+
+/// Converts WAV PCM data to f32 samples without explicitly handling error cases in the return type.
+///
+/// # Arguments
+/// * `wav_data` - The bytes of a WAV file.
+///
+/// # Returns
+/// A `Result` containing a `Vec<f32>` of normalized audio samples, or an `Error`.
+pub fn wav_to_f32(wav_data: Vec<u8>) -> Result<Vec<f32>> {
+    let cursor = Cursor::new(wav_data);
+    let reader_result = hound::WavReader::new(cursor);
+
+    // Check if the reader was successfully created
+    let reader = match reader_result {
+        Ok(r) => r,
+        Err(_) => return Ok(Vec::new()), // In case of an error, return an empty vector to match the mp3_to_f32 strategy
+    };
+
+    // Depending on the sample format, process the samples differently
+    let spec = reader.spec();
+    let sample_format = spec.sample_format;
+    let bits_per_sample = spec.bits_per_sample;
+
+    let samples = match sample_format {
+        hound::SampleFormat::Float => reader
+            .into_samples::<f32>()
+            .filter_map(|result_sample| result_sample.ok()) // Convert Result<f32, hound::Error> to Option<f32>, and then filter_map will filter out the None values
+            .collect(),
+
+        hound::SampleFormat::Int => match bits_per_sample {
+            16 => reader
+                .into_samples::<i16>()
+                .filter_map(|result_sample| result_sample.ok()) // Convert Result<i16, hound::Error> to Option<i16>
+                .map(|sample| sample as f32 / i16::MAX as f32) // Normalize
+                .collect(),
+
+            24 => reader
+                .into_samples::<i32>()
+                .filter_map(|result_sample| result_sample.ok()) // Convert Result<i32, hound::Error> to Option<i32>
+                .map(|sample| (sample >> 8) as f32 / i16::MAX as f32) // Shift and normalize for 24-bit stored in i32
+                .collect(),
+
+            // In case of an unsupported bit depth, return an empty vector
+            _ => Vec::new(),
+        },
+    };
+
+    Ok(samples)
+}
+
+pub fn mp3_to_f32(mp3_data: Vec<u8>) -> Result<Vec<f32>> {
+    let cursor = Cursor::new(mp3_data);
+    let mut decoder = Decoder::new(cursor);
+    let mut samples_f32 = Vec::new();
+
+    while let Ok(Frame { data, .. }) = decoder.next_frame() {
+        for &sample in &data {
+            // Convert each sample to f32; MP3 samples are typically s16.
+            // Normalize the s16 sample to the range [-1.0, 1.0].
+            let sample_f32 = sample as f32 / i16::MAX as f32;
+            samples_f32.push(sample_f32);
+        }
+    }
+
+    Ok(samples_f32)
+}
diff --git a/src/candle_gemma.rs b/src/candle_gemma.rs
@@ -77,6 +77,13 @@ impl TextGeneration {
             }
         }
 
+        // Skip the first token
+        for &t in tokens.iter() {
+            if let Some(_) = self.tokenizer.next_token(t)? {
+                break;
+            }
+        }
+
         debug!("prompt: {:?}", prompt);
 
         let eos_token = match self.tokenizer.get_token("<eos>") {

diff --git a/src/candle_mistral.rs b/src/candle_mistral.rs
@@ -85,6 +85,13 @@ impl TextGeneration {
             }
         }
 
+        // Skip the first token
+        for &t in tokens.iter() {
+            if let Some(_) = self.tokenizer.next_token(t)? {
+                break;
+            }
+        }
+
         debug!("prompt: {:?}", prompt);
 
         let eos_token = match self.tokenizer.get_token("</s>") {