From 25c3a9ae4dbd10ccb06abeed19f464c09ecf1a20 Mon Sep 17 00:00:00 2001
From: Vulcanraven <11424186+Vulcanraven91@users.noreply.github.com>
Date: Fri, 15 Sep 2023 10:47:20 +0200
Subject: [PATCH 1/4] fix: team-sharing/Dockerfile to reduce vulnerabilities
 (#953)

---
 team-sharing/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/team-sharing/Dockerfile b/team-sharing/Dockerfile
index 6a1d5f80..b0fc3afe 100644
--- a/team-sharing/Dockerfile
+++ b/team-sharing/Dockerfile
@@ -1,4 +1,4 @@
-FROM caddy:2.4.6
+FROM caddy:2.7.3
 
 COPY ./Caddyfile /etc/caddy/Caddyfile
 COPY ./main.sh /usr/src/www/main.sh

From 21b997ddb9311be979acbca602f1da42e5c62976 Mon Sep 17 00:00:00 2001
From: JasonGrass <junjiegrass@qq.com>
Date: Tue, 14 Nov 2023 11:32:10 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E2=9C=A8=20feat(TTS):=20=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?=E5=AF=B9=E7=94=9F=E6=88=90=E5=86=85=E5=AE=B9=E7=9A=84=20TTS=20?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/components/tts/CollapsedAudio.tsx   |  65 ++++++++++++
 src/components/tts/SentenceBuilder.ts   | 132 ++++++++++++++++++++++++
 src/components/tts/SpeechController.tsx |  40 +++++++
 src/components/tts/TtsWorker.tsx        | 130 +++++++++++++++++++++++
 src/components/tts/debounce.js          |  14 +++
 src/components/tts/tts.ts               |  81 +++++++++++++++
 6 files changed, 462 insertions(+)
 create mode 100644 src/components/tts/CollapsedAudio.tsx
 create mode 100644 src/components/tts/SentenceBuilder.ts
 create mode 100644 src/components/tts/SpeechController.tsx
 create mode 100644 src/components/tts/TtsWorker.tsx
 create mode 100644 src/components/tts/debounce.js
 create mode 100644 src/components/tts/tts.ts

diff --git a/src/components/tts/CollapsedAudio.tsx b/src/components/tts/CollapsedAudio.tsx
new file mode 100644
index 00000000..62517261
--- /dev/null
+++ b/src/components/tts/CollapsedAudio.tsx
@@ -0,0 +1,65 @@
+import React, { memo, useState, useRef, useEffect, useCallback } from 'react'
+
+interface AudioOutputProps {
+    source: string
+}
+
+/**
+ * 隐藏的 audio 播放器
+ * 在连续设置 source 时，会自动排队，先播放完前面的内容，然后继续播放下一条
+ */
+const CollapsedAudio = memo((props: AudioOutputProps) => {
+    const { source } = props
+
+    const audioPlayerRef = useRef<HTMLAudioElement>(null)
+
+    const [sourceQueue, setSourceQueue] = useState<string[]>([])
+    const [currentSource, setCurrentSource] = useState('')
+
+    // 入队
+    useEffect(() => {
+        if (!source) {
+            return
+        }
+
+        setSourceQueue((queue) => {
+            return [...queue, source]
+        })
+    }, [source])
+
+    // 播放当前 source 为队头
+    useEffect(() => {
+        if (currentSource) {
+            return
+        }
+
+        const next = sourceQueue[0]
+        if (next) {
+            setCurrentSource(next)
+        }
+    }, [sourceQueue, currentSource])
+
+    const onLoadedMetadata = () => {
+        audioPlayerRef.current?.play()
+    }
+
+    // 播放完成，出队
+    const onPlayEnd = () => {
+        setSourceQueue((queue) => {
+            return queue.slice(1)
+        })
+        setCurrentSource('')
+    }
+
+    return (
+        <audio
+            id="audio-player"
+            ref={audioPlayerRef}
+            onLoadedMetadata={onLoadedMetadata}
+            onEnded={onPlayEnd}
+            src={currentSource}
+        ></audio>
+    )
+})
+
+export default CollapsedAudio
diff --git a/src/components/tts/SentenceBuilder.ts b/src/components/tts/SentenceBuilder.ts
new file mode 100644
index 00000000..f4dd544e
--- /dev/null
+++ b/src/components/tts/SentenceBuilder.ts
@@ -0,0 +1,132 @@
+const SENTENCE_SPLICE_CHAR = '.;!?。；！？'
+
+/**
+ * 将输入内容按照 sentence 进行分割，每个 Message 需要使用单独的实例。
+ * 设计目标：在 input 不完整内容时，也尽可能先提取出已经完成的 sentence。以便可以尽快进入下一个消费流程。（类比：按流的形式进行 sentence 的生产）
+ * 场景：input 可能被多次调用 e.g.
+ *
+ * input("This guide")
+ * input("This guide will help you")
+ * input("This guide will help you get started with ElevenLabs.")
+ */
+export default class SentenceBuilder {
+    private cache: string[] = []
+    private cursor = 0
+    private left = ''
+
+    public readonly id: Readonly<string> = ''
+
+    constructor(id: string) {
+        this.id = id
+    }
+
+    public input(text: string) {
+        const lastCursor = this.cursor
+        const currentCursor = this.positionCursor(text) + 1
+
+        let content = text.slice(lastCursor, currentCursor)
+        if (!content) {
+            return
+        }
+        content = this.left.trim() + content
+
+        this.left = '' // 清空上一轮的最后一个断句的记录，重新生成
+        this.cursor = currentCursor
+
+        const lines = content.split(/[\r\n]+/)
+        const readyLines = lines.slice(0, -1)
+        if (readyLines.length > 0) {
+            for (const line of readyLines) {
+                const sentences = this.line2sentences(line)
+                this.cache.push(...sentences)
+            }
+        }
+
+        const last = lines[lines.length - 1]
+
+        if (last) {
+            const sentences = this.line2sentences(last)
+            const readySentences = sentences.slice(0, -1)
+            this.cache.push(...readySentences)
+
+            const left = sentences[sentences.length - 1]
+
+            if (this.isSentence(left)) {
+                this.cache.push(left)
+            } else {
+                this.left = left.trim()
+            }
+        }
+    }
+
+    public async *generateSentence() {
+        while (this.cache.length > 0) {
+            yield this.cache.shift()
+        }
+        yield ''
+    }
+
+    public getLeft() {
+        const l = this.left
+        this.left = ''
+        return l
+    }
+
+    private positionCursor(text: string) {
+        let cursor = 0
+        for (let i = 0; i < text.length; i++) {
+            const char = text[i]
+            if (char === '\r' || char === '\n') {
+                cursor = i
+                continue
+            }
+
+            if (i === text.length - 1 && char.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]`))) {
+                cursor = i
+                continue
+            }
+
+            if (
+                i < text.length - 1 &&
+                char.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]`)) &&
+                !new Boolean(text[i + 1])
+            ) {
+                cursor = i
+                continue
+            }
+        }
+
+        return cursor
+    }
+
+    private isSentence(sentence: string) {
+        if (sentence.match(new RegExp(`^\\s*\\d+[${SENTENCE_SPLICE_CHAR}]\\s*$`))) {
+            // 类似 "1." 这样的内容
+            return false
+        }
+        return sentence.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]\\s*$`))
+    }
+
+    private line2sentences(line: string) {
+        if (line.length < 50) {
+            return [line]
+        }
+
+        const sentences = line.split(new RegExp(`(?<=[${SENTENCE_SPLICE_CHAR}])\\s+`))
+
+        const result: string[] = []
+
+        let next = ''
+        for (const sentence of sentences) {
+            next += sentence + ' '
+            if (next.length > 10) {
+                result.push(next.trim())
+                next = ''
+            }
+        }
+        if (next.trim()) {
+            result.push(next.trim())
+        }
+        return result
+    }
+}
diff --git a/src/components/tts/SpeechController.tsx b/src/components/tts/SpeechController.tsx
new file mode 100644
index 00000000..d39a5830
--- /dev/null
+++ b/src/components/tts/SpeechController.tsx
@@ -0,0 +1,40 @@
+import React, { memo, useMemo } from 'react'
+import TtsWorker from './TtsWorker'
+
+interface Props {
+    text: string
+    messageId: string
+    enable: boolean
+}
+
+const key = ''
+const id = ''
+
+const LAST_BREAK_TIMEOUT = 3000
+const SENTENCE_MERGE_WAIT_TIME = 1000
+
+const SpeechController = memo((props: Props) => {
+    const { enable } = props
+
+    // TODO 从设置中读取配置
+    const options = useMemo(() => {
+        return {
+            elevenlabsKey: key,
+            elevenlabsVoiceId: id,
+            breakSentenceTimeout: LAST_BREAK_TIMEOUT,
+            mergeSentenceTimeout: SENTENCE_MERGE_WAIT_TIME,
+        }
+    }, [])
+
+    if (!enable) {
+        return null
+    }
+
+    return (
+        <div>
+            <TtsWorker text={props.text} messageId={props.messageId} options={options}></TtsWorker>
+        </div>
+    )
+})
+
+export default SpeechController
diff --git a/src/components/tts/TtsWorker.tsx b/src/components/tts/TtsWorker.tsx
new file mode 100644
index 00000000..4fb8201c
--- /dev/null
+++ b/src/components/tts/TtsWorker.tsx
@@ -0,0 +1,130 @@
+import React, { memo, useState, useRef, useEffect, useCallback } from 'react'
+
+import { speech } from './tts'
+import debounce from './debounce'
+
+import CollapsedAudio from './CollapsedAudio'
+import SentenceBuilder from './SentenceBuilder'
+
+interface Props {
+    text: string
+    messageId: string
+    options: {
+        elevenlabsKey: string
+        elevenlabsVoiceId: string
+        /**
+         * 最后一个不完整句子的等待时间，在这个时间之后，会直接读取不完整的句子（如果存在，一般 GPT 反馈的句子都有结束标点符号），进行 tts
+         */
+        breakSentenceTimeout: number
+        /**
+         * 句子之间的等待时间，如果句子之间的生成时间小于这个时间，则会被合并。减少请求 tts 服务器的次数。数值越小，可能的 tts 请求越多。
+         */
+        mergeSentenceTimeout: number
+    }
+}
+
+/**
+ * 一个不占据实际空间的 TTS 组件
+ */
+const TtsWorker = memo((props: Props) => {
+    const { text, messageId, options } = props
+
+    // 等待被消费（发音）的句子
+    const [speechSentence, setSpeechSentence] = useState<string[]>([])
+
+    // 音频 source, 此内容可以快速更新，不必等待音频播放完成（CollapsedAudio 组件会自行处理）
+    const [audioSource, setAudioSource] = useState('')
+
+    // 用于处理最后可能存在的断句（没有结束标点符号的句子）
+    const [timeoutId, setTimeoutId] = useState<NodeJS.Timeout>()
+
+    // text to sentences 生成器
+    const [sentenceBuilder, setSentenceBuilder] = useState<SentenceBuilder>()
+
+    // 每个 messageId 需要有不同的 SentenceBuilder 实例
+    useEffect(() => {
+        if (!messageId || sentenceBuilder?.id === messageId) {
+            return
+        }
+        setSentenceBuilder(new SentenceBuilder(messageId))
+    }, [messageId])
+
+    // 从 SentenceBuilder 获取已经生成的 sentence
+    const fetchSentences = useCallback(async () => {
+        if (!sentenceBuilder) {
+            return
+        }
+
+        clearTimeout(timeoutId)
+        const sentences: string[] = []
+        const generator = sentenceBuilder.generateSentence()
+
+        let continueGenerator = true
+        do {
+            const sentence = await generator.next()
+            if (sentence.value) {
+                sentences.push(sentence.value)
+            } else {
+                continueGenerator = false
+            }
+        } while (continueGenerator)
+
+        setSpeechSentence((originSentences) => {
+            return originSentences.concat(sentences)
+        })
+
+        const tid = setTimeout(() => {
+            const lastSentence = sentenceBuilder.getLeft()
+            if (lastSentence) {
+                setSpeechSentence((originSentences) => {
+                    return [...originSentences, lastSentence]
+                })
+            }
+        }, options.breakSentenceTimeout)
+        setTimeoutId(tid)
+    }, [sentenceBuilder])
+
+    // 调用 tts 服务，获取 sentence 的音频
+    const speechSentenceHandler = useCallback<(_: string[]) => void>(
+        debounce((sentences: string[]) => {
+            setSpeechSentence([])
+
+            const longSentence = sentences.join(' ')
+            console.log('[speech] ', longSentence)
+
+            speech(longSentence, {
+                apiKey: options.elevenlabsKey,
+                voiceId: options.elevenlabsVoiceId,
+            })
+                .then(setAudioSource)
+                .catch(console.error) // TODO 界面提示
+        }, options.mergeSentenceTimeout),
+        [],
+    )
+
+    // text to sentences 生产
+    useEffect(() => {
+        if (!sentenceBuilder) {
+            return
+        }
+        if (sentenceBuilder.id !== messageId) {
+            return
+        }
+
+        sentenceBuilder.input(text)
+        fetchSentences()
+    }, [text, sentenceBuilder])
+
+    // sentences to speech 消费
+    useEffect(() => {
+        if (speechSentence.length < 1) {
+            return
+        }
+        const sentences = [...speechSentence]
+        speechSentenceHandler(sentences)
+    }, [speechSentence])
+
+    return <CollapsedAudio source={audioSource} />
+})
+
+export default TtsWorker
diff --git a/src/components/tts/debounce.js b/src/components/tts/debounce.js
new file mode 100644
index 00000000..5c0ae470
--- /dev/null
+++ b/src/components/tts/debounce.js
@@ -0,0 +1,14 @@
+export default function debounce(handler, delay = 500) {
+    let timer = null
+
+    return function () {
+        if (timer) {
+            clearTimeout(timer)
+        }
+
+        timer = setTimeout(() => {
+            handler?.apply(this, arguments)
+            timer = null
+        }, delay)
+    }
+}
diff --git a/src/components/tts/tts.ts b/src/components/tts/tts.ts
new file mode 100644
index 00000000..9c2348e2
--- /dev/null
+++ b/src/components/tts/tts.ts
@@ -0,0 +1,81 @@
+// 参考：[Virtuo1/talk-to-gpt](https://github.com/Virtuo1/talk-to-gpt )
+
+class Elevenlabs {
+    public voiceId = ''
+    public apiKey = ''
+
+    public async getVoices() {
+        const response = await fetch(`https://api.elevenlabs.io/v1/voices`, {
+            method: 'GET',
+            headers: {
+                'xi-api-key': this.apiKey,
+            },
+        })
+
+        if (response.ok) {
+            const data = await response.json()
+            return data.voices
+        } else {
+            throw new Error("Couldn't fetch voices, is your elevenlabs key correct?")
+        }
+    }
+
+    public async *textToSpeak(sentences: string[]) {
+        for (const sentence of sentences) {
+            const response = await this.fetchAudio(sentence)
+            yield response
+        }
+    }
+
+    public async fetchAudio(message: string) {
+        const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${this.voiceId}`, {
+            method: 'POST',
+            headers: {
+                'Content-Type': 'application/json',
+                'xi-api-key': this.apiKey,
+            },
+            body: JSON.stringify({
+                text: message,
+                model_id: 'eleven_multilingual_v2',
+                voice_settings: {
+                    stability: 0.5,
+                    similarity_boost: 0.5,
+                },
+            }),
+        })
+
+        if (response.ok) {
+            const blob = await response.blob()
+            return URL.createObjectURL(blob)
+        } else {
+            throw new Error("Couldn't fetch audio")
+        }
+    }
+}
+
+const elevenlabs = new Elevenlabs()
+
+interface ElevenlabsOptions {
+    apiKey: string
+    voiceId: string
+}
+
+export function speechMulti(sentences: string[], options: ElevenlabsOptions) {
+    elevenlabs.apiKey = options.apiKey
+    elevenlabs.voiceId = options.voiceId
+    return elevenlabs.textToSpeak(sentences)
+}
+
+export async function speech(sentence: string, options: ElevenlabsOptions) {
+    if (!sentence) {
+        return ''
+    }
+
+    if (!options.apiKey || !options.voiceId) {
+        throw new Error('apiKey or voiceId is empty')
+    }
+
+    elevenlabs.apiKey = options.apiKey
+    elevenlabs.voiceId = options.voiceId
+    return await elevenlabs.fetchAudio(sentence)
+}

From a5d4a02c08203cf34782eff66e55eb3f88468000 Mon Sep 17 00:00:00 2001
From: JasonGrass <junjiegrass@qq.com>
Date: Tue, 14 Nov 2023 11:32:38 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E2=9C=A8=20feat(TTS):=20=E5=B0=86=20TTS=20?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=BC=95=E5=85=A5=20APP=20=E4=B8=AD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/App.tsx | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/App.tsx b/src/App.tsx
index cd5e4527..973bcb6f 100644
--- a/src/App.tsx
+++ b/src/App.tsx
@@ -61,6 +61,7 @@ import { SortableContext, arrayMove, sortableKeyboardCoordinates, verticalListSo
 import { restrictToVerticalAxis } from '@dnd-kit/modifiers'
 import { SortableItem } from './components/SortableItem'
 import InputBox from './components/InputBox'
+import SpeechController from './components/tts/SpeechController'
 
 function Main() {
     const { t } = useTranslation()
@@ -281,6 +282,9 @@ function Main() {
         }
     }
 
+    const [speechText, setSpeechText] = useState('')
+    const [speechMessageId, setSpeechMessageId] = useState('')
+
     const generate = async (session: Session, promptMsgs: Message[], targetMsg: Message) => {
         messageScrollRef.current = { msgId: targetMsg.id, smooth: false }
         await llm.chat(
@@ -305,6 +309,8 @@ function Main() {
                     }
                 }
                 store.updateChatSession(session)
+                setSpeechText(text)
+                setSpeechMessageId(targetMsg.id)
             },
             (err) => {
                 for (let i = 0; i < session.messages.length; i++) {
@@ -757,6 +763,7 @@ function Main() {
                     />
                 ))}
             </Grid>
+            <SpeechController text={speechText} messageId={speechMessageId} enable></SpeechController>
         </Box>
     )
 }

From 84087d7e350652c88c8f2398af40ad6fb679f2a7 Mon Sep 17 00:00:00 2001
From: JasonGrass <junjiegrass@qq.com>
Date: Tue, 14 Nov 2023 11:37:44 +0800
Subject: [PATCH 4/4] =?UTF-8?q?todo:=20=E5=B1=8F=E8=94=BD=20TTS=20?=
 =?UTF-8?q?=E6=94=AF=E6=8C=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/components/tts/SpeechController.tsx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/components/tts/SpeechController.tsx b/src/components/tts/SpeechController.tsx
index d39a5830..03ea2ccc 100644
--- a/src/components/tts/SpeechController.tsx
+++ b/src/components/tts/SpeechController.tsx
@@ -30,6 +30,9 @@ const SpeechController = memo((props: Props) => {
         return null
     }
 
+    // TODO 因为还有配置相关的工作未完成，这里先返回 null
+    return null
+
     return (
         <div>
             <TtsWorker text={props.text} messageId={props.messageId} options={options}></TtsWorker>