From 25c3a9ae4dbd10ccb06abeed19f464c09ecf1a20 Mon Sep 17 00:00:00 2001 From: Vulcanraven <11424186+Vulcanraven91@users.noreply.github.com> Date: Fri, 15 Sep 2023 10:47:20 +0200 Subject: [PATCH 1/4] fix: team-sharing/Dockerfile to reduce vulnerabilities (#953) --- team-sharing/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/team-sharing/Dockerfile b/team-sharing/Dockerfile index 6a1d5f80..b0fc3afe 100644 --- a/team-sharing/Dockerfile +++ b/team-sharing/Dockerfile @@ -1,4 +1,4 @@ -FROM caddy:2.4.6 +FROM caddy:2.7.3 COPY ./Caddyfile /etc/caddy/Caddyfile COPY ./main.sh /usr/src/www/main.sh From 21b997ddb9311be979acbca602f1da42e5c62976 Mon Sep 17 00:00:00 2001 From: JasonGrass Date: Tue, 14 Nov 2023 11:32:10 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E2=9C=A8=20feat(TTS):=20=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E5=AF=B9=E7=94=9F=E6=88=90=E5=86=85=E5=AE=B9=E7=9A=84=20TTS=20?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/components/tts/CollapsedAudio.tsx | 65 ++++++++++++ src/components/tts/SentenceBuilder.ts | 132 ++++++++++++++++++++++++ src/components/tts/SpeechController.tsx | 40 +++++++ src/components/tts/TtsWorker.tsx | 130 +++++++++++++++++++++++ src/components/tts/debounce.js | 14 +++ src/components/tts/tts.ts | 81 +++++++++++++++ 6 files changed, 462 insertions(+) create mode 100644 src/components/tts/CollapsedAudio.tsx create mode 100644 src/components/tts/SentenceBuilder.ts create mode 100644 src/components/tts/SpeechController.tsx create mode 100644 src/components/tts/TtsWorker.tsx create mode 100644 src/components/tts/debounce.js create mode 100644 src/components/tts/tts.ts diff --git a/src/components/tts/CollapsedAudio.tsx b/src/components/tts/CollapsedAudio.tsx new file mode 100644 index 00000000..62517261 --- /dev/null +++ b/src/components/tts/CollapsedAudio.tsx @@ -0,0 +1,65 @@ +import React, { memo, useState, useRef, useEffect, useCallback } from 'react' + +interface AudioOutputProps { + source: string +} + +/** + * 隐藏的 audio 播放器 + * 在连续设置 source 时,会自动排队,先播放完前面的内容,然后继续播放下一条 + */ +const CollapsedAudio = memo((props: AudioOutputProps) => { + const { source } = props + + const audioPlayerRef = useRef(null) + + const [sourceQueue, setSourceQueue] = useState([]) + const [currentSource, setCurrentSource] = useState('') + + // 入队 + useEffect(() => { + if (!source) { + return + } + + setSourceQueue((queue) => { + return [...queue, source] + }) + }, [source]) + + // 播放当前 source 为队头 + useEffect(() => { + if (currentSource) { + return + } + + const next = sourceQueue[0] + if (next) { + setCurrentSource(next) + } + }, [sourceQueue, currentSource]) + + const onLoadedMetadata = () => { + audioPlayerRef.current?.play() + } + + // 播放完成,出队 + const onPlayEnd = () => { + setSourceQueue((queue) => { + return queue.slice(1) + }) + setCurrentSource('') + } + + return ( + + ) +}) + +export default CollapsedAudio diff --git a/src/components/tts/SentenceBuilder.ts b/src/components/tts/SentenceBuilder.ts new file mode 100644 index 00000000..f4dd544e --- /dev/null +++ b/src/components/tts/SentenceBuilder.ts @@ -0,0 +1,132 @@ +const SENTENCE_SPLICE_CHAR = '.;!?。;!?' + +/** + * 将输入内容按照 sentence 进行分割,每个 Message 需要使用单独的实例。 + * 设计目标:在 input 不完整内容时,也尽可能先提取出已经完成的 sentence。以便可以尽快进入下一个消费流程。(类比:按流的形式进行 sentence 的生产) + * 场景:input 可能被多次调用 e.g. + * + * input("This guide") + * input("This guide will help you") + * input("This guide will help you get started with ElevenLabs.") + */ +export default class SentenceBuilder { + private cache: string[] = [] + private cursor = 0 + private left = '' + + public readonly id: Readonly = '' + + constructor(id: string) { + this.id = id + } + + public input(text: string) { + const lastCursor = this.cursor + const currentCursor = this.positionCursor(text) + 1 + + let content = text.slice(lastCursor, currentCursor) + if (!content) { + return + } + content = this.left.trim() + content + + this.left = '' // 清空上一轮的最后一个断句的记录,重新生成 + this.cursor = currentCursor + + const lines = content.split(/[\r\n]+/) + const readyLines = lines.slice(0, -1) + if (readyLines.length > 0) { + for (const line of readyLines) { + const sentences = this.line2sentences(line) + this.cache.push(...sentences) + } + } + + const last = lines[lines.length - 1] + + if (last) { + const sentences = this.line2sentences(last) + const readySentences = sentences.slice(0, -1) + this.cache.push(...readySentences) + + const left = sentences[sentences.length - 1] + + if (this.isSentence(left)) { + this.cache.push(left) + } else { + this.left = left.trim() + } + } + } + + public async *generateSentence() { + while (this.cache.length > 0) { + yield this.cache.shift() + } + yield '' + } + + public getLeft() { + const l = this.left + this.left = '' + return l + } + + private positionCursor(text: string) { + let cursor = 0 + for (let i = 0; i < text.length; i++) { + const char = text[i] + if (char === '\r' || char === '\n') { + cursor = i + continue + } + + if (i === text.length - 1 && char.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]`))) { + cursor = i + continue + } + + if ( + i < text.length - 1 && + char.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]`)) && + !new Boolean(text[i + 1]) + ) { + cursor = i + continue + } + } + + return cursor + } + + private isSentence(sentence: string) { + if (sentence.match(new RegExp(`^\\s*\\d+[${SENTENCE_SPLICE_CHAR}]\\s*$`))) { + // 类似 "1." 这样的内容 + return false + } + return sentence.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]\\s*$`)) + } + + private line2sentences(line: string) { + if (line.length < 50) { + return [line] + } + + const sentences = line.split(new RegExp(`(?<=[${SENTENCE_SPLICE_CHAR}])\\s+`)) + + const result: string[] = [] + + let next = '' + for (const sentence of sentences) { + next += sentence + ' ' + if (next.length > 10) { + result.push(next.trim()) + next = '' + } + } + if (next.trim()) { + result.push(next.trim()) + } + return result + } +} diff --git a/src/components/tts/SpeechController.tsx b/src/components/tts/SpeechController.tsx new file mode 100644 index 00000000..d39a5830 --- /dev/null +++ b/src/components/tts/SpeechController.tsx @@ -0,0 +1,40 @@ +import React, { memo, useMemo } from 'react' +import TtsWorker from './TtsWorker' + +interface Props { + text: string + messageId: string + enable: boolean +} + +const key = '' +const id = '' + +const LAST_BREAK_TIMEOUT = 3000 +const SENTENCE_MERGE_WAIT_TIME = 1000 + +const SpeechController = memo((props: Props) => { + const { enable } = props + + // TODO 从设置中读取配置 + const options = useMemo(() => { + return { + elevenlabsKey: key, + elevenlabsVoiceId: id, + breakSentenceTimeout: LAST_BREAK_TIMEOUT, + mergeSentenceTimeout: SENTENCE_MERGE_WAIT_TIME, + } + }, []) + + if (!enable) { + return null + } + + return ( +
+ +
+ ) +}) + +export default SpeechController diff --git a/src/components/tts/TtsWorker.tsx b/src/components/tts/TtsWorker.tsx new file mode 100644 index 00000000..4fb8201c --- /dev/null +++ b/src/components/tts/TtsWorker.tsx @@ -0,0 +1,130 @@ +import React, { memo, useState, useRef, useEffect, useCallback } from 'react' + +import { speech } from './tts' +import debounce from './debounce' + +import CollapsedAudio from './CollapsedAudio' +import SentenceBuilder from './SentenceBuilder' + +interface Props { + text: string + messageId: string + options: { + elevenlabsKey: string + elevenlabsVoiceId: string + /** + * 最后一个不完整句子的等待时间,在这个时间之后,会直接读取不完整的句子(如果存在,一般 GPT 反馈的句子都有结束标点符号),进行 tts + */ + breakSentenceTimeout: number + /** + * 句子之间的等待时间,如果句子之间的生成时间小于这个时间,则会被合并。减少请求 tts 服务器的次数。数值越小,可能的 tts 请求越多。 + */ + mergeSentenceTimeout: number + } +} + +/** + * 一个不占据实际空间的 TTS 组件 + */ +const TtsWorker = memo((props: Props) => { + const { text, messageId, options } = props + + // 等待被消费(发音)的句子 + const [speechSentence, setSpeechSentence] = useState([]) + + // 音频 source, 此内容可以快速更新,不必等待音频播放完成(CollapsedAudio 组件会自行处理) + const [audioSource, setAudioSource] = useState('') + + // 用于处理最后可能存在的断句(没有结束标点符号的句子) + const [timeoutId, setTimeoutId] = useState() + + // text to sentences 生成器 + const [sentenceBuilder, setSentenceBuilder] = useState() + + // 每个 messageId 需要有不同的 SentenceBuilder 实例 + useEffect(() => { + if (!messageId || sentenceBuilder?.id === messageId) { + return + } + setSentenceBuilder(new SentenceBuilder(messageId)) + }, [messageId]) + + // 从 SentenceBuilder 获取已经生成的 sentence + const fetchSentences = useCallback(async () => { + if (!sentenceBuilder) { + return + } + + clearTimeout(timeoutId) + const sentences: string[] = [] + const generator = sentenceBuilder.generateSentence() + + let continueGenerator = true + do { + const sentence = await generator.next() + if (sentence.value) { + sentences.push(sentence.value) + } else { + continueGenerator = false + } + } while (continueGenerator) + + setSpeechSentence((originSentences) => { + return originSentences.concat(sentences) + }) + + const tid = setTimeout(() => { + const lastSentence = sentenceBuilder.getLeft() + if (lastSentence) { + setSpeechSentence((originSentences) => { + return [...originSentences, lastSentence] + }) + } + }, options.breakSentenceTimeout) + setTimeoutId(tid) + }, [sentenceBuilder]) + + // 调用 tts 服务,获取 sentence 的音频 + const speechSentenceHandler = useCallback<(_: string[]) => void>( + debounce((sentences: string[]) => { + setSpeechSentence([]) + + const longSentence = sentences.join(' ') + console.log('[speech] ', longSentence) + + speech(longSentence, { + apiKey: options.elevenlabsKey, + voiceId: options.elevenlabsVoiceId, + }) + .then(setAudioSource) + .catch(console.error) // TODO 界面提示 + }, options.mergeSentenceTimeout), + [], + ) + + // text to sentences 生产 + useEffect(() => { + if (!sentenceBuilder) { + return + } + if (sentenceBuilder.id !== messageId) { + return + } + + sentenceBuilder.input(text) + fetchSentences() + }, [text, sentenceBuilder]) + + // sentences to speech 消费 + useEffect(() => { + if (speechSentence.length < 1) { + return + } + const sentences = [...speechSentence] + speechSentenceHandler(sentences) + }, [speechSentence]) + + return +}) + +export default TtsWorker diff --git a/src/components/tts/debounce.js b/src/components/tts/debounce.js new file mode 100644 index 00000000..5c0ae470 --- /dev/null +++ b/src/components/tts/debounce.js @@ -0,0 +1,14 @@ +export default function debounce(handler, delay = 500) { + let timer = null + + return function () { + if (timer) { + clearTimeout(timer) + } + + timer = setTimeout(() => { + handler?.apply(this, arguments) + timer = null + }, delay) + } +} diff --git a/src/components/tts/tts.ts b/src/components/tts/tts.ts new file mode 100644 index 00000000..9c2348e2 --- /dev/null +++ b/src/components/tts/tts.ts @@ -0,0 +1,81 @@ +// 参考:[Virtuo1/talk-to-gpt](https://github.com/Virtuo1/talk-to-gpt ) + +class Elevenlabs { + public voiceId = '' + public apiKey = '' + + public async getVoices() { + const response = await fetch(`https://api.elevenlabs.io/v1/voices`, { + method: 'GET', + headers: { + 'xi-api-key': this.apiKey, + }, + }) + + if (response.ok) { + const data = await response.json() + return data.voices + } else { + throw new Error("Couldn't fetch voices, is your elevenlabs key correct?") + } + } + + public async *textToSpeak(sentences: string[]) { + for (const sentence of sentences) { + const response = await this.fetchAudio(sentence) + yield response + } + } + + public async fetchAudio(message: string) { + const response = await fetch(`https://api.elevenlabs.io/v1/text-to-speech/${this.voiceId}`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + 'xi-api-key': this.apiKey, + }, + body: JSON.stringify({ + text: message, + model_id: 'eleven_multilingual_v2', + voice_settings: { + stability: 0.5, + similarity_boost: 0.5, + }, + }), + }) + + if (response.ok) { + const blob = await response.blob() + return URL.createObjectURL(blob) + } else { + throw new Error("Couldn't fetch audio") + } + } +} + +const elevenlabs = new Elevenlabs() + +interface ElevenlabsOptions { + apiKey: string + voiceId: string +} + +export function speechMulti(sentences: string[], options: ElevenlabsOptions) { + elevenlabs.apiKey = options.apiKey + elevenlabs.voiceId = options.voiceId + return elevenlabs.textToSpeak(sentences) +} + +export async function speech(sentence: string, options: ElevenlabsOptions) { + if (!sentence) { + return '' + } + + if (!options.apiKey || !options.voiceId) { + throw new Error('apiKey or voiceId is empty') + } + + elevenlabs.apiKey = options.apiKey + elevenlabs.voiceId = options.voiceId + return await elevenlabs.fetchAudio(sentence) +} From a5d4a02c08203cf34782eff66e55eb3f88468000 Mon Sep 17 00:00:00 2001 From: JasonGrass Date: Tue, 14 Nov 2023 11:32:38 +0800 Subject: [PATCH 3/4] =?UTF-8?q?=E2=9C=A8=20feat(TTS):=20=E5=B0=86=20TTS=20?= =?UTF-8?q?=E6=94=AF=E6=8C=81=E5=BC=95=E5=85=A5=20APP=20=E4=B8=AD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/App.tsx | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/App.tsx b/src/App.tsx index cd5e4527..973bcb6f 100644 --- a/src/App.tsx +++ b/src/App.tsx @@ -61,6 +61,7 @@ import { SortableContext, arrayMove, sortableKeyboardCoordinates, verticalListSo import { restrictToVerticalAxis } from '@dnd-kit/modifiers' import { SortableItem } from './components/SortableItem' import InputBox from './components/InputBox' +import SpeechController from './components/tts/SpeechController' function Main() { const { t } = useTranslation() @@ -281,6 +282,9 @@ function Main() { } } + const [speechText, setSpeechText] = useState('') + const [speechMessageId, setSpeechMessageId] = useState('') + const generate = async (session: Session, promptMsgs: Message[], targetMsg: Message) => { messageScrollRef.current = { msgId: targetMsg.id, smooth: false } await llm.chat( @@ -305,6 +309,8 @@ function Main() { } } store.updateChatSession(session) + setSpeechText(text) + setSpeechMessageId(targetMsg.id) }, (err) => { for (let i = 0; i < session.messages.length; i++) { @@ -757,6 +763,7 @@ function Main() { /> ))} + ) } From 84087d7e350652c88c8f2398af40ad6fb679f2a7 Mon Sep 17 00:00:00 2001 From: JasonGrass Date: Tue, 14 Nov 2023 11:37:44 +0800 Subject: [PATCH 4/4] =?UTF-8?q?todo:=20=E5=B1=8F=E8=94=BD=20TTS=20?= =?UTF-8?q?=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/components/tts/SpeechController.tsx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/components/tts/SpeechController.tsx b/src/components/tts/SpeechController.tsx index d39a5830..03ea2ccc 100644 --- a/src/components/tts/SpeechController.tsx +++ b/src/components/tts/SpeechController.tsx @@ -30,6 +30,9 @@ const SpeechController = memo((props: Props) => { return null } + // TODO 因为还有配置相关的工作未完成,这里先返回 null + return null + return (