Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto TTS #1031

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ import { SortableContext, arrayMove, sortableKeyboardCoordinates, verticalListSo
import { restrictToVerticalAxis } from '@dnd-kit/modifiers'
import { SortableItem } from './components/SortableItem'
import InputBox from './components/InputBox'
import SpeechController from './components/tts/SpeechController'

function Main() {
const { t } = useTranslation()
Expand Down Expand Up @@ -281,6 +282,9 @@ function Main() {
}
}

const [speechText, setSpeechText] = useState('')
const [speechMessageId, setSpeechMessageId] = useState('')

const generate = async (session: Session, promptMsgs: Message[], targetMsg: Message) => {
messageScrollRef.current = { msgId: targetMsg.id, smooth: false }
await llm.chat(
Expand All @@ -305,6 +309,8 @@ function Main() {
}
}
store.updateChatSession(session)
setSpeechText(text)
setSpeechMessageId(targetMsg.id)
},
(err) => {
for (let i = 0; i < session.messages.length; i++) {
Expand Down Expand Up @@ -757,6 +763,7 @@ function Main() {
/>
))}
</Grid>
<SpeechController text={speechText} messageId={speechMessageId} enable></SpeechController>
</Box>
)
}
Expand Down
65 changes: 65 additions & 0 deletions src/components/tts/CollapsedAudio.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import React, { memo, useState, useRef, useEffect, useCallback } from 'react'

interface AudioOutputProps {
source: string
}

/**
* 隐藏的 audio 播放器
* 在连续设置 source 时,会自动排队,先播放完前面的内容,然后继续播放下一条
*/
const CollapsedAudio = memo((props: AudioOutputProps) => {
const { source } = props

const audioPlayerRef = useRef<HTMLAudioElement>(null)

const [sourceQueue, setSourceQueue] = useState<string[]>([])
const [currentSource, setCurrentSource] = useState('')

// 入队
useEffect(() => {
if (!source) {
return
}

setSourceQueue((queue) => {
return [...queue, source]
})
}, [source])

// 播放当前 source 为队头
useEffect(() => {
if (currentSource) {
return
}

const next = sourceQueue[0]
if (next) {
setCurrentSource(next)
}
}, [sourceQueue, currentSource])

const onLoadedMetadata = () => {
audioPlayerRef.current?.play()
}

// 播放完成,出队
const onPlayEnd = () => {
setSourceQueue((queue) => {
return queue.slice(1)
})
setCurrentSource('')
}

return (
<audio
id="audio-player"
ref={audioPlayerRef}
onLoadedMetadata={onLoadedMetadata}
onEnded={onPlayEnd}
src={currentSource}
></audio>
)
})

export default CollapsedAudio
132 changes: 132 additions & 0 deletions src/components/tts/SentenceBuilder.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
const SENTENCE_SPLICE_CHAR = '.;!?。;!?'

/**
* 将输入内容按照 sentence 进行分割,每个 Message 需要使用单独的实例。
* 设计目标:在 input 不完整内容时,也尽可能先提取出已经完成的 sentence。以便可以尽快进入下一个消费流程。(类比:按流的形式进行 sentence 的生产)
* 场景:input 可能被多次调用 e.g.
*
* input("This guide")
* input("This guide will help you")
* input("This guide will help you get started with ElevenLabs.")
*/
export default class SentenceBuilder {
private cache: string[] = []
private cursor = 0
private left = ''

public readonly id: Readonly<string> = ''

constructor(id: string) {
this.id = id
}

public input(text: string) {
const lastCursor = this.cursor
const currentCursor = this.positionCursor(text) + 1

let content = text.slice(lastCursor, currentCursor)
if (!content) {
return
}
content = this.left.trim() + content

this.left = '' // 清空上一轮的最后一个断句的记录,重新生成
this.cursor = currentCursor

const lines = content.split(/[\r\n]+/)
const readyLines = lines.slice(0, -1)
if (readyLines.length > 0) {
for (const line of readyLines) {
const sentences = this.line2sentences(line)
this.cache.push(...sentences)
}
}

const last = lines[lines.length - 1]

if (last) {
const sentences = this.line2sentences(last)
const readySentences = sentences.slice(0, -1)
this.cache.push(...readySentences)

const left = sentences[sentences.length - 1]

if (this.isSentence(left)) {
this.cache.push(left)
} else {
this.left = left.trim()
}
}
}

public async *generateSentence() {
while (this.cache.length > 0) {
yield this.cache.shift()
}
yield ''
}

public getLeft() {
const l = this.left
this.left = ''
return l
}

private positionCursor(text: string) {
let cursor = 0
for (let i = 0; i < text.length; i++) {
const char = text[i]
if (char === '\r' || char === '\n') {
cursor = i
continue
}

if (i === text.length - 1 && char.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]`))) {
cursor = i
continue
}

if (
i < text.length - 1 &&
char.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]`)) &&
!new Boolean(text[i + 1])
) {
cursor = i
continue
}
}

return cursor
}

private isSentence(sentence: string) {
if (sentence.match(new RegExp(`^\\s*\\d+[${SENTENCE_SPLICE_CHAR}]\\s*$`))) {
// 类似 "1." 这样的内容
return false
}
return sentence.match(new RegExp(`[${SENTENCE_SPLICE_CHAR}]\\s*$`))
}

private line2sentences(line: string) {
if (line.length < 50) {
return [line]
}

const sentences = line.split(new RegExp(`(?<=[${SENTENCE_SPLICE_CHAR}])\\s+`))

const result: string[] = []

let next = ''
for (const sentence of sentences) {
next += sentence + ' '
if (next.length > 10) {
result.push(next.trim())
next = ''
}
}
if (next.trim()) {
result.push(next.trim())
}
return result
}
}
43 changes: 43 additions & 0 deletions src/components/tts/SpeechController.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import React, { memo, useMemo } from 'react'
import TtsWorker from './TtsWorker'

interface Props {
text: string
messageId: string
enable: boolean
}

const key = ''
const id = ''

const LAST_BREAK_TIMEOUT = 3000
const SENTENCE_MERGE_WAIT_TIME = 1000

const SpeechController = memo((props: Props) => {
const { enable } = props

// TODO 从设置中读取配置
const options = useMemo(() => {
return {
elevenlabsKey: key,
elevenlabsVoiceId: id,
breakSentenceTimeout: LAST_BREAK_TIMEOUT,
mergeSentenceTimeout: SENTENCE_MERGE_WAIT_TIME,
}
}, [])

if (!enable) {
return null
}

// TODO 因为还有配置相关的工作未完成,这里先返回 null
return null

return (
<div>
<TtsWorker text={props.text} messageId={props.messageId} options={options}></TtsWorker>
</div>
)
})

export default SpeechController
Loading