diff --git a/src/services/speaker/base.ts b/src/services/speaker/base.ts index 22e59af..80ba622 100644 --- a/src/services/speaker/base.ts +++ b/src/services/speaker/base.ts @@ -8,6 +8,7 @@ import { } from "mi-service-lite"; import { sleep } from "../../utils/base"; import { Http } from "../http"; +import { ResponseStream } from "./stream"; export type TTSProvider = "xiaoai" | "doubao"; @@ -49,6 +50,7 @@ export class BaseSpeaker { } async unWakeUp() { + // ! FIXME 新版小爱音箱固件在关闭/打开麦克风时会有提示音 await this.MiIOT!.setProperty(4, 1, true); // 关闭麦克风 await this.MiIOT!.setProperty(4, 1, false); // 打开麦克风 } @@ -57,6 +59,7 @@ export class BaseSpeaker { async response(options: { tts?: TTSProvider; text?: string; + stream?: ResponseStream; audio?: string; speaker?: string; keepAlive?: boolean; @@ -65,16 +68,98 @@ export class BaseSpeaker { let { text, audio, + stream, + playSFX = true, + keepAlive = false, + tts = this.tts, + } = options ?? {}; + + const ttsNotXiaoai = (!!stream || !!text) && !audio && tts !== "xiaoai"; + playSFX = ttsNotXiaoai && playSFX; + + if (ttsNotXiaoai && !stream) { + // 长文本 TTS 转化成 stream 分段模式 + stream = ResponseStream.createResponseStream(text!); + } + + let res; + this.responding = true; + // 开始响应 + if (stream) { + let _response = ""; + while (true) { + const { nextSentence, noMore } = stream.getNextResponse(); + if (nextSentence) { + if (_response.length < 1) { + // 播放开始提示音 + if (playSFX) { + await this.MiNA!.play({ url: process.env.AUDIO_BEEP }); + } + // 在播放 TTS 语音之前,先取消小爱音箱的唤醒状态,防止将 TTS 语音识别成用户指令 + if (ttsNotXiaoai) { + await this.unWakeUp(); + } + } + res = await this._response({ + ...options, + text: nextSentence, + playSFX: false, + keepAlive: false, + }); + if (res === "break") { + // 终止回复 + stream.cancel(); + break; + } + _response += nextSentence; + } + if (noMore) { + if (_response.length > 0) { + // 播放结束提示音 + if (playSFX) { + await this.MiNA!.play({ url: process.env.AUDIO_BEEP }); + } + } + // 保持唤醒状态 + if (keepAlive) { + await this.wakeUp(); + } + // 播放完毕 + break; + } + await sleep(this.interval); + } + } else { + res = await this._response(options); + } + this.responding = false; + return res; + } + + private async _response(options: { + tts?: TTSProvider; + text?: string; + stream?: ResponseStream; + audio?: string; + speaker?: string; + keepAlive?: boolean; + playSFX?: boolean; + }) { + let { + text, + audio, + stream, playSFX = true, keepAlive = false, tts = this.tts, speaker = this._defaultSpeaker, } = options ?? {}; + const ttsNotXiaoai = !stream && !!text && !audio && tts !== "xiaoai"; + playSFX = ttsNotXiaoai && playSFX; + // 播放回复 const play = async (args?: { tts?: string; url?: string }) => { - const ttsNotXiaoai = !audio && tts !== "xiaoai"; - playSFX = ttsNotXiaoai && playSFX; // 播放开始提示音 if (playSFX) { await this.MiNA!.play({ url: process.env.AUDIO_BEEP }); @@ -112,7 +197,6 @@ export class BaseSpeaker { // 开始响应 let res; - this.responding = true; if (audio) { // 音频回复 res = await play({ url: audio }); @@ -120,18 +204,18 @@ export class BaseSpeaker { // 文字回复 switch (tts) { case "doubao": - text = encodeURIComponent(text); + const _text = encodeURIComponent(text); const doubaoTTS = process.env.TTS_DOUBAO; - const url = `${doubaoTTS}?speaker=${speaker}&text=${text}`; + const url = `${doubaoTTS}?speaker=${speaker}&text=${_text}`; res = await play({ url }); break; case "xiaoai": default: res = await play({ tts: text }); + break; } - this.responding = false; - return res; } + return res; } private _doubaoSpeakers?: Speaker[]; diff --git a/src/services/speaker/speaker.ts b/src/services/speaker/speaker.ts index 02512ff..fa47d52 100644 --- a/src/services/speaker/speaker.ts +++ b/src/services/speaker/speaker.ts @@ -266,7 +266,7 @@ export class Speaker extends BaseSpeaker { const ttsAnswer = e.answers.find((e) => e.type === "TTS") as any; return { text: e.query, - answer: ttsAnswer?.tts?.text, + answer: ttsAnswer?.tts?.text?.trim(), timestamp: e.time, }; }); diff --git a/src/services/speaker/stream.ts b/src/services/speaker/stream.ts new file mode 100644 index 0000000..bf532ef --- /dev/null +++ b/src/services/speaker/stream.ts @@ -0,0 +1,190 @@ +type ResponseStatus = "idle" | "responding" | "finished" | "canceled"; + +interface ResponseStreamOptions { + /** + * 单次响应句子的最大长度 + */ + maxSentenceLength?: number; + /** + * 首次响应句子的收集时长(单位:毫秒) + * + * 例子:100ms => 从收到第一条响应文本开始,聚合之后 100ms 内收到的文本,作为第一次 Response + * + * 默认值:200,(0 为立即响应) + */ + firstSubmitTimeout?: number; + /** + * 批量提交响应句子的收集时长(单位:毫秒) + * + * 例子:1000ms => 收集每隔 1s 内收到的文本,作为一次 Response + * + * 默认值:1s,(0 为立即提交) + */ + batchSubmitTimeout?: number; +} + +export class ResponseStream { + // 将已有的大篇文字回复 chuck 成 stream 回复 + static createResponseStream(text: string, options?: ResponseStreamOptions) { + const { maxSentenceLength = 100 } = options ?? {}; + if (text.length > maxSentenceLength) { + const stream = new ResponseStream(options); + stream.addResponse(text); + stream.finish(); + return stream; + } + } + + maxSentenceLength: number; + firstSubmitTimeout: number; + batchSubmitTimeout: number; + constructor(options?: ResponseStreamOptions) { + const { + maxSentenceLength = 100, + firstSubmitTimeout = 200, + batchSubmitTimeout = 1000, + } = options ?? {}; + this.maxSentenceLength = maxSentenceLength; + this.firstSubmitTimeout = firstSubmitTimeout; + this.batchSubmitTimeout = batchSubmitTimeout; + } + + status: ResponseStatus = "responding"; + + cancel() { + if (["idle", "responding"].includes(this.status)) { + this.status = "canceled"; + } + return this.status === "canceled"; + } + + addResponse(text: string) { + if (this.status === "idle") { + this.status = "responding"; + } + if (this.status !== "responding") { + return; + } + this._batchSubmit(text); + } + + private _nextChunkIdx = 0; + getNextResponse() { + const nextSentence = this._chunks[this._nextChunkIdx]; + if (nextSentence) { + this._nextChunkIdx++; + } + const noMore = + this._nextChunkIdx > this._chunks.length - 1 && + ["finished", "canceled"].includes(this.status); + return { nextSentence, noMore }; + } + + finish() { + if (["idle", "responding"].includes(this.status)) { + if (this._tempText) { + // 提交暂存的文本 + this._addResponse(this._tempText); + this._tempText = ""; + } + if (this._remainingText) { + // 提交完整句子 + this._chunks.push(this._remainingText); + this._remainingText = ""; + } + this.status = "finished"; + } + return this.status === "finished"; + } + + private _chunks: string[] = []; + private _tempText = ""; + private _remainingText: string = ""; + private _preSubmitTimestamp = 0; + + /** + * 批量收集/提交收到的文字响应 + * + * 主要用途是使收到的 AI stream 回答的句子长度适中(不过长/短)。 + */ + private _batchSubmit(text: string, immediately?: boolean) { + this._tempText += text; + const submitImmediately = () => { + if (this._tempText) { + this._addResponse(this._tempText); + this._tempText = ""; + } + this._preSubmitTimestamp = Date.now(); + }; + immediately = + immediately ?? + (this.firstSubmitTimeout < 100 || this.batchSubmitTimeout < 100); + if (immediately) { + return submitImmediately(); + } + const isFirstSubmit = this._preSubmitTimestamp === 0; + const batchSubmit = (timeout: number) => { + // 当消息长度积攒到一定长度,或达到一定时间间隔后,批量提交消息 + if ( + Date.now() - this._preSubmitTimestamp > timeout || + this._tempText.length > this.maxSentenceLength + ) { + submitImmediately(); + } + }; + const submit = (timeout: number) => { + batchSubmit(timeout); + setTimeout(() => { + batchSubmit(timeout); + }, timeout); + }; + if (isFirstSubmit) { + this._preSubmitTimestamp = Date.now(); + submit(this.firstSubmitTimeout); + } else { + submit(this.batchSubmitTimeout); + } + } + + private _addResponse(text: string) { + this._remainingText += text; + while (this._remainingText.length > 0) { + let lastCutIndex = this._findLastCutIndex(this._remainingText); + if (lastCutIndex > 0) { + const currentChunk = this._remainingText.substring(0, lastCutIndex); + this._chunks.push(currentChunk); + this._remainingText = this._remainingText.substring(lastCutIndex); + } else { + // 搜索不到 + break; + } + } + } + + private _findLastCutIndex(text: string): number { + const punctuations = ",。?!:;……,.?!:;…"; + let lastCutIndex = -1; + for (let i = 0; i < Math.min(text.length, this.maxSentenceLength); i++) { + if (punctuations.includes(text[i])) { + lastCutIndex = i + 1; + } + } + return lastCutIndex; + } +} + +const stream = new ResponseStream(); + +// ai onNewText +// { +// onNewText(text:string){ +// if(stream.status==='canceled'){ +// return 'canceled'; +// } +// if(finished){ +// stream.finish() +// }else{ +// stream.addResponse(text) +// } +// } +// } diff --git a/tests/index.ts b/tests/index.ts index 7bb40d3..a8029e1 100644 --- a/tests/index.ts +++ b/tests/index.ts @@ -3,12 +3,14 @@ import { println } from "../src/utils/base"; import { kBannerASCII } from "../src/utils/string"; import { runWithDB } from "../src/services/db"; import { testDB } from "./db"; +import { testSpeaker } from "./speaker"; dotenv.config(); async function main() { println(kBannerASCII); - testDB(); + // testDB(); + testSpeaker(); } runWithDB(main); diff --git a/tests/speaker.ts b/tests/speaker.ts index 580f361..0820142 100644 --- a/tests/speaker.ts +++ b/tests/speaker.ts @@ -1,7 +1,8 @@ import { AISpeaker } from "../src/services/speaker/ai"; +import { ResponseStream } from "../src/services/speaker/stream"; import { sleep } from "../src/utils/base"; -export async function main() { +export async function testSpeaker() { const config: any = { userId: process.env.MI_USER!, password: process.env.MI_PASS!, @@ -12,10 +13,11 @@ export async function main() { const speaker = new AISpeaker(config); await speaker.initMiServices(); // await testSpeakerResponse(speaker); + await testSpeakerStreamResponse(speaker); // await testSpeakerGetMessages(speaker); // await testSwitchSpeaker(speaker); // await testSpeakerUnWakeUp(speaker); - await testAISpeaker(speaker); + // await testAISpeaker(speaker); } async function testAISpeaker(speaker: AISpeaker) { @@ -51,8 +53,34 @@ async function testSpeakerGetMessages(speaker: AISpeaker) { async function testSpeakerResponse(speaker: AISpeaker) { let status = await speaker.MiNA!.getStatus(); console.log("curent status", status); - speaker.response({ text: "你好,我是豆包,很高兴认识你!" }); + await speaker.response({ text: "你好,我是豆包,很高兴认识你!" }); sleep(1000); status = await speaker.MiNA!.getStatus(); console.log("tts status", status); } + +async function testSpeakerStreamResponse(speaker: AISpeaker) { + const stream = new ResponseStream(); + const add = async (text: string) => { + stream.addResponse(text); + await sleep(100); + }; + setTimeout(async () => { + await add(`地球是圆的主要原因`); + await add(`是由于地球的引力和自转。`); + await add(`地球的引力使得地球在形成过程中变得更加圆滑,因为引力会使得地球`); + await add(`的物质向地心靠拢,从而使得地球的形状更接近于一个球体。此外,`); + await add( + `地球的自转也会导致地球呈现出圆形,因为地球自转会使得地球的物质在赤道附近向外扩散,从而使得` + ); + await add( + `地球在赤道处稍微膨胀,而在极地处稍微收缩,最终形成一个近似于球体的形状。因此,地球是圆的` + ); + await add(`主要原因是由于地球的引力和自转共同作用所致。`); + await sleep(10 * 1000); + console.log("finished!"); + stream.finish(); + }); + await speaker.response({ stream }); + console.log("hello!"); +} diff --git a/yarn.lock b/yarn.lock index c77e5fa..b927a33 100644 --- a/yarn.lock +++ b/yarn.lock @@ -874,8 +874,10 @@ merge2@^1.3.0, merge2@^1.4.1: resolved "https://registry.yarnpkg.com/merge2/-/merge2-1.4.1.tgz#4368892f885e907455a6fd7dc55c0c9d404990ae" integrity sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg== -"mi-service-lite@file:../mi-service-lite": +mi-service-lite@^2.0.0: version "2.0.0" + resolved "https://registry.yarnpkg.com/mi-service-lite/-/mi-service-lite-2.0.0.tgz#c043a931574011c154a3113ecabe4fc2a61b328a" + integrity sha512-PqMWtvEHQ7a6mhKee9RAnT6Xh+rqf+RvhlCki/8VsSTnjREAzl/kxZh3U0ogFhN5iQzwlK4YC8Is0rnSljl2og== dependencies: axios "^1.6.5" pako "^2.1.0"