TTS: route all synthesizeAndPlay calls through Stage 3 streaming session
Replaces the four per-sentence TTS entry points (pipeline.speak, REPEAT
voice command, echo-mode TTS, LLM-response TTS) with a single shared
pipeline.speakText() that:
* opens a Qwen3TtsEngine streaming session when the TTS backend is
Qwen3 (voice-cloning path);
* feeds the whole response through a SentenceStreamer so the first
sentence starts playing as soon as it's decoded;
* falls back to the old one-shot synthesizeAndPlay for non-Qwen3 TTS
engines (AndroidTts, Chatterbox) that don't expose a session API.
KazeiaPipeline.speakText is now public so KazeiaService can use the
same dispatch — previously each call site re-implemented the
"streaming-or-fallback" logic or just called synthesizeAndPlay and
waited for the full synthesis.
Enabling the real on-device LLM is a separate task (task #48): the
existing llama-cli binary has ggml-hexagon linked in and fails to
init the DSP (0x80000406) when the TTS Hexagon runners hold the
session. Needs either a CPU-only llama-cli build or the restored
ExecuTorch qnn_llama_runner setup.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2f07901ff3
commit
0833d1bd21
|
|
@ -129,13 +129,35 @@ class KazeiaPipeline {
|
||||||
return ProcessorResult(responseText = text, metadata = mapOf("mode" to "echo"))
|
return ProcessorResult(responseText = text, metadata = mapOf("mode" to "echo"))
|
||||||
}
|
}
|
||||||
|
|
||||||
private suspend fun speak(text: String) {
|
private suspend fun speak(text: String) = speakText(text)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Public entry point for speaking a full (possibly multi-sentence) text.
|
||||||
|
* When TTS is Qwen3, text is sentence-split and fed through a streaming
|
||||||
|
* session so first audio arrives after the first sentence rather than
|
||||||
|
* after the full response is synthesised. Other TTS backends fall back
|
||||||
|
* to the legacy one-shot synthesizeAndPlay call.
|
||||||
|
*
|
||||||
|
* Made public so KazeiaService can route its voice-command replies and
|
||||||
|
* the echo-mode playback through the same path — otherwise each TTS
|
||||||
|
* site reimplemented the "streaming-or-fallback" dispatch.
|
||||||
|
*/
|
||||||
|
suspend fun speakText(text: String) {
|
||||||
val ttsEngine = tts ?: return
|
val ttsEngine = tts ?: return
|
||||||
_pipelineState.value = PipelineState.Speaking
|
_pipelineState.value = PipelineState.Speaking
|
||||||
try {
|
try {
|
||||||
ttsEngine.synthesizeAndPlay(text, context.language,
|
val qwen = ttsEngine as? com.kazeia.tts.Qwen3TtsEngine
|
||||||
onComplete = { _pipelineState.value = PipelineState.Idle }
|
if (qwen != null) {
|
||||||
)
|
qwen.startStreamingSession()
|
||||||
|
val streamer = com.kazeia.tts.SentenceStreamer { s -> qwen.enqueueSentence(s) }
|
||||||
|
streamer.append(text)
|
||||||
|
streamer.flush()
|
||||||
|
qwen.endStreamingSession()
|
||||||
|
} else {
|
||||||
|
ttsEngine.synthesizeAndPlay(text, context.language,
|
||||||
|
onComplete = { _pipelineState.value = PipelineState.Idle }
|
||||||
|
)
|
||||||
|
}
|
||||||
} catch (e: Exception) {
|
} catch (e: Exception) {
|
||||||
log("TTS error: ${e.message}")
|
log("TTS error: ${e.message}")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1008,13 +1008,13 @@ class KazeiaService : Service() {
|
||||||
val lastKazeia = _messages.value.lastOrNull { it.role == ChatMessage.Role.KAZEIA }
|
val lastKazeia = _messages.value.lastOrNull { it.role == ChatMessage.Role.KAZEIA }
|
||||||
if (lastKazeia != null) {
|
if (lastKazeia != null) {
|
||||||
serviceScope.launch {
|
serviceScope.launch {
|
||||||
_pipelineState.value = PipelineState.Speaking
|
// Route the "repeat last answer" command through the
|
||||||
tts.synthesizeAndPlay(lastKazeia.text, "fr",
|
// streaming TTS session so the user hears the first
|
||||||
onComplete = {
|
// sentence immediately instead of waiting for a full
|
||||||
_pipelineState.value = if (_isListening.value)
|
// re-synthesis of a long previous response.
|
||||||
PipelineState.Listening else PipelineState.Idle
|
pipeline.speakText(lastKazeia.text)
|
||||||
}
|
_pipelineState.value = if (_isListening.value)
|
||||||
)
|
PipelineState.Listening else PipelineState.Idle
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1132,20 +1132,15 @@ class KazeiaService : Service() {
|
||||||
_pipelineState.value = PipelineState.Thinking
|
_pipelineState.value = PipelineState.Thinking
|
||||||
conversationManager.onNewTurn()
|
conversationManager.onNewTurn()
|
||||||
|
|
||||||
// If LLM not loaded, use echo mode with TTS
|
// If LLM not loaded, use echo mode with TTS. Route through the
|
||||||
|
// streaming session — in debug builds this is the fast path most
|
||||||
|
// exercised, and it benefits from per-sentence playback like any
|
||||||
|
// real response would.
|
||||||
if (!llm.isLoaded()) {
|
if (!llm.isLoaded()) {
|
||||||
log("Echo mode: '$patientMessage' → TTS (${tts::class.simpleName})")
|
log("Echo mode: '$patientMessage' → TTS (${tts::class.simpleName})")
|
||||||
val echoResponse = patientMessage
|
val echoResponse = patientMessage
|
||||||
addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = echoResponse))
|
addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = echoResponse))
|
||||||
_pipelineState.value = PipelineState.Speaking
|
pipeline.speakText(echoResponse)
|
||||||
tts.synthesizeAndPlay(
|
|
||||||
text = echoResponse,
|
|
||||||
language = "fr",
|
|
||||||
onComplete = {
|
|
||||||
_pipelineState.value = if (_isListening.value)
|
|
||||||
PipelineState.Listening else PipelineState.Idle
|
|
||||||
}
|
|
||||||
)
|
|
||||||
_pipelineState.value = if (_isListening.value) PipelineState.Listening else PipelineState.Idle
|
_pipelineState.value = if (_isListening.value) PipelineState.Listening else PipelineState.Idle
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
@ -1185,16 +1180,7 @@ class KazeiaService : Service() {
|
||||||
|
|
||||||
if (responseText.isNotEmpty()) {
|
if (responseText.isNotEmpty()) {
|
||||||
addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = responseText))
|
addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = responseText))
|
||||||
|
pipeline.speakText(responseText)
|
||||||
_pipelineState.value = PipelineState.Speaking
|
|
||||||
tts.synthesizeAndPlay(
|
|
||||||
text = responseText,
|
|
||||||
language = "fr",
|
|
||||||
onComplete = {
|
|
||||||
_pipelineState.value = if (_isListening.value)
|
|
||||||
PipelineState.Listening else PipelineState.Idle
|
|
||||||
}
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
_pipelineState.value = if (_isListening.value)
|
_pipelineState.value = if (_isListening.value)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue