From 0833d1bd21da89f6724ce09f38e32a59e8a564f2 Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Mon, 13 Apr 2026 11:12:14 +0200 Subject: [PATCH] TTS: route all synthesizeAndPlay calls through Stage 3 streaming session MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the four per-sentence TTS entry points (pipeline.speak, REPEAT voice command, echo-mode TTS, LLM-response TTS) with a single shared pipeline.speakText() that: * opens a Qwen3TtsEngine streaming session when the TTS backend is Qwen3 (voice-cloning path); * feeds the whole response through a SentenceStreamer so the first sentence starts playing as soon as it's decoded; * falls back to the old one-shot synthesizeAndPlay for non-Qwen3 TTS engines (AndroidTts, Chatterbox) that don't expose a session API. KazeiaPipeline.speakText is now public so KazeiaService can use the same dispatch — previously each call site re-implemented the "streaming-or-fallback" logic or just called synthesizeAndPlay and waited for the full synthesis. Enabling the real on-device LLM is a separate task (task #48): the existing llama-cli binary has ggml-hexagon linked in and fails to init the DSP (0x80000406) when the TTS Hexagon runners hold the session. Needs either a CPU-only llama-cli build or the restored ExecuTorch qnn_llama_runner setup. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../java/com/kazeia/service/KazeiaPipeline.kt | 30 ++++++++++++-- .../java/com/kazeia/service/KazeiaService.kt | 40 ++++++------------- 2 files changed, 39 insertions(+), 31 deletions(-) diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt index 30d112e..104c924 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt @@ -129,13 +129,35 @@ class KazeiaPipeline { return ProcessorResult(responseText = text, metadata = mapOf("mode" to "echo")) } - private suspend fun speak(text: String) { + private suspend fun speak(text: String) = speakText(text) + + /** + * Public entry point for speaking a full (possibly multi-sentence) text. + * When TTS is Qwen3, text is sentence-split and fed through a streaming + * session so first audio arrives after the first sentence rather than + * after the full response is synthesised. Other TTS backends fall back + * to the legacy one-shot synthesizeAndPlay call. + * + * Made public so KazeiaService can route its voice-command replies and + * the echo-mode playback through the same path — otherwise each TTS + * site reimplemented the "streaming-or-fallback" dispatch. + */ + suspend fun speakText(text: String) { val ttsEngine = tts ?: return _pipelineState.value = PipelineState.Speaking try { - ttsEngine.synthesizeAndPlay(text, context.language, - onComplete = { _pipelineState.value = PipelineState.Idle } - ) + val qwen = ttsEngine as? com.kazeia.tts.Qwen3TtsEngine + if (qwen != null) { + qwen.startStreamingSession() + val streamer = com.kazeia.tts.SentenceStreamer { s -> qwen.enqueueSentence(s) } + streamer.append(text) + streamer.flush() + qwen.endStreamingSession() + } else { + ttsEngine.synthesizeAndPlay(text, context.language, + onComplete = { _pipelineState.value = PipelineState.Idle } + ) + } } catch (e: Exception) { log("TTS error: ${e.message}") } diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt index 5084dba..e973a7d 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt @@ -1008,13 +1008,13 @@ class KazeiaService : Service() { val lastKazeia = _messages.value.lastOrNull { it.role == ChatMessage.Role.KAZEIA } if (lastKazeia != null) { serviceScope.launch { - _pipelineState.value = PipelineState.Speaking - tts.synthesizeAndPlay(lastKazeia.text, "fr", - onComplete = { - _pipelineState.value = if (_isListening.value) - PipelineState.Listening else PipelineState.Idle - } - ) + // Route the "repeat last answer" command through the + // streaming TTS session so the user hears the first + // sentence immediately instead of waiting for a full + // re-synthesis of a long previous response. + pipeline.speakText(lastKazeia.text) + _pipelineState.value = if (_isListening.value) + PipelineState.Listening else PipelineState.Idle } } } @@ -1132,20 +1132,15 @@ class KazeiaService : Service() { _pipelineState.value = PipelineState.Thinking conversationManager.onNewTurn() - // If LLM not loaded, use echo mode with TTS + // If LLM not loaded, use echo mode with TTS. Route through the + // streaming session — in debug builds this is the fast path most + // exercised, and it benefits from per-sentence playback like any + // real response would. if (!llm.isLoaded()) { log("Echo mode: '$patientMessage' → TTS (${tts::class.simpleName})") val echoResponse = patientMessage addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = echoResponse)) - _pipelineState.value = PipelineState.Speaking - tts.synthesizeAndPlay( - text = echoResponse, - language = "fr", - onComplete = { - _pipelineState.value = if (_isListening.value) - PipelineState.Listening else PipelineState.Idle - } - ) + pipeline.speakText(echoResponse) _pipelineState.value = if (_isListening.value) PipelineState.Listening else PipelineState.Idle return } @@ -1185,16 +1180,7 @@ class KazeiaService : Service() { if (responseText.isNotEmpty()) { addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = responseText)) - - _pipelineState.value = PipelineState.Speaking - tts.synthesizeAndPlay( - text = responseText, - language = "fr", - onComplete = { - _pipelineState.value = if (_isListening.value) - PipelineState.Listening else PipelineState.Idle - } - ) + pipeline.speakText(responseText) } _pipelineState.value = if (_isListening.value)