TTS: route all synthesizeAndPlay calls through Stage 3 streaming session

Replaces the four per-sentence TTS entry points (pipeline.speak, REPEAT voice command, echo-mode TTS, LLM-response TTS) with a single shared pipeline.speakText() that: * opens a Qwen3TtsEngine streaming session when the TTS backend is Qwen3 (voice-cloning path); * feeds the whole response through a SentenceStreamer so the first sentence starts playing as soon as it's decoded; * falls back to the old one-shot synthesizeAndPlay for non-Qwen3 TTS engines (AndroidTts, Chatterbox) that don't expose a session API. KazeiaPipeline.speakText is now public so KazeiaService can use the same dispatch — previously each call site re-implemented the "streaming-or-fallback" logic or just called synthesizeAndPlay and waited for the full synthesis. Enabling the real on-device LLM is a separate task (task #48): the existing llama-cli binary has ggml-hexagon linked in and fails to init the DSP (0x80000406) when the TTS Hexagon runners hold the session. Needs either a CPU-only llama-cli build or the restored ExecuTorch qnn_llama_runner setup. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-13 11:12:14 +02:00 · 2026-04-13 11:12:14 +02:00 · 0833d1bd21
parent 2f07901ff3
commit 0833d1bd21
2 changed files with 39 additions and 31 deletions
--- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt
@ -129,13 +129,35 @@ class KazeiaPipeline {
        return ProcessorResult(responseText = text, metadata = mapOf("mode" to "echo"))
    }
-    private suspend fun speak(text: String) {
+    private suspend fun speak(text: String) = speakText(text)
    /**
     * Public entry point for speaking a full (possibly multi-sentence) text.
     * When TTS is Qwen3, text is sentence-split and fed through a streaming
     * session so first audio arrives after the first sentence rather than
     * after the full response is synthesised. Other TTS backends fall back
     * to the legacy one-shot synthesizeAndPlay call.
     *
     * Made public so KazeiaService can route its voice-command replies and
     * the echo-mode playback through the same path — otherwise each TTS
     * site reimplemented the "streaming-or-fallback" dispatch.
     */
    suspend fun speakText(text: String) {
        val ttsEngine = tts ?: return
        _pipelineState.value = PipelineState.Speaking
        try {
-            ttsEngine.synthesizeAndPlay(text, context.language,
+            val qwen = ttsEngine as? com.kazeia.tts.Qwen3TtsEngine
-                onComplete = { _pipelineState.value = PipelineState.Idle }
+            if (qwen != null) {
-            )
+                qwen.startStreamingSession()
                val streamer = com.kazeia.tts.SentenceStreamer { s -> qwen.enqueueSentence(s) }
                streamer.append(text)
                streamer.flush()
                qwen.endStreamingSession()
            } else {
                ttsEngine.synthesizeAndPlay(text, context.language,
                    onComplete = { _pipelineState.value = PipelineState.Idle }
                )
            }
        } catch (e: Exception) {
            log("TTS error: ${e.message}")
        }
--- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
@ -1008,13 +1008,13 @@ class KazeiaService : Service() {
                val lastKazeia = _messages.value.lastOrNull { it.role == ChatMessage.Role.KAZEIA }
                if (lastKazeia != null) {
                    serviceScope.launch {
-                        _pipelineState.value = PipelineState.Speaking
+                        // Route the "repeat last answer" command through the
-                        tts.synthesizeAndPlay(lastKazeia.text, "fr",
+                        // streaming TTS session so the user hears the first
-                            onComplete = {
+                        // sentence immediately instead of waiting for a full
-                                _pipelineState.value = if (_isListening.value)
+                        // re-synthesis of a long previous response.
-                                    PipelineState.Listening else PipelineState.Idle
+                        pipeline.speakText(lastKazeia.text)
-                            }
+                        _pipelineState.value = if (_isListening.value)
-                        )
+                            PipelineState.Listening else PipelineState.Idle
                    }
                }
            }
@ -1132,20 +1132,15 @@ class KazeiaService : Service() {
        _pipelineState.value = PipelineState.Thinking
        conversationManager.onNewTurn()
-        // If LLM not loaded, use echo mode with TTS
+        // If LLM not loaded, use echo mode with TTS. Route through the
        // streaming session — in debug builds this is the fast path most
        // exercised, and it benefits from per-sentence playback like any
        // real response would.
        if (!llm.isLoaded()) {
            log("Echo mode: '$patientMessage' → TTS (${tts::class.simpleName})")
            val echoResponse = patientMessage
            addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = echoResponse))
-            _pipelineState.value = PipelineState.Speaking
+            pipeline.speakText(echoResponse)
            tts.synthesizeAndPlay(
                text = echoResponse,
                language = "fr",
                onComplete = {
                    _pipelineState.value = if (_isListening.value)
                        PipelineState.Listening else PipelineState.Idle
                }
            )
            _pipelineState.value = if (_isListening.value) PipelineState.Listening else PipelineState.Idle
            return
        }
@ -1185,16 +1180,7 @@ class KazeiaService : Service() {
            if (responseText.isNotEmpty()) {
                addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = responseText))
-
+                pipeline.speakText(responseText)
                _pipelineState.value = PipelineState.Speaking
                tts.synthesizeAndPlay(
                    text = responseText,
                    language = "fr",
                    onComplete = {
                        _pipelineState.value = if (_isListening.value)
                            PipelineState.Listening else PipelineState.Idle
                    }
                )
            }
            _pipelineState.value = if (_isListening.value)