From b57719fa5e190a58bbd1c894e050b8aecbb452ed Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Tue, 14 Apr 2026 11:16:08 +0200 Subject: [PATCH] LLM: filter tokens out of the streaming TTS path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even with /no_think in the system prompt Qwen3 still emits an empty wrapper before the real answer. Without filtering, the SentenceStreamer treats '' as a sentence boundary and feeds three tokens of XML into the TTS, producing audible parasites at the start of each reply. The new in-callback filter buffers a small lookahead (just enough to span ""), suppresses everything between the open and close tags, and flushes the surrounding prose to onToken in order. With the lookahead, tags that arrive split across decoded pieces ("") still match. Validated end-to-end: prompt 'Bonjour, comment vas-tu ?' now streams sentence-by-sentence to the TTS — first segment "Bonjour !" reaches the talker at 4.6 s, no sneak-through. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../com/kazeia/llm/ExecuTorchLlmEngine.kt | 49 ++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt index d0910cc..b40d080 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt @@ -108,12 +108,52 @@ class ExecuTorchLlmEngine( val responseBuilder = StringBuilder() var firstTokenMs = -1L + // Track whether we're inside a block so the upstream + // SentenceStreamer / TTS doesn't get fed reasoning tokens. Even with + // /no_think in the system prompt Qwen3 still emits empty + // wrappers for ~3 tokens before the real answer. + var inThink = false + val tokenScan = StringBuilder() // small lookahead to spot tag boundaries val cb = object : LlmCallback { override fun onResult(result: String) { if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime responseBuilder.append(result) - onToken?.invoke(result) + + // Forward to caller only outside blocks. We accumulate + // a tiny lookahead buffer so tag tokens that arrive split + // ("") still match. + tokenScan.append(result) + while (true) { + if (!inThink) { + val open = tokenScan.indexOf("") + if (open < 0) { + // No tag pending — flush everything up to a safe point + // (length minus 7 for the longest tag we look for). + val safe = tokenScan.length - "".length + if (safe > 0) { + onToken?.invoke(tokenScan.substring(0, safe)) + tokenScan.delete(0, safe) + } + break + } + // Flush the prose before the tag, then enter think mode. + if (open > 0) onToken?.invoke(tokenScan.substring(0, open)) + tokenScan.delete(0, open + "".length) + inThink = true + } else { + val close = tokenScan.indexOf("") + if (close < 0) { + // Drop all buffered chars except a small tail in case + // the closing tag is split across tokens. + val keep = "".length - 1 + if (tokenScan.length > keep) tokenScan.delete(0, tokenScan.length - keep) + break + } + tokenScan.delete(0, close + "".length) + inThink = false + } + } } override fun onStats(stats: String) { nlog("stats: ${stats.take(200)}") @@ -131,6 +171,13 @@ class ExecuTorchLlmEngine( -1 } + // Drain any leftover prose buffered during -suppression so the + // last sentence reaches the TTS even if it ran past the closing tag. + if (!inThink && tokenScan.isNotEmpty()) { + onToken?.invoke(tokenScan.toString()) + tokenScan.clear() + } + val elapsed = System.currentTimeMillis() - startTime val rawText = responseBuilder.toString() val responseText = cleanResponse(rawText)