From b57719fa5e190a58bbd1c894e050b8aecbb452ed Mon Sep 17 00:00:00 2001
From: Kazeia Team <support@kazeia.com>
Date: Tue, 14 Apr 2026 11:16:08 +0200
Subject: [PATCH] LLM: filter <think> tokens out of the streaming TTS path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Even with /no_think in the system prompt Qwen3 still emits an empty
<think>…</think> wrapper before the real answer. Without filtering, the
SentenceStreamer treats '<think>' as a sentence boundary and feeds three
tokens of XML into the TTS, producing audible parasites at the start of
each reply.

The new in-callback filter buffers a small lookahead (just enough to span
"</think>"), suppresses everything between the open and close tags, and
flushes the surrounding prose to onToken in order. With the lookahead, tags
that arrive split across decoded pieces ("<thi"+"nk>") still match.

Validated end-to-end: prompt 'Bonjour, comment vas-tu ?' now streams
sentence-by-sentence to the TTS — first segment "Bonjour !" reaches the
talker at 4.6 s, no <think> sneak-through.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../com/kazeia/llm/ExecuTorchLlmEngine.kt     | 49 ++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)
diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
index d0910cc..b40d080 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
@@ -108,12 +108,52 @@ class ExecuTorchLlmEngine(
 
         val responseBuilder = StringBuilder()
         var firstTokenMs = -1L
+        // Track whether we're inside a <think>…</think> block so the upstream
+        // SentenceStreamer / TTS doesn't get fed reasoning tokens. Even with
+        // /no_think in the system prompt Qwen3 still emits empty <think></think>
+        // wrappers for ~3 tokens before the real answer.
+        var inThink = false
+        val tokenScan = StringBuilder()  // small lookahead to spot tag boundaries
 
         val cb = object : LlmCallback {
             override fun onResult(result: String) {
                 if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
                 responseBuilder.append(result)
-                onToken?.invoke(result)
+
+                // Forward to caller only outside <think> blocks. We accumulate
+                // a tiny lookahead buffer so tag tokens that arrive split
+                // ("<thi", "nk>") still match.
+                tokenScan.append(result)
+                while (true) {
+                    if (!inThink) {
+                        val open = tokenScan.indexOf("<think>")
+                        if (open < 0) {
+                            // No tag pending — flush everything up to a safe point
+                            // (length minus 7 for the longest tag we look for).
+                            val safe = tokenScan.length - "<think>".length
+                            if (safe > 0) {
+                                onToken?.invoke(tokenScan.substring(0, safe))
+                                tokenScan.delete(0, safe)
+                            }
+                            break
+                        }
+                        // Flush the prose before the tag, then enter think mode.
+                        if (open > 0) onToken?.invoke(tokenScan.substring(0, open))
+                        tokenScan.delete(0, open + "<think>".length)
+                        inThink = true
+                    } else {
+                        val close = tokenScan.indexOf("</think>")
+                        if (close < 0) {
+                            // Drop all buffered chars except a small tail in case
+                            // the closing tag is split across tokens.
+                            val keep = "</think>".length - 1
+                            if (tokenScan.length > keep) tokenScan.delete(0, tokenScan.length - keep)
+                            break
+                        }
+                        tokenScan.delete(0, close + "</think>".length)
+                        inThink = false
+                    }
+                }
             }
             override fun onStats(stats: String) {
                 nlog("stats: ${stats.take(200)}")
@@ -131,6 +171,13 @@ class ExecuTorchLlmEngine(
             -1
         }
 
+        // Drain any leftover prose buffered during <think>-suppression so the
+        // last sentence reaches the TTS even if it ran past the closing tag.
+        if (!inThink && tokenScan.isNotEmpty()) {
+            onToken?.invoke(tokenScan.toString())
+            tokenScan.clear()
+        }
+
         val elapsed = System.currentTimeMillis() - startTime
         val rawText = responseBuilder.toString()
         val responseText = cleanResponse(rawText)