diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
index d0910cc..b40d080 100644
--- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
@@ -108,12 +108,52 @@ class ExecuTorchLlmEngine(
val responseBuilder = StringBuilder()
var firstTokenMs = -1L
+ // Track whether we're inside a … block so the upstream
+ // SentenceStreamer / TTS doesn't get fed reasoning tokens. Even with
+ // /no_think in the system prompt Qwen3 still emits empty
+ // wrappers for ~3 tokens before the real answer.
+ var inThink = false
+ val tokenScan = StringBuilder() // small lookahead to spot tag boundaries
val cb = object : LlmCallback {
override fun onResult(result: String) {
if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
responseBuilder.append(result)
- onToken?.invoke(result)
+
+ // Forward to caller only outside blocks. We accumulate
+ // a tiny lookahead buffer so tag tokens that arrive split
+ // ("") still match.
+ tokenScan.append(result)
+ while (true) {
+ if (!inThink) {
+ val open = tokenScan.indexOf("")
+ if (open < 0) {
+ // No tag pending — flush everything up to a safe point
+ // (length minus 7 for the longest tag we look for).
+ val safe = tokenScan.length - "".length
+ if (safe > 0) {
+ onToken?.invoke(tokenScan.substring(0, safe))
+ tokenScan.delete(0, safe)
+ }
+ break
+ }
+ // Flush the prose before the tag, then enter think mode.
+ if (open > 0) onToken?.invoke(tokenScan.substring(0, open))
+ tokenScan.delete(0, open + "".length)
+ inThink = true
+ } else {
+ val close = tokenScan.indexOf("")
+ if (close < 0) {
+ // Drop all buffered chars except a small tail in case
+ // the closing tag is split across tokens.
+ val keep = "".length - 1
+ if (tokenScan.length > keep) tokenScan.delete(0, tokenScan.length - keep)
+ break
+ }
+ tokenScan.delete(0, close + "".length)
+ inThink = false
+ }
+ }
}
override fun onStats(stats: String) {
nlog("stats: ${stats.take(200)}")
@@ -131,6 +171,13 @@ class ExecuTorchLlmEngine(
-1
}
+ // Drain any leftover prose buffered during -suppression so the
+ // last sentence reaches the TTS even if it ran past the closing tag.
+ if (!inThink && tokenScan.isNotEmpty()) {
+ onToken?.invoke(tokenScan.toString())
+ tokenScan.clear()
+ }
+
val elapsed = System.currentTimeMillis() - startTime
val rawText = responseBuilder.toString()
val responseText = cleanResponse(rawText)