diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt index d0910cc..b40d080 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt @@ -108,12 +108,52 @@ class ExecuTorchLlmEngine( val responseBuilder = StringBuilder() var firstTokenMs = -1L + // Track whether we're inside a block so the upstream + // SentenceStreamer / TTS doesn't get fed reasoning tokens. Even with + // /no_think in the system prompt Qwen3 still emits empty + // wrappers for ~3 tokens before the real answer. + var inThink = false + val tokenScan = StringBuilder() // small lookahead to spot tag boundaries val cb = object : LlmCallback { override fun onResult(result: String) { if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime responseBuilder.append(result) - onToken?.invoke(result) + + // Forward to caller only outside blocks. We accumulate + // a tiny lookahead buffer so tag tokens that arrive split + // ("") still match. + tokenScan.append(result) + while (true) { + if (!inThink) { + val open = tokenScan.indexOf("") + if (open < 0) { + // No tag pending — flush everything up to a safe point + // (length minus 7 for the longest tag we look for). + val safe = tokenScan.length - "".length + if (safe > 0) { + onToken?.invoke(tokenScan.substring(0, safe)) + tokenScan.delete(0, safe) + } + break + } + // Flush the prose before the tag, then enter think mode. + if (open > 0) onToken?.invoke(tokenScan.substring(0, open)) + tokenScan.delete(0, open + "".length) + inThink = true + } else { + val close = tokenScan.indexOf("") + if (close < 0) { + // Drop all buffered chars except a small tail in case + // the closing tag is split across tokens. + val keep = "".length - 1 + if (tokenScan.length > keep) tokenScan.delete(0, tokenScan.length - keep) + break + } + tokenScan.delete(0, close + "".length) + inThink = false + } + } } override fun onStats(stats: String) { nlog("stats: ${stats.take(200)}") @@ -131,6 +171,13 @@ class ExecuTorchLlmEngine( -1 } + // Drain any leftover prose buffered during -suppression so the + // last sentence reaches the TTS even if it ran past the closing tag. + if (!inThink && tokenScan.isNotEmpty()) { + onToken?.invoke(tokenScan.toString()) + tokenScan.clear() + } + val elapsed = System.currentTimeMillis() - startTime val rawText = responseBuilder.toString() val responseText = cleanResponse(rawText)