LLM: filter <think> tokens out of the streaming TTS path
Even with /no_think in the system prompt Qwen3 still emits an empty
<think>…</think> wrapper before the real answer. Without filtering, the
SentenceStreamer treats '<think>' as a sentence boundary and feeds three
tokens of XML into the TTS, producing audible parasites at the start of
each reply.
The new in-callback filter buffers a small lookahead (just enough to span
"</think>"), suppresses everything between the open and close tags, and
flushes the surrounding prose to onToken in order. With the lookahead, tags
that arrive split across decoded pieces ("<thi"+"nk>") still match.
Validated end-to-end: prompt 'Bonjour, comment vas-tu ?' now streams
sentence-by-sentence to the TTS — first segment "Bonjour !" reaches the
talker at 4.6 s, no <think> sneak-through.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f32b5ddfdd
commit
b57719fa5e
|
|
@ -108,12 +108,52 @@ class ExecuTorchLlmEngine(
|
||||||
|
|
||||||
val responseBuilder = StringBuilder()
|
val responseBuilder = StringBuilder()
|
||||||
var firstTokenMs = -1L
|
var firstTokenMs = -1L
|
||||||
|
// Track whether we're inside a <think>…</think> block so the upstream
|
||||||
|
// SentenceStreamer / TTS doesn't get fed reasoning tokens. Even with
|
||||||
|
// /no_think in the system prompt Qwen3 still emits empty <think></think>
|
||||||
|
// wrappers for ~3 tokens before the real answer.
|
||||||
|
var inThink = false
|
||||||
|
val tokenScan = StringBuilder() // small lookahead to spot tag boundaries
|
||||||
|
|
||||||
val cb = object : LlmCallback {
|
val cb = object : LlmCallback {
|
||||||
override fun onResult(result: String) {
|
override fun onResult(result: String) {
|
||||||
if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
|
if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
|
||||||
responseBuilder.append(result)
|
responseBuilder.append(result)
|
||||||
onToken?.invoke(result)
|
|
||||||
|
// Forward to caller only outside <think> blocks. We accumulate
|
||||||
|
// a tiny lookahead buffer so tag tokens that arrive split
|
||||||
|
// ("<thi", "nk>") still match.
|
||||||
|
tokenScan.append(result)
|
||||||
|
while (true) {
|
||||||
|
if (!inThink) {
|
||||||
|
val open = tokenScan.indexOf("<think>")
|
||||||
|
if (open < 0) {
|
||||||
|
// No tag pending — flush everything up to a safe point
|
||||||
|
// (length minus 7 for the longest tag we look for).
|
||||||
|
val safe = tokenScan.length - "<think>".length
|
||||||
|
if (safe > 0) {
|
||||||
|
onToken?.invoke(tokenScan.substring(0, safe))
|
||||||
|
tokenScan.delete(0, safe)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
// Flush the prose before the tag, then enter think mode.
|
||||||
|
if (open > 0) onToken?.invoke(tokenScan.substring(0, open))
|
||||||
|
tokenScan.delete(0, open + "<think>".length)
|
||||||
|
inThink = true
|
||||||
|
} else {
|
||||||
|
val close = tokenScan.indexOf("</think>")
|
||||||
|
if (close < 0) {
|
||||||
|
// Drop all buffered chars except a small tail in case
|
||||||
|
// the closing tag is split across tokens.
|
||||||
|
val keep = "</think>".length - 1
|
||||||
|
if (tokenScan.length > keep) tokenScan.delete(0, tokenScan.length - keep)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
tokenScan.delete(0, close + "</think>".length)
|
||||||
|
inThink = false
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
override fun onStats(stats: String) {
|
override fun onStats(stats: String) {
|
||||||
nlog("stats: ${stats.take(200)}")
|
nlog("stats: ${stats.take(200)}")
|
||||||
|
|
@ -131,6 +171,13 @@ class ExecuTorchLlmEngine(
|
||||||
-1
|
-1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Drain any leftover prose buffered during <think>-suppression so the
|
||||||
|
// last sentence reaches the TTS even if it ran past the closing tag.
|
||||||
|
if (!inThink && tokenScan.isNotEmpty()) {
|
||||||
|
onToken?.invoke(tokenScan.toString())
|
||||||
|
tokenScan.clear()
|
||||||
|
}
|
||||||
|
|
||||||
val elapsed = System.currentTimeMillis() - startTime
|
val elapsed = System.currentTimeMillis() - startTime
|
||||||
val rawText = responseBuilder.toString()
|
val rawText = responseBuilder.toString()
|
||||||
val responseText = cleanResponse(rawText)
|
val responseText = cleanResponse(rawText)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue