LLM+TTS: short-response system prompt, PTE streaming fallback

- ExecuTorchLlmEngine: system prompt forces French, 1-2 short sentences,
  /no_think so the full budget goes to the answer (Qwen3 was consuming
  120+ tokens on <think>); eval_mode 0 matches our kv-mode export.
- Qwen3TtsEngine.generateSegmentAudioVC: when the Hexagon talker socket
  isn't open, fall back to runInterleavedPteFromEmbeds so the Stage 3
  streaming session still produces audio. Without this the session opened,
  accepted sentences, and silently emitted empty PCM.

Documents the QNN SDK version-skew pitfall in ExecuTorchLlmEngine.kt
ahead of the upcoming migration to a unified v2.42 toolchain.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kazeia Team 2026-04-14 00:17:08 +02:00
parent 9930bfa392
commit 364016b7b8
2 changed files with 34 additions and 21 deletions

View File

@ -11,6 +11,14 @@ import java.io.File
* Calls qnn_llama_runner binary with root access. * Calls qnn_llama_runner binary with root access.
* Current tablet config: Qwen3-4B KV-mode, ~18-20 tok/s on Hexagon V79 (Snapdragon 8 Elite), * Current tablet config: Qwen3-4B KV-mode, ~18-20 tok/s on Hexagon V79 (Snapdragon 8 Elite),
* TTFT 0.9 s, RSS 1.76 GB. Previously tested Qwen3-0.6B at ~76 tok/s. * TTFT 0.9 s, RSS 1.76 GB. Previously tested Qwen3-0.6B at ~76 tok/s.
*
* TODO: migrate binary + QNN libs out of /data/local/tmp so ProcessBuilder can
* run them without su. The challenge is the QNN SDK version lock between
* ARM64 libs and the Hexagon skel bundling the v2.42 pair in the APK
* conflicts with the existing TTS stack which ships its own v2.31 pair.
* Either per-process library-path isolation (LD_LIBRARY_PATH pointing at
* context.filesDir/llm/, ADSP_LIBRARY_PATH likewise) with assets-based
* extraction, or consolidating the TTS stack onto the same QNN version.
*/ */
class ExecuTorchLlmEngine( class ExecuTorchLlmEngine(
private val onLog: ((String) -> Unit)? = null private val onLog: ((String) -> Unit)? = null
@ -19,7 +27,12 @@ class ExecuTorchLlmEngine(
companion object { companion object {
private const val TAG = "ExecuTorchLLM" private const val TAG = "ExecuTorchLLM"
private const val RUNNER_DIR = "/data/local/tmp/kazeia-et" private const val RUNNER_DIR = "/data/local/tmp/kazeia-et"
private const val SYSTEM_PROMPT = "" // /no_think disables Qwen3's chain-of-thought block so the full token
// budget goes to the actual answer (without it, 120-200 tokens get
// consumed by <think>…</think> leaving nothing to speak).
// Short-response directive keeps TTS latency manageable — each sentence
// costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot.
private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think"
} }
private var modelName = "" private var modelName = ""
@ -44,8 +57,13 @@ class ExecuTorchLlmEngine(
// Quick test // Quick test
writeFileRoot("$RUNNER_DIR/outputs/prompt.b64", writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP)) android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP))
execRoot("rm -f $RUNNER_DIR/outputs/system.b64") if (SYSTEM_PROMPT.isNotEmpty()) {
val test = execRoot("su -c 'sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1'") writeFileRoot("$RUNNER_DIR/outputs/system.b64",
android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
} else {
execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
}
val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1")
if (test.contains("Generated Tokens") || test.contains("Rate:")) { if (test.contains("Generated Tokens") || test.contains("Rate:")) {
loaded = true loaded = true
@ -70,7 +88,6 @@ class ExecuTorchLlmEngine(
val startTime = System.currentTimeMillis() val startTime = System.currentTimeMillis()
// Write base64-encoded prompt to file (avoids all shell escaping issues)
writeFileRoot("$RUNNER_DIR/outputs/prompt.b64", writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP)) android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP))
if (SYSTEM_PROMPT.isNotEmpty()) { if (SYSTEM_PROMPT.isNotEmpty()) {
@ -82,11 +99,9 @@ class ExecuTorchLlmEngine(
nlog("Prompt: '${prompt.take(80)}'") nlog("Prompt: '${prompt.take(80)}'")
// seq_len = maxNewTokens but capped at model's compiled max context (512)
val seqLen = minOf(params.maxNewTokens, 512) val seqLen = minOf(params.maxNewTokens, 512)
val output = execRoot("su -c 'sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1'") val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1")
// Parse perf stats
val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output) val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output)
?.groupValues?.get(1)?.toIntOrNull() ?: 0 ?.groupValues?.get(1)?.toIntOrNull() ?: 0
val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output) val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output)
@ -94,7 +109,6 @@ class ExecuTorchLlmEngine(
val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output) val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output)
?.groupValues?.get(1)?.toFloatOrNull() ?: 0f ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
// Read response
val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null") val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null")
nlog("RAW: ${responseRaw.take(300)}") nlog("RAW: ${responseRaw.take(300)}")
val responseText = extractResponse(responseRaw) val responseText = extractResponse(responseRaw)
@ -116,26 +130,19 @@ class ExecuTorchLlmEngine(
/** Extract clean response text from Qwen3 output (strips think block and special tokens) */ /** Extract clean response text from Qwen3 output (strips think block and special tokens) */
private fun extractResponse(raw: String): String { private fun extractResponse(raw: String): String {
var text = raw var text = raw
// Strip everything up to and including </think>
val thinkEnd = text.indexOf("</think>") val thinkEnd = text.indexOf("</think>")
if (thinkEnd >= 0) { if (thinkEnd >= 0) {
text = text.substring(thinkEnd + "</think>".length) text = text.substring(thinkEnd + "</think>".length)
} else { } else {
// No </think> found — the think block consumed all tokens
// Try to find any text after the <think> block that looks like a response
val thinkStart = text.indexOf("<think>") val thinkStart = text.indexOf("<think>")
val assistantTag = text.indexOf("assistant") val assistantTag = text.indexOf("assistant")
if (thinkStart >= 0) { if (thinkStart >= 0) {
// Think block never closed — no usable response
// Return empty so the service can handle it
nlog("WARN: <think> block never closed, no response generated") nlog("WARN: <think> block never closed, no response generated")
return "" return ""
} else if (assistantTag >= 0) { } else if (assistantTag >= 0) {
text = text.substring(assistantTag + "assistant".length) text = text.substring(assistantTag + "assistant".length)
} }
} }
return text return text
.replace("<|im_start|>", "") .replace("<|im_start|>", "")
.replace("<|im_end|>", "") .replace("<|im_end|>", "")
@ -156,10 +163,8 @@ export ADSP_LIBRARY_PATH=$RUNNER_DIR
TEMP=${'$'}1 TEMP=${'$'}1
SEQ_LEN=${'$'}2 SEQ_LEN=${'$'}2
# Decode base64 prompt (avoids all shell escaping issues with quotes/apostrophes)
PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64) PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64)
# Clear old response
rm -f $RUNNER_DIR/outputs/response.txt rm -f $RUNNER_DIR/outputs/response.txt
SYSTEM_ARGS="" SYSTEM_ARGS=""

View File

@ -3403,9 +3403,6 @@ class Qwen3TtsEngine(
if (bpeTokenizer == null || textEmbedsFullBuf == null || damienVoicePrefix == null || damienVoiceSuffix == null) { if (bpeTokenizer == null || textEmbedsFullBuf == null || damienVoicePrefix == null || damienVoiceSuffix == null) {
nlog("generateSegmentAudioVC: Stage 2 assets missing"); return ShortArray(0) nlog("generateSegmentAudioVC: Stage 2 assets missing"); return ShortArray(0)
} }
// Reset Hexagon KV between sentences so the talker context doesn't
// accumulate state from the previous one.
hexReset()
val prefix = damienVoicePrefix!! val prefix = damienVoicePrefix!!
val suffix = damienVoiceSuffix!! val suffix = damienVoiceSuffix!!
val codecPadEmb = codecEmb(CODEC_PAD) val codecPadEmb = codecEmb(CODEC_PAD)
@ -3423,7 +3420,18 @@ class Qwen3TtsEngine(
val expectedSteps = (ids.size * 24) / 10 val expectedSteps = (ids.size * 24) / 10
val maxGen = minOf(expectedSteps * 3 / 2 + 10, MAX_CONTEXT - 15) val maxGen = minOf(expectedSteps * 3 / 2 + 10, MAX_CONTEXT - 15)
val eosBoostMinStep = expectedSteps / 2 val eosBoostMinStep = expectedSteps / 2
val codes = runHexGenWithPrefill(prefill, maxGen, eosBoostMinStep)
// Backend dispatch: with the DSP-contention fix (force_hexagon removed)
// the Hexagon talker socket isn't opened. Fall back to the .pte path,
// which creates fresh KV arrays per call so no manual reset is needed.
val codes: Array<IntArray> = if (talkerSocket != null) {
hexReset()
runHexGenWithPrefill(prefill, maxGen, eosBoostMinStep)
} else if (talkerPteModule != null && cpPteModule != null) {
runInterleavedPteFromEmbeds(prefill, emptyList(), maxGen)
} else {
nlog("generateSegmentAudioVC: no talker backend available"); return ShortArray(0)
}
if (codes.isEmpty()) return ShortArray(0) if (codes.isEmpty()) return ShortArray(0)
val n = codes.size val n = codes.size