From 364016b7b8a4a432d31bf24c9e283de077d9e254 Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Tue, 14 Apr 2026 00:17:08 +0200 Subject: [PATCH] LLM+TTS: short-response system prompt, PTE streaming fallback - ExecuTorchLlmEngine: system prompt forces French, 1-2 short sentences, /no_think so the full budget goes to the answer (Qwen3 was consuming 120+ tokens on ); eval_mode 0 matches our kv-mode export. - Qwen3TtsEngine.generateSegmentAudioVC: when the Hexagon talker socket isn't open, fall back to runInterleavedPteFromEmbeds so the Stage 3 streaming session still produces audio. Without this the session opened, accepted sentences, and silently emitted empty PCM. Documents the QNN SDK version-skew pitfall in ExecuTorchLlmEngine.kt ahead of the upcoming migration to a unified v2.42 toolchain. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../com/kazeia/llm/ExecuTorchLlmEngine.kt | 39 +++++++++++-------- .../java/com/kazeia/tts/Qwen3TtsEngine.kt | 16 ++++++-- 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt index 29bb0b0..26a27f5 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt @@ -11,6 +11,14 @@ import java.io.File * Calls qnn_llama_runner binary with root access. * Current tablet config: Qwen3-4B KV-mode, ~18-20 tok/s on Hexagon V79 (Snapdragon 8 Elite), * TTFT 0.9 s, RSS 1.76 GB. Previously tested Qwen3-0.6B at ~76 tok/s. + * + * TODO: migrate binary + QNN libs out of /data/local/tmp so ProcessBuilder can + * run them without su. The challenge is the QNN SDK version lock between + * ARM64 libs and the Hexagon skel — bundling the v2.42 pair in the APK + * conflicts with the existing TTS stack which ships its own v2.31 pair. + * Either per-process library-path isolation (LD_LIBRARY_PATH pointing at + * context.filesDir/llm/, ADSP_LIBRARY_PATH likewise) with assets-based + * extraction, or consolidating the TTS stack onto the same QNN version. */ class ExecuTorchLlmEngine( private val onLog: ((String) -> Unit)? = null @@ -19,7 +27,12 @@ class ExecuTorchLlmEngine( companion object { private const val TAG = "ExecuTorchLLM" private const val RUNNER_DIR = "/data/local/tmp/kazeia-et" - private const val SYSTEM_PROMPT = "" + // /no_think disables Qwen3's chain-of-thought block so the full token + // budget goes to the actual answer (without it, 120-200 tokens get + // consumed by leaving nothing to speak). + // Short-response directive keeps TTS latency manageable — each sentence + // costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot. + private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think" } private var modelName = "" @@ -44,8 +57,13 @@ class ExecuTorchLlmEngine( // Quick test writeFileRoot("$RUNNER_DIR/outputs/prompt.b64", android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP)) - execRoot("rm -f $RUNNER_DIR/outputs/system.b64") - val test = execRoot("su -c 'sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1'") + if (SYSTEM_PROMPT.isNotEmpty()) { + writeFileRoot("$RUNNER_DIR/outputs/system.b64", + android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP)) + } else { + execRoot("rm -f $RUNNER_DIR/outputs/system.b64") + } + val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1") if (test.contains("Generated Tokens") || test.contains("Rate:")) { loaded = true @@ -70,7 +88,6 @@ class ExecuTorchLlmEngine( val startTime = System.currentTimeMillis() - // Write base64-encoded prompt to file (avoids all shell escaping issues) writeFileRoot("$RUNNER_DIR/outputs/prompt.b64", android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP)) if (SYSTEM_PROMPT.isNotEmpty()) { @@ -82,11 +99,9 @@ class ExecuTorchLlmEngine( nlog("Prompt: '${prompt.take(80)}'") - // seq_len = maxNewTokens but capped at model's compiled max context (512) val seqLen = minOf(params.maxNewTokens, 512) - val output = execRoot("su -c 'sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1'") + val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1") - // Parse perf stats val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output) ?.groupValues?.get(1)?.toIntOrNull() ?: 0 val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output) @@ -94,7 +109,6 @@ class ExecuTorchLlmEngine( val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output) ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f - // Read response val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null") nlog("RAW: ${responseRaw.take(300)}") val responseText = extractResponse(responseRaw) @@ -116,26 +130,19 @@ class ExecuTorchLlmEngine( /** Extract clean response text from Qwen3 output (strips think block and special tokens) */ private fun extractResponse(raw: String): String { var text = raw - - // Strip everything up to and including val thinkEnd = text.indexOf("") if (thinkEnd >= 0) { text = text.substring(thinkEnd + "".length) } else { - // No found — the think block consumed all tokens - // Try to find any text after the block that looks like a response val thinkStart = text.indexOf("") val assistantTag = text.indexOf("assistant") if (thinkStart >= 0) { - // Think block never closed — no usable response - // Return empty so the service can handle it nlog("WARN: block never closed, no response generated") return "" } else if (assistantTag >= 0) { text = text.substring(assistantTag + "assistant".length) } } - return text .replace("<|im_start|>", "") .replace("<|im_end|>", "") @@ -156,10 +163,8 @@ export ADSP_LIBRARY_PATH=$RUNNER_DIR TEMP=${'$'}1 SEQ_LEN=${'$'}2 -# Decode base64 prompt (avoids all shell escaping issues with quotes/apostrophes) PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64) -# Clear old response rm -f $RUNNER_DIR/outputs/response.txt SYSTEM_ARGS="" diff --git a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt index 30c8833..461f10d 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt @@ -3403,9 +3403,6 @@ class Qwen3TtsEngine( if (bpeTokenizer == null || textEmbedsFullBuf == null || damienVoicePrefix == null || damienVoiceSuffix == null) { nlog("generateSegmentAudioVC: Stage 2 assets missing"); return ShortArray(0) } - // Reset Hexagon KV between sentences so the talker context doesn't - // accumulate state from the previous one. - hexReset() val prefix = damienVoicePrefix!! val suffix = damienVoiceSuffix!! val codecPadEmb = codecEmb(CODEC_PAD) @@ -3423,7 +3420,18 @@ class Qwen3TtsEngine( val expectedSteps = (ids.size * 24) / 10 val maxGen = minOf(expectedSteps * 3 / 2 + 10, MAX_CONTEXT - 15) val eosBoostMinStep = expectedSteps / 2 - val codes = runHexGenWithPrefill(prefill, maxGen, eosBoostMinStep) + + // Backend dispatch: with the DSP-contention fix (force_hexagon removed) + // the Hexagon talker socket isn't opened. Fall back to the .pte path, + // which creates fresh KV arrays per call so no manual reset is needed. + val codes: Array = if (talkerSocket != null) { + hexReset() + runHexGenWithPrefill(prefill, maxGen, eosBoostMinStep) + } else if (talkerPteModule != null && cpPteModule != null) { + runInterleavedPteFromEmbeds(prefill, emptyList(), maxGen) + } else { + nlog("generateSegmentAudioVC: no talker backend available"); return ShortArray(0) + } if (codes.isEmpty()) return ShortArray(0) val n = codes.size