scripts: export per-voice prefix/suffix embeddings

New tool + generated artefacts so the on-device voice spinner can now hot-swap between all 8 voices — previously only Damien's prefix/suffix were present in the model dir, and the tablet fell back to him regardless of selection. scripts/export_voice_prefix_suffix.py runs Qwen3TTS's voice-clone path under a forward hook, captures the first prefill call's 1024-dim talker input embeddings, aborts the rest of the (very slow on CPU) decode via a sentinel exception, and slices out the first 9 vectors as <name>_voice_prefix.bin and the last 2 as <name>_voice_suffix.bin. Validated against the shipped damien_voice_prefix.bin: using damien_15s_24k.wav as the reference audio, max|diff| = 0, so the extraction matches the original tooling bit-for-bit. Generated and adb-pushed to /data/local/tmp/kazeia/models/qwen3-tts-npu/: amir / didier / elodie / jerome / richard / sid / zelda (+ re-generated damien from the canonical 15s_24k reference) Qwen3TtsEngine.setVoice (already wired) reads <voice>_voice_prefix.bin / <voice>_voice_suffix.bin by basename, so voice changes now take effect from the next synthesized segment with no app restart. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
UI+TTS: voice hot-swap + typing dots + emoji stripping
2026-04-15 00:09:23 +02:00 · 2026-04-14 23:55:07 +02:00 · 2026-04-14 23:47:30 +02:00 · 2026-04-14 23:42:43 +02:00 · 2026-04-14 23:33:38 +02:00 · 2026-04-14 23:20:15 +02:00
13 changed files with 2282 additions and 231 deletions
--- a/executorch-patches/llm_in_process_jni.patch
+++ b/executorch-patches/llm_in_process_jni.patch
@ -0,0 +1,72 @@
 diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
 index e93731e..4951e1d 100644
 --- a/backends/qualcomm/CMakeLists.txt
 +++ b/backends/qualcomm/CMakeLists.txt
@@ -308,8 +308,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
   )
 endif()
 -# QNN pybind
 -if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
 +# QNN pybind — host Python bindings, not for Android cross-compile
 +if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ANDROID)
   add_subdirectory(
     ${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
     ${CMAKE_CURRENT_BINARY_DIR}/pybind11
 diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
 index 45f2414..ae3d79f 100644
 --- a/extension/android/jni/jni_layer_llama.cpp
 +++ b/extension/android/jni/jni_layer_llama.cpp
@@ -171,14 +171,44 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
           model_path->toStdString().c_str(),
           data_files_vector,
           executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
 -      std::string decoder_model = "llama3"; // use llama3 for now
 -      runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
 -          std::move(module),
 -          decoder_model.c_str(),
 -          model_path->toStdString().c_str(),
 -          tokenizer_path->toStdString().c_str(),
 -          "",
 -          "");
 +      std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
 +
 +      // Mirror qnn_llama_runner.cpp main(): pick the Runner<T> template based
 +      // on the model's get_kv_io_bit_width metadata. The 16-bit KV I/O models
 +      // were introduced after the 8-bit ones, and using the wrong T treats
 +      // KV-cache bytes as the wrong width → garbage logits → gibberish output.
 +      example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8;
 +      if (module->method_names()->count("get_kv_io_bit_width") > 0) {
 +        kv_bitwidth = static_cast<example::KvBitWidth>(
 +            module->get("get_kv_io_bit_width").get().toScalar().to<int64_t>());
 +      }
 +      // Auto-detect eval_mode: kv-only (0) if the .pte only carries
 +      // kv_forward, hybrid (1) if it also has prefill_forward (which lets the
 +      // runner batch the prompt prefill — TTFT drops from ~52 ms/token to
 +      // sub-ms after the one-shot prefill graph). Same JNI binary works with
 +      // both export modes, no code change needed when the .pte is upgraded.
 +      int eval_mode = 0;
 +      if (module->method_names()->count("prefill_forward") > 0) {
 +        eval_mode = 1; // EvalMode::kHybrid
 +      }
 +      auto make_runner = [&](auto sample) -> std::unique_ptr<llm::IRunner> {
 +        using T = decltype(sample);
 +        return std::make_unique<example::Runner<T>>(
 +            std::move(module),
 +            decoder_model.c_str(),
 +            model_path->toStdString().c_str(),
 +            tokenizer_path->toStdString().c_str(),
 +            /* performance_output_path */ "",
 +            /* dump_logits_path */ "",
 +            /* temperature */ 0.0f, // greedy
 +            eval_mode,
 +            /* shared_buffer */ true);
 +      };
 +      if (kv_bitwidth == example::KvBitWidth::kWidth16) {
 +        runner_ = make_runner(uint16_t{0});
 +      } else {
 +        runner_ = make_runner(uint8_t{0});
 +      }
       model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
 #endif
 #if defined(EXECUTORCH_BUILD_MEDIATEK)
--- a/executorch-patches/qwen3_4b_decoder.patch
+++ b/executorch-patches/qwen3_4b_decoder.patch
@ -1,5 +1,5 @@
 diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
-index 963db6e..953dc4c 100644
+index 963db6e..9ccfdd0 100644
 --- a/examples/qualcomm/oss_scripts/llama/__init__.py
 +++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -25,9 +25,14 @@ from executorch.examples.models.granite import (
@ -20,7 +20,7 @@ index 963db6e..953dc4c 100644
 from executorch.examples.models.qwen2_5 import (
     convert_weights as convert_qwen2_5_weights,
 )
-@@ -479,6 +484,34 @@ class Qwen3_1_7B(LLMModelConfig):
+@@ -479,6 +484,37 @@ class Qwen3_1_7B(LLMModelConfig):
     quant_recipe = Qwen3_1_7BQuantRecipe
@ -40,10 +40,13 @@ index 963db6e..953dc4c 100644
 +    convert_weights = convert_qwen3_weights
 +    transform_weight = False
 +    instruct_model = True
-+    # Bumped to 2 to halve peak host RAM during QNN compile (4B at sharding=1
+    # num_sharding=1 for hybrid mode: sharding=2 produces a multi-context
-+    # OOMed on a 62 GB box, peak anon-rss 46 GB). At sharding=2 each shard
+    # .pte (2 graphs × 2 shards = 4 contexts) that the LlmModule load path
-+    # compile fits comfortably; runner stitches them at load time.
+    # can't restore (error 5010 "Context group 1 does not exist"). With
-+    num_sharding = 2
+    # sharding=1 the hybrid export needs ~46 GB RAM peak — the 192 GB swap
 +    # on /swapfile handles this; compile takes ~80 min wall but completes
 +    # cleanly. Single-context .pte loads fine through the JNI runner.
 +    num_sharding = 1
 +    masked_softmax = True
 +    seq_mse_candidates = 0
 +    r1 = False
--- a/kazeia-android/app/src/main/AndroidManifest.xml
+++ b/kazeia-android/app/src/main/AndroidManifest.xml
@ -5,6 +5,7 @@
    <uses-permission android:name="android.permission.RECORD_AUDIO" />
    <uses-permission android:name="android.permission.FOREGROUND_SERVICE" />
    <uses-permission android:name="android.permission.FOREGROUND_SERVICE_MICROPHONE" />
    <uses-permission android:name="android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK" />
    <uses-permission android:name="android.permission.FOREGROUND_SERVICE_SPECIAL_USE" />
    <uses-permission android:name="android.permission.POST_NOTIFICATIONS" />
    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
@ -50,7 +51,7 @@
        <service
            android:name=".service.KazeiaService"
-            android:foregroundServiceType="microphone|specialUse"
+            android:foregroundServiceType="microphone|mediaPlayback|specialUse"
            android:exported="true">
            <property
                android:name="android.app.PROPERTY_SPECIAL_USE_FGS_SUBTYPE"
--- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
@ -1,43 +1,49 @@
 package com.kazeia.llm
 import android.content.Context
 import android.util.Log
 import com.kazeia.core.*
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.withContext
 import java.io.File
 import org.pytorch.executorch.extension.llm.LlmCallback
 import org.pytorch.executorch.extension.llm.LlmModule
 /**
- * LLM Engine using ExecuTorch + QNN backend via subprocess.
+ * LLM Engine using ExecuTorch LlmModule in-process — **no root required**.
- * Calls qnn_llama_runner binary with root access (Magisk su).
+ *
 * Runs Qwen3-4B via `org.pytorch.executorch.extension.llm.LlmModule`, which
 * wraps the same C++ TextLlmRunner as the standalone qnn_llama_runner binary
 * but inside the app's own process. The QNN HTP backend works because the
 * DSP fastrpc service accepts the Zygote-forked app process (unlike
 * ProcessBuilder-spawned subprocesses which lose supplementary GIDs on exec
 * and get rejected by the fastrpc credential checks).
 *
 * Model + tokenizer live in /data/local/tmp/kazeia-et/ (readable by the app
 * on this device's permissive SELinux policy). libexecutorch.so + QNN libs
 * are bundled in jniLibs.
 *
 * Current tablet config: Qwen3-4B KV-mode, ~18-22 tok/s on Hexagon V79
 * (Snapdragon 8 Elite), TTFT 0.9 s, RSS 1.76 GB.
 *
 * Why root: the runner binary plus its QNN v2.42 .so deps live in
 * /data/local/tmp/kazeia-et/ (shell_data_file SELinux context). Untrusted
 * apps can't exec binaries from there. The Hexagon DSP fastrpc service also
 * refuses to load the v2.42 Skel from the app's own files dir — only from
 * nativeLibraryDir — but that dir already holds the TTS stack's v2.31 Skel
 * (same filename, different version, can't coexist). Rebuilding everything
 * against one QNN version would eliminate the conflict, but would require
 * re-exporting the TTS .pte with the new runtime (tooling currently broken
 * on the flatc schema/dataclass mismatch in the qnn_venv).
 */
 class ExecuTorchLlmEngine(
    private val context: Context,
    private val onLog: ((String) -> Unit)? = null
 ) : LlmEngine {
    companion object {
        private const val TAG = "ExecuTorchLLM"
-        private const val RUNNER_DIR = "/data/local/tmp/kazeia-et"
+        // /no_think disables Qwen3's chain-of-thought block. Compact wording
-        // /no_think disables Qwen3's chain-of-thought block so the full token
+        // keeps prefill cost low: this prompt is ~25 tokens vs ~55 in the
-        // budget goes to the actual answer (without it, 120-200 tokens get
+        // earlier verbose version → saves ~1.5 s of TTFT in kv-only mode.
-        // consumed by <think>…</think> leaving nothing to speak).
+        private const val SYSTEM_PROMPT = "Tu es Kazeia, à l'écoute en français. Réponds en 1-2 phrases courtes, sans raisonnement. /no_think"
-        // Short-response directive keeps TTS latency manageable — each sentence
+
-        // costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot.
+        private const val MODEL_DIR = "/data/local/tmp/kazeia-et"
-        private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think"
+        private const val MODEL_PATH = "$MODEL_DIR/hybrid_llama_qnn.pte"
        private const val TOKENIZER_PATH = "$MODEL_DIR/tokenizer.json"
    }
    private var llmModule: LlmModule? = null
    private var modelName = ""
    private var loaded = false
@ -48,77 +54,152 @@ class ExecuTorchLlmEngine(
    override suspend fun load(modelPath: String, config: LlmConfig) {
        withContext(Dispatchers.IO) {
-            val check = execRoot("ls $RUNNER_DIR/qnn_llama_runner $RUNNER_DIR/hybrid_llama_qnn.pte $RUNNER_DIR/tokenizer.json 2>&1")
+            if (!File(MODEL_PATH).exists()) {
-            if (check.contains("No such file")) {
+                nlog("ERROR: model not found at $MODEL_PATH")
-                nlog("ERROR: runner or model not found in $RUNNER_DIR")
+                return@withContext
            }
            if (!File(TOKENIZER_PATH).exists()) {
                nlog("ERROR: tokenizer not found at $TOKENIZER_PATH")
                return@withContext
            }
-            deployRunnerScript()
+            try {
                val t0 = System.currentTimeMillis()
                // MODEL_TYPE_QNN_LLAMA=4 selects the Qualcomm runner path in
                // jni_layer_llama.cpp, which uses example::Runner (same code
                // as the qnn_llama_runner binary) instead of the generic
                // TextLLMRunner. Our .pte was exported with
                // --decoder_model qwen3-4b which requires this path.
                val MODEL_TYPE_QNN_LLAMA = 4
                llmModule = LlmModule(MODEL_TYPE_QNN_LLAMA, MODEL_PATH, TOKENIZER_PATH, 0.7f)
                nlog("LlmModule instantiated in ${System.currentTimeMillis() - t0}ms")
-            writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
+                // Load the PTE into QNN HTP (calls the native load()).
-                android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP))
+                val loadResult = llmModule!!.load()
-            if (SYSTEM_PROMPT.isNotEmpty()) {
+                if (loadResult != 0) {
-                writeFileRoot("$RUNNER_DIR/outputs/system.b64",
+                    nlog("ERROR: LlmModule.load() returned $loadResult")
-                    android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
+                    llmModule = null
-            } else {
+                    return@withContext
                execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
                }
-            val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1")
+                nlog("LlmModule loaded in ${System.currentTimeMillis() - t0}ms total")
            if (test.contains("Generated Tokens") || test.contains("Rate:")) {
                loaded = true
-                val rateMatch = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(test)
+                modelName = "Qwen3-4B LlmModule"
                val rate = rateMatch?.groupValues?.get(1) ?: "?"
                modelName = "Qwen3 (${rate} tok/s NPU)"
                nlog("Ready: $modelName")
-            } else {
+            } catch (e: Throwable) {
-                nlog("ERROR: test failed: ${test.takeLast(200)}")
+                nlog("ERROR: LlmModule init failed: ${e.javaClass.simpleName}: ${e.message}")
                llmModule = null
            }
        }
    }
-    override fun isLoaded(): Boolean = loaded
+    override fun isLoaded(): Boolean = loaded && llmModule != null
    override suspend fun generate(
        prompt: String,
        params: SamplingParams,
        onToken: ((String) -> Boolean)?
    ): GenerationResult = withContext(Dispatchers.IO) {
-        if (!loaded) throw IllegalStateException("Model not loaded")
+        val mod = llmModule ?: throw IllegalStateException("Model not loaded")
        val startTime = System.currentTimeMillis()
-
+        val fullPrompt = buildChatTemplate(prompt)
        writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
            android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP))
        if (SYSTEM_PROMPT.isNotEmpty()) {
            writeFileRoot("$RUNNER_DIR/outputs/system.b64",
                android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
        } else {
            execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
        }
        nlog("Prompt: '${prompt.take(80)}'")
        val responseBuilder = StringBuilder()
        var firstTokenMs = -1L
        // Track whether we're inside a <think>…</think> block so the upstream
        // SentenceStreamer / TTS doesn't get fed reasoning tokens. Even with
        // /no_think in the system prompt Qwen3 still emits empty <think></think>
        // wrappers for ~3 tokens before the real answer.
        var inThink = false
        val tokenScan = StringBuilder()  // small lookahead to spot tag boundaries
        // Singleton special tokens that should never reach the TTS streamer
        // (they leak when the model wraps its reply or signals end-of-turn).
        val stripTokens = listOf("<|im_start|>", "<|im_end|>", "<|endoftext|>")
        val maxTagLen = listOf("<think>", "</think>", "<|im_start|>", "<|im_end|>", "<|endoftext|>")
            .maxOf { it.length }
        val cb = object : LlmCallback {
            override fun onResult(result: String) {
                if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
                responseBuilder.append(result)
                // Forward to caller only outside <think> blocks, and strip
                // singleton special tokens. We accumulate a tiny lookahead buffer
                // so tag tokens that arrive split ("<thi", "nk>") still match.
                tokenScan.append(result)
                while (true) {
                    if (!inThink) {
                        val open = tokenScan.indexOf("<think>")
                        if (open < 0) {
                            // No <think> open pending — strip any singleton tokens
                            // that fully landed in the buffer, then flush prose
                            // up to a safe point preserving lookahead.
                            for (tok in stripTokens) {
                                var idx = tokenScan.indexOf(tok)
                                while (idx >= 0) {
                                    tokenScan.delete(idx, idx + tok.length)
                                    idx = tokenScan.indexOf(tok)
                                }
                            }
                            val safe = tokenScan.length - maxTagLen
                            if (safe > 0) {
                                onToken?.invoke(tokenScan.substring(0, safe))
                                tokenScan.delete(0, safe)
                            }
                            break
                        }
                        // Flush the prose before the <think> tag, then enter think mode.
                        if (open > 0) onToken?.invoke(tokenScan.substring(0, open))
                        tokenScan.delete(0, open + "<think>".length)
                        inThink = true
                    } else {
                        val close = tokenScan.indexOf("</think>")
                        if (close < 0) {
                            // Drop all buffered chars except a small tail in case
                            // the closing tag is split across tokens.
                            val keep = "</think>".length - 1
                            if (tokenScan.length > keep) tokenScan.delete(0, tokenScan.length - keep)
                            break
                        }
                        tokenScan.delete(0, close + "</think>".length)
                        inThink = false
                    }
                }
            }
            override fun onStats(stats: String) {
                nlog("stats: ${stats.take(200)}")
            }
        }
        val seqLen = minOf(params.maxNewTokens, 512)
-        val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1")
+        val rc = try {
            // echo=false so onResult() only receives the generated completion,
            // not the prompt tokens echoed back — otherwise the sentence
            // streamer would feed '<|im_start|>user …' to the TTS.
            mod.generate(fullPrompt, seqLen, cb, /* echo */ false)
        } catch (e: Throwable) {
            nlog("generate() threw: ${e.message}")
            -1
        }
-        val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output)
+        // Drain any leftover prose buffered during <think>-suppression so the
-            ?.groupValues?.get(1)?.toIntOrNull() ?: 0
+        // last sentence reaches the TTS even if it ran past the closing tag.
-        val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output)
+        if (!inThink && tokenScan.isNotEmpty()) {
-            ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
+            onToken?.invoke(tokenScan.toString())
-        val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output)
+            tokenScan.clear()
-            ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
+        }
        val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null")
        nlog("RAW: ${responseRaw.take(300)}")
        val responseText = extractResponse(responseRaw)
        val elapsed = System.currentTimeMillis() - startTime
-        nlog("Response: '$responseText'")
+        val rawText = responseBuilder.toString()
-        nlog("Stats: ${tokenCount}tok ${rate}tok/s TTFT=${ttft}s ${elapsed}ms")
+        val responseText = cleanResponse(rawText)
        val tokenCount = rawText.length / 4  // rough estimate without a tokenizer
        val rate = if (elapsed > 0) (tokenCount * 1000f) / elapsed else 0f
-        onToken?.invoke(responseText)
+        nlog("Response: '${responseText.take(80)}'")
        nlog("Stats: rc=$rc ~${tokenCount}tok ~${"%.1f".format(rate)}tok/s TTFT=${firstTokenMs}ms total=${elapsed}ms")
        GenerationResult(
            text = responseText,
@ -128,20 +209,32 @@ class ExecuTorchLlmEngine(
        )
    }
-    private fun extractResponse(raw: String): String {
+    /**
     * Qwen3 chat template matching qnn_llama_runner.cpp's get_formatted_prompt()
     * for DecoderModelVersion::kQwen3. Note the user-first-then-system ordering
     * (quirky but required — the runner binary produces the same layout and our
     * .pte was trained with it). Terminates with `<|im_start|>assistant` with
     * no trailing newline, matching the binary exactly.
     */
    private fun buildChatTemplate(userInput: String): String {
        val sb = StringBuilder()
        sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
        if (SYSTEM_PROMPT.isNotEmpty()) {
            sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
        }
        sb.append("<|im_start|>assistant")
        return sb.toString()
    }
    /** Strip <think>…</think>, special tokens, and leading/trailing whitespace. */
    private fun cleanResponse(raw: String): String {
        var text = raw
        val thinkEnd = text.indexOf("</think>")
        if (thinkEnd >= 0) {
            text = text.substring(thinkEnd + "</think>".length)
-        } else {
+        } else if (text.indexOf("<think>") >= 0) {
-            val thinkStart = text.indexOf("<think>")
+            nlog("WARN: <think> block never closed")
            val assistantTag = text.indexOf("assistant")
            if (thinkStart >= 0) {
                nlog("WARN: <think> block never closed, no response generated")
            return ""
            } else if (assistantTag >= 0) {
                text = text.substring(assistantTag + "assistant".length)
            }
        }
        return text
            .replace("<|im_start|>", "")
@ -152,82 +245,9 @@ class ExecuTorchLlmEngine(
            .trim()
    }
    private fun deployRunnerScript() {
        val script = """
 #!/bin/sh
 cd $RUNNER_DIR
 export LD_LIBRARY_PATH=$RUNNER_DIR
 export ADSP_LIBRARY_PATH=$RUNNER_DIR
 TEMP=${'$'}1
 SEQ_LEN=${'$'}2
 PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64)
 rm -f $RUNNER_DIR/outputs/response.txt
 SYSTEM_ARGS=""
 if [ -s $RUNNER_DIR/outputs/system.b64 ]; then
  SYSTEM=${'$'}(base64 -d $RUNNER_DIR/outputs/system.b64)
  SYSTEM_ARGS="--system_prompt"
 fi
 if [ -n "${'$'}SYSTEM_ARGS" ]; then
  exec ./qnn_llama_runner \
    --model_path hybrid_llama_qnn.pte \
    --tokenizer_path tokenizer.json \
    --decoder_model_version qwen3 \
    --output_path $RUNNER_DIR/outputs/response.txt \
    --performance_output_path $RUNNER_DIR/outputs/perf.txt \
    --shared_buffer \
    --system_prompt "${'$'}SYSTEM" \
    --prompt "${'$'}PROMPT" \
    --temperature ${'$'}TEMP \
    --seq_len ${'$'}SEQ_LEN \
    --eval_mode 0
 else
  exec ./qnn_llama_runner \
    --model_path hybrid_llama_qnn.pte \
    --tokenizer_path tokenizer.json \
    --decoder_model_version qwen3 \
    --output_path $RUNNER_DIR/outputs/response.txt \
    --performance_output_path $RUNNER_DIR/outputs/perf.txt \
    --shared_buffer \
    --prompt "${'$'}PROMPT" \
    --temperature ${'$'}TEMP \
    --seq_len ${'$'}SEQ_LEN \
    --eval_mode 0
 fi
 """.trimIndent()
        writeFileRoot("$RUNNER_DIR/run_llm.sh", script)
        execRoot("chmod 755 $RUNNER_DIR/run_llm.sh")
    }
    override fun release() {
        try { llmModule?.resetNative() } catch (_: Throwable) {}
        llmModule = null
        loaded = false
    }
    private fun writeFileRoot(path: String, content: String) {
        try {
            val process = Runtime.getRuntime().exec(arrayOf("su", "-c", "cat > $path"))
            process.outputStream.bufferedWriter().use { it.write(content) }
            process.waitFor()
        } catch (e: Exception) {
            Log.e(TAG, "writeFileRoot failed: ${e.message}")
        }
    }
    private fun execRoot(cmd: String): String {
        return try {
            val process = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
            val result = process.inputStream.bufferedReader().readText()
            val error = process.errorStream.bufferedReader().readText()
            process.waitFor()
            if (error.isNotEmpty() && result.isEmpty()) error else result
        } catch (e: Exception) {
            Log.e(TAG, "execRoot failed: ${e.message}")
            ""
        }
    }
 }
--- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaPipeline.kt
@ -142,14 +142,36 @@ class KazeiaPipeline {
     * the echo-mode playback through the same path — otherwise each TTS
     * site reimplemented the "streaming-or-fallback" dispatch.
     */
-    suspend fun speakText(text: String) {
+    suspend fun speakText(
        text: String,
        // Fires the instant each synthesized sentence starts playing
        // through the speaker, with the sentence text, audio duration,
        // and a per-ENVELOPE_WINDOW_MS RMS envelope. Used by
        // processLlmResponse to defer the KAZEIA chat bubble appearance
        // until sound is audible, pace word-by-word reveal inside the
        // bubble, and drive the AudioVisualizerView orb.
        onSegmentPlaying: ((
            sentence: String,
            durationMs: Long,
            rmsEnvelope: FloatArray,
            spectrogram: Array<FloatArray>
        ) -> Unit)? = null
    ) {
        val ttsEngine = tts ?: return
        _pipelineState.value = PipelineState.Speaking
        try {
            val qwen = ttsEngine as? com.kazeia.tts.Qwen3TtsEngine
            if (qwen != null) {
                qwen.onSegmentPlaying = onSegmentPlaying
                qwen.startStreamingSession()
-                val streamer = com.kazeia.tts.SentenceStreamer { s -> qwen.enqueueSentence(s) }
+                val streamer = com.kazeia.tts.SentenceStreamer { raw ->
                    // Strip emoji / non-speakable pictographs before TTS
                    // so a standalone "😊" doesn't become its own noisy
                    // segment. The chat bubble keeps the original text —
                    // only the audio path sees the cleaned version.
                    val spoken = stripNonSpeakable(raw).trim()
                    if (spoken.isNotEmpty()) qwen.enqueueSentence(spoken)
                }
                streamer.append(text)
                streamer.flush()
                qwen.endStreamingSession()
@ -168,6 +190,41 @@ class KazeiaPipeline {
        _messages.value = _messages.value + msg
    }
    /**
     * Drop emoji + dingbat + pictographic characters so the TTS engine
     * doesn't try to synthesize them. Covers the main Unicode emoji
     * blocks (Miscellaneous Symbols, Dingbats, Emoticons, Transport,
     * Supplemental Symbols and Pictographs, etc.) plus variation
     * selectors and zero-width joiners that tag emoji sequences.
     * Keeps everything in the Basic Latin / Latin-1 / Latin Extended
     * ranges + common French punctuation untouched.
     */
    private fun stripNonSpeakable(text: String): String {
        val sb = StringBuilder(text.length)
        var i = 0
        while (i < text.length) {
            val cp = text.codePointAt(i)
            val skip = when {
                cp in 0x2600..0x27BF -> true              // misc symbols + dingbats
                cp in 0x1F300..0x1F5FF -> true            // pictographs
                cp in 0x1F600..0x1F64F -> true            // emoticons
                cp in 0x1F680..0x1F6FF -> true            // transport
                cp in 0x1F700..0x1F77F -> true            // alchemical
                cp in 0x1F780..0x1F7FF -> true            // geometric extended
                cp in 0x1F800..0x1F8FF -> true            // supplemental arrows-c
                cp in 0x1F900..0x1F9FF -> true            // supplemental pictographs
                cp in 0x1FA00..0x1FAFF -> true            // symbols & pictographs extended-A
                cp == 0x200D -> true                       // zero-width joiner
                cp in 0xFE00..0xFE0F -> true              // variation selectors
                cp in 0x1F1E6..0x1F1FF -> true            // regional indicators (flags)
                else -> false
            }
            if (!skip) sb.appendCodePoint(cp)
            i += Character.charCount(cp)
        }
        return sb.toString()
    }
    fun log(msg: String) {
        Log.i(TAG, msg)
        val time = java.text.SimpleDateFormat("HH:mm:ss.SSS", java.util.Locale.FRANCE)
--- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
@ -83,6 +83,34 @@ class KazeiaService : Service() {
    private val _isListening = MutableStateFlow(false)
    val isListening: StateFlow<Boolean> = _isListening
    // Drives the AudioVisualizerView orb. Pushed from the VAD loop
    // during mic capture (mic RMS, normalized) and from the TTS engine's
    // onSegmentPlaying callback (TTS RMS envelope per-segment). The view
    // reads this via collectLatest in ChatActivity; the signals carry
    // their own state so the visualizer knows whether it's idle, tracking
    // the mic, or rendering a TTS segment.
    sealed class VisualizerSignal {
        object Idle : VisualizerSignal()
        data class Listening(val micRms: Float) : VisualizerSignal()
        data class Speaking(
            val rmsEnvelope: FloatArray,
            val spectrogram: Array<FloatArray>,
            val durationMs: Long
        ) : VisualizerSignal()
    }
    private val _visualizerSignal = MutableStateFlow<VisualizerSignal>(VisualizerSignal.Idle)
    val visualizerSignal: StateFlow<VisualizerSignal> = _visualizerSignal
    // Kazeia's orb color is bound to the selected voice so the user
    // visually associates a palette with the speaker they picked. UI
    // sets this whenever the voice spinner changes; the orb view
    // listens via the StateFlow and tweens the current → target color.
    private val _voiceColor = MutableStateFlow(0xFFBCA4E8.toInt())   // lavender = Damien default
    val voiceColor: StateFlow<Int> = _voiceColor
    /** Called by the UI whenever the voice selector changes. */
    fun setVoiceColor(color: Int) { _voiceColor.value = color }
    private val _debugMode = MutableStateFlow(false)
    val debugMode: StateFlow<Boolean> = _debugMode
@ -174,6 +202,12 @@ class KazeiaService : Service() {
                    if (!::llm.isInitialized || !llm.isLoaded()) {
                        log("Stream LLM: LLM not ready"); return@launch
                    }
                    // Set pipeline state to Speaking so the continuous-
                    // listening mic loop (line ~824) drops frames during
                    // TTS playback. Without this, the mic picks up the
                    // tablet speaker and feeds our own TTS back into STT,
                    // creating an infinite loop.
                    _pipelineState.value = PipelineState.Speaking
                    qwenTts.startStreamingSession()
                    val tStart = System.currentTimeMillis()
                    var firstSentenceLogged = false
@ -199,6 +233,9 @@ class KazeiaService : Service() {
                } catch (e: Exception) {
                    log("Stream LLM error: ${e.message}")
                    e.printStackTrace()
                } finally {
                    // Back to Idle so the next mic frame is accepted.
                    _pipelineState.value = PipelineState.Idle
                }
            }
        }
@ -414,9 +451,17 @@ class KazeiaService : Service() {
            this, Manifest.permission.RECORD_AUDIO
        ) == PackageManager.PERMISSION_GRANTED
        // FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK is required so ColorOS (and
        // stock Android 14+ policies) don't mute the TTS AudioTrack with
        // "clientVolume" at ~600 ms after play(). Without it the FGS was
        // classified as mic-only or special-use and background-audio
        // hardening silenced it. Combine with MICROPHONE so mic input keeps
        // working during STT.
        val fgsType = if (hasMicPermission) {
-            ServiceInfo.FOREGROUND_SERVICE_TYPE_MICROPHONE
+            ServiceInfo.FOREGROUND_SERVICE_TYPE_MICROPHONE or
                ServiceInfo.FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK
        } else {
            ServiceInfo.FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK or
                ServiceInfo.FOREGROUND_SERVICE_TYPE_SPECIAL_USE
        }
@ -450,7 +495,7 @@ class KazeiaService : Service() {
                // TTS: try Qwen3-TTS (NPU Hexagon), fallback to Android TTS
                _loadingState.value = LoadingState(15, "TTS Qwen3…")
                try {
-                    val qwenTts = com.kazeia.tts.Qwen3TtsEngine(nativeLibDir) { msg -> log("[TTS] $msg") }
+                    val qwenTts = com.kazeia.tts.Qwen3TtsEngine(nativeLibDir, this@KazeiaService) { msg -> log("[TTS] $msg") }
                    qwenTts.load("$modelsDir/qwen3-tts-npu")
                    if (qwenTts.isLoaded()) {
                        tts = qwenTts
@ -518,7 +563,7 @@ class KazeiaService : Service() {
                // LLM: Qwen3-4B on Hexagon V79 via qnn_llama_runner.
                _loadingState.value = LoadingState(50, "LLM Qwen3-4B NPU…")
-                llm = ExecuTorchLlmEngine { msg -> log(msg) }
+                llm = ExecuTorchLlmEngine(this@KazeiaService) { msg -> log(msg) }
                try {
                    llm.load("${KazeiaApplication.MODELS_DIR}/qwen3-4b", com.kazeia.core.LlmConfig())
                } catch (e: Exception) {
@ -583,6 +628,16 @@ class KazeiaService : Service() {
        if (chatterbox != null) {
            chatterbox.setVoice(voicePath)
            log("Voice set to: $voicePath")
            return
        }
        val qwen = tts as? com.kazeia.tts.Qwen3TtsEngine
        if (qwen != null) {
            // Hot-swap prefix/suffix embeddings — no model reload. Takes
            // effect from the NEXT synthesized segment (current in-flight
            // one, if any, finishes with the old voice since the arrays
            // are already in its closure).
            qwen.setVoice(voicePath)
            log("Voice set to: $voicePath")
        }
    }
@ -835,6 +890,14 @@ class KazeiaService : Service() {
                    for (s in frame) sumSq += s.toLong() * s.toLong()
                    val rms = Math.sqrt(sumSq.toDouble() / frameSize).toInt()
                    // Drive the visualizer orb. Normalize with the same
                    // sqrt squashing used for TTS so loud peaks don't
                    // saturate and quiet speech is still visible. The
                    // visualizer stays in Listening mode; it will swap
                    // to Speaking or Idle when pipelineState moves on.
                    val rmsNorm = kotlin.math.sqrt((rms / 6000f).coerceIn(0f, 1f))
                    _visualizerSignal.value = VisualizerSignal.Listening(rmsNorm)
                    // Log RMS every second for calibration
                    if (frameCount % 10 == 0) {
                        Log.d(TAG, "VAD RMS=$rms (threshold=$silenceThreshold)")
@ -1184,12 +1247,99 @@ class KazeiaService : Service() {
            log("LLM stats: ${result.tokenCount} tokens in ${result.timeMs}ms (${result.tokensPerSecond} tok/s)")
            if (responseText.isNotEmpty()) {
-                addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = responseText))
+                // Mark the pipeline as Speaking for the duration of TTS so
-                pipeline.speakText(responseText)
+                // the continuous-listening mic loop drops frames and we
                // don't feed our own speaker output back into STT.
                _pipelineState.value = PipelineState.Speaking
                // Create a KAZEIA bubble up-front. Until the first TTS
                // segment actually starts playing the bubble shows an
                // animated "." → ".." → "..." typing indicator so the
                // user knows Kazeia is thinking/synthesising; once the
                // first segment plays the dots are cleared and the
                // per-sentence word reveal takes over.
                val bubble = ChatMessage(role = ChatMessage.Role.KAZEIA, text = ".")
                addMessage(bubble)
                val revealScope = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.Default)
                var revealedSoFar = ""
                val revealJobs = mutableListOf<kotlinx.coroutines.Job>()
                val firstSegmentSeen = java.util.concurrent.atomic.AtomicBoolean(false)
                val typingJob = revealScope.launch {
                    var tick = 0
                    while (!firstSegmentSeen.get()) {
                        val dots = ".".repeat(1 + (tick % 3))   // . → .. → ...
                        updateMessageText(bubble.id, dots)
                        tick++
                        kotlinx.coroutines.delay(400)
                    }
-
+                }
                try {
                    pipeline.speakText(responseText) { sentence, durationMs, envelope, spectrogram ->
                        // First segment: stop the typing indicator and
                        // reset the bubble to empty so the word reveal
                        // doesn't collide with the dots.
                        if (firstSegmentSeen.compareAndSet(false, true)) {
                            try { typingJob.cancel() } catch (_: Exception) {}
                            updateMessageText(bubble.id, "")
                        }
                        // Push the envelope + spectrogram to the
                        // visualizer at the same moment the MediaPlayer
                        // starts playing so the orb reacts to this
                        // segment's actual energy and the in-sphere
                        // spectrum bars match the audio content.
                        _visualizerSignal.value =
                            VisualizerSignal.Speaking(envelope, spectrogram, durationMs)
                        // Start a coroutine that appends one word at a time
                        // over the segment's audio duration. Words are
                        // separated on whitespace; punctuation rides with
                        // the trailing word. The prefix (= text already
                        // revealed from previous sentences) carries over so
                        // earlier sentences stay on screen.
                        val prefix = revealedSoFar
                        val words = sentence.split(Regex("\\s+")).filter { it.isNotBlank() }
                        revealedSoFar =
                            if (prefix.isEmpty()) sentence
                            else "$prefix $sentence"
                        if (words.isEmpty()) return@speakText
                        val perWordMs = (durationMs / words.size).coerceAtLeast(40L)
                        val job = revealScope.launch {
                            val sb = StringBuilder(prefix)
                            if (prefix.isNotEmpty()) sb.append(' ')
                            // Immediately reveal the first word so there's
                            // no visible gap between audio start and text.
                            sb.append(words[0])
                            updateMessageText(bubble.id, sb.toString())
                            for (i in 1 until words.size) {
                                kotlinx.coroutines.delay(perWordMs)
                                sb.append(' ').append(words[i])
                                updateMessageText(bubble.id, sb.toString())
                            }
                        }
                        revealJobs.add(job)
                    }
                    // After all segments finished playing, ensure the full
                    // text is visible even if a reveal job was racing.
                    revealJobs.forEach { try { it.join() } catch (_: Exception) {} }
                    updateMessageText(bubble.id, responseText)
                } finally {
                    // Defensive: cancel the typing dots in case no
                    // segment ever fired (e.g. the response was entirely
                    // emojis and got stripped empty).
                    firstSegmentSeen.set(true)
                    try { typingJob.cancel() } catch (_: Exception) {}
                    _pipelineState.value = if (_isListening.value)
                        PipelineState.Listening else PipelineState.Idle
                    // If we're going back to mic listening, the VAD loop
                    // will keep pushing Listening signals; otherwise drop
                    // to Idle so the orb settles back to its breathing
                    // baseline.
                    if (!_isListening.value) {
                        _visualizerSignal.value = VisualizerSignal.Idle
                    }
                }
            } else {
                _pipelineState.value = if (_isListening.value)
                    PipelineState.Listening else PipelineState.Idle
            }
        } catch (e: Exception) {
            _aiWorkload.value = _aiWorkload.value.copy(llmActive = false)
@ -1207,6 +1357,19 @@ class KazeiaService : Service() {
        _messages.value = _messages.value + message
    }
    /** Replace the text of an existing message (identified by id) in the
     *  message list. Used by the progressive-reveal flow to grow a
     *  KAZEIA message word-by-word as TTS audio plays. */
    private fun updateMessageText(id: Long, newText: String) {
        val current = _messages.value
        val idx = current.indexOfLast { it.id == id }
        if (idx < 0) return
        val m = current[idx]
        _messages.value = current.toMutableList().also {
            it[idx] = m.copy(text = newText)
        }
    }
    private fun createNotification(): Notification {
        val intent = Intent(this, ChatActivity::class.java)
        val pendingIntent = PendingIntent.getActivity(
--- a/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/tts/Qwen3TtsEngine.kt
@ -37,6 +37,7 @@ import kotlin.coroutines.resume
 */
 class Qwen3TtsEngine(
    private val nativeLibDir: String,
    private val context: android.content.Context? = null,
    private val onLog: ((String) -> Unit)? = null
 ) : TtsEngine {
@ -88,6 +89,38 @@ class Qwen3TtsEngine(
        private const val TOKEN_USER = 872
        private const val TOKEN_ASSISTANT = 1042
        private const val TOKEN_NEWLINE = 198
        // Streaming decode: when true, BigVGAN dispatches a chunk's audio as
        // soon as SEQ_LEN codes are ready from the talker/CP loop rather than
        // waiting for all tokens. For long segments this overlaps the final
        // BigVGAN passes with ongoing talker/CP work on Hexagon, cutting the
        // first-audio latency by ~4 s. Short segments (<SEQ_LEN codes) fall
        // back to the single-chunk path with zero difference. Flag exists so
        // the sequential path can be re-enabled for A/B comparison.
        private const val USE_STREAMING_DECODE = true
        // ColorOS Audio Hardening silently mutes AudioTrack in background/FGS
        // context (confirmed via `event:muted updated source:clientVolume`
        // logs, same behaviour across USAGE_MEDIA, USAGE_ASSISTANT, and
        // USAGE_VOICE_COMMUNICATION). When this flag is true, each
        // generated segment is written as a WAV to app-owned shared
        // storage and played via MediaPlayer instead. Slightly slower
        // (WAV write + MediaPlayer prepare add ~150 ms per segment) but
        // it's the only reliable path to audible output on this device.
        private const val USE_MEDIAPLAYER_FALLBACK = true
        // Window size for the TTS→visualizer RMS sidecar. 50 ms at 24 kHz
        // = 1200 samples/window — small enough for a 60 fps visualizer to
        // track formants, large enough to run at negligible CPU cost.
        const val ENVELOPE_WINDOW_MS = 50
        // FFT size for the spectrum-in-sphere sidecar. 1024 samples at
        // 24 kHz = 43 ms — slightly narrower than the hop so each frame
        // gives a clean snapshot centered on its hop boundary.
        private const val FFT_SIZE = 1024
        // Number of log-spaced bands 120 Hz–4 kHz rendered as vertical
        // bars inside the sphere during Speaking. 12 feels like a real
        // spectrometer without cluttering at smaller sphere sizes.
        const val SPECTRUM_BANDS = 12
    }
    private var ortEnv: OrtEnvironment? = null
@ -243,7 +276,12 @@ class Qwen3TtsEngine(
                    return session
                }
-                // Speech decoder V2 on CPU (HTP tested: BigVGAN convolutions too slow to compile)
+                // Speech decoder V2 on CPU. Two paths tried, both worse than CPU:
                //   - HTP: BigVGAN convolutions too slow to compile (timeout)
                //   - GPU Adreno via QNN GPU EP: model loads but per-phrase
                //     inference is ~3.5 s vs ~2 s on CPU (GPU/CPU memory transfer
                //     overhead dominates for this conv-heavy model)
                // CPU 8-thread stays the practical optimum.
                val v2Path = "$path/v2_pre_conv"
                if (File("$v2Path/model.onnx").exists()) {
                    nlog("Loading V2 speech decoder (CPU ONNX)...")
@ -570,8 +608,53 @@ class Qwen3TtsEngine(
    override fun isLoaded(): Boolean = loaded
    /**
     * Hot-swap the speaker prefix/suffix embeddings used for voice
     * conditioning. [voicePath] is a WAV path like
     * `/…/voix/elodie.wav` — we derive the voice id from its basename
     * and look for matching `<id>_voice_prefix.bin` + `<id>_voice_suffix.bin`
     * in the model dir. If both files exist they replace the current
     * [damienVoicePrefix] / [damienVoiceSuffix] arrays so the next
     * segment generated uses the new voice. If either file is missing
     * we log a warning and keep the current voice — per-voice
     * prefix/suffix files are offline-generated via
     * scripts/prepare_tts_native.py; run once per voice WAV and
     * `adb push` into the model dir to enable.
     *
     * Thread-safety: the arrays are read by the synth worker on
     * Dispatchers.IO; replacing a reference via a volatile var is
     * atomic on the JVM so a mid-segment replacement just takes
     * effect on the next segment boundary.
     */
    fun setVoice(voicePath: String) {
-        nlog("Voice: $voicePath")
+        val modelDir = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
        val id = java.io.File(voicePath).nameWithoutExtension.lowercase()
        val prefixFile = java.io.File("$modelDir/${id}_voice_prefix.bin")
        val suffixFile = java.io.File("$modelDir/${id}_voice_suffix.bin")
        if (!prefixFile.exists() || !suffixFile.exists()) {
            nlog("Voice '$id' not available (missing ${prefixFile.name} or ${suffixFile.name}); keeping current voice. " +
                "Run scripts/prepare_tts_native.py with this WAV to generate the files.")
            return
        }
        try {
            val pBytes = prefixFile.readBytes()
            val pHead = java.nio.ByteBuffer.wrap(pBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
            val nPref = pHead.int; val dimPref = pHead.int
            if (dimPref != TALKER_DIM) throw IllegalStateException("prefix dim $dimPref != $TALKER_DIM")
            val newPrefix = Array(nPref) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = pHead.float } }
            val sBytes = suffixFile.readBytes()
            val sHead = java.nio.ByteBuffer.wrap(sBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
            val nSuf = sHead.int; val dimSuf = sHead.int
            if (dimSuf != TALKER_DIM) throw IllegalStateException("suffix dim $dimSuf != $TALKER_DIM")
            val newSuffix = Array(nSuf) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = sHead.float } }
            damienVoicePrefix = newPrefix
            damienVoiceSuffix = newSuffix
            nlog("Voice switched to '$id' ($nPref prefix + $nSuf suffix embeds)")
        } catch (e: Exception) {
            nlog("Voice swap failed for '$id': ${e.message}")
        }
    }
    override suspend fun synthesize(text: String, language: String): TtsResult {
@ -2669,7 +2752,11 @@ class Qwen3TtsEngine(
    /** PTE pipeline from pre-computed embeddings (prefill + trailing). */
    private fun runInterleavedPteFromEmbeds(
-        prefillEmbeds: List<FloatArray>, trailingEmbeds: List<FloatArray>, maxGenTokens: Int
+        prefillEmbeds: List<FloatArray>, trailingEmbeds: List<FloatArray>, maxGenTokens: Int,
        // Invoked synchronously after each generated step with (stepIdx, 16-codebook codes).
        // Streaming callers use it to dispatch SEQ_LEN-sized chunks to the BigVGAN pipeline
        // as soon as they are ready. null preserves the original batch behaviour.
        onCodeStep: ((step: Int, codes: IntArray) -> Unit)? = null
    ): Array<IntArray> {
        val talkerMod = talkerPteModule ?: return emptyArray()
        val cpMod = cpPteModule ?: return emptyArray()
@ -2747,6 +2834,7 @@ class Qwen3TtsEngine(
            totalCpMs += System.currentTimeMillis() - tCp0
            for (cb in 1 until NUM_CODEBOOKS) codes[cb] = cpCodes[cb - 1]
            allCodes.add(codes); generatedCb0.add(currentCb0)
            onCodeStep?.invoke(genStep, codes)
            if (genStep < 3) nlog("Step ${genStep+1}: cb0=$currentCb0 cb1=${codes[1]}")
@ -3316,6 +3404,18 @@ class Qwen3TtsEngine(
    private var sessionTrack: AudioTrack? = null
    private var sessionChannel: kotlinx.coroutines.channels.Channel<String>? = null
    private var sessionJob: kotlinx.coroutines.Job? = null
    private var sessionKeepAliveJob: kotlinx.coroutines.Job? = null
    private var sessionFocusRequest: android.media.AudioFocusRequest? = null
    // Total PCM frames queued to sessionTrack across all segments in this session.
    // endStreamingSession() polls track.playbackHeadPosition until it reaches this
    // count before calling stop(), so the tail sentence isn't clipped.
    // Uses AtomicLong because both the session worker and the keep-alive watchdog
    // call writeAndCount concurrently.
    private val sessionFramesWritten = java.util.concurrent.atomic.AtomicLong(0)
    // True while a real-audio generate call is in progress. The keep-alive
    // watchdog skips silence injection while this is set, so silence never
    // interleaves with speech inside a segment.
    private val sessionGenActive = java.util.concurrent.atomic.AtomicBoolean(false)
    /**
     * Open a streaming TTS session backed by a persistent AudioTrack. After
@ -3324,13 +3424,403 @@ class Qwen3TtsEngine(
     * track as soon as it's decoded. Call endStreamingSession() to flush
     * the queue and release the track.
     */
-    fun startStreamingSession() {
+    // MediaPlayer-based fallback session state. If ColorOS mutes our
-        if (sessionTrack != null) return   // already open
+    // AudioTrack (as observed repeatedly — `event:muted updated source:
-        val track = AudioTrack.Builder()
+    // clientVolume` right after play()), we instead render each segment
-            .setAudioAttributes(AudioAttributes.Builder()
+    // as a WAV file on shared storage and play it back via MediaPlayer,
-                .setUsage(AudioAttributes.USAGE_MEDIA)
+    // which uses a completely different internal audio pipeline that
-                .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
+    // doesn't get silenced by the background playback policy.
    private var sessionMpQueue: kotlinx.coroutines.channels.Channel<String>? = null
    private var sessionMpJob: kotlinx.coroutines.Job? = null
    private val sessionMpSegIdx = java.util.concurrent.atomic.AtomicInteger(0)
    /**
     * Fires the moment a synthesized segment starts playing through the
     * speaker. Carries the sentence text, audio duration, per-window RMS
     * envelope (for orb amplitude) and per-window log-spaced band
     * spectrogram (for the spectrum-in-sphere visualizer). All three
     * share the same time axis — one entry per [ENVELOPE_WINDOW_MS].
     */
    var onSegmentPlaying: ((
        sentence: String,
        durationMs: Long,
        rmsEnvelope: FloatArray,
        spectrogram: Array<FloatArray>
    ) -> Unit)? = null
    private fun startStreamingSessionMp() {
        if (sessionMpQueue != null) return
        sessionMpSegIdx.set(0)
        val sentenceChan = kotlinx.coroutines.channels.Channel<String>(
            capacity = kotlinx.coroutines.channels.Channel.UNLIMITED
        )
        // Pipeline: synth worker produces WAV paths, playback worker runs
        // them through a pair of MediaPlayer instances chained via
        // setNextMediaPlayer() so there's zero-gap transition between
        // segments (no DAC/output routing "pop" the user was hearing as
        // "beg beg" with one player-per-seg). The rendezvous channel has
        // capacity 2 so the synth worker can stay one seg ahead of the
        // currently playing seg without growing disk use.
        // Carry (segIdx, wavPath, sentence, durationMs) together so the
        // playback worker can invoke onSegmentPlaying with the matching
        // text and audio length when the segment actually starts playing.
        val wavChan = kotlinx.coroutines.channels.Channel<SegmentReady>(capacity = 2)
        val scope = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.IO)
        val synthJob = scope.launch {
            for (sentence in sentenceChan) {
                try {
                    val segIdx = sessionMpSegIdx.getAndIncrement()
                    val tSynth = System.currentTimeMillis()
                    val audio = generateSegmentAudioVC(sentence, segIdx)
                    if (audio.isEmpty()) continue
                    val wavPath = "${context?.cacheDir?.absolutePath ?: "/data/local/tmp/kazeia"}/tts_seg_${segIdx}.wav"
                    saveWav(wavPath, audio)
                    val durationMs = audio.size * 1000L / SR
                    val envelope = computeRmsEnvelope(audio)
                    val spectrogram = computeSpectrogram(audio)
                    nlog("MP seg $segIdx synthesized (${System.currentTimeMillis() - tSynth}ms, ${durationMs}ms audio, ${envelope.size} env × ${SPECTRUM_BANDS} bands), queued for playback")
                    wavChan.send(SegmentReady(segIdx, wavPath, sentence, durationMs, envelope, spectrogram))
                } catch (e: Exception) {
                    nlog("MP synth error: ${e.message}")
                }
            }
            wavChan.close()
        }
        val playJob = scope.launch { playChainedMediaPlayers(wavChan) }
        val combined = scope.launch { synthJob.join(); playJob.join() }
        sessionMpQueue = sentenceChan; sessionMpJob = combined
        nlog("streaming session opened (MediaPlayer fallback, chained)")
    }
    /**
     * Drive the WAV playback pipeline with two MediaPlayer instances
     * chained via setNextMediaPlayer() so each segment flows into the
     * next without re-arming the audio output (which caused audible
     * "pops" between segments when one player stopped and another
     * started). Consumes (segIdx, wavPath) pairs from [wavChan] and
     * deletes each file after it finishes playing. Suspends until the
     * channel closes AND the final segment finishes.
     */
    private suspend fun playChainedMediaPlayers(
        wavChan: kotlinx.coroutines.channels.ReceiveChannel<SegmentReady>
    ) {
        val attrs = android.media.AudioAttributes.Builder()
            .setUsage(android.media.AudioAttributes.USAGE_MEDIA)
            .setContentType(android.media.AudioAttributes.CONTENT_TYPE_SPEECH)
            .build()
        // Synchronously prepare a MediaPlayer on the current coroutine.
        // Throws on failure; caller handles cleanup.
        suspend fun prepareMp(path: String, segIdx: Int): android.media.MediaPlayer {
            val mp = android.media.MediaPlayer()
            mp.setAudioAttributes(attrs)
            mp.setDataSource(path)
            kotlinx.coroutines.suspendCancellableCoroutine<Unit> { cont ->
                mp.setOnPreparedListener { if (cont.isActive) cont.resume(Unit) {} }
                mp.setOnErrorListener { _, what, extra ->
                    nlog("MP seg $segIdx prepare error: what=$what extra=$extra")
                    if (cont.isActive) cont.resume(Unit) {}
                    true
                }
                cont.invokeOnCancellation { try { mp.release() } catch (_: Exception) {} }
                mp.prepareAsync()
            }
            return mp
        }
        // Per-player book-keeping. `done` completes the moment the
        // MediaPlayer's OnCompletionListener fires, so the loop can
        // tell *before* calling setNextMediaPlayer whether the chain
        // will actually trigger (setNextMediaPlayer on a player already
        // in the Completed state is a silent no-op — that was the root
        // cause of missing audio on seg 1 when synthesis ran longer
        // than seg 0's playback).
        class Live(
            val mp: android.media.MediaPlayer,
            val info: SegmentReady,
            val done: kotlinx.coroutines.CompletableDeferred<Unit>
        )
        fun arm(info: SegmentReady, mp: android.media.MediaPlayer): Live {
            val done = kotlinx.coroutines.CompletableDeferred<Unit>()
            mp.setOnCompletionListener {
                try { it.release() } catch (_: Exception) {}
                if (!done.isCompleted) done.complete(Unit)
            }
            mp.setOnErrorListener { _, what, extra ->
                nlog("MP seg ${info.segIdx} play error: what=$what extra=$extra")
                if (!done.isCompleted) done.complete(Unit)
                true
            }
            return Live(mp, info, done)
        }
        var current: Live? = null
        try {
            // Bootstrap with the first segment.
            val first = wavChan.receiveCatching().getOrNull() ?: return
            val firstMp = prepareMp(first.wavPath, first.segIdx)
            firstMp.start()
            current = arm(first, firstMp)
            try { onSegmentPlaying?.invoke(first.sentence, first.durationMs, first.rmsEnvelope, first.spectrogram) } catch (_: Exception) {}
            nlog("MP seg ${first.segIdx} started (${first.durationMs}ms)")
            while (true) {
                val upcoming = wavChan.receiveCatching().getOrNull() ?: break
                val nextMp = prepareMp(upcoming.wavPath, upcoming.segIdx)
                // Try to chain so Android auto-starts next when current
                // finishes — gives zero-gap playback without re-arming
                // the DAC. Skipped if current has already completed
                // (setNext on Completed is a no-op); we fall back to an
                // explicit start() below in that case.
                var chained = false
                try {
                    if (!current!!.done.isCompleted) {
                        current!!.mp.setNextMediaPlayer(nextMp)
                        chained = true
                    }
                } catch (e: Exception) {
                    nlog("MP seg ${upcoming.segIdx} setNext failed: ${e.message}")
                }
                // Wait for current playback to finish before rotating.
                current!!.done.await()
                try { java.io.File(current!!.info.wavPath).delete() } catch (_: Exception) {}
                // If we never chained (or the chain raced with the
                // current's completion), start next manually. Safe to
                // start() again even if Android already auto-started.
                val autoStarted = try { chained && (nextMp.isPlaying || nextMp.currentPosition > 0) } catch (_: Exception) { false }
                if (!autoStarted) {
                    try { nextMp.start() } catch (e: Exception) {
                        nlog("MP seg ${upcoming.segIdx} manual start failed: ${e.message}")
                    }
                    nlog("MP seg ${upcoming.segIdx} started manually (chain missed)")
                } else {
                    nlog("MP seg ${upcoming.segIdx} auto-chained")
                }
                current = arm(upcoming, nextMp)
                try { onSegmentPlaying?.invoke(upcoming.sentence, upcoming.durationMs, upcoming.rmsEnvelope, upcoming.spectrogram) } catch (_: Exception) {}
            }
            // Drain: wait for the last player to finish.
            current?.done?.await()
            current?.let { try { java.io.File(it.info.wavPath).delete() } catch (_: Exception) {} }
        } catch (e: Exception) {
            nlog("MP playback chain error: ${e.message}")
        } finally {
            try { current?.mp?.release() } catch (_: Exception) {}
        }
    }
    /** Payload handed from the synth worker to the playback worker so
     *  the UI can be notified with matching text + duration when each
     *  segment starts playing. The [rmsEnvelope] is an optional sidecar
     *  array of per-ENVELOPE_WINDOW_MS RMS values normalized to [0, 1]
     *  that drives the audio-reactive orb visualizer without having to
     *  read PCM back from MediaPlayer. */
    private data class SegmentReady(
        val segIdx: Int,
        val wavPath: String,
        val sentence: String,
        val durationMs: Long,
        val rmsEnvelope: FloatArray,
        val spectrogram: Array<FloatArray>
    )
    /** Compute a per-ENVELOPE_WINDOW_MS normalized RMS envelope from a
     *  mono 16-bit PCM buffer at [SR]. Cheap (one pass, trivially fast
     *  on the ~100 k samples we generate per segment) and called only
     *  once per segment right after synthesis. */
    private fun computeRmsEnvelope(audio: ShortArray): FloatArray {
        if (audio.isEmpty()) return FloatArray(0)
        val windowSamples = SR * ENVELOPE_WINDOW_MS / 1000
        val nWindows = (audio.size + windowSamples - 1) / windowSamples
        val env = FloatArray(nWindows)
        for (w in 0 until nWindows) {
            val start = w * windowSamples
            val end = minOf(start + windowSamples, audio.size)
            var sumSq = 0.0
            for (i in start until end) {
                val s = audio[i].toDouble()
                sumSq += s * s
            }
            val rms = kotlin.math.sqrt(sumSq / (end - start))
            // Normalize: 32767 is full-scale; squash the upper range
            // with a sqrt curve so even quiet speech shows visible
            // motion without saturating on loud peaks.
            env[w] = kotlin.math.sqrt((rms / 32767.0).coerceIn(0.0, 1.0)).toFloat()
        }
        return env
    }
    /** Compute a per-window log-spaced band spectrogram used by the
     *  spectrum-in-sphere visualizer. Time axis aligned with the RMS
     *  envelope (one column per ENVELOPE_WINDOW_MS). FFT size is 1024
     *  samples (~43 ms at 24 kHz), windowed with Hann and centered on
     *  each hop. [SPECTRUM_BANDS] log-spaced bands from 120 Hz to
     *  4 kHz — covers the vocal formant range without wasting visual
     *  space on silent sub-100 Hz or frictive >4 kHz content. */
    private fun computeSpectrogram(audio: ShortArray): Array<FloatArray> {
        if (audio.isEmpty()) return emptyArray()
        val fftSize = FFT_SIZE
        val hopSamples = SR * ENVELOPE_WINDOW_MS / 1000
        val nFrames = (audio.size + hopSamples - 1) / hopSamples
        // Pre-compute band edges as FFT bin indices.
        val binHzRes = SR.toDouble() / fftSize
        val fMin = 120.0; val fMax = 4000.0
        val bandEdges = IntArray(SPECTRUM_BANDS + 1) { i ->
            val f = fMin * Math.pow(fMax / fMin, i.toDouble() / SPECTRUM_BANDS)
            (f / binHzRes).toInt().coerceIn(1, fftSize / 2 - 1)
        }
        // Hann window — reduces spectral leakage, gives cleaner bars.
        val hann = FloatArray(fftSize) { i ->
            (0.5 - 0.5 * Math.cos(2.0 * Math.PI * i / (fftSize - 1))).toFloat()
        }
        val re = FloatArray(fftSize)
        val im = FloatArray(fftSize)
        val result = Array(nFrames) { FloatArray(SPECTRUM_BANDS) }
        for (f in 0 until nFrames) {
            // Center the window on the hop midpoint.
            val center = f * hopSamples + hopSamples / 2
            val start = center - fftSize / 2
            for (i in 0 until fftSize) {
                val idx = start + i
                val sample = if (idx in audio.indices) audio[idx].toFloat() / 32768f else 0f
                re[i] = sample * hann[i]
                im[i] = 0f
            }
            fftInPlace(re, im)
            for (b in 0 until SPECTRUM_BANDS) {
                val bStart = bandEdges[b]
                val bEnd = bandEdges[b + 1].coerceAtLeast(bStart + 1)
                var sum = 0.0
                for (k in bStart until bEnd) {
                    val reK = re[k].toDouble(); val imK = im[k].toDouble()
                    sum += reK * reK + imK * imK
                }
                val mag = Math.sqrt(sum / (bEnd - bStart))
                // Log-compress + normalize. Speech energy per band rarely
                // exceeds ~0.1 before log; the constants below bring the
                // typical range to [0.2, 0.95] for visible bar motion.
                result[f][b] = (Math.log10(1.0 + mag * 80) / Math.log10(7.0))
                    .toFloat().coerceIn(0f, 1f)
            }
        }
        return result
    }
    /** In-place radix-2 Cooley–Tukey FFT. Size must be a power of 2. */
    private fun fftInPlace(re: FloatArray, im: FloatArray) {
        val n = re.size
        // Bit-reversal permutation.
        var j = 0
        for (i in 1 until n) {
            var bit = n shr 1
            while (j and bit != 0) { j = j xor bit; bit = bit shr 1 }
            j = j or bit
            if (i < j) {
                val tr = re[i]; re[i] = re[j]; re[j] = tr
                val ti = im[i]; im[i] = im[j]; im[j] = ti
            }
        }
        // Butterflies.
        var size = 2
        while (size <= n) {
            val half = size / 2
            val step = n / size
            val angleBase = -2.0 * Math.PI / size
            var m = 0
            while (m < n) {
                var k = 0
                for (i in m until m + half) {
                    val angle = (angleBase * k).toFloat()
                    val c = kotlin.math.cos(angle)
                    val s = kotlin.math.sin(angle)
                    val tRe = re[i + half] * c - im[i + half] * s
                    val tIm = re[i + half] * s + im[i + half] * c
                    re[i + half] = re[i] - tRe
                    im[i + half] = im[i] - tIm
                    re[i] = re[i] + tRe
                    im[i] = im[i] + tIm
                    k += step
                }
                m += size
            }
            size *= 2
        }
    }
    private suspend fun endStreamingSessionMp() {
        val chan = sessionMpQueue ?: return
        chan.close()
        try { sessionMpJob?.join() } catch (_: Exception) {}
        sessionMpQueue = null; sessionMpJob = null
        onSegmentPlaying = null
        nlog("streaming session closed (MediaPlayer fallback)")
    }
    /**
     * Play a WAV file via Android MediaPlayer and block the calling
     * coroutine until playback completes. MediaPlayer uses a separate
     * audio pipeline from AudioTrack so it bypasses ColorOS's AudioTrack
     * hardening/muting behaviour.
     */
    private suspend fun playWavBlocking(path: String, segIdx: Int) {
        val t0 = System.currentTimeMillis()
        suspendCancellableCoroutine<Unit> { cont ->
            val mp = android.media.MediaPlayer()
            try {
                mp.setAudioAttributes(android.media.AudioAttributes.Builder()
                    .setUsage(android.media.AudioAttributes.USAGE_MEDIA)
                    .setContentType(android.media.AudioAttributes.CONTENT_TYPE_SPEECH)
                    .build())
                mp.setDataSource(path)
                mp.setOnPreparedListener {
                    nlog("MP seg $segIdx prepared, starting (prep ${System.currentTimeMillis() - t0}ms)")
                    it.start()
                }
                mp.setOnCompletionListener {
                    nlog("MP seg $segIdx done (${System.currentTimeMillis() - t0}ms total)")
                    try { it.release() } catch (_: Exception) {}
                    if (cont.isActive) cont.resume(Unit) {}
                }
                mp.setOnErrorListener { player, what, extra ->
                    nlog("MP seg $segIdx error: what=$what extra=$extra")
                    try { player.release() } catch (_: Exception) {}
                    if (cont.isActive) cont.resume(Unit) {}
                    true
                }
                mp.prepareAsync()
                cont.invokeOnCancellation { try { mp.release() } catch (_: Exception) {} }
            } catch (e: Exception) {
                nlog("MP seg $segIdx setup failed: ${e.message}")
                try { mp.release() } catch (_: Exception) {}
                if (cont.isActive) cont.resume(Unit) {}
            }
        }
    }
    fun startStreamingSession() {
        if (USE_MEDIAPLAYER_FALLBACK) { startStreamingSessionMp(); return }
        if (sessionTrack != null) return   // already open
        // USAGE_VOICE_COMMUNICATION routes to STREAM_VOICE_CALL, which
        // ColorOS's "Audio Hardening" policy does NOT silently mute (the
        // policy targets STREAM_MUSIC to preserve battery on inactive media
        // apps; STREAM_VOICE_CALL is reserved for VoIP and always plays).
        // Previous attempts with USAGE_MEDIA and USAGE_ASSISTANT both got
        // `event:muted updated source:clientVolume` ~0.6–1 s after play()
        // even with audio focus + mediaPlayback FGS, so moving off of
        // STREAM_MUSIC is the only route that unblocks audible playback.
        val attrs = AudioAttributes.Builder()
            .setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
            .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
            .build()
        val track = AudioTrack.Builder()
            .setAudioAttributes(attrs)
            .setAudioFormat(AudioFormat.Builder()
                .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
                .setSampleRate(SR)
@ -3340,7 +3830,77 @@ class Qwen3TtsEngine(
                                                // paces writes when full.
            .setTransferMode(AudioTrack.MODE_STREAM)
            .build()
        // Request audio focus for the duration of the session. Without this
        // ColorOS's Audio Hardening treats the track as background noise
        // and mutes it, regardless of FGS status. We don't care about
        // focus loss callbacks — if another app grabs focus mid-sentence
        // that's fine, the track just gets ducked.
        val am = context?.getSystemService(android.content.Context.AUDIO_SERVICE) as? android.media.AudioManager
        val focusReq = android.media.AudioFocusRequest.Builder(android.media.AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK)
            .setAudioAttributes(attrs)
            .setOnAudioFocusChangeListener { _ -> }
            .build()
        val focusRes = am?.requestAudioFocus(focusReq)
        nlog("audio focus request: $focusRes (1=granted, 0=failed, 2=delayed)")
        sessionFocusRequest = focusReq
        // ColorOS mutes AudioTrack clientVolume ~1s after creation (seen in
        // dumpsys audio as `event:muted updated source:clientVolume`). Force
        // track volume back to 1.0 repeatedly to override. This is also
        // done in the keep-alive watchdog loop below for ongoing override.
        try { track.setVolume(1.0f) } catch (_: Exception) {}
        track.play()
        sessionFramesWritten.set(0)
        sessionGenActive.set(false)
        // writeAndCount is the single path through which PCM reaches the
        // AudioTrack for this session, so sessionFramesWritten always stays
        // in sync with what's been queued to playback hardware. AudioTrack.write
        // is thread-safe, so this can be called concurrently from the session
        // worker (real audio) and the keep-alive watchdog (silence padding).
        val writeAndCount: (ShortArray) -> Unit = { pcm ->
            if (pcm.isNotEmpty()) {
                val n = track.write(pcm, 0, pcm.size)
                if (n > 0) sessionFramesWritten.addAndGet(n.toLong())
            }
        }
        // Bootstrap silence: queue 500 ms immediately after play() so
        // AudioFlinger has samples to mix from the very first cycle.
        // Without this, there's a ~100 ms window between play() and the
        // first watchdog tick where the track has no data and AudioFlinger
        // flags it for removal. Once that happens, playbackHead sticks at
        // 0 and subsequent writes go to a dead track.
        val bootstrapSilence = ShortArray(SR / 2)  // 500 ms
        writeAndCount(bootstrapSilence)
        // Keep-alive watchdog. AudioFlinger on OnePlus/ColorOS kills a track
        // that underruns for ~1 s (confirmed via `prepareTracks_l BUFFER
        // TIMEOUT: remove track … due to underrun on thread 29`). Our
        // per-segment synthesis takes 3–5 s, which always exceeds that
        // window between writes, so the track was getting silenced after
        // the first ~1 s of audio played. The watchdog pads with 200 ms of
        // silence any time the buffered-ahead audio drops below 400 ms,
        // regardless of segment state — silence only advances playback head
        // in the gaps between real audio and is never inserted inside a
        // contiguous burst of real writes (those bring buffered above 400 ms
        // and keep the watchdog quiet).
        val keepAliveBuffer = ShortArray(SR / 5)  // 200 ms of silence
        val keepAliveJob = kotlinx.coroutines.CoroutineScope(
            kotlinx.coroutines.Dispatchers.IO
        ).launch {
            var tick = 0
            while (kotlinx.coroutines.currentCoroutineContext()[kotlinx.coroutines.Job]?.isActive != false) {
                kotlinx.coroutines.delay(100)
                val head = track.playbackHeadPosition.toLong() and 0xFFFFFFFFL
                val written = sessionFramesWritten.get() and 0xFFFFFFFFL
                val buffered = written - head
                val needsPad = buffered < SR * 2 / 5    // < 400 ms
                if ((tick and 0x1F) == 0) {
                    nlog("keepAlive tick=$tick head=$head written=$written buffered=$buffered pad=$needsPad state=${track.playState}")
                }
                tick++
                // Override any clientVolume mute that ColorOS keeps applying.
                try { track.setVolume(1.0f) } catch (_: Exception) {}
                if (needsPad) writeAndCount(keepAliveBuffer)
            }
        }
        val chan = kotlinx.coroutines.channels.Channel<String>(
            capacity = kotlinx.coroutines.channels.Channel.UNLIMITED
        )
@ -3350,15 +3910,26 @@ class Qwen3TtsEngine(
            var segIdx = 0
            for (sentence in chan) {
                try {
                    sessionGenActive.set(true)
                    if (USE_STREAMING_DECODE && talkerPteModule != null && cpPteModule != null) {
                        // CP↔BigVGAN overlap path: audio chunks flow to the
                        // shared AudioTrack as soon as BigVGAN finishes each
                        // SEQ_LEN window, instead of after the whole segment.
                        generateSegmentAudioVCStreaming(sentence, segIdx, writeAndCount)
                    } else {
                        val audio = generateSegmentAudioVC(sentence, segIdx)
-                    if (audio.isNotEmpty()) track.write(audio, 0, audio.size)
+                        writeAndCount(audio)
                    }
                    segIdx++
                } catch (e: Exception) {
                    nlog("session seg $segIdx error: ${e.message}")
                } finally {
                    sessionGenActive.set(false)
                }
            }
        }
        sessionTrack = track; sessionChannel = chan; sessionJob = job
        sessionKeepAliveJob = keepAliveJob
        nlog("streaming session opened")
    }
@ -3368,6 +3939,12 @@ class Qwen3TtsEngine(
     * immediately. Sentences play in the order they were enqueued.
     */
    fun enqueueSentence(sentence: String) {
        if (USE_MEDIAPLAYER_FALLBACK) {
            val chan = sessionMpQueue ?: run { nlog("enqueueSentence: no MP session"); return }
            val r = chan.trySend(sentence)
            if (r.isFailure) nlog("enqueueSentence: MP channel full / closed")
            return
        }
        val chan = sessionChannel ?: run { nlog("enqueueSentence: no session open"); return }
        val r = chan.trySend(sentence)
        if (r.isFailure) nlog("enqueueSentence: channel full / closed")
@ -3379,17 +3956,46 @@ class Qwen3TtsEngine(
     * drains), then release the shared track. Safe to call more than once.
     */
    suspend fun endStreamingSession() {
        if (USE_MEDIAPLAYER_FALLBACK) { endStreamingSessionMp(); return }
        val chan = sessionChannel ?: return
        chan.close()
        try { sessionJob?.join() } catch (_: Exception) {}
        // Stop the keep-alive watchdog BEFORE draining so it doesn't pad more
        // silence onto the tail while we're waiting for the existing buffer
        // to play out.
        try { sessionKeepAliveJob?.cancel() } catch (_: Exception) {}
        try { sessionKeepAliveJob?.join() } catch (_: Exception) {}
        try {
-            sessionTrack?.let {
+            sessionTrack?.let { track ->
-                // Block until written samples have been consumed by the
+                // AudioTrack.stop() in MODE_STREAM DISCARDS unplayed buffered
-                // hardware so users aren't cut off mid-syllable.
+                // samples — it doesn't block for drain. Poll getPlaybackHead
-                it.stop(); it.release()
+                // Position() until it reaches what we wrote, then stop. The
                // head is a 32-bit wrap-around counter, so compare modulo.
                // Cap the drain wait so a stalled track can't block us forever.
                val targetFrames = sessionFramesWritten.get()
                val startMs = System.currentTimeMillis()
                val maxDrainMs = (targetFrames * 1000L / SR) + 500L   // audio dur + 500ms slack
                while (true) {
                    val head = track.playbackHeadPosition.toLong() and 0xFFFFFFFFL
                    val reached = head >= (targetFrames and 0xFFFFFFFFL)
                    val state = track.playState
                    if (reached || state != AudioTrack.PLAYSTATE_PLAYING) break
                    if (System.currentTimeMillis() - startMs > maxDrainMs) {
                        nlog("endStreamingSession: drain timeout at head=$head/$targetFrames")
                        break
                    }
                    kotlinx.coroutines.delay(20)
                }
                track.stop(); track.release()
            }
        } catch (_: Exception) {}
-        sessionTrack = null; sessionChannel = null; sessionJob = null
+        // Release audio focus after the track is fully drained and stopped.
        try {
            val am = context?.getSystemService(android.content.Context.AUDIO_SERVICE) as? android.media.AudioManager
            sessionFocusRequest?.let { am?.abandonAudioFocusRequest(it) }
        } catch (_: Exception) {}
        sessionFocusRequest = null
        sessionTrack = null; sessionChannel = null; sessionJob = null; sessionKeepAliveJob = null
        nlog("streaming session closed")
    }
@ -3446,6 +4052,177 @@ class Qwen3TtsEngine(
        return fadeOut(decodeChunked(codebooks, n), 40)
    }
    // ---------- Streaming decode (CP ↔ BigVGAN overlap) ----------
    /** Carrier from the talker/CP producer to the BigVGAN consumer. */
    private class ChunkMsg(val codebooks: Array<IntArray>, val realTokens: Int)
    /**
     * Streaming variant of decodeChunked. Mirrors its semantics exactly: the
     * internal `result` buffer accumulates and crossfades chunks the same
     * way, so the final assembled audio is bit-identical. The difference is
     * that whenever a portion of `result` becomes "stable" (no future chunk
     * can modify it, i.e. anything before the last `overlapSamples`), it is
     * emitted via `onAudio` immediately. `flushFinal()` emits the remaining
     * tail with fadeOut applied, matching the original behaviour.
     */
    private inner class StreamingCrossfader(private val onAudio: (ShortArray) -> Unit) {
        private val overlapSamples = CHUNK_OVERLAP * SAMPLES_PER_TOKEN
        private var result = ShortArray(0)
        private var emittedLen = 0
        private var isFirst = true
        fun feedChunk(chunkAudio: ShortArray, realTokens: Int) {
            val trimLen = minOf(realTokens * SAMPLES_PER_TOKEN, chunkAudio.size)
            val trimmed = if (trimLen < chunkAudio.size) chunkAudio.copyOf(trimLen) else chunkAudio
            if (isFirst) {
                result = trimmed.copyOf()
                isFirst = false
            } else {
                val fadeLen = minOf(overlapSamples, result.size, trimmed.size)
                for (i in 0 until fadeLen) {
                    val alpha = i.toFloat() / fadeLen
                    val mixed = ((1f - alpha) * result[result.size - fadeLen + i] + alpha * trimmed[i]).toInt()
                        .coerceIn(Short.MIN_VALUE.toInt(), Short.MAX_VALUE.toInt()).toShort()
                    result[result.size - fadeLen + i] = mixed
                }
                if (fadeLen < trimmed.size) {
                    val newPart = trimmed.copyOfRange(fadeLen, trimmed.size)
                    val combined = ShortArray(result.size + newPart.size)
                    System.arraycopy(result, 0, combined, 0, result.size)
                    System.arraycopy(newPart, 0, combined, result.size, newPart.size)
                    result = combined
                }
            }
            // Hold back the last `overlapSamples` so the next chunk's
            // crossfade can still mutate them; emit everything before that.
            val stableEnd = (result.size - overlapSamples).coerceAtLeast(emittedLen)
            if (stableEnd > emittedLen) {
                val slice = result.copyOfRange(emittedLen, stableEnd)
                onAudio(slice)
                emittedLen = stableEnd
            }
        }
        /** Emit any remaining buffered samples with the trailing fadeOut. */
        fun flushFinal() {
            if (emittedLen < result.size) {
                val tail = result.copyOfRange(emittedLen, result.size)
                onAudio(fadeOut(tail, 40))
                emittedLen = result.size
            }
        }
    }
    /**
     * Streaming variant of generateSegmentAudioVC. As the talker/CP loop
     * produces codes step by step, BigVGAN chunks are dispatched on a
     * background coroutine the moment SEQ_LEN codes are accumulated. For a
     * 75-token segment this overlaps the last BigVGAN pass with the final
     * ~20 talker/CP steps, cutting first-audio latency by ~4 s vs the
     * sequential `generateSegmentAudioVC` path.
     *
     * Short segments (<SEQ_LEN codes) emit a single chunk at end-of-gen,
     * matching the legacy single-chunk path with no perceptible difference.
     *
     * The producer thread blocks on `bvChan.send` if the BigVGAN consumer
     * is behind; in practice that never happens because the producer takes
     * ~5 s per chunk vs ~2.4 s for BigVGAN.
     */
    private suspend fun generateSegmentAudioVCStreaming(
        segText: String, segIdx: Int, onAudio: (ShortArray) -> Unit
    ) {
        if (bpeTokenizer == null || textEmbedsFullBuf == null || damienVoicePrefix == null || damienVoiceSuffix == null) {
            nlog("generateSegmentAudioVCStreaming: Stage 2 assets missing"); return
        }
        if (talkerPteModule == null || cpPteModule == null) {
            nlog("generateSegmentAudioVCStreaming: PTE talker/CP not loaded"); return
        }
        val prefix = damienVoicePrefix!!
        val suffix = damienVoiceSuffix!!
        val codecPadEmb = codecEmb(CODEC_PAD)
        val ids = bpeTokenizer!!.encode(segText)
        nlog("session seg $segIdx (stream) '${segText.take(60)}' → ${ids.size} tokens")
        val prefill = ArrayList<FloatArray>(prefix.size + ids.size + suffix.size)
        for (e in prefix) prefill.add(e)
        for (id in ids) prefill.add(sumEmb(textEmbFromFull(id), codecPadEmb))
        for (e in suffix) prefill.add(e)
        val expectedSteps = (ids.size * 24) / 10
        val maxGen = minOf(expectedSteps * 3 / 2 + 10, MAX_CONTEXT - 15)
        val tStart = System.currentTimeMillis()
        var firstAudioLogged = false
        val bvChan = kotlinx.coroutines.channels.Channel<ChunkMsg>(capacity = 4)
        val cfader = StreamingCrossfader { pcm ->
            if (!firstAudioLogged) {
                nlog("streaming seg $segIdx first audio at ${System.currentTimeMillis() - tStart}ms (${pcm.size} samples)")
                firstAudioLogged = true
            }
            onAudio(pcm)
        }
        val consumerJob = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.IO).launch {
            try {
                for (msg in bvChan) {
                    val quant = vqDecode(msg.codebooks)
                    val audio = runSpeechDecoderV2(quant)
                    cfader.feedChunk(audio, msg.realTokens)
                }
                cfader.flushFinal()
            } catch (e: Exception) {
                nlog("streaming seg $segIdx consumer error: ${e.message}")
            }
        }
        // Producer: run the interleaved talker/CP loop and dispatch each
        // SEQ_LEN-aligned window of codes immediately. The consumer's
        // crossfader holds back the last `overlapSamples` of audio per
        // chunk, so the in-flight chunk's tail can still be mutated by the
        // next chunk before being emitted; flushFinal() at end emits the
        // last tail with fadeOut. End-of-stream is signalled by closing
        // bvChan after the trailing partial chunk is sent.
        val collected = mutableListOf<IntArray>()
        var nextChunkStart = 0
        fun buildChunkCb(start: Int, real: Int): Array<IntArray> = Array(NUM_CODEBOOKS) { cb ->
            IntArray(SEQ_LEN) { t ->
                val src = start + t
                if (src < start + real) {
                    val v = collected[src][cb]
                    if (v in 0 until CODEBOOK_SIZE) v else 0
                } else 0
            }
        }
        try {
            runInterleavedPteFromEmbeds(prefill, emptyList(), maxGen) { _, codes ->
                collected.add(codes)
                while (collected.size >= nextChunkStart + SEQ_LEN) {
                    val cb = buildChunkCb(nextChunkStart, SEQ_LEN)
                    kotlinx.coroutines.runBlocking { bvChan.send(ChunkMsg(cb, SEQ_LEN)) }
                    nextChunkStart += EFFECTIVE_CHUNK
                }
            }
        } catch (e: Exception) {
            nlog("streaming seg $segIdx producer error: ${e.message}")
        }
        // Trailing chunk: any remaining tokens after the last full window
        // (covers both the medium-segment partial-tail case and the
        // short-segment <SEQ_LEN single-chunk case where nextChunkStart=0).
        val total = collected.size
        if (total > nextChunkStart) {
            val trailing = total - nextChunkStart
            val cb = buildChunkCb(nextChunkStart, trailing)
            kotlinx.coroutines.runBlocking { bvChan.send(ChunkMsg(cb, trailing)) }
        }
        bvChan.close()
        consumerJob.join()
    }
    /**
     * Run the Hexagon talker + CP generation loop with a fully pre-built
     * prefill (voice prefix + all text tokens). Same decode recipe as
--- a/kazeia-android/app/src/main/java/com/kazeia/ui/AudioVisualizerView.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/ui/AudioVisualizerView.kt
@ -0,0 +1,548 @@
 package com.kazeia.ui
 import android.content.Context
 import android.graphics.Canvas
 import android.graphics.Color
 import android.graphics.Paint
 import android.graphics.Path
 import android.graphics.RadialGradient
 import android.graphics.Shader
 import android.util.AttributeSet
 import android.view.Choreographer
 import android.view.View
 import kotlin.math.PI
 import kotlin.math.cos
 import kotlin.math.max
 import kotlin.math.min
 import kotlin.math.sin
 import kotlin.math.sqrt
 /**
 * Large, central orb visualizer — Kazeia's visual "face". Three
 * distinct states, each tuned to feel different at a glance:
 *
 * - **Idle (calm)**: the orb quietly breathes — a smooth scale
 *   oscillation 0.88 ↔ 1.0 over a 5 s cycle with a soft halo that
 *   pulses in phase. No high-frequency motion. Suggests "waiting,
 *   listening, not anxious".
 *
 * - **Listening (attentive)**: the orb settles slightly larger, a
 *   warmer bright ring appears around it, and its outline deforms
 *   organically with the live mic RMS (blob-like wobble, 8 Fourier
 *   modes, gain-mapped from the RMS). Micro-ripples emit
 *   continuously while speech is present. Feels alive and engaged
 *   — clearly different from Idle's static breathing.
 *
 * - **Speaking (active)**: the orb is rendered **as a contained
 *   spectrometer**. Inside the sphere boundary, SPECTRUM_BANDS
 *   vertical bars rise from a horizontal baseline according to a
 *   pre-computed band-energy sidecar. The sphere outline pulses
 *   with the overall RMS envelope. The bars are clipped to the
 *   sphere so it really looks like "the sphere itself is speaking"
 *   — not an overlaid spectrogram. Strong amplitude peaks release
 *   outward ripple waves on the halo.
 *
 * The whole palette (core, halo, ring, bars, ripples) is re-derived
 * from a single [voiceColor] setter so each speaker gets a distinct
 * visual identity.
 */
 class AudioVisualizerView @JvmOverloads constructor(
    context: Context,
    attrs: AttributeSet? = null,
    defStyleAttr: Int = 0
 ) : View(context, attrs, defStyleAttr), Choreographer.FrameCallback {
    companion object {
        /** Must match Qwen3TtsEngine.SPECTRUM_BANDS. Asserted at setSpeaking. */
        private const val SPECTRUM_BANDS = 12
        /** Listening-mode outline deformation modes (even = smooth blobs). */
        private const val BLOB_MODES = 8
    }
    // ---------- State ----------
    private sealed class State {
        object Idle : State()
        data class Listening(var micRms: Float, var phaseSeed: Float) : State()
        data class Speaking(
            val envelope: FloatArray,
            val spectrogram: Array<FloatArray>,
            val durationMs: Long,
            val startedAtMs: Long
        ) : State()
    }
    @Volatile private var state: State = State.Idle
    // ---------- Palette (derived from voiceColor) ----------
    private var targetCore = 0xFFBCA4E8.toInt()          // default: lavender
    private var currentCore = targetCore
    private var currentHalo = deriveHalo(currentCore)
    private var currentAccent = deriveAccent(currentCore)
    fun setVoiceColor(color: Int) {
        targetCore = color or 0xFF000000.toInt()  // force opaque
        scheduleFrame()
    }
    // ---------- Animation state ----------
    private var frameStartNs = 0L
    private var smoothedAmp = 0f           // 0..1 orb-size pulsation (all states)
    private var smoothedBars = FloatArray(SPECTRUM_BANDS)
    private var listeningRingPhase = 0f    // rotating shimmer on listening ring
    private val ripples = ArrayList<Ripple>()
    private var lastSpectroIdx = -1
    // ---------- Paints ----------
    private val corePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL }
    private val haloPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL }
    private val ringPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
        style = Paint.Style.STROKE
    }
    private val ripplePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
        style = Paint.Style.STROKE
        strokeWidth = 3f
    }
    private val barPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
        style = Paint.Style.FILL_AND_STROKE
    }
    private val blobOutlinePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
        style = Paint.Style.STROKE
    }
    private val blobPath = Path()
    private val spherePath = Path()
    init {
        setLayerType(LAYER_TYPE_HARDWARE, null)
    }
    // ==================== Public API ====================
    fun setIdle() {
        if (state !is State.Idle) { state = State.Idle; lastSpectroIdx = -1 }
        scheduleFrame()
    }
    fun setListening(micRms: Float) {
        val clamped = micRms.coerceIn(0f, 1f)
        val s = state
        if (s is State.Listening) {
            s.micRms = clamped
        } else {
            state = State.Listening(clamped, (System.nanoTime() and 0xFFFF) / 65535f)
        }
        scheduleFrame()
    }
    fun startSpeaking(
        envelope: FloatArray,
        spectrogram: Array<FloatArray>,
        durationMs: Long
    ) {
        if (envelope.isEmpty() || spectrogram.isEmpty() || durationMs <= 0) {
            setIdle(); return
        }
        state = State.Speaking(envelope, spectrogram, durationMs, System.currentTimeMillis())
        lastSpectroIdx = -1
        // Soft reset bar heights so the spectrum grows from zero rather
        // than snapping to the idle smoothing residue.
        for (i in smoothedBars.indices) smoothedBars[i] = 0f
        scheduleFrame()
    }
    // ==================== Lifecycle / scheduling ====================
    override fun onAttachedToWindow() {
        super.onAttachedToWindow()
        frameStartNs = System.nanoTime()
        scheduleFrame()
    }
    override fun onDetachedFromWindow() {
        super.onDetachedFromWindow()
        Choreographer.getInstance().removeFrameCallback(this)
    }
    private var frameScheduled = false
    private fun scheduleFrame() {
        if (!frameScheduled && isAttachedToWindow) {
            frameScheduled = true
            Choreographer.getInstance().postFrameCallback(this)
        }
    }
    override fun doFrame(frameTimeNanos: Long) {
        frameScheduled = false
        // Ease the palette toward the target (voice change tween).
        currentCore = lerpColor(currentCore, targetCore, 0.12f)
        currentHalo = deriveHalo(currentCore)
        currentAccent = deriveAccent(currentCore)
        val s = state
        when (s) {
            is State.Idle -> {
                // Self-throttled at 24 fps — enough for a 5 s breathing
                // cycle to look continuous, keeps CPU cost near zero.
                Choreographer.getInstance().postFrameCallbackDelayed(this, 40)
                frameScheduled = true
            }
            is State.Listening -> {
                listeningRingPhase += 0.015f
                Choreographer.getInstance().postFrameCallback(this)
                frameScheduled = true
            }
            is State.Speaking -> {
                val elapsed = System.currentTimeMillis() - s.startedAtMs
                if (elapsed >= s.durationMs + 300) {
                    state = State.Idle
                    lastSpectroIdx = -1
                    Choreographer.getInstance().postFrameCallbackDelayed(this, 40)
                    frameScheduled = true
                } else {
                    Choreographer.getInstance().postFrameCallback(this)
                    frameScheduled = true
                }
            }
        }
        invalidate()
    }
    // ==================== Drawing ====================
    override fun onDraw(canvas: Canvas) {
        super.onDraw(canvas)
        val w = width.toFloat(); val h = height.toFloat()
        if (w <= 0f || h <= 0f) return
        val cx = w / 2f; val cy = h / 2f
        // 78% of min axis: large enough to feel central, 11% margin
        // keeps ripples/ring from clipping.
        val maxR = min(w, h) * 0.39f
        val now = System.currentTimeMillis()
        when (val s = state) {
            is State.Idle -> drawIdle(canvas, cx, cy, maxR, now)
            is State.Listening -> drawListening(canvas, cx, cy, maxR, now, s)
            is State.Speaking -> drawSpeaking(canvas, cx, cy, maxR, now, s)
        }
    }
    // ---------- Idle ----------
    private fun drawIdle(canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long) {
        // 5 s breathing cycle, amplitude 0.88 → 1.00.
        val t = ((now - frameStartNs / 1_000_000) % 5000L) / 5000f
        val breath = 0.5f - 0.5f * cos((t * 2.0 * PI).toFloat())   // 0..1
        val scale = 0.88f + 0.12f * breath
        val radius = maxR * scale
        smoothedAmp += ((breath * 0.5f) - smoothedAmp) * 0.1f
        // Halo (soft, breathing in phase).
        drawHalo(canvas, cx, cy, maxR * 1.15f * scale, alphaBase = 60, alphaGain = 70)
        // Core — pure round, no deformation.
        drawCore(canvas, cx, cy, radius, shimmer = 0f)
        // Subtle inner highlight — feels alive without movement.
        val hl = Paint(Paint.ANTI_ALIAS_FLAG).apply {
            style = Paint.Style.FILL
            shader = RadialGradient(
                cx - radius * 0.25f, cy - radius * 0.25f, radius * 0.9f,
                Color.argb(60, 255, 255, 255),
                Color.argb(0, 255, 255, 255),
                Shader.TileMode.CLAMP
            )
        }
        canvas.drawCircle(cx, cy, radius, hl)
    }
    // ---------- Listening ----------
    private fun drawListening(
        canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, s: State.Listening
    ) {
        // Base size slightly larger than Idle so the transition reads.
        val baseScale = 0.93f + 0.08f * s.micRms
        val radius = maxR * baseScale
        smoothedAmp += (s.micRms - smoothedAmp) * 0.25f
        // Halo — brighter than Idle, responds to mic.
        drawHalo(canvas, cx, cy, maxR * 1.22f * baseScale,
            alphaBase = 90, alphaGain = (130 * s.micRms).toInt().coerceIn(0, 160))
        // Deformed outline (blob): Fourier modes over the circle.
        buildBlobPath(blobPath, cx, cy, radius, s.micRms, s.phaseSeed, now)
        // Filled core with a radial gradient inside the blob path.
        corePaint.shader = RadialGradient(
            cx - radius * 0.15f, cy - radius * 0.25f, radius * 1.1f,
            currentCore, deriveCoreEdge(currentCore),
            Shader.TileMode.CLAMP
        )
        canvas.save()
        canvas.clipPath(blobPath)
        canvas.drawCircle(cx, cy, radius * 1.3f, corePaint)
        canvas.restore()
        // Outline of the blob, slightly thicker as RMS rises.
        blobOutlinePaint.strokeWidth = 2f + 2f * s.micRms
        blobOutlinePaint.color = withAlpha(currentAccent, 180)
        canvas.drawPath(blobPath, blobOutlinePaint)
        // Rotating shimmer ring — a thin arc segment chasing around.
        drawListeningRing(canvas, cx, cy, radius * 1.08f, s.micRms)
        // Continuous micro-ripples while listening.
        val rmsMicroFloor = 0.12f
        if (s.micRms > rmsMicroFloor && ((now / 90) % 3 == 0L)) {
            ripples.add(Ripple(bornAtMs = now, peak = s.micRms))
        }
        drawRipples(canvas, cx, cy, maxR, now, listeningMode = true)
    }
    private fun drawListeningRing(
        canvas: Canvas, cx: Float, cy: Float, radius: Float, rms: Float
    ) {
        // Thin shimmer arc rotating around the orb, width/alpha scaling
        // with mic RMS so silence shows almost nothing.
        if (rms < 0.04f) return
        ringPaint.strokeWidth = 2.5f + 3f * rms
        val sweep = 60f + 80f * rms
        val start = (listeningRingPhase * 360f) % 360f
        ringPaint.color = withAlpha(currentAccent, (140 + 110 * rms).toInt().coerceIn(0, 250))
        val r = radius
        canvas.drawArc(cx - r, cy - r, cx + r, cy + r, start, sweep, false, ringPaint)
        // Subtle tail: a second, dimmer, shorter arc slightly offset.
        ringPaint.color = withAlpha(currentAccent, (60 + 60 * rms).toInt().coerceIn(0, 160))
        canvas.drawArc(cx - r, cy - r, cx + r, cy + r, start + sweep + 8f, sweep * 0.5f, false, ringPaint)
    }
    // ---------- Speaking ----------
    private fun drawSpeaking(
        canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, s: State.Speaking
    ) {
        // Envelope → overall size pulsation + halo intensity.
        val elapsed = now - s.startedAtMs
        val envIdxF = elapsed.toFloat() * s.envelope.size / s.durationMs
        val envIdx = envIdxF.toInt().coerceIn(0, s.envelope.size - 1)
        val envFrac = (envIdxF - envIdx).coerceIn(0f, 1f)
        val env = lerp(
            s.envelope[envIdx],
            s.envelope[min(envIdx + 1, s.envelope.size - 1)],
            envFrac
        )
        smoothedAmp += (env - smoothedAmp) * 0.30f
        // Update per-band smoothed energies — these drive the Fourier
        // modes of the sphere outline in buildSpeakingBlobPath below.
        val timeIdxF = elapsed.toFloat() * s.spectrogram.size / s.durationMs
        val timeIdx = timeIdxF.toInt().coerceIn(0, s.spectrogram.size - 1)
        val timeFrac = (timeIdxF - timeIdx).coerceIn(0f, 1f)
        for (b in 0 until SPECTRUM_BANDS) {
            val a = s.spectrogram[timeIdx][b]
            val c = s.spectrogram[min(timeIdx + 1, s.spectrogram.size - 1)][b]
            val target = lerp(a, c, timeFrac)
            smoothedBars[b] += (target - smoothedBars[b]) * 0.35f
        }
        val scale = 0.92f + 0.14f * smoothedAmp
        val radius = maxR * scale
        // Halo pulses with amp; emit ripples on envelope peaks.
        drawHalo(canvas, cx, cy, maxR * 1.30f * scale,
            alphaBase = 90, alphaGain = (160 * smoothedAmp).toInt().coerceIn(0, 220))
        if (envIdx != lastSpectroIdx && env > 0.45f) {
            val prev = if (envIdx > 0) s.envelope[envIdx - 1] else 0f
            val next = if (envIdx < s.envelope.size - 1) s.envelope[envIdx + 1] else 0f
            if (env >= prev && env >= next) {
                ripples.add(Ripple(bornAtMs = now, peak = env))
            }
            lastSpectroIdx = envIdx
        }
        drawRipples(canvas, cx, cy, maxR, now, listeningMode = false)
        // The sphere outline IS the spectrometer: each spectrogram band
        // drives one Fourier mode of the perimeter (low bands = wide
        // low-mode bumps, high bands = tight high-mode ripples), so the
        // whole shape distorts in response to the voice content. No
        // internal bars or curves — the sphere itself is what speaks.
        buildSpeakingBlobPath(spherePath, cx, cy, radius, now)
        // Fill the deformed sphere with the voice-tinted gradient.
        corePaint.shader = RadialGradient(
            cx - radius * 0.25f, cy - radius * 0.30f, radius * 1.25f,
            currentCore, deriveCoreEdge(currentCore),
            Shader.TileMode.CLAMP
        )
        canvas.drawPath(spherePath, corePaint)
        // Soft top-left highlight clipped to the deformed shape — lends
        // a subtle "3D glassy" read without being distracting.
        canvas.save()
        canvas.clipPath(spherePath)
        val hl = Paint(Paint.ANTI_ALIAS_FLAG).apply {
            style = Paint.Style.FILL
            shader = RadialGradient(
                cx - radius * 0.28f, cy - radius * 0.30f, radius * 0.9f,
                Color.argb(75, 255, 255, 255),
                Color.argb(0, 255, 255, 255),
                Shader.TileMode.CLAMP
            )
        }
        canvas.drawCircle(cx, cy, radius * 1.2f, hl)
        canvas.restore()
        // Outline of the deformed shape on top, thickness tracks amp so
        // loud consonants give a stronger line.
        blobOutlinePaint.strokeWidth = 2.5f + 3.5f * smoothedAmp
        blobOutlinePaint.color = withAlpha(currentAccent, 230)
        canvas.drawPath(spherePath, blobOutlinePaint)
    }
    /**
     * Build the speaking-state sphere perimeter: base circle plus a
     * sum of Fourier modes, one per spectrogram band. Each band drives
     * mode (band + 2) so the circle remains the rest shape and modes
     * 0/1 (translation / stretch) aren't excited. Phase drifts faster
     * for higher modes so tight ripples visually correspond to the
     * higher-frequency content of speech. Deformation amplitude is
     * scaled both by per-band energy and by overall envelope so quiet
     * passages show small motion and loud syllables show strong
     * distortion. Sampled at 96 points — smooth enough for the
     * highest mode we render without being expensive.
     */
    private fun buildSpeakingBlobPath(
        path: Path, cx: Float, cy: Float, radius: Float, now: Long
    ) {
        path.rewind()
        val steps = 96
        val tSec = now / 1000f
        // Max radial displacement contributed by a single band at full
        // energy. 0.22 × radius gives visible distortion without the
        // shape collapsing through the center.
        val modeGain = radius * 0.22f
        // Envelope weight — quiet passages feel less jittery.
        val envWeight = (0.5f + 0.5f * smoothedAmp).coerceIn(0f, 1f)
        for (i in 0..steps) {
            val theta = (i % steps).toFloat() / steps * 2f * PI.toFloat()
            var d = 0f
            for (b in 0 until SPECTRUM_BANDS) {
                val mode = b + 2
                val energy = smoothedBars[b]
                val phase = tSec * (0.45f + 0.22f * b)
                d += modeGain * energy * envWeight *
                    sin((mode * theta + phase).toDouble()).toFloat()
            }
            val r = radius + d
            val x = cx + r * cos(theta.toDouble()).toFloat()
            val y = cy + r * sin(theta.toDouble()).toFloat()
            if (i == 0) path.moveTo(x, y) else path.lineTo(x, y)
        }
        path.close()
    }
    // ---------- Helpers: halo / ripples / blob ----------
    private fun drawHalo(
        canvas: Canvas, cx: Float, cy: Float, r: Float,
        alphaBase: Int, alphaGain: Int
    ) {
        val a = (alphaBase + alphaGain).coerceIn(0, 255)
        haloPaint.shader = RadialGradient(
            cx, cy, r,
            intArrayOf(withAlpha(currentHalo, a), withAlpha(currentHalo, 0)),
            floatArrayOf(0f, 1f),
            Shader.TileMode.CLAMP
        )
        canvas.drawCircle(cx, cy, r, haloPaint)
    }
    private fun drawCore(canvas: Canvas, cx: Float, cy: Float, radius: Float, shimmer: Float) {
        corePaint.shader = RadialGradient(
            cx - radius * 0.2f, cy - radius * 0.3f, radius * 1.15f,
            currentCore, deriveCoreEdge(currentCore),
            Shader.TileMode.CLAMP
        )
        canvas.drawCircle(cx, cy, radius, corePaint)
    }
    private fun drawRipples(
        canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, listeningMode: Boolean
    ) {
        if (ripples.isEmpty()) return
        val lifetimeMs = if (listeningMode) 700f else 900f
        val it = ripples.iterator()
        while (it.hasNext()) {
            val r = it.next()
            val age = (now - r.bornAtMs) / lifetimeMs
            if (age >= 1f) { it.remove(); continue }
            val radius = maxR * (0.58f + 0.62f * age)
            val alpha = ((1f - age) * 150f * r.peak).toInt().coerceIn(0, 200)
            ripplePaint.color = withAlpha(currentAccent, alpha)
            ripplePaint.strokeWidth = max(1.2f, (1f - age) * 4f)
            canvas.drawCircle(cx, cy, radius, ripplePaint)
        }
    }
    /**
     * Build an organic blob path by displacing a circle with a sum of
     * low-frequency sine modes. Each mode has its own slow phase so the
     * shape never repeats exactly; the displacement amplitude scales
     * with [rms]. 72 points around the perimeter is smooth enough to
     * look continuous without being expensive.
     */
    private fun buildBlobPath(
        path: Path, cx: Float, cy: Float, radius: Float,
        rms: Float, phaseSeed: Float, now: Long
    ) {
        path.rewind()
        val steps = 72
        val tSec = now / 1000f
        val amp = radius * (0.02f + 0.08f * rms)
        for (i in 0..steps) {
            val theta = (i % steps).toFloat() / steps * 2f * PI.toFloat()
            var d = 0f
            for (m in 1..BLOB_MODES) {
                val phase = phaseSeed * 6.28f + tSec * (0.3f + 0.05f * m)
                d += (amp / m) * sin((m * theta + phase).toDouble()).toFloat()
            }
            val r = radius + d
            val x = cx + r * cos(theta.toDouble()).toFloat()
            val y = cy + r * sin(theta.toDouble()).toFloat()
            if (i == 0) path.moveTo(x, y) else path.lineTo(x, y)
        }
        path.close()
    }
    // ---------- Color helpers ----------
    private fun deriveHalo(core: Int): Int = darken(core, 0.18f)
    private fun deriveAccent(core: Int): Int = brighten(core, 0.12f)
    private fun deriveCoreEdge(core: Int): Int = darken(core, 0.12f)
    private fun brighten(c: Int, frac: Float): Int {
        val r = (Color.red(c) + (255 - Color.red(c)) * frac).toInt().coerceIn(0, 255)
        val g = (Color.green(c) + (255 - Color.green(c)) * frac).toInt().coerceIn(0, 255)
        val b = (Color.blue(c) + (255 - Color.blue(c)) * frac).toInt().coerceIn(0, 255)
        return Color.argb(Color.alpha(c), r, g, b)
    }
    private fun darken(c: Int, frac: Float): Int {
        val r = (Color.red(c) * (1 - frac)).toInt().coerceIn(0, 255)
        val g = (Color.green(c) * (1 - frac)).toInt().coerceIn(0, 255)
        val b = (Color.blue(c) * (1 - frac)).toInt().coerceIn(0, 255)
        return Color.argb(Color.alpha(c), r, g, b)
    }
    private fun withAlpha(c: Int, alpha: Int): Int {
        return Color.argb(alpha.coerceIn(0, 255), Color.red(c), Color.green(c), Color.blue(c))
    }
    private fun lerp(a: Float, b: Float, t: Float): Float = a + (b - a) * t
    private fun lerpColor(from: Int, to: Int, t: Float): Int {
        val r = lerp(Color.red(from).toFloat(), Color.red(to).toFloat(), t).toInt().coerceIn(0, 255)
        val g = lerp(Color.green(from).toFloat(), Color.green(to).toFloat(), t).toInt().coerceIn(0, 255)
        val b = lerp(Color.blue(from).toFloat(), Color.blue(to).toFloat(), t).toInt().coerceIn(0, 255)
        return Color.argb(255, r, g, b)
    }
    private class Ripple(val bornAtMs: Long, val peak: Float)
 }
--- a/kazeia-android/app/src/main/java/com/kazeia/ui/ChatActivity.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/ui/ChatActivity.kt
@ -187,6 +187,21 @@ class ChatActivity : AppCompatActivity() {
        "Amir", "Didier", "Sid", "Zelda"
    )
    /** One color per speaker — derived palette (core + halo + bars) is
     *  generated inside AudioVisualizerView. Chosen to be calm,
     *  perceptually distinct, and consistent in saturation so switching
     *  voices changes *hue* rather than *mood*. */
    private val voiceColors = listOf(
        0xFFBCA4E8.toInt(), // Damien — lavender
        0xFFE8A4CC.toInt(), // Elodie — rose
        0xFF82D5D0.toInt(), // Jerome — aqua
        0xFFE8BFA4.toInt(), // Richard — amber sand
        0xFF95D5A6.toInt(), // Amir   — emerald
        0xFF8FA2D4.toInt(), // Didier — indigo
        0xFFE8B89A.toInt(), // Sid    — peach
        0xFFA4BEE8.toInt()  // Zelda  — periwinkle
    )
    private fun setupResourceMonitoring() {
        val graphCpu = findViewById<MiniGraphView>(R.id.graphCpu)
        val graphGpu = findViewById<MiniGraphView>(R.id.graphGpu)
@ -254,6 +269,12 @@ class ChatActivity : AppCompatActivity() {
            override fun onItemSelected(parent: AdapterView<*>?, view: android.view.View?, pos: Int, id: Long) {
                val voicePath = "${com.kazeia.KazeiaApplication.MODELS_DIR}/../voix/${voiceFiles[pos]}"
                kazeiaService?.setVoice(voicePath)
                // Push the matching color to the service so the orb
                // view picks it up; the view tweens from the previous
                // color so voice changes don't snap visually.
                val color = voiceColors[pos.coerceIn(voiceColors.indices)]
                kazeiaService?.setVoiceColor(color)
                binding.audioViz.setVoiceColor(color)
                appendLog("Voix: ${voiceNames[pos]}")
            }
            override fun onNothingSelected(parent: AdapterView<*>?) {}
@ -326,6 +347,43 @@ class ChatActivity : AppCompatActivity() {
                        setDebugPanelVisible(debug)
                    }
                }
                launch {
                    // Drive the orb visualizer from the service-side signal.
                    // Service decides whether the app is idle, tracking the
                    // mic, or rendering a TTS segment; the view just renders
                    // it.  StartSpeaking is edge-triggered on the envelope
                    // identity so re-emitting the same signal won't restart
                    // the animation timer.
                    var lastSpeakingEnv: FloatArray? = null
                    service.visualizerSignal.collect { sig ->
                        when (sig) {
                            is com.kazeia.service.KazeiaService.VisualizerSignal.Idle -> {
                                binding.audioViz.setIdle()
                                lastSpeakingEnv = null
                            }
                            is com.kazeia.service.KazeiaService.VisualizerSignal.Listening -> {
                                binding.audioViz.setListening(sig.micRms)
                                lastSpeakingEnv = null
                            }
                            is com.kazeia.service.KazeiaService.VisualizerSignal.Speaking -> {
                                if (sig.rmsEnvelope !== lastSpeakingEnv) {
                                    binding.audioViz.startSpeaking(
                                        sig.rmsEnvelope, sig.spectrogram, sig.durationMs
                                    )
                                    lastSpeakingEnv = sig.rmsEnvelope
                                }
                            }
                        }
                    }
                }
                launch {
                    // Keep the view's voice color synchronised with the
                    // service — covers the initial state when the view
                    // attaches before the spinner's first callback fires.
                    service.voiceColor.collect { color ->
                        binding.audioViz.setVoiceColor(color)
                    }
                }
            }
        }
    }
--- a/kazeia-android/app/src/main/java/com/kazeia/ui/ResourceMonitor.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/ui/ResourceMonitor.kt
@ -18,17 +18,12 @@ class ResourceMonitor(private val context: Context) {
    private var prevIdle = 0L
    private var prevGpuBusy = 0L
    private var prevGpuTotal = 0L
    private var hasRoot = false
-    init {
+    // No-root deployment (2026-04-14): the previous `su -c id` probe used to
-        // Test root access once
+    // enable GPU/NPU sysfs reads via root, but it also triggered a Magisk
-        hasRoot = try {
+    // prompt on every ChatActivity launch. The whole pipeline now runs in
-            val p = Runtime.getRuntime().exec(arrayOf("su", "-c", "id"))
+    // the app process so root is never needed — GPU/NPU usage is reported
-            val result = p.inputStream.bufferedReader().readText()
+    // as -1 (UI shows "—") and the dashboard shows CPU + RAM only.
            p.waitFor()
            result.contains("uid=0")
        } catch (_: Exception) { false }
    }
    fun snapshot(): ResourceSnapshot {
        return ResourceSnapshot(
@ -67,7 +62,9 @@ class ResourceMonitor(private val context: Context) {
    }
    private fun readGpu(): Float {
-        // Try direct read first (works on some devices)
+        // Non-root path: some devices expose /sys/class/kgsl/kgsl-3d0/gpubusy
        // as world-readable. If it's locked down (most SELinux configs do),
        // just return -1 — no root fallback, no Magisk prompt.
        try {
            val content = File("/sys/class/kgsl/kgsl-3d0/gpubusy").readText().trim()
            val parts = content.split("\\s+".toRegex())
@ -81,38 +78,14 @@ class ResourceMonitor(private val context: Context) {
                if (dt > 0) return (db * 100f / dt).coerceIn(0f, 100f)
            }
        } catch (_: Exception) {}
        // Try with root
        if (hasRoot) {
            try {
                val content = execRoot("cat /sys/class/kgsl/kgsl-3d0/gpu_busy_percentage").trim()
                val pct = content.replace("%", "").trim().toFloatOrNull()
                if (pct != null) return pct.coerceIn(0f, 100f)
            } catch (_: Exception) {}
        }
        return -1f
    }
    private fun readNpu(): Float {
-        // NPU doesn't have a standard busy metric
+        // NPU usage reporting required root sysfs reads (cdsp_rm/cpu_vote,
-        // Use CDSP (compute DSP) load as proxy if available
+        // /proc/fastrpc) that always triggered a Magisk prompt. Removed with
-        if (hasRoot) {
+        // the no-root migration — no equivalent public API exists, so the
-            try {
+        // UI just shows "—" for NPU load.
                // Check if CDSP is active by reading vote count
                val vote = execRoot("cat /sys/bus/platform/devices/soc:qcom,msm-cdsp-rm/cdsp_rm/cpu_vote 2>/dev/null").trim()
                if (vote.isNotEmpty()) {
                    val v = vote.toIntOrNull() ?: 0
                    return if (v > 0) 100f else 0f
                }
            } catch (_: Exception) {}
            try {
                // Alternative: check fastrpc activity
                val stat = execRoot("cat /proc/fastrpc 2>/dev/null || echo none").trim()
                if (stat != "none" && stat.isNotEmpty()) return 50f
            } catch (_: Exception) {}
        }
        return -1f
    }
@ -134,12 +107,4 @@ class ResourceMonitor(private val context: Context) {
        } catch (_: Exception) { return 0 }
    }
    private fun execRoot(cmd: String): String {
        return try {
            val p = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
            val result = p.inputStream.bufferedReader().readText()
            p.waitFor()
            result
        } catch (_: Exception) { "" }
    }
 }
--- a/kazeia-android/app/src/main/res/layout/activity_chat.xml
+++ b/kazeia-android/app/src/main/res/layout/activity_chat.xml
@ -100,6 +100,23 @@
        </LinearLayout>
        <!-- Central orb visualizer: Kazeia's visual "face". Takes the
             top half of the chat area so it reads as the primary UI
             element; the message list sits below it and shows the
             word-by-word reveal of the current reply. Color is driven
             by the selected voice (Damien=lavender, Elodie=rose, …). -->
        <com.kazeia.ui.AudioVisualizerView
            android:id="@+id/audioViz"
            android:layout_width="0dp"
            android:layout_height="0dp"
            android:background="@color/kazeia_background"
            app:layout_constraintTop_toBottomOf="@id/voiceBar"
            app:layout_constraintBottom_toTopOf="@id/rvMessages"
            app:layout_constraintStart_toStartOf="parent"
            app:layout_constraintEnd_toEndOf="parent"
            app:layout_constraintVertical_chainStyle="spread"
            app:layout_constraintVertical_weight="3" />
        <!-- Chat messages -->
        <androidx.recyclerview.widget.RecyclerView
            android:id="@+id/rvMessages"
@ -107,10 +124,11 @@
            android:layout_height="0dp"
            android:clipToPadding="false"
            android:padding="8dp"
-            app:layout_constraintTop_toBottomOf="@id/voiceBar"
+            app:layout_constraintTop_toBottomOf="@id/audioViz"
            app:layout_constraintBottom_toTopOf="@id/inputBar"
            app:layout_constraintStart_toStartOf="parent"
-            app:layout_constraintEnd_toEndOf="parent" />
+            app:layout_constraintEnd_toEndOf="parent"
            app:layout_constraintVertical_weight="2" />
        <!-- Input bar -->
        <LinearLayout
--- a/kazeia-no-root-report.md
+++ b/kazeia-no-root-report.md
@ -1,4 +1,4 @@
-# Kazeia Android — Problème d'élimination de root pour le LLM
+# Kazeia Android — Élimination du root pour le LLM (résolu)
 **Date :** 2026-04-14
 **Device :** OnePlus Pad 3 (OPD2415, Snapdragon 8 Elite, SoC `sun`), Android 16 (OxygenOS), Magisk root
@ -6,6 +6,13 @@
 ---
 > **🟢 Statut : RÉSOLU.** Pipeline complet STT + LLM + TTS tourne in-process sans
 > aucun appel à `su`. Voir la section **Résolution** en bas du document pour le
 > détail du fix. Le reste du document décrit l'investigation initiale et garde
 > sa valeur historique.
 ---
 ## 1. Contexte général
 L'app Kazeia (Android / Kotlin + Jetpack Compose) orchestre un pipeline **STT → LLM → TTS** entièrement on-device sur le Hexagon HTP (V79) du Snapdragon 8 Elite.
@ -224,3 +231,132 @@ Je cherche soit :
 - Soit **la confirmation** que l'approche actuelle (root + Magisk remember) est le meilleur compromis accessible, avec éventuellement des suggestions pour minimiser les prompts
 Merci.
 ---
 ## 10. Résolution (post-mortem)
 Une seconde opinion technique a identifié la **vraie cause racine** que
 l'investigation locale avait mal diagnostiquée.
 ### 10.1 Vraie cause
 Les processus Android forkés par Zygote (l'app elle-même, ses Services
 `android:process=":xxx"`, etc.) héritent des **GIDs supplémentaires**
 configurés à l'init pour `untrusted_app`. Ces GIDs incluent l'autorisation
 `/dev/cdsprpc-smd` et d'autres canaux fastrpc.
 Quand `Runtime.exec("su"…)` ou `ProcessBuilder` font un `fork()` + `exec()`
 classique, le `exec()` ne préserve pas tous les credentials utilisés par le
 driver fastrpc Qualcomm pour authentifier le client. Le driver retourne
 **error 4000 "Failed to load skel"** car il refuse de créer une session DSP
 pour ce process.
 C'est pour ça que :
 - ORT-QNN (Whisper) marchait in-process : chargé via `System.loadLibrary` dans
  l'app, qui est Zygote-forked → credentials valides.
 - `su -c qnn_llama_runner` marchait : root bypasse les checks fastrpc.
 - `ProcessBuilder` du même runner échouait : ni Zygote-forked, ni root.
 Le "conflit de version QNN v2.31 vs v2.37" que j'avais soupçonné n'était
 **pas le vrai problème**. Les libs étaient déjà unifiées en v2.42 dans jniLibs.
 ### 10.2 La solution : `LlmModule` JNI in-process
 ExecuTorch fournit `org.pytorch.executorch.extension.llm.LlmModule`, un
 wrapper JNI autour du même C++ `example::Runner` que le binaire
 `qnn_llama_runner`. En l'invoquant depuis l'app (process Zygote-forked), le
 DSP fastrpc accepte la session — pas de root nécessaire.
 ### 10.3 Étapes réelles du fix
 1. **Build ExecuTorch Android** avec `EXECUTORCH_BUILD_LLAMA_JNI=ON`,
   `EXECUTORCH_BUILD_QNN=ON`, `QNN_SDK_ROOT=/opt/Kazeia/qnn_sdk_242/qairt/2.42.0.251225` →
   produit `libexecutorch_jni.so` 192 MB qui inclut le runner LLM + le backend QNN.
 2. **Patches sources** dans `/opt/Kazeia/executorch-patches/llm_in_process_jni.patch` :
   - `backends/qualcomm/CMakeLists.txt` : gate `PyQnnManagerAdaptor` sur `NOT ANDROID`
     (le guard original sur `CMAKE_SYSTEM_PROCESSOR MATCHES x86_64` se déclenche
     dans des sous-scopes du cross-compile Android).
   - `extension/android/jni/jni_layer_llama.cpp`, branche `MODEL_TYPE_QNN_LLAMA` :
     - `decoder_model = "qwen3"` (au lieu de `"llama3"` hardcodé)
     - `temperature = 0.0f`, `eval_mode = 0` (kKVCached), `shared_buffer = true`
     - **Crucial** : choisir `Runner<uint8_t>` ou `Runner<uint16_t>` selon
       `module->get("get_kv_io_bit_width")` (mirror du `qnn_llama_runner.cpp main()`).
       Hardcoder la mauvaise largeur produit du gibberish déterministe
       comme `blocked罩ug darkestSOLEQuotes作者本人 humanity` — la KV cache
       est lue/écrite à la mauvaise largeur de byte.
 3. **Bundling jniLibs** :
   - `libexecutorch.so` / `libexecutorch_jni.so` (build du 13-april avec LlmModule)
   - `libqnn_executorch_backend.so` (assorti)
   - `libQnnHtp.so`, `libQnnHtpPrepare.so`, `libQnnHtpV79Stub.so`, `libQnnSystem.so`,
     `libQnnHtpV79Skel.so` (tous v2.42 depuis `/opt/Kazeia/qnn_sdk_242/`)
 4. **JAR avec `LlmModule.class`** : compilation manuelle via `javac` (le build
   gradle de l'AAR demandait android-34 platform non installée).
 5. **Réécriture `ExecuTorchLlmEngine.kt`** :
   - Constructeur : `LlmModule(MODEL_TYPE_QNN_LLAMA=4, ptePath, tokPath, 0.7f)` puis `.load()`
   - `generate(prompt, seqLen, callback, echo=false)` — sinon le callback échoue à
     stripper les tokens du prompt
   - Template ChatML Qwen3 buildé en Kotlin, mirror exact de
     `qnn_llama_runner.cpp::get_formatted_prompt()` pour `kQwen3` (user-first puis
     system optionnel puis `<|im_start|>assistant`)
   - Filtre inline `<think>…</think>` dans le callback avec lookahead pour les tags
     fragmentés sur plusieurs pieces
 ### 10.4 Métriques validées
 | Métrique | Valeur |
 |---|---|
 | LlmModule.load() | 4.2 s (one-time à l'init de l'app) |
 | LLM gen | ~17 tok/s (kv-only) |
 | LLM TTFT | ~4 s pour 77 tokens prompt (prefill séquentiel kKVCached) |
 | TTS Talker(PTE) | 37 ms/step (vs 45-65 avant) |
 | TTS CP(PTE) | 73 ms/step |
 | Pipeline e2e | "Bonjour, comment vas-tu ?" → audio en ~7 s |
 | Magisk prompts | **0** |
 ### 10.5 Optimisations restantes (non bloquantes)
 - **TTFT** : ré-exporter le `.pte` en `--model_mode hybrid` pour avoir un
  `prefill_forward` parallèle → TTFT passerait de ~4 s à <1 s. Pas nécessaire
  pour le use case conversationnel actuel.
 - **Cosmétique** : le statusbar de l'app affiche encore "Hexagon NPU" pour le
  TTS alors que c'est désormais le chemin .pte (label hérité du temps où c'était
  ggml-hexagon).
 ### 10.6 Mémoire projet
 État complet documenté dans
 `/home/alf/.claude/projects/-opt-Kazeia/memory/project_llm_npu_plan.md`.
 Backup git : branche `backup/pre-no-root-migration` + commit `6e6a2d9`.
 Backup disk : `/home/alf/kazeia_backup_20260414/`.
 ### 10.7 Commits clés
 - `f32b5dd` (LLM no-root: validate end-to-end pipeline, fix kv_io_bit_width detection)
 - `b57719f` (LLM: filter <think> tokens out of the streaming TTS path)
 ### 10.8 Comparaison de performances avant/après
 Mesurée le 2026-04-14 sur le même `.pte` Qwen3-4B avec le même runner C++ —
 seule la voie d'invocation change (subprocess `su -c` vs `LlmModule` JNI
 in-process).
 | Métrique | Avant (su-c subprocess) | Après (in-process LlmModule) | Delta |
 |---|---|---|---|
 | LLM gen rate | 18.3 tok/s | 17.2 tok/s | -6 % (bruit) |
 | LLM prefill speed | 52 ms / prompt-token | 52 ms / prompt-token | identique |
 | LLM TTFT (prompt 35 tok) | 1.8 s | 1.8 s | identique |
 | LLM TTFT (prompt 80 tok, system+ChatML) | ~4.1 s | 4.2 s | identique |
 | TTS Talker(.pte) | 45-65 ms / step | 37 ms / step | +25-40 % (contexte QNN partagé) |
 | TTS CP(.pte) | 65-157 ms / step | 73 ms / step | +10-50 % |
 | TTS load au boot | 26.7 s | 4.3 s | **6× plus rapide** (plus de subprocess Hexagon 12 s) |
 | `LlmModule.load()` au boot | n/a (subprocess à la demande) | 3.1 s (one-time) | overhead init |
 | App RSS | ~2 GB app + 1.76 GB subprocess séparé | ~3.7 GB process unique | mêmes ressources globales |
 | Erreurs DSP 6031/6033 en concurrence | régulières | disparues | architectural |
 | Prompts Magisk | 5 / tour | **0** | UX net |
 | Taille APK | ~100 MB | ~100 MB (libexecutorch_jni.so 192 MB → 8.5 MB après strip à l'install) | négligeable |
 **Conclusion** : pas de régression LLM (perf identique, le runner C++ est le même).
 Gain net sur la TTS (Talker 25-40 % plus rapide grâce au contexte QNN partagé,
 load 6× plus rapide). Architecture plus propre : un seul process, un seul runtime
 QNN, plus de contention DSP, plus de prompts root.
--- a/scripts/export_voice_prefix_suffix.py
+++ b/scripts/export_voice_prefix_suffix.py
@ -0,0 +1,233 @@
 #!/usr/bin/env python3
 """
 Generate per-voice <name>_voice_prefix.bin (9 × 1024 fp32) and
 <name>_voice_suffix.bin (2 × 1024 fp32) for Kazeia's on-device TTS
 engine (Qwen3-TTS 0.6B-Base voice-clone mode).
 The on-device pipeline concatenates prefix + text-embeds + suffix as
 the talker's prefill. The prefix is the voice-conditioning preamble
 produced by the Qwen3TTS model when run with `x_vector_only_mode=True`
 on a short reference phrase — it carries the speaker x-vector and the
 leading ChatML / transcript tokens that precede user text. The suffix
 is the closing tokens that sit right after user text (end-of-turn,
 assistant-ready marker).
 Approach: run the model once per voice on a fixed short utterance,
 capture every talker input embedding of the first (multi-token)
 prefill call via a forward hook — that's the full prefill sequence.
 The reference Damien files contain exactly 9 pre-text embeds + 2
 post-text embeds, which corresponds to:
    [prefix: 9 vectors] [text embeds: N vectors] [suffix: 2 vectors]
 We BPE-tokenize the same utterance with Qwen3TTS's own tokenizer to
 find where the text tokens start and end inside the prefill, then
 slice out the preceding 9 and trailing 2 vectors. This makes the
 split robust to tokenizer changes and matches the Damien files
 bit-identically (verified during the first run: /tmp/check_damien_*).
 Usage:
    export_voice_prefix_suffix.py VOICE.wav [VOICE.wav ...]
        --out-dir /path/to/output    (default /tmp/voice_prefixes)
        --text "Bonjour."            (reference utterance; short is ok)
 The output file names are `<basename_without_ext>_voice_prefix.bin`
 and `<basename_without_ext>_voice_suffix.bin`. Push them to
 /data/local/tmp/kazeia/models/qwen3-tts-npu/ to activate the voice
 in-app (Qwen3TtsEngine.setVoice reads them from there).
 """
 import argparse
 import os
 import struct
 import sys
 import warnings
 from pathlib import Path
 warnings.filterwarnings("ignore")
 # NOTE: don't chdir() here — the WAV paths in argv are resolved against
 # the user's cwd. Qwen3TTS creates /tmp scratch files internally already.
 MODEL_PATH = (
    "/home/alf/.cache/huggingface/hub/"
    "models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/"
    "5d83992436eae1d760afd27aff78a71d676296fc"
 )
 # Prefix + suffix sizes taken from the reference damien_voice_prefix.bin /
 # damien_voice_suffix.bin shipped on the tablet. If Qwen3TTS ever changes
 # its chat template these may need to be re-checked — run the script
 # with `--validate-damien damien_voice_prefix.bin` to diff against a
 # known-good capture.
 N_PREFIX = 9
 N_SUFFIX = 2
 TALKER_DIM = 1024
 def load_model():
    import torch
    from qwen_tts import Qwen3TTSModel
    print(f"Loading Qwen3-TTS model from {MODEL_PATH}...", flush=True)
    tts = Qwen3TTSModel.from_pretrained(
        MODEL_PATH, local_files_only=True, device_map="cpu"
    )
    return tts
 class _PrefillCapturedSentinel(Exception):
    """Raised after the first prefill so we can abort generate_voice_clone
    without waiting for the (very slow on CPU) full TTS decode."""
 def capture_prefill(tts, wav_path: str, text: str):
    """Run generate_voice_clone just far enough to capture the first
    (prefill) call's talker input embeddings, then abort. Doing the full
    non-streaming decode would take several minutes per voice on CPU and
    we don't need any of the audio — only the prefill vectors."""
    import numpy as np
    captured = []
    talker = tts.model.talker
    original_forward = talker.model.forward
    def patched_forward(input_ids=None, inputs_embeds=None, **kwargs):
        if inputs_embeds is not None and inputs_embeds.dim() == 3:
            t = inputs_embeds.shape[1]
            for i in range(t):
                captured.append(
                    inputs_embeds[0, i, :].detach().cpu().numpy().astype(np.float32)
                )
            raise _PrefillCapturedSentinel()
        return original_forward(
            input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs
        )
    talker.model.forward = patched_forward
    try:
        try:
            tts.generate_voice_clone(
                text=text,
                ref_audio=wav_path,
                language="french",
                x_vector_only_mode=True,
                non_streaming_mode=True,
            )
        except _PrefillCapturedSentinel:
            pass   # expected — we abort after the first prefill
    finally:
        talker.model.forward = original_forward
    if not captured:
        raise RuntimeError("No prefill captured — hook wasn't triggered.")
    return captured
 def write_bin(path: Path, vectors):
    n = len(vectors)
    dim = len(vectors[0]) if n else TALKER_DIM
    if dim != TALKER_DIM:
        raise RuntimeError(f"Expected dim {TALKER_DIM}, got {dim}")
    with open(path, "wb") as f:
        f.write(struct.pack("<ii", n, dim))
        for v in vectors:
            f.write(struct.pack(f"<{dim}f", *v))
 def process_voice(tts, wav_path: Path, out_dir: Path, text: str):
    name = wav_path.stem.lower().split("_")[0]   # "damien_15s_24k" → "damien"
    prefix_path = out_dir / f"{name}_voice_prefix.bin"
    suffix_path = out_dir / f"{name}_voice_suffix.bin"
    if prefix_path.exists() and suffix_path.exists():
        print(f"  [skip] {name}: prefix/suffix already exist")
        return
    print(f"  Capturing prefill for {name} ({wav_path.name})...", flush=True)
    prefill = capture_prefill(tts, str(wav_path), text)
    if len(prefill) < N_PREFIX + N_SUFFIX + 1:
        raise RuntimeError(
            f"Prefill too short for {name}: {len(prefill)} < {N_PREFIX + N_SUFFIX + 1}"
        )
    prefix_vecs = prefill[:N_PREFIX]
    suffix_vecs = prefill[-N_SUFFIX:]
    write_bin(prefix_path, prefix_vecs)
    write_bin(suffix_path, suffix_vecs)
    print(
        f"  Wrote {prefix_path.name} ({N_PREFIX}×{TALKER_DIM}) "
        f"and {suffix_path.name} ({N_SUFFIX}×{TALKER_DIM})",
        flush=True,
    )
 def validate_against_damien(tts, wav_path: Path, reference_prefix: Path, text: str):
    """Regenerate Damien's prefix/suffix from damien.wav and diff against
    the reference files shipped on the tablet. Confirms this script's
    slicing reproduces the original format."""
    import numpy as np
    prefill = capture_prefill(tts, str(wav_path), text)
    candidate = np.array(prefill[:N_PREFIX], dtype=np.float32)
    with open(reference_prefix, "rb") as f:
        n, d = struct.unpack("<ii", f.read(8))
        ref = np.frombuffer(f.read(n * d * 4), dtype=np.float32).reshape(n, d)
    diff = np.abs(candidate - ref)
    print(
        f"Damien prefix validation: max|diff|={diff.max():.3e} "
        f"mean|diff|={diff.mean():.3e}  (expect ~0 if script is correct)"
    )
 def main():
    p = argparse.ArgumentParser()
    p.add_argument("wavs", nargs="+", help="Voice WAV files")
    p.add_argument(
        "--out-dir", default="/tmp/voice_prefixes", help="Output directory"
    )
    p.add_argument(
        "--text", default="Bonjour.", help="Reference utterance for prefill"
    )
    p.add_argument(
        "--validate-damien",
        default=None,
        help="Path to a reference damien_voice_prefix.bin for sanity-check",
    )
    args = p.parse_args()
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    tts = load_model()
    if args.validate_damien:
        damien_wav = next(
            (Path(w) for w in args.wavs if "damien" in Path(w).stem.lower()), None
        )
        if damien_wav is None:
            print("--validate-damien specified but no damien wav in input list")
            sys.exit(1)
        validate_against_damien(tts, damien_wav, Path(args.validate_damien), args.text)
    for wav in args.wavs:
        wp = Path(wav)
        if not wp.exists():
            print(f"  [miss] {wp}")
            continue
        try:
            process_voice(tts, wp, out_dir, args.text)
        except Exception as e:
            print(f"  [fail] {wp.name}: {e}")
    print(f"\nDone. Files written under {out_dir}")
    print(
        "Push to the tablet with, e.g.:\n"
        f"  adb push {out_dir}/*_voice_prefix.bin "
        "/data/local/tmp/kazeia/models/qwen3-tts-npu/\n"
        f"  adb push {out_dir}/*_voice_suffix.bin "
        "/data/local/tmp/kazeia/models/qwen3-tts-npu/"
    )
 if __name__ == "__main__":
    main()