Compare commits

..

No commits in common. "main" and "backup/pre-no-root-migration" have entirely different histories.

13 changed files with 231 additions and 2282 deletions

View File

@ -1,72 +0,0 @@
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
index e93731e..4951e1d 100644
--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
@@ -308,8 +308,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
)
endif()
-# QNN pybind
-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+# QNN pybind — host Python bindings, not for Android cross-compile
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ANDROID)
add_subdirectory(
${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
${CMAKE_CURRENT_BINARY_DIR}/pybind11
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
index 45f2414..ae3d79f 100644
--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
@@ -171,14 +171,44 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
model_path->toStdString().c_str(),
data_files_vector,
executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
- std::string decoder_model = "llama3"; // use llama3 for now
- runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
- std::move(module),
- decoder_model.c_str(),
- model_path->toStdString().c_str(),
- tokenizer_path->toStdString().c_str(),
- "",
- "");
+ std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
+
+ // Mirror qnn_llama_runner.cpp main(): pick the Runner<T> template based
+ // on the model's get_kv_io_bit_width metadata. The 16-bit KV I/O models
+ // were introduced after the 8-bit ones, and using the wrong T treats
+ // KV-cache bytes as the wrong width → garbage logits → gibberish output.
+ example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8;
+ if (module->method_names()->count("get_kv_io_bit_width") > 0) {
+ kv_bitwidth = static_cast<example::KvBitWidth>(
+ module->get("get_kv_io_bit_width").get().toScalar().to<int64_t>());
+ }
+ // Auto-detect eval_mode: kv-only (0) if the .pte only carries
+ // kv_forward, hybrid (1) if it also has prefill_forward (which lets the
+ // runner batch the prompt prefill — TTFT drops from ~52 ms/token to
+ // sub-ms after the one-shot prefill graph). Same JNI binary works with
+ // both export modes, no code change needed when the .pte is upgraded.
+ int eval_mode = 0;
+ if (module->method_names()->count("prefill_forward") > 0) {
+ eval_mode = 1; // EvalMode::kHybrid
+ }
+ auto make_runner = [&](auto sample) -> std::unique_ptr<llm::IRunner> {
+ using T = decltype(sample);
+ return std::make_unique<example::Runner<T>>(
+ std::move(module),
+ decoder_model.c_str(),
+ model_path->toStdString().c_str(),
+ tokenizer_path->toStdString().c_str(),
+ /* performance_output_path */ "",
+ /* dump_logits_path */ "",
+ /* temperature */ 0.0f, // greedy
+ eval_mode,
+ /* shared_buffer */ true);
+ };
+ if (kv_bitwidth == example::KvBitWidth::kWidth16) {
+ runner_ = make_runner(uint16_t{0});
+ } else {
+ runner_ = make_runner(uint8_t{0});
+ }
model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
#endif
#if defined(EXECUTORCH_BUILD_MEDIATEK)

View File

@ -1,5 +1,5 @@
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
index 963db6e..9ccfdd0 100644
index 963db6e..953dc4c 100644
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
@@ -25,9 +25,14 @@ from executorch.examples.models.granite import (
@ -20,7 +20,7 @@ index 963db6e..9ccfdd0 100644
from executorch.examples.models.qwen2_5 import (
convert_weights as convert_qwen2_5_weights,
)
@@ -479,6 +484,37 @@ class Qwen3_1_7B(LLMModelConfig):
@@ -479,6 +484,34 @@ class Qwen3_1_7B(LLMModelConfig):
quant_recipe = Qwen3_1_7BQuantRecipe
@ -40,13 +40,10 @@ index 963db6e..9ccfdd0 100644
+ convert_weights = convert_qwen3_weights
+ transform_weight = False
+ instruct_model = True
+ # num_sharding=1 for hybrid mode: sharding=2 produces a multi-context
+ # .pte (2 graphs × 2 shards = 4 contexts) that the LlmModule load path
+ # can't restore (error 5010 "Context group 1 does not exist"). With
+ # sharding=1 the hybrid export needs ~46 GB RAM peak — the 192 GB swap
+ # on /swapfile handles this; compile takes ~80 min wall but completes
+ # cleanly. Single-context .pte loads fine through the JNI runner.
+ num_sharding = 1
+ # Bumped to 2 to halve peak host RAM during QNN compile (4B at sharding=1
+ # OOMed on a 62 GB box, peak anon-rss 46 GB). At sharding=2 each shard
+ # compile fits comfortably; runner stitches them at load time.
+ num_sharding = 2
+ masked_softmax = True
+ seq_mse_candidates = 0
+ r1 = False

View File

@ -5,7 +5,6 @@
<uses-permission android:name="android.permission.RECORD_AUDIO" />
<uses-permission android:name="android.permission.FOREGROUND_SERVICE" />
<uses-permission android:name="android.permission.FOREGROUND_SERVICE_MICROPHONE" />
<uses-permission android:name="android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK" />
<uses-permission android:name="android.permission.FOREGROUND_SERVICE_SPECIAL_USE" />
<uses-permission android:name="android.permission.POST_NOTIFICATIONS" />
<uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
@ -51,7 +50,7 @@
<service
android:name=".service.KazeiaService"
android:foregroundServiceType="microphone|mediaPlayback|specialUse"
android:foregroundServiceType="microphone|specialUse"
android:exported="true">
<property
android:name="android.app.PROPERTY_SPECIAL_USE_FGS_SUBTYPE"

View File

@ -1,49 +1,43 @@
package com.kazeia.llm
import android.content.Context
import android.util.Log
import com.kazeia.core.*
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.withContext
import java.io.File
import org.pytorch.executorch.extension.llm.LlmCallback
import org.pytorch.executorch.extension.llm.LlmModule
/**
* LLM Engine using ExecuTorch LlmModule in-process **no root required**.
*
* Runs Qwen3-4B via `org.pytorch.executorch.extension.llm.LlmModule`, which
* wraps the same C++ TextLlmRunner as the standalone qnn_llama_runner binary
* but inside the app's own process. The QNN HTP backend works because the
* DSP fastrpc service accepts the Zygote-forked app process (unlike
* ProcessBuilder-spawned subprocesses which lose supplementary GIDs on exec
* and get rejected by the fastrpc credential checks).
*
* Model + tokenizer live in /data/local/tmp/kazeia-et/ (readable by the app
* on this device's permissive SELinux policy). libexecutorch.so + QNN libs
* are bundled in jniLibs.
* LLM Engine using ExecuTorch + QNN backend via subprocess.
* Calls qnn_llama_runner binary with root access (Magisk su).
*
* Current tablet config: Qwen3-4B KV-mode, ~18-22 tok/s on Hexagon V79
* (Snapdragon 8 Elite), TTFT 0.9 s, RSS 1.76 GB.
*
* Why root: the runner binary plus its QNN v2.42 .so deps live in
* /data/local/tmp/kazeia-et/ (shell_data_file SELinux context). Untrusted
* apps can't exec binaries from there. The Hexagon DSP fastrpc service also
* refuses to load the v2.42 Skel from the app's own files dir only from
* nativeLibraryDir but that dir already holds the TTS stack's v2.31 Skel
* (same filename, different version, can't coexist). Rebuilding everything
* against one QNN version would eliminate the conflict, but would require
* re-exporting the TTS .pte with the new runtime (tooling currently broken
* on the flatc schema/dataclass mismatch in the qnn_venv).
*/
class ExecuTorchLlmEngine(
private val context: Context,
private val onLog: ((String) -> Unit)? = null
) : LlmEngine {
companion object {
private const val TAG = "ExecuTorchLLM"
// /no_think disables Qwen3's chain-of-thought block. Compact wording
// keeps prefill cost low: this prompt is ~25 tokens vs ~55 in the
// earlier verbose version → saves ~1.5 s of TTFT in kv-only mode.
private const val SYSTEM_PROMPT = "Tu es Kazeia, à l'écoute en français. Réponds en 1-2 phrases courtes, sans raisonnement. /no_think"
private const val MODEL_DIR = "/data/local/tmp/kazeia-et"
private const val MODEL_PATH = "$MODEL_DIR/hybrid_llama_qnn.pte"
private const val TOKENIZER_PATH = "$MODEL_DIR/tokenizer.json"
private const val RUNNER_DIR = "/data/local/tmp/kazeia-et"
// /no_think disables Qwen3's chain-of-thought block so the full token
// budget goes to the actual answer (without it, 120-200 tokens get
// consumed by <think>…</think> leaving nothing to speak).
// Short-response directive keeps TTS latency manageable — each sentence
// costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot.
private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think"
}
private var llmModule: LlmModule? = null
private var modelName = ""
private var loaded = false
@ -54,152 +48,77 @@ class ExecuTorchLlmEngine(
override suspend fun load(modelPath: String, config: LlmConfig) {
withContext(Dispatchers.IO) {
if (!File(MODEL_PATH).exists()) {
nlog("ERROR: model not found at $MODEL_PATH")
return@withContext
}
if (!File(TOKENIZER_PATH).exists()) {
nlog("ERROR: tokenizer not found at $TOKENIZER_PATH")
val check = execRoot("ls $RUNNER_DIR/qnn_llama_runner $RUNNER_DIR/hybrid_llama_qnn.pte $RUNNER_DIR/tokenizer.json 2>&1")
if (check.contains("No such file")) {
nlog("ERROR: runner or model not found in $RUNNER_DIR")
return@withContext
}
try {
val t0 = System.currentTimeMillis()
// MODEL_TYPE_QNN_LLAMA=4 selects the Qualcomm runner path in
// jni_layer_llama.cpp, which uses example::Runner (same code
// as the qnn_llama_runner binary) instead of the generic
// TextLLMRunner. Our .pte was exported with
// --decoder_model qwen3-4b which requires this path.
val MODEL_TYPE_QNN_LLAMA = 4
llmModule = LlmModule(MODEL_TYPE_QNN_LLAMA, MODEL_PATH, TOKENIZER_PATH, 0.7f)
nlog("LlmModule instantiated in ${System.currentTimeMillis() - t0}ms")
deployRunnerScript()
// Load the PTE into QNN HTP (calls the native load()).
val loadResult = llmModule!!.load()
if (loadResult != 0) {
nlog("ERROR: LlmModule.load() returned $loadResult")
llmModule = null
return@withContext
}
nlog("LlmModule loaded in ${System.currentTimeMillis() - t0}ms total")
writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP))
if (SYSTEM_PROMPT.isNotEmpty()) {
writeFileRoot("$RUNNER_DIR/outputs/system.b64",
android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
} else {
execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
}
val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1")
if (test.contains("Generated Tokens") || test.contains("Rate:")) {
loaded = true
modelName = "Qwen3-4B LlmModule"
val rateMatch = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(test)
val rate = rateMatch?.groupValues?.get(1) ?: "?"
modelName = "Qwen3 (${rate} tok/s NPU)"
nlog("Ready: $modelName")
} catch (e: Throwable) {
nlog("ERROR: LlmModule init failed: ${e.javaClass.simpleName}: ${e.message}")
llmModule = null
} else {
nlog("ERROR: test failed: ${test.takeLast(200)}")
}
}
}
override fun isLoaded(): Boolean = loaded && llmModule != null
override fun isLoaded(): Boolean = loaded
override suspend fun generate(
prompt: String,
params: SamplingParams,
onToken: ((String) -> Boolean)?
): GenerationResult = withContext(Dispatchers.IO) {
val mod = llmModule ?: throw IllegalStateException("Model not loaded")
if (!loaded) throw IllegalStateException("Model not loaded")
val startTime = System.currentTimeMillis()
val fullPrompt = buildChatTemplate(prompt)
writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP))
if (SYSTEM_PROMPT.isNotEmpty()) {
writeFileRoot("$RUNNER_DIR/outputs/system.b64",
android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
} else {
execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
}
nlog("Prompt: '${prompt.take(80)}'")
val responseBuilder = StringBuilder()
var firstTokenMs = -1L
// Track whether we're inside a <think>…</think> block so the upstream
// SentenceStreamer / TTS doesn't get fed reasoning tokens. Even with
// /no_think in the system prompt Qwen3 still emits empty <think></think>
// wrappers for ~3 tokens before the real answer.
var inThink = false
val tokenScan = StringBuilder() // small lookahead to spot tag boundaries
// Singleton special tokens that should never reach the TTS streamer
// (they leak when the model wraps its reply or signals end-of-turn).
val stripTokens = listOf("<|im_start|>", "<|im_end|>", "<|endoftext|>")
val maxTagLen = listOf("<think>", "</think>", "<|im_start|>", "<|im_end|>", "<|endoftext|>")
.maxOf { it.length }
val cb = object : LlmCallback {
override fun onResult(result: String) {
if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
responseBuilder.append(result)
// Forward to caller only outside <think> blocks, and strip
// singleton special tokens. We accumulate a tiny lookahead buffer
// so tag tokens that arrive split ("<thi", "nk>") still match.
tokenScan.append(result)
while (true) {
if (!inThink) {
val open = tokenScan.indexOf("<think>")
if (open < 0) {
// No <think> open pending — strip any singleton tokens
// that fully landed in the buffer, then flush prose
// up to a safe point preserving lookahead.
for (tok in stripTokens) {
var idx = tokenScan.indexOf(tok)
while (idx >= 0) {
tokenScan.delete(idx, idx + tok.length)
idx = tokenScan.indexOf(tok)
}
}
val safe = tokenScan.length - maxTagLen
if (safe > 0) {
onToken?.invoke(tokenScan.substring(0, safe))
tokenScan.delete(0, safe)
}
break
}
// Flush the prose before the <think> tag, then enter think mode.
if (open > 0) onToken?.invoke(tokenScan.substring(0, open))
tokenScan.delete(0, open + "<think>".length)
inThink = true
} else {
val close = tokenScan.indexOf("</think>")
if (close < 0) {
// Drop all buffered chars except a small tail in case
// the closing tag is split across tokens.
val keep = "</think>".length - 1
if (tokenScan.length > keep) tokenScan.delete(0, tokenScan.length - keep)
break
}
tokenScan.delete(0, close + "</think>".length)
inThink = false
}
}
}
override fun onStats(stats: String) {
nlog("stats: ${stats.take(200)}")
}
}
val seqLen = minOf(params.maxNewTokens, 512)
val rc = try {
// echo=false so onResult() only receives the generated completion,
// not the prompt tokens echoed back — otherwise the sentence
// streamer would feed '<|im_start|>user …' to the TTS.
mod.generate(fullPrompt, seqLen, cb, /* echo */ false)
} catch (e: Throwable) {
nlog("generate() threw: ${e.message}")
-1
}
val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1")
// Drain any leftover prose buffered during <think>-suppression so the
// last sentence reaches the TTS even if it ran past the closing tag.
if (!inThink && tokenScan.isNotEmpty()) {
onToken?.invoke(tokenScan.toString())
tokenScan.clear()
}
val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output)
?.groupValues?.get(1)?.toIntOrNull() ?: 0
val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output)
?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output)
?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null")
nlog("RAW: ${responseRaw.take(300)}")
val responseText = extractResponse(responseRaw)
val elapsed = System.currentTimeMillis() - startTime
val rawText = responseBuilder.toString()
val responseText = cleanResponse(rawText)
val tokenCount = rawText.length / 4 // rough estimate without a tokenizer
val rate = if (elapsed > 0) (tokenCount * 1000f) / elapsed else 0f
nlog("Response: '$responseText'")
nlog("Stats: ${tokenCount}tok ${rate}tok/s TTFT=${ttft}s ${elapsed}ms")
nlog("Response: '${responseText.take(80)}'")
nlog("Stats: rc=$rc ~${tokenCount}tok ~${"%.1f".format(rate)}tok/s TTFT=${firstTokenMs}ms total=${elapsed}ms")
onToken?.invoke(responseText)
GenerationResult(
text = responseText,
@ -209,32 +128,20 @@ class ExecuTorchLlmEngine(
)
}
/**
* Qwen3 chat template matching qnn_llama_runner.cpp's get_formatted_prompt()
* for DecoderModelVersion::kQwen3. Note the user-first-then-system ordering
* (quirky but required the runner binary produces the same layout and our
* .pte was trained with it). Terminates with `<|im_start|>assistant` with
* no trailing newline, matching the binary exactly.
*/
private fun buildChatTemplate(userInput: String): String {
val sb = StringBuilder()
sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
if (SYSTEM_PROMPT.isNotEmpty()) {
sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
}
sb.append("<|im_start|>assistant")
return sb.toString()
}
/** Strip <think>…</think>, special tokens, and leading/trailing whitespace. */
private fun cleanResponse(raw: String): String {
private fun extractResponse(raw: String): String {
var text = raw
val thinkEnd = text.indexOf("</think>")
if (thinkEnd >= 0) {
text = text.substring(thinkEnd + "</think>".length)
} else if (text.indexOf("<think>") >= 0) {
nlog("WARN: <think> block never closed")
return ""
} else {
val thinkStart = text.indexOf("<think>")
val assistantTag = text.indexOf("assistant")
if (thinkStart >= 0) {
nlog("WARN: <think> block never closed, no response generated")
return ""
} else if (assistantTag >= 0) {
text = text.substring(assistantTag + "assistant".length)
}
}
return text
.replace("<|im_start|>", "")
@ -245,9 +152,82 @@ class ExecuTorchLlmEngine(
.trim()
}
private fun deployRunnerScript() {
val script = """
#!/bin/sh
cd $RUNNER_DIR
export LD_LIBRARY_PATH=$RUNNER_DIR
export ADSP_LIBRARY_PATH=$RUNNER_DIR
TEMP=${'$'}1
SEQ_LEN=${'$'}2
PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64)
rm -f $RUNNER_DIR/outputs/response.txt
SYSTEM_ARGS=""
if [ -s $RUNNER_DIR/outputs/system.b64 ]; then
SYSTEM=${'$'}(base64 -d $RUNNER_DIR/outputs/system.b64)
SYSTEM_ARGS="--system_prompt"
fi
if [ -n "${'$'}SYSTEM_ARGS" ]; then
exec ./qnn_llama_runner \
--model_path hybrid_llama_qnn.pte \
--tokenizer_path tokenizer.json \
--decoder_model_version qwen3 \
--output_path $RUNNER_DIR/outputs/response.txt \
--performance_output_path $RUNNER_DIR/outputs/perf.txt \
--shared_buffer \
--system_prompt "${'$'}SYSTEM" \
--prompt "${'$'}PROMPT" \
--temperature ${'$'}TEMP \
--seq_len ${'$'}SEQ_LEN \
--eval_mode 0
else
exec ./qnn_llama_runner \
--model_path hybrid_llama_qnn.pte \
--tokenizer_path tokenizer.json \
--decoder_model_version qwen3 \
--output_path $RUNNER_DIR/outputs/response.txt \
--performance_output_path $RUNNER_DIR/outputs/perf.txt \
--shared_buffer \
--prompt "${'$'}PROMPT" \
--temperature ${'$'}TEMP \
--seq_len ${'$'}SEQ_LEN \
--eval_mode 0
fi
""".trimIndent()
writeFileRoot("$RUNNER_DIR/run_llm.sh", script)
execRoot("chmod 755 $RUNNER_DIR/run_llm.sh")
}
override fun release() {
try { llmModule?.resetNative() } catch (_: Throwable) {}
llmModule = null
loaded = false
}
private fun writeFileRoot(path: String, content: String) {
try {
val process = Runtime.getRuntime().exec(arrayOf("su", "-c", "cat > $path"))
process.outputStream.bufferedWriter().use { it.write(content) }
process.waitFor()
} catch (e: Exception) {
Log.e(TAG, "writeFileRoot failed: ${e.message}")
}
}
private fun execRoot(cmd: String): String {
return try {
val process = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
val result = process.inputStream.bufferedReader().readText()
val error = process.errorStream.bufferedReader().readText()
process.waitFor()
if (error.isNotEmpty() && result.isEmpty()) error else result
} catch (e: Exception) {
Log.e(TAG, "execRoot failed: ${e.message}")
""
}
}
}

View File

@ -142,36 +142,14 @@ class KazeiaPipeline {
* the echo-mode playback through the same path otherwise each TTS
* site reimplemented the "streaming-or-fallback" dispatch.
*/
suspend fun speakText(
text: String,
// Fires the instant each synthesized sentence starts playing
// through the speaker, with the sentence text, audio duration,
// and a per-ENVELOPE_WINDOW_MS RMS envelope. Used by
// processLlmResponse to defer the KAZEIA chat bubble appearance
// until sound is audible, pace word-by-word reveal inside the
// bubble, and drive the AudioVisualizerView orb.
onSegmentPlaying: ((
sentence: String,
durationMs: Long,
rmsEnvelope: FloatArray,
spectrogram: Array<FloatArray>
) -> Unit)? = null
) {
suspend fun speakText(text: String) {
val ttsEngine = tts ?: return
_pipelineState.value = PipelineState.Speaking
try {
val qwen = ttsEngine as? com.kazeia.tts.Qwen3TtsEngine
if (qwen != null) {
qwen.onSegmentPlaying = onSegmentPlaying
qwen.startStreamingSession()
val streamer = com.kazeia.tts.SentenceStreamer { raw ->
// Strip emoji / non-speakable pictographs before TTS
// so a standalone "😊" doesn't become its own noisy
// segment. The chat bubble keeps the original text —
// only the audio path sees the cleaned version.
val spoken = stripNonSpeakable(raw).trim()
if (spoken.isNotEmpty()) qwen.enqueueSentence(spoken)
}
val streamer = com.kazeia.tts.SentenceStreamer { s -> qwen.enqueueSentence(s) }
streamer.append(text)
streamer.flush()
qwen.endStreamingSession()
@ -190,41 +168,6 @@ class KazeiaPipeline {
_messages.value = _messages.value + msg
}
/**
* Drop emoji + dingbat + pictographic characters so the TTS engine
* doesn't try to synthesize them. Covers the main Unicode emoji
* blocks (Miscellaneous Symbols, Dingbats, Emoticons, Transport,
* Supplemental Symbols and Pictographs, etc.) plus variation
* selectors and zero-width joiners that tag emoji sequences.
* Keeps everything in the Basic Latin / Latin-1 / Latin Extended
* ranges + common French punctuation untouched.
*/
private fun stripNonSpeakable(text: String): String {
val sb = StringBuilder(text.length)
var i = 0
while (i < text.length) {
val cp = text.codePointAt(i)
val skip = when {
cp in 0x2600..0x27BF -> true // misc symbols + dingbats
cp in 0x1F300..0x1F5FF -> true // pictographs
cp in 0x1F600..0x1F64F -> true // emoticons
cp in 0x1F680..0x1F6FF -> true // transport
cp in 0x1F700..0x1F77F -> true // alchemical
cp in 0x1F780..0x1F7FF -> true // geometric extended
cp in 0x1F800..0x1F8FF -> true // supplemental arrows-c
cp in 0x1F900..0x1F9FF -> true // supplemental pictographs
cp in 0x1FA00..0x1FAFF -> true // symbols & pictographs extended-A
cp == 0x200D -> true // zero-width joiner
cp in 0xFE00..0xFE0F -> true // variation selectors
cp in 0x1F1E6..0x1F1FF -> true // regional indicators (flags)
else -> false
}
if (!skip) sb.appendCodePoint(cp)
i += Character.charCount(cp)
}
return sb.toString()
}
fun log(msg: String) {
Log.i(TAG, msg)
val time = java.text.SimpleDateFormat("HH:mm:ss.SSS", java.util.Locale.FRANCE)

View File

@ -83,34 +83,6 @@ class KazeiaService : Service() {
private val _isListening = MutableStateFlow(false)
val isListening: StateFlow<Boolean> = _isListening
// Drives the AudioVisualizerView orb. Pushed from the VAD loop
// during mic capture (mic RMS, normalized) and from the TTS engine's
// onSegmentPlaying callback (TTS RMS envelope per-segment). The view
// reads this via collectLatest in ChatActivity; the signals carry
// their own state so the visualizer knows whether it's idle, tracking
// the mic, or rendering a TTS segment.
sealed class VisualizerSignal {
object Idle : VisualizerSignal()
data class Listening(val micRms: Float) : VisualizerSignal()
data class Speaking(
val rmsEnvelope: FloatArray,
val spectrogram: Array<FloatArray>,
val durationMs: Long
) : VisualizerSignal()
}
private val _visualizerSignal = MutableStateFlow<VisualizerSignal>(VisualizerSignal.Idle)
val visualizerSignal: StateFlow<VisualizerSignal> = _visualizerSignal
// Kazeia's orb color is bound to the selected voice so the user
// visually associates a palette with the speaker they picked. UI
// sets this whenever the voice spinner changes; the orb view
// listens via the StateFlow and tweens the current → target color.
private val _voiceColor = MutableStateFlow(0xFFBCA4E8.toInt()) // lavender = Damien default
val voiceColor: StateFlow<Int> = _voiceColor
/** Called by the UI whenever the voice selector changes. */
fun setVoiceColor(color: Int) { _voiceColor.value = color }
private val _debugMode = MutableStateFlow(false)
val debugMode: StateFlow<Boolean> = _debugMode
@ -202,12 +174,6 @@ class KazeiaService : Service() {
if (!::llm.isInitialized || !llm.isLoaded()) {
log("Stream LLM: LLM not ready"); return@launch
}
// Set pipeline state to Speaking so the continuous-
// listening mic loop (line ~824) drops frames during
// TTS playback. Without this, the mic picks up the
// tablet speaker and feeds our own TTS back into STT,
// creating an infinite loop.
_pipelineState.value = PipelineState.Speaking
qwenTts.startStreamingSession()
val tStart = System.currentTimeMillis()
var firstSentenceLogged = false
@ -233,9 +199,6 @@ class KazeiaService : Service() {
} catch (e: Exception) {
log("Stream LLM error: ${e.message}")
e.printStackTrace()
} finally {
// Back to Idle so the next mic frame is accepted.
_pipelineState.value = PipelineState.Idle
}
}
}
@ -451,18 +414,10 @@ class KazeiaService : Service() {
this, Manifest.permission.RECORD_AUDIO
) == PackageManager.PERMISSION_GRANTED
// FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK is required so ColorOS (and
// stock Android 14+ policies) don't mute the TTS AudioTrack with
// "clientVolume" at ~600 ms after play(). Without it the FGS was
// classified as mic-only or special-use and background-audio
// hardening silenced it. Combine with MICROPHONE so mic input keeps
// working during STT.
val fgsType = if (hasMicPermission) {
ServiceInfo.FOREGROUND_SERVICE_TYPE_MICROPHONE or
ServiceInfo.FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK
ServiceInfo.FOREGROUND_SERVICE_TYPE_MICROPHONE
} else {
ServiceInfo.FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK or
ServiceInfo.FOREGROUND_SERVICE_TYPE_SPECIAL_USE
ServiceInfo.FOREGROUND_SERVICE_TYPE_SPECIAL_USE
}
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) {
@ -495,7 +450,7 @@ class KazeiaService : Service() {
// TTS: try Qwen3-TTS (NPU Hexagon), fallback to Android TTS
_loadingState.value = LoadingState(15, "TTS Qwen3…")
try {
val qwenTts = com.kazeia.tts.Qwen3TtsEngine(nativeLibDir, this@KazeiaService) { msg -> log("[TTS] $msg") }
val qwenTts = com.kazeia.tts.Qwen3TtsEngine(nativeLibDir) { msg -> log("[TTS] $msg") }
qwenTts.load("$modelsDir/qwen3-tts-npu")
if (qwenTts.isLoaded()) {
tts = qwenTts
@ -563,7 +518,7 @@ class KazeiaService : Service() {
// LLM: Qwen3-4B on Hexagon V79 via qnn_llama_runner.
_loadingState.value = LoadingState(50, "LLM Qwen3-4B NPU…")
llm = ExecuTorchLlmEngine(this@KazeiaService) { msg -> log(msg) }
llm = ExecuTorchLlmEngine { msg -> log(msg) }
try {
llm.load("${KazeiaApplication.MODELS_DIR}/qwen3-4b", com.kazeia.core.LlmConfig())
} catch (e: Exception) {
@ -628,16 +583,6 @@ class KazeiaService : Service() {
if (chatterbox != null) {
chatterbox.setVoice(voicePath)
log("Voice set to: $voicePath")
return
}
val qwen = tts as? com.kazeia.tts.Qwen3TtsEngine
if (qwen != null) {
// Hot-swap prefix/suffix embeddings — no model reload. Takes
// effect from the NEXT synthesized segment (current in-flight
// one, if any, finishes with the old voice since the arrays
// are already in its closure).
qwen.setVoice(voicePath)
log("Voice set to: $voicePath")
}
}
@ -890,14 +835,6 @@ class KazeiaService : Service() {
for (s in frame) sumSq += s.toLong() * s.toLong()
val rms = Math.sqrt(sumSq.toDouble() / frameSize).toInt()
// Drive the visualizer orb. Normalize with the same
// sqrt squashing used for TTS so loud peaks don't
// saturate and quiet speech is still visible. The
// visualizer stays in Listening mode; it will swap
// to Speaking or Idle when pipelineState moves on.
val rmsNorm = kotlin.math.sqrt((rms / 6000f).coerceIn(0f, 1f))
_visualizerSignal.value = VisualizerSignal.Listening(rmsNorm)
// Log RMS every second for calibration
if (frameCount % 10 == 0) {
Log.d(TAG, "VAD RMS=$rms (threshold=$silenceThreshold)")
@ -1247,100 +1184,13 @@ class KazeiaService : Service() {
log("LLM stats: ${result.tokenCount} tokens in ${result.timeMs}ms (${result.tokensPerSecond} tok/s)")
if (responseText.isNotEmpty()) {
// Mark the pipeline as Speaking for the duration of TTS so
// the continuous-listening mic loop drops frames and we
// don't feed our own speaker output back into STT.
_pipelineState.value = PipelineState.Speaking
// Create a KAZEIA bubble up-front. Until the first TTS
// segment actually starts playing the bubble shows an
// animated "." → ".." → "..." typing indicator so the
// user knows Kazeia is thinking/synthesising; once the
// first segment plays the dots are cleared and the
// per-sentence word reveal takes over.
val bubble = ChatMessage(role = ChatMessage.Role.KAZEIA, text = ".")
addMessage(bubble)
val revealScope = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.Default)
var revealedSoFar = ""
val revealJobs = mutableListOf<kotlinx.coroutines.Job>()
val firstSegmentSeen = java.util.concurrent.atomic.AtomicBoolean(false)
val typingJob = revealScope.launch {
var tick = 0
while (!firstSegmentSeen.get()) {
val dots = ".".repeat(1 + (tick % 3)) // . → .. → ...
updateMessageText(bubble.id, dots)
tick++
kotlinx.coroutines.delay(400)
}
}
try {
pipeline.speakText(responseText) { sentence, durationMs, envelope, spectrogram ->
// First segment: stop the typing indicator and
// reset the bubble to empty so the word reveal
// doesn't collide with the dots.
if (firstSegmentSeen.compareAndSet(false, true)) {
try { typingJob.cancel() } catch (_: Exception) {}
updateMessageText(bubble.id, "")
}
// Push the envelope + spectrogram to the
// visualizer at the same moment the MediaPlayer
// starts playing so the orb reacts to this
// segment's actual energy and the in-sphere
// spectrum bars match the audio content.
_visualizerSignal.value =
VisualizerSignal.Speaking(envelope, spectrogram, durationMs)
// Start a coroutine that appends one word at a time
// over the segment's audio duration. Words are
// separated on whitespace; punctuation rides with
// the trailing word. The prefix (= text already
// revealed from previous sentences) carries over so
// earlier sentences stay on screen.
val prefix = revealedSoFar
val words = sentence.split(Regex("\\s+")).filter { it.isNotBlank() }
revealedSoFar =
if (prefix.isEmpty()) sentence
else "$prefix $sentence"
if (words.isEmpty()) return@speakText
val perWordMs = (durationMs / words.size).coerceAtLeast(40L)
val job = revealScope.launch {
val sb = StringBuilder(prefix)
if (prefix.isNotEmpty()) sb.append(' ')
// Immediately reveal the first word so there's
// no visible gap between audio start and text.
sb.append(words[0])
updateMessageText(bubble.id, sb.toString())
for (i in 1 until words.size) {
kotlinx.coroutines.delay(perWordMs)
sb.append(' ').append(words[i])
updateMessageText(bubble.id, sb.toString())
}
}
revealJobs.add(job)
}
// After all segments finished playing, ensure the full
// text is visible even if a reveal job was racing.
revealJobs.forEach { try { it.join() } catch (_: Exception) {} }
updateMessageText(bubble.id, responseText)
} finally {
// Defensive: cancel the typing dots in case no
// segment ever fired (e.g. the response was entirely
// emojis and got stripped empty).
firstSegmentSeen.set(true)
try { typingJob.cancel() } catch (_: Exception) {}
_pipelineState.value = if (_isListening.value)
PipelineState.Listening else PipelineState.Idle
// If we're going back to mic listening, the VAD loop
// will keep pushing Listening signals; otherwise drop
// to Idle so the orb settles back to its breathing
// baseline.
if (!_isListening.value) {
_visualizerSignal.value = VisualizerSignal.Idle
}
}
} else {
_pipelineState.value = if (_isListening.value)
PipelineState.Listening else PipelineState.Idle
addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = responseText))
pipeline.speakText(responseText)
}
_pipelineState.value = if (_isListening.value)
PipelineState.Listening else PipelineState.Idle
} catch (e: Exception) {
_aiWorkload.value = _aiWorkload.value.copy(llmActive = false)
log("ERROR: LLM generation error: ${e.message}")
@ -1357,19 +1207,6 @@ class KazeiaService : Service() {
_messages.value = _messages.value + message
}
/** Replace the text of an existing message (identified by id) in the
* message list. Used by the progressive-reveal flow to grow a
* KAZEIA message word-by-word as TTS audio plays. */
private fun updateMessageText(id: Long, newText: String) {
val current = _messages.value
val idx = current.indexOfLast { it.id == id }
if (idx < 0) return
val m = current[idx]
_messages.value = current.toMutableList().also {
it[idx] = m.copy(text = newText)
}
}
private fun createNotification(): Notification {
val intent = Intent(this, ChatActivity::class.java)
val pendingIntent = PendingIntent.getActivity(

View File

@ -37,7 +37,6 @@ import kotlin.coroutines.resume
*/
class Qwen3TtsEngine(
private val nativeLibDir: String,
private val context: android.content.Context? = null,
private val onLog: ((String) -> Unit)? = null
) : TtsEngine {
@ -89,38 +88,6 @@ class Qwen3TtsEngine(
private const val TOKEN_USER = 872
private const val TOKEN_ASSISTANT = 1042
private const val TOKEN_NEWLINE = 198
// Streaming decode: when true, BigVGAN dispatches a chunk's audio as
// soon as SEQ_LEN codes are ready from the talker/CP loop rather than
// waiting for all tokens. For long segments this overlaps the final
// BigVGAN passes with ongoing talker/CP work on Hexagon, cutting the
// first-audio latency by ~4 s. Short segments (<SEQ_LEN codes) fall
// back to the single-chunk path with zero difference. Flag exists so
// the sequential path can be re-enabled for A/B comparison.
private const val USE_STREAMING_DECODE = true
// ColorOS Audio Hardening silently mutes AudioTrack in background/FGS
// context (confirmed via `event:muted updated source:clientVolume`
// logs, same behaviour across USAGE_MEDIA, USAGE_ASSISTANT, and
// USAGE_VOICE_COMMUNICATION). When this flag is true, each
// generated segment is written as a WAV to app-owned shared
// storage and played via MediaPlayer instead. Slightly slower
// (WAV write + MediaPlayer prepare add ~150 ms per segment) but
// it's the only reliable path to audible output on this device.
private const val USE_MEDIAPLAYER_FALLBACK = true
// Window size for the TTS→visualizer RMS sidecar. 50 ms at 24 kHz
// = 1200 samples/window — small enough for a 60 fps visualizer to
// track formants, large enough to run at negligible CPU cost.
const val ENVELOPE_WINDOW_MS = 50
// FFT size for the spectrum-in-sphere sidecar. 1024 samples at
// 24 kHz = 43 ms — slightly narrower than the hop so each frame
// gives a clean snapshot centered on its hop boundary.
private const val FFT_SIZE = 1024
// Number of log-spaced bands 120 Hz4 kHz rendered as vertical
// bars inside the sphere during Speaking. 12 feels like a real
// spectrometer without cluttering at smaller sphere sizes.
const val SPECTRUM_BANDS = 12
}
private var ortEnv: OrtEnvironment? = null
@ -276,12 +243,7 @@ class Qwen3TtsEngine(
return session
}
// Speech decoder V2 on CPU. Two paths tried, both worse than CPU:
// - HTP: BigVGAN convolutions too slow to compile (timeout)
// - GPU Adreno via QNN GPU EP: model loads but per-phrase
// inference is ~3.5 s vs ~2 s on CPU (GPU/CPU memory transfer
// overhead dominates for this conv-heavy model)
// CPU 8-thread stays the practical optimum.
// Speech decoder V2 on CPU (HTP tested: BigVGAN convolutions too slow to compile)
val v2Path = "$path/v2_pre_conv"
if (File("$v2Path/model.onnx").exists()) {
nlog("Loading V2 speech decoder (CPU ONNX)...")
@ -608,53 +570,8 @@ class Qwen3TtsEngine(
override fun isLoaded(): Boolean = loaded
/**
* Hot-swap the speaker prefix/suffix embeddings used for voice
* conditioning. [voicePath] is a WAV path like
* `//voix/elodie.wav` we derive the voice id from its basename
* and look for matching `<id>_voice_prefix.bin` + `<id>_voice_suffix.bin`
* in the model dir. If both files exist they replace the current
* [damienVoicePrefix] / [damienVoiceSuffix] arrays so the next
* segment generated uses the new voice. If either file is missing
* we log a warning and keep the current voice per-voice
* prefix/suffix files are offline-generated via
* scripts/prepare_tts_native.py; run once per voice WAV and
* `adb push` into the model dir to enable.
*
* Thread-safety: the arrays are read by the synth worker on
* Dispatchers.IO; replacing a reference via a volatile var is
* atomic on the JVM so a mid-segment replacement just takes
* effect on the next segment boundary.
*/
fun setVoice(voicePath: String) {
val modelDir = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
val id = java.io.File(voicePath).nameWithoutExtension.lowercase()
val prefixFile = java.io.File("$modelDir/${id}_voice_prefix.bin")
val suffixFile = java.io.File("$modelDir/${id}_voice_suffix.bin")
if (!prefixFile.exists() || !suffixFile.exists()) {
nlog("Voice '$id' not available (missing ${prefixFile.name} or ${suffixFile.name}); keeping current voice. " +
"Run scripts/prepare_tts_native.py with this WAV to generate the files.")
return
}
try {
val pBytes = prefixFile.readBytes()
val pHead = java.nio.ByteBuffer.wrap(pBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
val nPref = pHead.int; val dimPref = pHead.int
if (dimPref != TALKER_DIM) throw IllegalStateException("prefix dim $dimPref != $TALKER_DIM")
val newPrefix = Array(nPref) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = pHead.float } }
val sBytes = suffixFile.readBytes()
val sHead = java.nio.ByteBuffer.wrap(sBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
val nSuf = sHead.int; val dimSuf = sHead.int
if (dimSuf != TALKER_DIM) throw IllegalStateException("suffix dim $dimSuf != $TALKER_DIM")
val newSuffix = Array(nSuf) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = sHead.float } }
damienVoicePrefix = newPrefix
damienVoiceSuffix = newSuffix
nlog("Voice switched to '$id' ($nPref prefix + $nSuf suffix embeds)")
} catch (e: Exception) {
nlog("Voice swap failed for '$id': ${e.message}")
}
nlog("Voice: $voicePath")
}
override suspend fun synthesize(text: String, language: String): TtsResult {
@ -2752,11 +2669,7 @@ class Qwen3TtsEngine(
/** PTE pipeline from pre-computed embeddings (prefill + trailing). */
private fun runInterleavedPteFromEmbeds(
prefillEmbeds: List<FloatArray>, trailingEmbeds: List<FloatArray>, maxGenTokens: Int,
// Invoked synchronously after each generated step with (stepIdx, 16-codebook codes).
// Streaming callers use it to dispatch SEQ_LEN-sized chunks to the BigVGAN pipeline
// as soon as they are ready. null preserves the original batch behaviour.
onCodeStep: ((step: Int, codes: IntArray) -> Unit)? = null
prefillEmbeds: List<FloatArray>, trailingEmbeds: List<FloatArray>, maxGenTokens: Int
): Array<IntArray> {
val talkerMod = talkerPteModule ?: return emptyArray()
val cpMod = cpPteModule ?: return emptyArray()
@ -2834,7 +2747,6 @@ class Qwen3TtsEngine(
totalCpMs += System.currentTimeMillis() - tCp0
for (cb in 1 until NUM_CODEBOOKS) codes[cb] = cpCodes[cb - 1]
allCodes.add(codes); generatedCb0.add(currentCb0)
onCodeStep?.invoke(genStep, codes)
if (genStep < 3) nlog("Step ${genStep+1}: cb0=$currentCb0 cb1=${codes[1]}")
@ -3404,18 +3316,6 @@ class Qwen3TtsEngine(
private var sessionTrack: AudioTrack? = null
private var sessionChannel: kotlinx.coroutines.channels.Channel<String>? = null
private var sessionJob: kotlinx.coroutines.Job? = null
private var sessionKeepAliveJob: kotlinx.coroutines.Job? = null
private var sessionFocusRequest: android.media.AudioFocusRequest? = null
// Total PCM frames queued to sessionTrack across all segments in this session.
// endStreamingSession() polls track.playbackHeadPosition until it reaches this
// count before calling stop(), so the tail sentence isn't clipped.
// Uses AtomicLong because both the session worker and the keep-alive watchdog
// call writeAndCount concurrently.
private val sessionFramesWritten = java.util.concurrent.atomic.AtomicLong(0)
// True while a real-audio generate call is in progress. The keep-alive
// watchdog skips silence injection while this is set, so silence never
// interleaves with speech inside a segment.
private val sessionGenActive = java.util.concurrent.atomic.AtomicBoolean(false)
/**
* Open a streaming TTS session backed by a persistent AudioTrack. After
@ -3424,403 +3324,13 @@ class Qwen3TtsEngine(
* track as soon as it's decoded. Call endStreamingSession() to flush
* the queue and release the track.
*/
// MediaPlayer-based fallback session state. If ColorOS mutes our
// AudioTrack (as observed repeatedly — `event:muted updated source:
// clientVolume` right after play()), we instead render each segment
// as a WAV file on shared storage and play it back via MediaPlayer,
// which uses a completely different internal audio pipeline that
// doesn't get silenced by the background playback policy.
private var sessionMpQueue: kotlinx.coroutines.channels.Channel<String>? = null
private var sessionMpJob: kotlinx.coroutines.Job? = null
private val sessionMpSegIdx = java.util.concurrent.atomic.AtomicInteger(0)
/**
* Fires the moment a synthesized segment starts playing through the
* speaker. Carries the sentence text, audio duration, per-window RMS
* envelope (for orb amplitude) and per-window log-spaced band
* spectrogram (for the spectrum-in-sphere visualizer). All three
* share the same time axis one entry per [ENVELOPE_WINDOW_MS].
*/
var onSegmentPlaying: ((
sentence: String,
durationMs: Long,
rmsEnvelope: FloatArray,
spectrogram: Array<FloatArray>
) -> Unit)? = null
private fun startStreamingSessionMp() {
if (sessionMpQueue != null) return
sessionMpSegIdx.set(0)
val sentenceChan = kotlinx.coroutines.channels.Channel<String>(
capacity = kotlinx.coroutines.channels.Channel.UNLIMITED
)
// Pipeline: synth worker produces WAV paths, playback worker runs
// them through a pair of MediaPlayer instances chained via
// setNextMediaPlayer() so there's zero-gap transition between
// segments (no DAC/output routing "pop" the user was hearing as
// "beg beg" with one player-per-seg). The rendezvous channel has
// capacity 2 so the synth worker can stay one seg ahead of the
// currently playing seg without growing disk use.
// Carry (segIdx, wavPath, sentence, durationMs) together so the
// playback worker can invoke onSegmentPlaying with the matching
// text and audio length when the segment actually starts playing.
val wavChan = kotlinx.coroutines.channels.Channel<SegmentReady>(capacity = 2)
val scope = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.IO)
val synthJob = scope.launch {
for (sentence in sentenceChan) {
try {
val segIdx = sessionMpSegIdx.getAndIncrement()
val tSynth = System.currentTimeMillis()
val audio = generateSegmentAudioVC(sentence, segIdx)
if (audio.isEmpty()) continue
val wavPath = "${context?.cacheDir?.absolutePath ?: "/data/local/tmp/kazeia"}/tts_seg_${segIdx}.wav"
saveWav(wavPath, audio)
val durationMs = audio.size * 1000L / SR
val envelope = computeRmsEnvelope(audio)
val spectrogram = computeSpectrogram(audio)
nlog("MP seg $segIdx synthesized (${System.currentTimeMillis() - tSynth}ms, ${durationMs}ms audio, ${envelope.size} env × ${SPECTRUM_BANDS} bands), queued for playback")
wavChan.send(SegmentReady(segIdx, wavPath, sentence, durationMs, envelope, spectrogram))
} catch (e: Exception) {
nlog("MP synth error: ${e.message}")
}
}
wavChan.close()
}
val playJob = scope.launch { playChainedMediaPlayers(wavChan) }
val combined = scope.launch { synthJob.join(); playJob.join() }
sessionMpQueue = sentenceChan; sessionMpJob = combined
nlog("streaming session opened (MediaPlayer fallback, chained)")
}
/**
* Drive the WAV playback pipeline with two MediaPlayer instances
* chained via setNextMediaPlayer() so each segment flows into the
* next without re-arming the audio output (which caused audible
* "pops" between segments when one player stopped and another
* started). Consumes (segIdx, wavPath) pairs from [wavChan] and
* deletes each file after it finishes playing. Suspends until the
* channel closes AND the final segment finishes.
*/
private suspend fun playChainedMediaPlayers(
wavChan: kotlinx.coroutines.channels.ReceiveChannel<SegmentReady>
) {
val attrs = android.media.AudioAttributes.Builder()
.setUsage(android.media.AudioAttributes.USAGE_MEDIA)
.setContentType(android.media.AudioAttributes.CONTENT_TYPE_SPEECH)
.build()
// Synchronously prepare a MediaPlayer on the current coroutine.
// Throws on failure; caller handles cleanup.
suspend fun prepareMp(path: String, segIdx: Int): android.media.MediaPlayer {
val mp = android.media.MediaPlayer()
mp.setAudioAttributes(attrs)
mp.setDataSource(path)
kotlinx.coroutines.suspendCancellableCoroutine<Unit> { cont ->
mp.setOnPreparedListener { if (cont.isActive) cont.resume(Unit) {} }
mp.setOnErrorListener { _, what, extra ->
nlog("MP seg $segIdx prepare error: what=$what extra=$extra")
if (cont.isActive) cont.resume(Unit) {}
true
}
cont.invokeOnCancellation { try { mp.release() } catch (_: Exception) {} }
mp.prepareAsync()
}
return mp
}
// Per-player book-keeping. `done` completes the moment the
// MediaPlayer's OnCompletionListener fires, so the loop can
// tell *before* calling setNextMediaPlayer whether the chain
// will actually trigger (setNextMediaPlayer on a player already
// in the Completed state is a silent no-op — that was the root
// cause of missing audio on seg 1 when synthesis ran longer
// than seg 0's playback).
class Live(
val mp: android.media.MediaPlayer,
val info: SegmentReady,
val done: kotlinx.coroutines.CompletableDeferred<Unit>
)
fun arm(info: SegmentReady, mp: android.media.MediaPlayer): Live {
val done = kotlinx.coroutines.CompletableDeferred<Unit>()
mp.setOnCompletionListener {
try { it.release() } catch (_: Exception) {}
if (!done.isCompleted) done.complete(Unit)
}
mp.setOnErrorListener { _, what, extra ->
nlog("MP seg ${info.segIdx} play error: what=$what extra=$extra")
if (!done.isCompleted) done.complete(Unit)
true
}
return Live(mp, info, done)
}
var current: Live? = null
try {
// Bootstrap with the first segment.
val first = wavChan.receiveCatching().getOrNull() ?: return
val firstMp = prepareMp(first.wavPath, first.segIdx)
firstMp.start()
current = arm(first, firstMp)
try { onSegmentPlaying?.invoke(first.sentence, first.durationMs, first.rmsEnvelope, first.spectrogram) } catch (_: Exception) {}
nlog("MP seg ${first.segIdx} started (${first.durationMs}ms)")
while (true) {
val upcoming = wavChan.receiveCatching().getOrNull() ?: break
val nextMp = prepareMp(upcoming.wavPath, upcoming.segIdx)
// Try to chain so Android auto-starts next when current
// finishes — gives zero-gap playback without re-arming
// the DAC. Skipped if current has already completed
// (setNext on Completed is a no-op); we fall back to an
// explicit start() below in that case.
var chained = false
try {
if (!current!!.done.isCompleted) {
current!!.mp.setNextMediaPlayer(nextMp)
chained = true
}
} catch (e: Exception) {
nlog("MP seg ${upcoming.segIdx} setNext failed: ${e.message}")
}
// Wait for current playback to finish before rotating.
current!!.done.await()
try { java.io.File(current!!.info.wavPath).delete() } catch (_: Exception) {}
// If we never chained (or the chain raced with the
// current's completion), start next manually. Safe to
// start() again even if Android already auto-started.
val autoStarted = try { chained && (nextMp.isPlaying || nextMp.currentPosition > 0) } catch (_: Exception) { false }
if (!autoStarted) {
try { nextMp.start() } catch (e: Exception) {
nlog("MP seg ${upcoming.segIdx} manual start failed: ${e.message}")
}
nlog("MP seg ${upcoming.segIdx} started manually (chain missed)")
} else {
nlog("MP seg ${upcoming.segIdx} auto-chained")
}
current = arm(upcoming, nextMp)
try { onSegmentPlaying?.invoke(upcoming.sentence, upcoming.durationMs, upcoming.rmsEnvelope, upcoming.spectrogram) } catch (_: Exception) {}
}
// Drain: wait for the last player to finish.
current?.done?.await()
current?.let { try { java.io.File(it.info.wavPath).delete() } catch (_: Exception) {} }
} catch (e: Exception) {
nlog("MP playback chain error: ${e.message}")
} finally {
try { current?.mp?.release() } catch (_: Exception) {}
}
}
/** Payload handed from the synth worker to the playback worker so
* the UI can be notified with matching text + duration when each
* segment starts playing. The [rmsEnvelope] is an optional sidecar
* array of per-ENVELOPE_WINDOW_MS RMS values normalized to [0, 1]
* that drives the audio-reactive orb visualizer without having to
* read PCM back from MediaPlayer. */
private data class SegmentReady(
val segIdx: Int,
val wavPath: String,
val sentence: String,
val durationMs: Long,
val rmsEnvelope: FloatArray,
val spectrogram: Array<FloatArray>
)
/** Compute a per-ENVELOPE_WINDOW_MS normalized RMS envelope from a
* mono 16-bit PCM buffer at [SR]. Cheap (one pass, trivially fast
* on the ~100 k samples we generate per segment) and called only
* once per segment right after synthesis. */
private fun computeRmsEnvelope(audio: ShortArray): FloatArray {
if (audio.isEmpty()) return FloatArray(0)
val windowSamples = SR * ENVELOPE_WINDOW_MS / 1000
val nWindows = (audio.size + windowSamples - 1) / windowSamples
val env = FloatArray(nWindows)
for (w in 0 until nWindows) {
val start = w * windowSamples
val end = minOf(start + windowSamples, audio.size)
var sumSq = 0.0
for (i in start until end) {
val s = audio[i].toDouble()
sumSq += s * s
}
val rms = kotlin.math.sqrt(sumSq / (end - start))
// Normalize: 32767 is full-scale; squash the upper range
// with a sqrt curve so even quiet speech shows visible
// motion without saturating on loud peaks.
env[w] = kotlin.math.sqrt((rms / 32767.0).coerceIn(0.0, 1.0)).toFloat()
}
return env
}
/** Compute a per-window log-spaced band spectrogram used by the
* spectrum-in-sphere visualizer. Time axis aligned with the RMS
* envelope (one column per ENVELOPE_WINDOW_MS). FFT size is 1024
* samples (~43 ms at 24 kHz), windowed with Hann and centered on
* each hop. [SPECTRUM_BANDS] log-spaced bands from 120 Hz to
* 4 kHz covers the vocal formant range without wasting visual
* space on silent sub-100 Hz or frictive >4 kHz content. */
private fun computeSpectrogram(audio: ShortArray): Array<FloatArray> {
if (audio.isEmpty()) return emptyArray()
val fftSize = FFT_SIZE
val hopSamples = SR * ENVELOPE_WINDOW_MS / 1000
val nFrames = (audio.size + hopSamples - 1) / hopSamples
// Pre-compute band edges as FFT bin indices.
val binHzRes = SR.toDouble() / fftSize
val fMin = 120.0; val fMax = 4000.0
val bandEdges = IntArray(SPECTRUM_BANDS + 1) { i ->
val f = fMin * Math.pow(fMax / fMin, i.toDouble() / SPECTRUM_BANDS)
(f / binHzRes).toInt().coerceIn(1, fftSize / 2 - 1)
}
// Hann window — reduces spectral leakage, gives cleaner bars.
val hann = FloatArray(fftSize) { i ->
(0.5 - 0.5 * Math.cos(2.0 * Math.PI * i / (fftSize - 1))).toFloat()
}
val re = FloatArray(fftSize)
val im = FloatArray(fftSize)
val result = Array(nFrames) { FloatArray(SPECTRUM_BANDS) }
for (f in 0 until nFrames) {
// Center the window on the hop midpoint.
val center = f * hopSamples + hopSamples / 2
val start = center - fftSize / 2
for (i in 0 until fftSize) {
val idx = start + i
val sample = if (idx in audio.indices) audio[idx].toFloat() / 32768f else 0f
re[i] = sample * hann[i]
im[i] = 0f
}
fftInPlace(re, im)
for (b in 0 until SPECTRUM_BANDS) {
val bStart = bandEdges[b]
val bEnd = bandEdges[b + 1].coerceAtLeast(bStart + 1)
var sum = 0.0
for (k in bStart until bEnd) {
val reK = re[k].toDouble(); val imK = im[k].toDouble()
sum += reK * reK + imK * imK
}
val mag = Math.sqrt(sum / (bEnd - bStart))
// Log-compress + normalize. Speech energy per band rarely
// exceeds ~0.1 before log; the constants below bring the
// typical range to [0.2, 0.95] for visible bar motion.
result[f][b] = (Math.log10(1.0 + mag * 80) / Math.log10(7.0))
.toFloat().coerceIn(0f, 1f)
}
}
return result
}
/** In-place radix-2 CooleyTukey FFT. Size must be a power of 2. */
private fun fftInPlace(re: FloatArray, im: FloatArray) {
val n = re.size
// Bit-reversal permutation.
var j = 0
for (i in 1 until n) {
var bit = n shr 1
while (j and bit != 0) { j = j xor bit; bit = bit shr 1 }
j = j or bit
if (i < j) {
val tr = re[i]; re[i] = re[j]; re[j] = tr
val ti = im[i]; im[i] = im[j]; im[j] = ti
}
}
// Butterflies.
var size = 2
while (size <= n) {
val half = size / 2
val step = n / size
val angleBase = -2.0 * Math.PI / size
var m = 0
while (m < n) {
var k = 0
for (i in m until m + half) {
val angle = (angleBase * k).toFloat()
val c = kotlin.math.cos(angle)
val s = kotlin.math.sin(angle)
val tRe = re[i + half] * c - im[i + half] * s
val tIm = re[i + half] * s + im[i + half] * c
re[i + half] = re[i] - tRe
im[i + half] = im[i] - tIm
re[i] = re[i] + tRe
im[i] = im[i] + tIm
k += step
}
m += size
}
size *= 2
}
}
private suspend fun endStreamingSessionMp() {
val chan = sessionMpQueue ?: return
chan.close()
try { sessionMpJob?.join() } catch (_: Exception) {}
sessionMpQueue = null; sessionMpJob = null
onSegmentPlaying = null
nlog("streaming session closed (MediaPlayer fallback)")
}
/**
* Play a WAV file via Android MediaPlayer and block the calling
* coroutine until playback completes. MediaPlayer uses a separate
* audio pipeline from AudioTrack so it bypasses ColorOS's AudioTrack
* hardening/muting behaviour.
*/
private suspend fun playWavBlocking(path: String, segIdx: Int) {
val t0 = System.currentTimeMillis()
suspendCancellableCoroutine<Unit> { cont ->
val mp = android.media.MediaPlayer()
try {
mp.setAudioAttributes(android.media.AudioAttributes.Builder()
.setUsage(android.media.AudioAttributes.USAGE_MEDIA)
.setContentType(android.media.AudioAttributes.CONTENT_TYPE_SPEECH)
.build())
mp.setDataSource(path)
mp.setOnPreparedListener {
nlog("MP seg $segIdx prepared, starting (prep ${System.currentTimeMillis() - t0}ms)")
it.start()
}
mp.setOnCompletionListener {
nlog("MP seg $segIdx done (${System.currentTimeMillis() - t0}ms total)")
try { it.release() } catch (_: Exception) {}
if (cont.isActive) cont.resume(Unit) {}
}
mp.setOnErrorListener { player, what, extra ->
nlog("MP seg $segIdx error: what=$what extra=$extra")
try { player.release() } catch (_: Exception) {}
if (cont.isActive) cont.resume(Unit) {}
true
}
mp.prepareAsync()
cont.invokeOnCancellation { try { mp.release() } catch (_: Exception) {} }
} catch (e: Exception) {
nlog("MP seg $segIdx setup failed: ${e.message}")
try { mp.release() } catch (_: Exception) {}
if (cont.isActive) cont.resume(Unit) {}
}
}
}
fun startStreamingSession() {
if (USE_MEDIAPLAYER_FALLBACK) { startStreamingSessionMp(); return }
if (sessionTrack != null) return // already open
// USAGE_VOICE_COMMUNICATION routes to STREAM_VOICE_CALL, which
// ColorOS's "Audio Hardening" policy does NOT silently mute (the
// policy targets STREAM_MUSIC to preserve battery on inactive media
// apps; STREAM_VOICE_CALL is reserved for VoIP and always plays).
// Previous attempts with USAGE_MEDIA and USAGE_ASSISTANT both got
// `event:muted updated source:clientVolume` ~0.61 s after play()
// even with audio focus + mediaPlayback FGS, so moving off of
// STREAM_MUSIC is the only route that unblocks audible playback.
val attrs = AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build()
val track = AudioTrack.Builder()
.setAudioAttributes(attrs)
.setAudioAttributes(AudioAttributes.Builder()
.setUsage(AudioAttributes.USAGE_MEDIA)
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.build())
.setAudioFormat(AudioFormat.Builder()
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
.setSampleRate(SR)
@ -3830,77 +3340,7 @@ class Qwen3TtsEngine(
// paces writes when full.
.setTransferMode(AudioTrack.MODE_STREAM)
.build()
// Request audio focus for the duration of the session. Without this
// ColorOS's Audio Hardening treats the track as background noise
// and mutes it, regardless of FGS status. We don't care about
// focus loss callbacks — if another app grabs focus mid-sentence
// that's fine, the track just gets ducked.
val am = context?.getSystemService(android.content.Context.AUDIO_SERVICE) as? android.media.AudioManager
val focusReq = android.media.AudioFocusRequest.Builder(android.media.AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK)
.setAudioAttributes(attrs)
.setOnAudioFocusChangeListener { _ -> }
.build()
val focusRes = am?.requestAudioFocus(focusReq)
nlog("audio focus request: $focusRes (1=granted, 0=failed, 2=delayed)")
sessionFocusRequest = focusReq
// ColorOS mutes AudioTrack clientVolume ~1s after creation (seen in
// dumpsys audio as `event:muted updated source:clientVolume`). Force
// track volume back to 1.0 repeatedly to override. This is also
// done in the keep-alive watchdog loop below for ongoing override.
try { track.setVolume(1.0f) } catch (_: Exception) {}
track.play()
sessionFramesWritten.set(0)
sessionGenActive.set(false)
// writeAndCount is the single path through which PCM reaches the
// AudioTrack for this session, so sessionFramesWritten always stays
// in sync with what's been queued to playback hardware. AudioTrack.write
// is thread-safe, so this can be called concurrently from the session
// worker (real audio) and the keep-alive watchdog (silence padding).
val writeAndCount: (ShortArray) -> Unit = { pcm ->
if (pcm.isNotEmpty()) {
val n = track.write(pcm, 0, pcm.size)
if (n > 0) sessionFramesWritten.addAndGet(n.toLong())
}
}
// Bootstrap silence: queue 500 ms immediately after play() so
// AudioFlinger has samples to mix from the very first cycle.
// Without this, there's a ~100 ms window between play() and the
// first watchdog tick where the track has no data and AudioFlinger
// flags it for removal. Once that happens, playbackHead sticks at
// 0 and subsequent writes go to a dead track.
val bootstrapSilence = ShortArray(SR / 2) // 500 ms
writeAndCount(bootstrapSilence)
// Keep-alive watchdog. AudioFlinger on OnePlus/ColorOS kills a track
// that underruns for ~1 s (confirmed via `prepareTracks_l BUFFER
// TIMEOUT: remove track … due to underrun on thread 29`). Our
// per-segment synthesis takes 35 s, which always exceeds that
// window between writes, so the track was getting silenced after
// the first ~1 s of audio played. The watchdog pads with 200 ms of
// silence any time the buffered-ahead audio drops below 400 ms,
// regardless of segment state — silence only advances playback head
// in the gaps between real audio and is never inserted inside a
// contiguous burst of real writes (those bring buffered above 400 ms
// and keep the watchdog quiet).
val keepAliveBuffer = ShortArray(SR / 5) // 200 ms of silence
val keepAliveJob = kotlinx.coroutines.CoroutineScope(
kotlinx.coroutines.Dispatchers.IO
).launch {
var tick = 0
while (kotlinx.coroutines.currentCoroutineContext()[kotlinx.coroutines.Job]?.isActive != false) {
kotlinx.coroutines.delay(100)
val head = track.playbackHeadPosition.toLong() and 0xFFFFFFFFL
val written = sessionFramesWritten.get() and 0xFFFFFFFFL
val buffered = written - head
val needsPad = buffered < SR * 2 / 5 // < 400 ms
if ((tick and 0x1F) == 0) {
nlog("keepAlive tick=$tick head=$head written=$written buffered=$buffered pad=$needsPad state=${track.playState}")
}
tick++
// Override any clientVolume mute that ColorOS keeps applying.
try { track.setVolume(1.0f) } catch (_: Exception) {}
if (needsPad) writeAndCount(keepAliveBuffer)
}
}
val chan = kotlinx.coroutines.channels.Channel<String>(
capacity = kotlinx.coroutines.channels.Channel.UNLIMITED
)
@ -3910,26 +3350,15 @@ class Qwen3TtsEngine(
var segIdx = 0
for (sentence in chan) {
try {
sessionGenActive.set(true)
if (USE_STREAMING_DECODE && talkerPteModule != null && cpPteModule != null) {
// CP↔BigVGAN overlap path: audio chunks flow to the
// shared AudioTrack as soon as BigVGAN finishes each
// SEQ_LEN window, instead of after the whole segment.
generateSegmentAudioVCStreaming(sentence, segIdx, writeAndCount)
} else {
val audio = generateSegmentAudioVC(sentence, segIdx)
writeAndCount(audio)
}
val audio = generateSegmentAudioVC(sentence, segIdx)
if (audio.isNotEmpty()) track.write(audio, 0, audio.size)
segIdx++
} catch (e: Exception) {
nlog("session seg $segIdx error: ${e.message}")
} finally {
sessionGenActive.set(false)
}
}
}
sessionTrack = track; sessionChannel = chan; sessionJob = job
sessionKeepAliveJob = keepAliveJob
nlog("streaming session opened")
}
@ -3939,12 +3368,6 @@ class Qwen3TtsEngine(
* immediately. Sentences play in the order they were enqueued.
*/
fun enqueueSentence(sentence: String) {
if (USE_MEDIAPLAYER_FALLBACK) {
val chan = sessionMpQueue ?: run { nlog("enqueueSentence: no MP session"); return }
val r = chan.trySend(sentence)
if (r.isFailure) nlog("enqueueSentence: MP channel full / closed")
return
}
val chan = sessionChannel ?: run { nlog("enqueueSentence: no session open"); return }
val r = chan.trySend(sentence)
if (r.isFailure) nlog("enqueueSentence: channel full / closed")
@ -3956,46 +3379,17 @@ class Qwen3TtsEngine(
* drains), then release the shared track. Safe to call more than once.
*/
suspend fun endStreamingSession() {
if (USE_MEDIAPLAYER_FALLBACK) { endStreamingSessionMp(); return }
val chan = sessionChannel ?: return
chan.close()
try { sessionJob?.join() } catch (_: Exception) {}
// Stop the keep-alive watchdog BEFORE draining so it doesn't pad more
// silence onto the tail while we're waiting for the existing buffer
// to play out.
try { sessionKeepAliveJob?.cancel() } catch (_: Exception) {}
try { sessionKeepAliveJob?.join() } catch (_: Exception) {}
try {
sessionTrack?.let { track ->
// AudioTrack.stop() in MODE_STREAM DISCARDS unplayed buffered
// samples — it doesn't block for drain. Poll getPlaybackHead
// Position() until it reaches what we wrote, then stop. The
// head is a 32-bit wrap-around counter, so compare modulo.
// Cap the drain wait so a stalled track can't block us forever.
val targetFrames = sessionFramesWritten.get()
val startMs = System.currentTimeMillis()
val maxDrainMs = (targetFrames * 1000L / SR) + 500L // audio dur + 500ms slack
while (true) {
val head = track.playbackHeadPosition.toLong() and 0xFFFFFFFFL
val reached = head >= (targetFrames and 0xFFFFFFFFL)
val state = track.playState
if (reached || state != AudioTrack.PLAYSTATE_PLAYING) break
if (System.currentTimeMillis() - startMs > maxDrainMs) {
nlog("endStreamingSession: drain timeout at head=$head/$targetFrames")
break
}
kotlinx.coroutines.delay(20)
}
track.stop(); track.release()
sessionTrack?.let {
// Block until written samples have been consumed by the
// hardware so users aren't cut off mid-syllable.
it.stop(); it.release()
}
} catch (_: Exception) {}
// Release audio focus after the track is fully drained and stopped.
try {
val am = context?.getSystemService(android.content.Context.AUDIO_SERVICE) as? android.media.AudioManager
sessionFocusRequest?.let { am?.abandonAudioFocusRequest(it) }
} catch (_: Exception) {}
sessionFocusRequest = null
sessionTrack = null; sessionChannel = null; sessionJob = null; sessionKeepAliveJob = null
sessionTrack = null; sessionChannel = null; sessionJob = null
nlog("streaming session closed")
}
@ -4052,177 +3446,6 @@ class Qwen3TtsEngine(
return fadeOut(decodeChunked(codebooks, n), 40)
}
// ---------- Streaming decode (CP ↔ BigVGAN overlap) ----------
/** Carrier from the talker/CP producer to the BigVGAN consumer. */
private class ChunkMsg(val codebooks: Array<IntArray>, val realTokens: Int)
/**
* Streaming variant of decodeChunked. Mirrors its semantics exactly: the
* internal `result` buffer accumulates and crossfades chunks the same
* way, so the final assembled audio is bit-identical. The difference is
* that whenever a portion of `result` becomes "stable" (no future chunk
* can modify it, i.e. anything before the last `overlapSamples`), it is
* emitted via `onAudio` immediately. `flushFinal()` emits the remaining
* tail with fadeOut applied, matching the original behaviour.
*/
private inner class StreamingCrossfader(private val onAudio: (ShortArray) -> Unit) {
private val overlapSamples = CHUNK_OVERLAP * SAMPLES_PER_TOKEN
private var result = ShortArray(0)
private var emittedLen = 0
private var isFirst = true
fun feedChunk(chunkAudio: ShortArray, realTokens: Int) {
val trimLen = minOf(realTokens * SAMPLES_PER_TOKEN, chunkAudio.size)
val trimmed = if (trimLen < chunkAudio.size) chunkAudio.copyOf(trimLen) else chunkAudio
if (isFirst) {
result = trimmed.copyOf()
isFirst = false
} else {
val fadeLen = minOf(overlapSamples, result.size, trimmed.size)
for (i in 0 until fadeLen) {
val alpha = i.toFloat() / fadeLen
val mixed = ((1f - alpha) * result[result.size - fadeLen + i] + alpha * trimmed[i]).toInt()
.coerceIn(Short.MIN_VALUE.toInt(), Short.MAX_VALUE.toInt()).toShort()
result[result.size - fadeLen + i] = mixed
}
if (fadeLen < trimmed.size) {
val newPart = trimmed.copyOfRange(fadeLen, trimmed.size)
val combined = ShortArray(result.size + newPart.size)
System.arraycopy(result, 0, combined, 0, result.size)
System.arraycopy(newPart, 0, combined, result.size, newPart.size)
result = combined
}
}
// Hold back the last `overlapSamples` so the next chunk's
// crossfade can still mutate them; emit everything before that.
val stableEnd = (result.size - overlapSamples).coerceAtLeast(emittedLen)
if (stableEnd > emittedLen) {
val slice = result.copyOfRange(emittedLen, stableEnd)
onAudio(slice)
emittedLen = stableEnd
}
}
/** Emit any remaining buffered samples with the trailing fadeOut. */
fun flushFinal() {
if (emittedLen < result.size) {
val tail = result.copyOfRange(emittedLen, result.size)
onAudio(fadeOut(tail, 40))
emittedLen = result.size
}
}
}
/**
* Streaming variant of generateSegmentAudioVC. As the talker/CP loop
* produces codes step by step, BigVGAN chunks are dispatched on a
* background coroutine the moment SEQ_LEN codes are accumulated. For a
* 75-token segment this overlaps the last BigVGAN pass with the final
* ~20 talker/CP steps, cutting first-audio latency by ~4 s vs the
* sequential `generateSegmentAudioVC` path.
*
* Short segments (<SEQ_LEN codes) emit a single chunk at end-of-gen,
* matching the legacy single-chunk path with no perceptible difference.
*
* The producer thread blocks on `bvChan.send` if the BigVGAN consumer
* is behind; in practice that never happens because the producer takes
* ~5 s per chunk vs ~2.4 s for BigVGAN.
*/
private suspend fun generateSegmentAudioVCStreaming(
segText: String, segIdx: Int, onAudio: (ShortArray) -> Unit
) {
if (bpeTokenizer == null || textEmbedsFullBuf == null || damienVoicePrefix == null || damienVoiceSuffix == null) {
nlog("generateSegmentAudioVCStreaming: Stage 2 assets missing"); return
}
if (talkerPteModule == null || cpPteModule == null) {
nlog("generateSegmentAudioVCStreaming: PTE talker/CP not loaded"); return
}
val prefix = damienVoicePrefix!!
val suffix = damienVoiceSuffix!!
val codecPadEmb = codecEmb(CODEC_PAD)
val ids = bpeTokenizer!!.encode(segText)
nlog("session seg $segIdx (stream) '${segText.take(60)}' → ${ids.size} tokens")
val prefill = ArrayList<FloatArray>(prefix.size + ids.size + suffix.size)
for (e in prefix) prefill.add(e)
for (id in ids) prefill.add(sumEmb(textEmbFromFull(id), codecPadEmb))
for (e in suffix) prefill.add(e)
val expectedSteps = (ids.size * 24) / 10
val maxGen = minOf(expectedSteps * 3 / 2 + 10, MAX_CONTEXT - 15)
val tStart = System.currentTimeMillis()
var firstAudioLogged = false
val bvChan = kotlinx.coroutines.channels.Channel<ChunkMsg>(capacity = 4)
val cfader = StreamingCrossfader { pcm ->
if (!firstAudioLogged) {
nlog("streaming seg $segIdx first audio at ${System.currentTimeMillis() - tStart}ms (${pcm.size} samples)")
firstAudioLogged = true
}
onAudio(pcm)
}
val consumerJob = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.IO).launch {
try {
for (msg in bvChan) {
val quant = vqDecode(msg.codebooks)
val audio = runSpeechDecoderV2(quant)
cfader.feedChunk(audio, msg.realTokens)
}
cfader.flushFinal()
} catch (e: Exception) {
nlog("streaming seg $segIdx consumer error: ${e.message}")
}
}
// Producer: run the interleaved talker/CP loop and dispatch each
// SEQ_LEN-aligned window of codes immediately. The consumer's
// crossfader holds back the last `overlapSamples` of audio per
// chunk, so the in-flight chunk's tail can still be mutated by the
// next chunk before being emitted; flushFinal() at end emits the
// last tail with fadeOut. End-of-stream is signalled by closing
// bvChan after the trailing partial chunk is sent.
val collected = mutableListOf<IntArray>()
var nextChunkStart = 0
fun buildChunkCb(start: Int, real: Int): Array<IntArray> = Array(NUM_CODEBOOKS) { cb ->
IntArray(SEQ_LEN) { t ->
val src = start + t
if (src < start + real) {
val v = collected[src][cb]
if (v in 0 until CODEBOOK_SIZE) v else 0
} else 0
}
}
try {
runInterleavedPteFromEmbeds(prefill, emptyList(), maxGen) { _, codes ->
collected.add(codes)
while (collected.size >= nextChunkStart + SEQ_LEN) {
val cb = buildChunkCb(nextChunkStart, SEQ_LEN)
kotlinx.coroutines.runBlocking { bvChan.send(ChunkMsg(cb, SEQ_LEN)) }
nextChunkStart += EFFECTIVE_CHUNK
}
}
} catch (e: Exception) {
nlog("streaming seg $segIdx producer error: ${e.message}")
}
// Trailing chunk: any remaining tokens after the last full window
// (covers both the medium-segment partial-tail case and the
// short-segment <SEQ_LEN single-chunk case where nextChunkStart=0).
val total = collected.size
if (total > nextChunkStart) {
val trailing = total - nextChunkStart
val cb = buildChunkCb(nextChunkStart, trailing)
kotlinx.coroutines.runBlocking { bvChan.send(ChunkMsg(cb, trailing)) }
}
bvChan.close()
consumerJob.join()
}
/**
* Run the Hexagon talker + CP generation loop with a fully pre-built
* prefill (voice prefix + all text tokens). Same decode recipe as

View File

@ -1,548 +0,0 @@
package com.kazeia.ui
import android.content.Context
import android.graphics.Canvas
import android.graphics.Color
import android.graphics.Paint
import android.graphics.Path
import android.graphics.RadialGradient
import android.graphics.Shader
import android.util.AttributeSet
import android.view.Choreographer
import android.view.View
import kotlin.math.PI
import kotlin.math.cos
import kotlin.math.max
import kotlin.math.min
import kotlin.math.sin
import kotlin.math.sqrt
/**
* Large, central orb visualizer Kazeia's visual "face". Three
* distinct states, each tuned to feel different at a glance:
*
* - **Idle (calm)**: the orb quietly breathes a smooth scale
* oscillation 0.88 1.0 over a 5 s cycle with a soft halo that
* pulses in phase. No high-frequency motion. Suggests "waiting,
* listening, not anxious".
*
* - **Listening (attentive)**: the orb settles slightly larger, a
* warmer bright ring appears around it, and its outline deforms
* organically with the live mic RMS (blob-like wobble, 8 Fourier
* modes, gain-mapped from the RMS). Micro-ripples emit
* continuously while speech is present. Feels alive and engaged
* clearly different from Idle's static breathing.
*
* - **Speaking (active)**: the orb is rendered **as a contained
* spectrometer**. Inside the sphere boundary, SPECTRUM_BANDS
* vertical bars rise from a horizontal baseline according to a
* pre-computed band-energy sidecar. The sphere outline pulses
* with the overall RMS envelope. The bars are clipped to the
* sphere so it really looks like "the sphere itself is speaking"
* not an overlaid spectrogram. Strong amplitude peaks release
* outward ripple waves on the halo.
*
* The whole palette (core, halo, ring, bars, ripples) is re-derived
* from a single [voiceColor] setter so each speaker gets a distinct
* visual identity.
*/
class AudioVisualizerView @JvmOverloads constructor(
context: Context,
attrs: AttributeSet? = null,
defStyleAttr: Int = 0
) : View(context, attrs, defStyleAttr), Choreographer.FrameCallback {
companion object {
/** Must match Qwen3TtsEngine.SPECTRUM_BANDS. Asserted at setSpeaking. */
private const val SPECTRUM_BANDS = 12
/** Listening-mode outline deformation modes (even = smooth blobs). */
private const val BLOB_MODES = 8
}
// ---------- State ----------
private sealed class State {
object Idle : State()
data class Listening(var micRms: Float, var phaseSeed: Float) : State()
data class Speaking(
val envelope: FloatArray,
val spectrogram: Array<FloatArray>,
val durationMs: Long,
val startedAtMs: Long
) : State()
}
@Volatile private var state: State = State.Idle
// ---------- Palette (derived from voiceColor) ----------
private var targetCore = 0xFFBCA4E8.toInt() // default: lavender
private var currentCore = targetCore
private var currentHalo = deriveHalo(currentCore)
private var currentAccent = deriveAccent(currentCore)
fun setVoiceColor(color: Int) {
targetCore = color or 0xFF000000.toInt() // force opaque
scheduleFrame()
}
// ---------- Animation state ----------
private var frameStartNs = 0L
private var smoothedAmp = 0f // 0..1 orb-size pulsation (all states)
private var smoothedBars = FloatArray(SPECTRUM_BANDS)
private var listeningRingPhase = 0f // rotating shimmer on listening ring
private val ripples = ArrayList<Ripple>()
private var lastSpectroIdx = -1
// ---------- Paints ----------
private val corePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL }
private val haloPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL }
private val ringPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
style = Paint.Style.STROKE
}
private val ripplePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
style = Paint.Style.STROKE
strokeWidth = 3f
}
private val barPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
style = Paint.Style.FILL_AND_STROKE
}
private val blobOutlinePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
style = Paint.Style.STROKE
}
private val blobPath = Path()
private val spherePath = Path()
init {
setLayerType(LAYER_TYPE_HARDWARE, null)
}
// ==================== Public API ====================
fun setIdle() {
if (state !is State.Idle) { state = State.Idle; lastSpectroIdx = -1 }
scheduleFrame()
}
fun setListening(micRms: Float) {
val clamped = micRms.coerceIn(0f, 1f)
val s = state
if (s is State.Listening) {
s.micRms = clamped
} else {
state = State.Listening(clamped, (System.nanoTime() and 0xFFFF) / 65535f)
}
scheduleFrame()
}
fun startSpeaking(
envelope: FloatArray,
spectrogram: Array<FloatArray>,
durationMs: Long
) {
if (envelope.isEmpty() || spectrogram.isEmpty() || durationMs <= 0) {
setIdle(); return
}
state = State.Speaking(envelope, spectrogram, durationMs, System.currentTimeMillis())
lastSpectroIdx = -1
// Soft reset bar heights so the spectrum grows from zero rather
// than snapping to the idle smoothing residue.
for (i in smoothedBars.indices) smoothedBars[i] = 0f
scheduleFrame()
}
// ==================== Lifecycle / scheduling ====================
override fun onAttachedToWindow() {
super.onAttachedToWindow()
frameStartNs = System.nanoTime()
scheduleFrame()
}
override fun onDetachedFromWindow() {
super.onDetachedFromWindow()
Choreographer.getInstance().removeFrameCallback(this)
}
private var frameScheduled = false
private fun scheduleFrame() {
if (!frameScheduled && isAttachedToWindow) {
frameScheduled = true
Choreographer.getInstance().postFrameCallback(this)
}
}
override fun doFrame(frameTimeNanos: Long) {
frameScheduled = false
// Ease the palette toward the target (voice change tween).
currentCore = lerpColor(currentCore, targetCore, 0.12f)
currentHalo = deriveHalo(currentCore)
currentAccent = deriveAccent(currentCore)
val s = state
when (s) {
is State.Idle -> {
// Self-throttled at 24 fps — enough for a 5 s breathing
// cycle to look continuous, keeps CPU cost near zero.
Choreographer.getInstance().postFrameCallbackDelayed(this, 40)
frameScheduled = true
}
is State.Listening -> {
listeningRingPhase += 0.015f
Choreographer.getInstance().postFrameCallback(this)
frameScheduled = true
}
is State.Speaking -> {
val elapsed = System.currentTimeMillis() - s.startedAtMs
if (elapsed >= s.durationMs + 300) {
state = State.Idle
lastSpectroIdx = -1
Choreographer.getInstance().postFrameCallbackDelayed(this, 40)
frameScheduled = true
} else {
Choreographer.getInstance().postFrameCallback(this)
frameScheduled = true
}
}
}
invalidate()
}
// ==================== Drawing ====================
override fun onDraw(canvas: Canvas) {
super.onDraw(canvas)
val w = width.toFloat(); val h = height.toFloat()
if (w <= 0f || h <= 0f) return
val cx = w / 2f; val cy = h / 2f
// 78% of min axis: large enough to feel central, 11% margin
// keeps ripples/ring from clipping.
val maxR = min(w, h) * 0.39f
val now = System.currentTimeMillis()
when (val s = state) {
is State.Idle -> drawIdle(canvas, cx, cy, maxR, now)
is State.Listening -> drawListening(canvas, cx, cy, maxR, now, s)
is State.Speaking -> drawSpeaking(canvas, cx, cy, maxR, now, s)
}
}
// ---------- Idle ----------
private fun drawIdle(canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long) {
// 5 s breathing cycle, amplitude 0.88 → 1.00.
val t = ((now - frameStartNs / 1_000_000) % 5000L) / 5000f
val breath = 0.5f - 0.5f * cos((t * 2.0 * PI).toFloat()) // 0..1
val scale = 0.88f + 0.12f * breath
val radius = maxR * scale
smoothedAmp += ((breath * 0.5f) - smoothedAmp) * 0.1f
// Halo (soft, breathing in phase).
drawHalo(canvas, cx, cy, maxR * 1.15f * scale, alphaBase = 60, alphaGain = 70)
// Core — pure round, no deformation.
drawCore(canvas, cx, cy, radius, shimmer = 0f)
// Subtle inner highlight — feels alive without movement.
val hl = Paint(Paint.ANTI_ALIAS_FLAG).apply {
style = Paint.Style.FILL
shader = RadialGradient(
cx - radius * 0.25f, cy - radius * 0.25f, radius * 0.9f,
Color.argb(60, 255, 255, 255),
Color.argb(0, 255, 255, 255),
Shader.TileMode.CLAMP
)
}
canvas.drawCircle(cx, cy, radius, hl)
}
// ---------- Listening ----------
private fun drawListening(
canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, s: State.Listening
) {
// Base size slightly larger than Idle so the transition reads.
val baseScale = 0.93f + 0.08f * s.micRms
val radius = maxR * baseScale
smoothedAmp += (s.micRms - smoothedAmp) * 0.25f
// Halo — brighter than Idle, responds to mic.
drawHalo(canvas, cx, cy, maxR * 1.22f * baseScale,
alphaBase = 90, alphaGain = (130 * s.micRms).toInt().coerceIn(0, 160))
// Deformed outline (blob): Fourier modes over the circle.
buildBlobPath(blobPath, cx, cy, radius, s.micRms, s.phaseSeed, now)
// Filled core with a radial gradient inside the blob path.
corePaint.shader = RadialGradient(
cx - radius * 0.15f, cy - radius * 0.25f, radius * 1.1f,
currentCore, deriveCoreEdge(currentCore),
Shader.TileMode.CLAMP
)
canvas.save()
canvas.clipPath(blobPath)
canvas.drawCircle(cx, cy, radius * 1.3f, corePaint)
canvas.restore()
// Outline of the blob, slightly thicker as RMS rises.
blobOutlinePaint.strokeWidth = 2f + 2f * s.micRms
blobOutlinePaint.color = withAlpha(currentAccent, 180)
canvas.drawPath(blobPath, blobOutlinePaint)
// Rotating shimmer ring — a thin arc segment chasing around.
drawListeningRing(canvas, cx, cy, radius * 1.08f, s.micRms)
// Continuous micro-ripples while listening.
val rmsMicroFloor = 0.12f
if (s.micRms > rmsMicroFloor && ((now / 90) % 3 == 0L)) {
ripples.add(Ripple(bornAtMs = now, peak = s.micRms))
}
drawRipples(canvas, cx, cy, maxR, now, listeningMode = true)
}
private fun drawListeningRing(
canvas: Canvas, cx: Float, cy: Float, radius: Float, rms: Float
) {
// Thin shimmer arc rotating around the orb, width/alpha scaling
// with mic RMS so silence shows almost nothing.
if (rms < 0.04f) return
ringPaint.strokeWidth = 2.5f + 3f * rms
val sweep = 60f + 80f * rms
val start = (listeningRingPhase * 360f) % 360f
ringPaint.color = withAlpha(currentAccent, (140 + 110 * rms).toInt().coerceIn(0, 250))
val r = radius
canvas.drawArc(cx - r, cy - r, cx + r, cy + r, start, sweep, false, ringPaint)
// Subtle tail: a second, dimmer, shorter arc slightly offset.
ringPaint.color = withAlpha(currentAccent, (60 + 60 * rms).toInt().coerceIn(0, 160))
canvas.drawArc(cx - r, cy - r, cx + r, cy + r, start + sweep + 8f, sweep * 0.5f, false, ringPaint)
}
// ---------- Speaking ----------
private fun drawSpeaking(
canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, s: State.Speaking
) {
// Envelope → overall size pulsation + halo intensity.
val elapsed = now - s.startedAtMs
val envIdxF = elapsed.toFloat() * s.envelope.size / s.durationMs
val envIdx = envIdxF.toInt().coerceIn(0, s.envelope.size - 1)
val envFrac = (envIdxF - envIdx).coerceIn(0f, 1f)
val env = lerp(
s.envelope[envIdx],
s.envelope[min(envIdx + 1, s.envelope.size - 1)],
envFrac
)
smoothedAmp += (env - smoothedAmp) * 0.30f
// Update per-band smoothed energies — these drive the Fourier
// modes of the sphere outline in buildSpeakingBlobPath below.
val timeIdxF = elapsed.toFloat() * s.spectrogram.size / s.durationMs
val timeIdx = timeIdxF.toInt().coerceIn(0, s.spectrogram.size - 1)
val timeFrac = (timeIdxF - timeIdx).coerceIn(0f, 1f)
for (b in 0 until SPECTRUM_BANDS) {
val a = s.spectrogram[timeIdx][b]
val c = s.spectrogram[min(timeIdx + 1, s.spectrogram.size - 1)][b]
val target = lerp(a, c, timeFrac)
smoothedBars[b] += (target - smoothedBars[b]) * 0.35f
}
val scale = 0.92f + 0.14f * smoothedAmp
val radius = maxR * scale
// Halo pulses with amp; emit ripples on envelope peaks.
drawHalo(canvas, cx, cy, maxR * 1.30f * scale,
alphaBase = 90, alphaGain = (160 * smoothedAmp).toInt().coerceIn(0, 220))
if (envIdx != lastSpectroIdx && env > 0.45f) {
val prev = if (envIdx > 0) s.envelope[envIdx - 1] else 0f
val next = if (envIdx < s.envelope.size - 1) s.envelope[envIdx + 1] else 0f
if (env >= prev && env >= next) {
ripples.add(Ripple(bornAtMs = now, peak = env))
}
lastSpectroIdx = envIdx
}
drawRipples(canvas, cx, cy, maxR, now, listeningMode = false)
// The sphere outline IS the spectrometer: each spectrogram band
// drives one Fourier mode of the perimeter (low bands = wide
// low-mode bumps, high bands = tight high-mode ripples), so the
// whole shape distorts in response to the voice content. No
// internal bars or curves — the sphere itself is what speaks.
buildSpeakingBlobPath(spherePath, cx, cy, radius, now)
// Fill the deformed sphere with the voice-tinted gradient.
corePaint.shader = RadialGradient(
cx - radius * 0.25f, cy - radius * 0.30f, radius * 1.25f,
currentCore, deriveCoreEdge(currentCore),
Shader.TileMode.CLAMP
)
canvas.drawPath(spherePath, corePaint)
// Soft top-left highlight clipped to the deformed shape — lends
// a subtle "3D glassy" read without being distracting.
canvas.save()
canvas.clipPath(spherePath)
val hl = Paint(Paint.ANTI_ALIAS_FLAG).apply {
style = Paint.Style.FILL
shader = RadialGradient(
cx - radius * 0.28f, cy - radius * 0.30f, radius * 0.9f,
Color.argb(75, 255, 255, 255),
Color.argb(0, 255, 255, 255),
Shader.TileMode.CLAMP
)
}
canvas.drawCircle(cx, cy, radius * 1.2f, hl)
canvas.restore()
// Outline of the deformed shape on top, thickness tracks amp so
// loud consonants give a stronger line.
blobOutlinePaint.strokeWidth = 2.5f + 3.5f * smoothedAmp
blobOutlinePaint.color = withAlpha(currentAccent, 230)
canvas.drawPath(spherePath, blobOutlinePaint)
}
/**
* Build the speaking-state sphere perimeter: base circle plus a
* sum of Fourier modes, one per spectrogram band. Each band drives
* mode (band + 2) so the circle remains the rest shape and modes
* 0/1 (translation / stretch) aren't excited. Phase drifts faster
* for higher modes so tight ripples visually correspond to the
* higher-frequency content of speech. Deformation amplitude is
* scaled both by per-band energy and by overall envelope so quiet
* passages show small motion and loud syllables show strong
* distortion. Sampled at 96 points smooth enough for the
* highest mode we render without being expensive.
*/
private fun buildSpeakingBlobPath(
path: Path, cx: Float, cy: Float, radius: Float, now: Long
) {
path.rewind()
val steps = 96
val tSec = now / 1000f
// Max radial displacement contributed by a single band at full
// energy. 0.22 × radius gives visible distortion without the
// shape collapsing through the center.
val modeGain = radius * 0.22f
// Envelope weight — quiet passages feel less jittery.
val envWeight = (0.5f + 0.5f * smoothedAmp).coerceIn(0f, 1f)
for (i in 0..steps) {
val theta = (i % steps).toFloat() / steps * 2f * PI.toFloat()
var d = 0f
for (b in 0 until SPECTRUM_BANDS) {
val mode = b + 2
val energy = smoothedBars[b]
val phase = tSec * (0.45f + 0.22f * b)
d += modeGain * energy * envWeight *
sin((mode * theta + phase).toDouble()).toFloat()
}
val r = radius + d
val x = cx + r * cos(theta.toDouble()).toFloat()
val y = cy + r * sin(theta.toDouble()).toFloat()
if (i == 0) path.moveTo(x, y) else path.lineTo(x, y)
}
path.close()
}
// ---------- Helpers: halo / ripples / blob ----------
private fun drawHalo(
canvas: Canvas, cx: Float, cy: Float, r: Float,
alphaBase: Int, alphaGain: Int
) {
val a = (alphaBase + alphaGain).coerceIn(0, 255)
haloPaint.shader = RadialGradient(
cx, cy, r,
intArrayOf(withAlpha(currentHalo, a), withAlpha(currentHalo, 0)),
floatArrayOf(0f, 1f),
Shader.TileMode.CLAMP
)
canvas.drawCircle(cx, cy, r, haloPaint)
}
private fun drawCore(canvas: Canvas, cx: Float, cy: Float, radius: Float, shimmer: Float) {
corePaint.shader = RadialGradient(
cx - radius * 0.2f, cy - radius * 0.3f, radius * 1.15f,
currentCore, deriveCoreEdge(currentCore),
Shader.TileMode.CLAMP
)
canvas.drawCircle(cx, cy, radius, corePaint)
}
private fun drawRipples(
canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, listeningMode: Boolean
) {
if (ripples.isEmpty()) return
val lifetimeMs = if (listeningMode) 700f else 900f
val it = ripples.iterator()
while (it.hasNext()) {
val r = it.next()
val age = (now - r.bornAtMs) / lifetimeMs
if (age >= 1f) { it.remove(); continue }
val radius = maxR * (0.58f + 0.62f * age)
val alpha = ((1f - age) * 150f * r.peak).toInt().coerceIn(0, 200)
ripplePaint.color = withAlpha(currentAccent, alpha)
ripplePaint.strokeWidth = max(1.2f, (1f - age) * 4f)
canvas.drawCircle(cx, cy, radius, ripplePaint)
}
}
/**
* Build an organic blob path by displacing a circle with a sum of
* low-frequency sine modes. Each mode has its own slow phase so the
* shape never repeats exactly; the displacement amplitude scales
* with [rms]. 72 points around the perimeter is smooth enough to
* look continuous without being expensive.
*/
private fun buildBlobPath(
path: Path, cx: Float, cy: Float, radius: Float,
rms: Float, phaseSeed: Float, now: Long
) {
path.rewind()
val steps = 72
val tSec = now / 1000f
val amp = radius * (0.02f + 0.08f * rms)
for (i in 0..steps) {
val theta = (i % steps).toFloat() / steps * 2f * PI.toFloat()
var d = 0f
for (m in 1..BLOB_MODES) {
val phase = phaseSeed * 6.28f + tSec * (0.3f + 0.05f * m)
d += (amp / m) * sin((m * theta + phase).toDouble()).toFloat()
}
val r = radius + d
val x = cx + r * cos(theta.toDouble()).toFloat()
val y = cy + r * sin(theta.toDouble()).toFloat()
if (i == 0) path.moveTo(x, y) else path.lineTo(x, y)
}
path.close()
}
// ---------- Color helpers ----------
private fun deriveHalo(core: Int): Int = darken(core, 0.18f)
private fun deriveAccent(core: Int): Int = brighten(core, 0.12f)
private fun deriveCoreEdge(core: Int): Int = darken(core, 0.12f)
private fun brighten(c: Int, frac: Float): Int {
val r = (Color.red(c) + (255 - Color.red(c)) * frac).toInt().coerceIn(0, 255)
val g = (Color.green(c) + (255 - Color.green(c)) * frac).toInt().coerceIn(0, 255)
val b = (Color.blue(c) + (255 - Color.blue(c)) * frac).toInt().coerceIn(0, 255)
return Color.argb(Color.alpha(c), r, g, b)
}
private fun darken(c: Int, frac: Float): Int {
val r = (Color.red(c) * (1 - frac)).toInt().coerceIn(0, 255)
val g = (Color.green(c) * (1 - frac)).toInt().coerceIn(0, 255)
val b = (Color.blue(c) * (1 - frac)).toInt().coerceIn(0, 255)
return Color.argb(Color.alpha(c), r, g, b)
}
private fun withAlpha(c: Int, alpha: Int): Int {
return Color.argb(alpha.coerceIn(0, 255), Color.red(c), Color.green(c), Color.blue(c))
}
private fun lerp(a: Float, b: Float, t: Float): Float = a + (b - a) * t
private fun lerpColor(from: Int, to: Int, t: Float): Int {
val r = lerp(Color.red(from).toFloat(), Color.red(to).toFloat(), t).toInt().coerceIn(0, 255)
val g = lerp(Color.green(from).toFloat(), Color.green(to).toFloat(), t).toInt().coerceIn(0, 255)
val b = lerp(Color.blue(from).toFloat(), Color.blue(to).toFloat(), t).toInt().coerceIn(0, 255)
return Color.argb(255, r, g, b)
}
private class Ripple(val bornAtMs: Long, val peak: Float)
}

View File

@ -187,21 +187,6 @@ class ChatActivity : AppCompatActivity() {
"Amir", "Didier", "Sid", "Zelda"
)
/** One color per speaker derived palette (core + halo + bars) is
* generated inside AudioVisualizerView. Chosen to be calm,
* perceptually distinct, and consistent in saturation so switching
* voices changes *hue* rather than *mood*. */
private val voiceColors = listOf(
0xFFBCA4E8.toInt(), // Damien — lavender
0xFFE8A4CC.toInt(), // Elodie — rose
0xFF82D5D0.toInt(), // Jerome — aqua
0xFFE8BFA4.toInt(), // Richard — amber sand
0xFF95D5A6.toInt(), // Amir — emerald
0xFF8FA2D4.toInt(), // Didier — indigo
0xFFE8B89A.toInt(), // Sid — peach
0xFFA4BEE8.toInt() // Zelda — periwinkle
)
private fun setupResourceMonitoring() {
val graphCpu = findViewById<MiniGraphView>(R.id.graphCpu)
val graphGpu = findViewById<MiniGraphView>(R.id.graphGpu)
@ -269,12 +254,6 @@ class ChatActivity : AppCompatActivity() {
override fun onItemSelected(parent: AdapterView<*>?, view: android.view.View?, pos: Int, id: Long) {
val voicePath = "${com.kazeia.KazeiaApplication.MODELS_DIR}/../voix/${voiceFiles[pos]}"
kazeiaService?.setVoice(voicePath)
// Push the matching color to the service so the orb
// view picks it up; the view tweens from the previous
// color so voice changes don't snap visually.
val color = voiceColors[pos.coerceIn(voiceColors.indices)]
kazeiaService?.setVoiceColor(color)
binding.audioViz.setVoiceColor(color)
appendLog("Voix: ${voiceNames[pos]}")
}
override fun onNothingSelected(parent: AdapterView<*>?) {}
@ -347,43 +326,6 @@ class ChatActivity : AppCompatActivity() {
setDebugPanelVisible(debug)
}
}
launch {
// Drive the orb visualizer from the service-side signal.
// Service decides whether the app is idle, tracking the
// mic, or rendering a TTS segment; the view just renders
// it. StartSpeaking is edge-triggered on the envelope
// identity so re-emitting the same signal won't restart
// the animation timer.
var lastSpeakingEnv: FloatArray? = null
service.visualizerSignal.collect { sig ->
when (sig) {
is com.kazeia.service.KazeiaService.VisualizerSignal.Idle -> {
binding.audioViz.setIdle()
lastSpeakingEnv = null
}
is com.kazeia.service.KazeiaService.VisualizerSignal.Listening -> {
binding.audioViz.setListening(sig.micRms)
lastSpeakingEnv = null
}
is com.kazeia.service.KazeiaService.VisualizerSignal.Speaking -> {
if (sig.rmsEnvelope !== lastSpeakingEnv) {
binding.audioViz.startSpeaking(
sig.rmsEnvelope, sig.spectrogram, sig.durationMs
)
lastSpeakingEnv = sig.rmsEnvelope
}
}
}
}
}
launch {
// Keep the view's voice color synchronised with the
// service — covers the initial state when the view
// attaches before the spinner's first callback fires.
service.voiceColor.collect { color ->
binding.audioViz.setVoiceColor(color)
}
}
}
}
}

View File

@ -18,12 +18,17 @@ class ResourceMonitor(private val context: Context) {
private var prevIdle = 0L
private var prevGpuBusy = 0L
private var prevGpuTotal = 0L
private var hasRoot = false
// No-root deployment (2026-04-14): the previous `su -c id` probe used to
// enable GPU/NPU sysfs reads via root, but it also triggered a Magisk
// prompt on every ChatActivity launch. The whole pipeline now runs in
// the app process so root is never needed — GPU/NPU usage is reported
// as -1 (UI shows "—") and the dashboard shows CPU + RAM only.
init {
// Test root access once
hasRoot = try {
val p = Runtime.getRuntime().exec(arrayOf("su", "-c", "id"))
val result = p.inputStream.bufferedReader().readText()
p.waitFor()
result.contains("uid=0")
} catch (_: Exception) { false }
}
fun snapshot(): ResourceSnapshot {
return ResourceSnapshot(
@ -62,9 +67,7 @@ class ResourceMonitor(private val context: Context) {
}
private fun readGpu(): Float {
// Non-root path: some devices expose /sys/class/kgsl/kgsl-3d0/gpubusy
// as world-readable. If it's locked down (most SELinux configs do),
// just return -1 — no root fallback, no Magisk prompt.
// Try direct read first (works on some devices)
try {
val content = File("/sys/class/kgsl/kgsl-3d0/gpubusy").readText().trim()
val parts = content.split("\\s+".toRegex())
@ -78,14 +81,38 @@ class ResourceMonitor(private val context: Context) {
if (dt > 0) return (db * 100f / dt).coerceIn(0f, 100f)
}
} catch (_: Exception) {}
// Try with root
if (hasRoot) {
try {
val content = execRoot("cat /sys/class/kgsl/kgsl-3d0/gpu_busy_percentage").trim()
val pct = content.replace("%", "").trim().toFloatOrNull()
if (pct != null) return pct.coerceIn(0f, 100f)
} catch (_: Exception) {}
}
return -1f
}
private fun readNpu(): Float {
// NPU usage reporting required root sysfs reads (cdsp_rm/cpu_vote,
// /proc/fastrpc) that always triggered a Magisk prompt. Removed with
// the no-root migration — no equivalent public API exists, so the
// UI just shows "—" for NPU load.
// NPU doesn't have a standard busy metric
// Use CDSP (compute DSP) load as proxy if available
if (hasRoot) {
try {
// Check if CDSP is active by reading vote count
val vote = execRoot("cat /sys/bus/platform/devices/soc:qcom,msm-cdsp-rm/cdsp_rm/cpu_vote 2>/dev/null").trim()
if (vote.isNotEmpty()) {
val v = vote.toIntOrNull() ?: 0
return if (v > 0) 100f else 0f
}
} catch (_: Exception) {}
try {
// Alternative: check fastrpc activity
val stat = execRoot("cat /proc/fastrpc 2>/dev/null || echo none").trim()
if (stat != "none" && stat.isNotEmpty()) return 50f
} catch (_: Exception) {}
}
return -1f
}
@ -107,4 +134,12 @@ class ResourceMonitor(private val context: Context) {
} catch (_: Exception) { return 0 }
}
private fun execRoot(cmd: String): String {
return try {
val p = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
val result = p.inputStream.bufferedReader().readText()
p.waitFor()
result
} catch (_: Exception) { "" }
}
}

View File

@ -100,23 +100,6 @@
</LinearLayout>
<!-- Central orb visualizer: Kazeia's visual "face". Takes the
top half of the chat area so it reads as the primary UI
element; the message list sits below it and shows the
word-by-word reveal of the current reply. Color is driven
by the selected voice (Damien=lavender, Elodie=rose, …). -->
<com.kazeia.ui.AudioVisualizerView
android:id="@+id/audioViz"
android:layout_width="0dp"
android:layout_height="0dp"
android:background="@color/kazeia_background"
app:layout_constraintTop_toBottomOf="@id/voiceBar"
app:layout_constraintBottom_toTopOf="@id/rvMessages"
app:layout_constraintStart_toStartOf="parent"
app:layout_constraintEnd_toEndOf="parent"
app:layout_constraintVertical_chainStyle="spread"
app:layout_constraintVertical_weight="3" />
<!-- Chat messages -->
<androidx.recyclerview.widget.RecyclerView
android:id="@+id/rvMessages"
@ -124,11 +107,10 @@
android:layout_height="0dp"
android:clipToPadding="false"
android:padding="8dp"
app:layout_constraintTop_toBottomOf="@id/audioViz"
app:layout_constraintTop_toBottomOf="@id/voiceBar"
app:layout_constraintBottom_toTopOf="@id/inputBar"
app:layout_constraintStart_toStartOf="parent"
app:layout_constraintEnd_toEndOf="parent"
app:layout_constraintVertical_weight="2" />
app:layout_constraintEnd_toEndOf="parent" />
<!-- Input bar -->
<LinearLayout

View File

@ -1,4 +1,4 @@
# Kazeia Android — Élimination du root pour le LLM (résolu)
# Kazeia Android — Problème d'élimination de root pour le LLM
**Date :** 2026-04-14
**Device :** OnePlus Pad 3 (OPD2415, Snapdragon 8 Elite, SoC `sun`), Android 16 (OxygenOS), Magisk root
@ -6,13 +6,6 @@
---
> **🟢 Statut : RÉSOLU.** Pipeline complet STT + LLM + TTS tourne in-process sans
> aucun appel à `su`. Voir la section **Résolution** en bas du document pour le
> détail du fix. Le reste du document décrit l'investigation initiale et garde
> sa valeur historique.
---
## 1. Contexte général
L'app Kazeia (Android / Kotlin + Jetpack Compose) orchestre un pipeline **STT → LLM → TTS** entièrement on-device sur le Hexagon HTP (V79) du Snapdragon 8 Elite.
@ -231,132 +224,3 @@ Je cherche soit :
- Soit **la confirmation** que l'approche actuelle (root + Magisk remember) est le meilleur compromis accessible, avec éventuellement des suggestions pour minimiser les prompts
Merci.
---
## 10. Résolution (post-mortem)
Une seconde opinion technique a identifié la **vraie cause racine** que
l'investigation locale avait mal diagnostiquée.
### 10.1 Vraie cause
Les processus Android forkés par Zygote (l'app elle-même, ses Services
`android:process=":xxx"`, etc.) héritent des **GIDs supplémentaires**
configurés à l'init pour `untrusted_app`. Ces GIDs incluent l'autorisation
`/dev/cdsprpc-smd` et d'autres canaux fastrpc.
Quand `Runtime.exec("su"…)` ou `ProcessBuilder` font un `fork()` + `exec()`
classique, le `exec()` ne préserve pas tous les credentials utilisés par le
driver fastrpc Qualcomm pour authentifier le client. Le driver retourne
**error 4000 "Failed to load skel"** car il refuse de créer une session DSP
pour ce process.
C'est pour ça que :
- ORT-QNN (Whisper) marchait in-process : chargé via `System.loadLibrary` dans
l'app, qui est Zygote-forked → credentials valides.
- `su -c qnn_llama_runner` marchait : root bypasse les checks fastrpc.
- `ProcessBuilder` du même runner échouait : ni Zygote-forked, ni root.
Le "conflit de version QNN v2.31 vs v2.37" que j'avais soupçonné n'était
**pas le vrai problème**. Les libs étaient déjà unifiées en v2.42 dans jniLibs.
### 10.2 La solution : `LlmModule` JNI in-process
ExecuTorch fournit `org.pytorch.executorch.extension.llm.LlmModule`, un
wrapper JNI autour du même C++ `example::Runner` que le binaire
`qnn_llama_runner`. En l'invoquant depuis l'app (process Zygote-forked), le
DSP fastrpc accepte la session — pas de root nécessaire.
### 10.3 Étapes réelles du fix
1. **Build ExecuTorch Android** avec `EXECUTORCH_BUILD_LLAMA_JNI=ON`,
`EXECUTORCH_BUILD_QNN=ON`, `QNN_SDK_ROOT=/opt/Kazeia/qnn_sdk_242/qairt/2.42.0.251225`
produit `libexecutorch_jni.so` 192 MB qui inclut le runner LLM + le backend QNN.
2. **Patches sources** dans `/opt/Kazeia/executorch-patches/llm_in_process_jni.patch` :
- `backends/qualcomm/CMakeLists.txt` : gate `PyQnnManagerAdaptor` sur `NOT ANDROID`
(le guard original sur `CMAKE_SYSTEM_PROCESSOR MATCHES x86_64` se déclenche
dans des sous-scopes du cross-compile Android).
- `extension/android/jni/jni_layer_llama.cpp`, branche `MODEL_TYPE_QNN_LLAMA` :
- `decoder_model = "qwen3"` (au lieu de `"llama3"` hardcodé)
- `temperature = 0.0f`, `eval_mode = 0` (kKVCached), `shared_buffer = true`
- **Crucial** : choisir `Runner<uint8_t>` ou `Runner<uint16_t>` selon
`module->get("get_kv_io_bit_width")` (mirror du `qnn_llama_runner.cpp main()`).
Hardcoder la mauvaise largeur produit du gibberish déterministe
comme `blocked罩ug darkestSOLEQuotes作者本人 humanity` — la KV cache
est lue/écrite à la mauvaise largeur de byte.
3. **Bundling jniLibs** :
- `libexecutorch.so` / `libexecutorch_jni.so` (build du 13-april avec LlmModule)
- `libqnn_executorch_backend.so` (assorti)
- `libQnnHtp.so`, `libQnnHtpPrepare.so`, `libQnnHtpV79Stub.so`, `libQnnSystem.so`,
`libQnnHtpV79Skel.so` (tous v2.42 depuis `/opt/Kazeia/qnn_sdk_242/`)
4. **JAR avec `LlmModule.class`** : compilation manuelle via `javac` (le build
gradle de l'AAR demandait android-34 platform non installée).
5. **Réécriture `ExecuTorchLlmEngine.kt`** :
- Constructeur : `LlmModule(MODEL_TYPE_QNN_LLAMA=4, ptePath, tokPath, 0.7f)` puis `.load()`
- `generate(prompt, seqLen, callback, echo=false)` — sinon le callback échoue à
stripper les tokens du prompt
- Template ChatML Qwen3 buildé en Kotlin, mirror exact de
`qnn_llama_runner.cpp::get_formatted_prompt()` pour `kQwen3` (user-first puis
system optionnel puis `<|im_start|>assistant`)
- Filtre inline `<think>…</think>` dans le callback avec lookahead pour les tags
fragmentés sur plusieurs pieces
### 10.4 Métriques validées
| Métrique | Valeur |
|---|---|
| LlmModule.load() | 4.2 s (one-time à l'init de l'app) |
| LLM gen | ~17 tok/s (kv-only) |
| LLM TTFT | ~4 s pour 77 tokens prompt (prefill séquentiel kKVCached) |
| TTS Talker(PTE) | 37 ms/step (vs 45-65 avant) |
| TTS CP(PTE) | 73 ms/step |
| Pipeline e2e | "Bonjour, comment vas-tu ?" → audio en ~7 s |
| Magisk prompts | **0** |
### 10.5 Optimisations restantes (non bloquantes)
- **TTFT** : ré-exporter le `.pte` en `--model_mode hybrid` pour avoir un
`prefill_forward` parallèle → TTFT passerait de ~4 s à <1 s. Pas nécessaire
pour le use case conversationnel actuel.
- **Cosmétique** : le statusbar de l'app affiche encore "Hexagon NPU" pour le
TTS alors que c'est désormais le chemin .pte (label hérité du temps où c'était
ggml-hexagon).
### 10.6 Mémoire projet
État complet documenté dans
`/home/alf/.claude/projects/-opt-Kazeia/memory/project_llm_npu_plan.md`.
Backup git : branche `backup/pre-no-root-migration` + commit `6e6a2d9`.
Backup disk : `/home/alf/kazeia_backup_20260414/`.
### 10.7 Commits clés
- `f32b5dd` (LLM no-root: validate end-to-end pipeline, fix kv_io_bit_width detection)
- `b57719f` (LLM: filter <think> tokens out of the streaming TTS path)
### 10.8 Comparaison de performances avant/après
Mesurée le 2026-04-14 sur le même `.pte` Qwen3-4B avec le même runner C++ —
seule la voie d'invocation change (subprocess `su -c` vs `LlmModule` JNI
in-process).
| Métrique | Avant (su-c subprocess) | Après (in-process LlmModule) | Delta |
|---|---|---|---|
| LLM gen rate | 18.3 tok/s | 17.2 tok/s | -6 % (bruit) |
| LLM prefill speed | 52 ms / prompt-token | 52 ms / prompt-token | identique |
| LLM TTFT (prompt 35 tok) | 1.8 s | 1.8 s | identique |
| LLM TTFT (prompt 80 tok, system+ChatML) | ~4.1 s | 4.2 s | identique |
| TTS Talker(.pte) | 45-65 ms / step | 37 ms / step | +25-40 % (contexte QNN partagé) |
| TTS CP(.pte) | 65-157 ms / step | 73 ms / step | +10-50 % |
| TTS load au boot | 26.7 s | 4.3 s | **6× plus rapide** (plus de subprocess Hexagon 12 s) |
| `LlmModule.load()` au boot | n/a (subprocess à la demande) | 3.1 s (one-time) | overhead init |
| App RSS | ~2 GB app + 1.76 GB subprocess séparé | ~3.7 GB process unique | mêmes ressources globales |
| Erreurs DSP 6031/6033 en concurrence | régulières | disparues | architectural |
| Prompts Magisk | 5 / tour | **0** | UX net |
| Taille APK | ~100 MB | ~100 MB (libexecutorch_jni.so 192 MB → 8.5 MB après strip à l'install) | négligeable |
**Conclusion** : pas de régression LLM (perf identique, le runner C++ est le même).
Gain net sur la TTS (Talker 25-40 % plus rapide grâce au contexte QNN partagé,
load 6× plus rapide). Architecture plus propre : un seul process, un seul runtime
QNN, plus de contention DSP, plus de prompts root.

View File

@ -1,233 +0,0 @@
#!/usr/bin/env python3
"""
Generate per-voice <name>_voice_prefix.bin (9 × 1024 fp32) and
<name>_voice_suffix.bin (2 × 1024 fp32) for Kazeia's on-device TTS
engine (Qwen3-TTS 0.6B-Base voice-clone mode).
The on-device pipeline concatenates prefix + text-embeds + suffix as
the talker's prefill. The prefix is the voice-conditioning preamble
produced by the Qwen3TTS model when run with `x_vector_only_mode=True`
on a short reference phrase it carries the speaker x-vector and the
leading ChatML / transcript tokens that precede user text. The suffix
is the closing tokens that sit right after user text (end-of-turn,
assistant-ready marker).
Approach: run the model once per voice on a fixed short utterance,
capture every talker input embedding of the first (multi-token)
prefill call via a forward hook that's the full prefill sequence.
The reference Damien files contain exactly 9 pre-text embeds + 2
post-text embeds, which corresponds to:
[prefix: 9 vectors] [text embeds: N vectors] [suffix: 2 vectors]
We BPE-tokenize the same utterance with Qwen3TTS's own tokenizer to
find where the text tokens start and end inside the prefill, then
slice out the preceding 9 and trailing 2 vectors. This makes the
split robust to tokenizer changes and matches the Damien files
bit-identically (verified during the first run: /tmp/check_damien_*).
Usage:
export_voice_prefix_suffix.py VOICE.wav [VOICE.wav ...]
--out-dir /path/to/output (default /tmp/voice_prefixes)
--text "Bonjour." (reference utterance; short is ok)
The output file names are `<basename_without_ext>_voice_prefix.bin`
and `<basename_without_ext>_voice_suffix.bin`. Push them to
/data/local/tmp/kazeia/models/qwen3-tts-npu/ to activate the voice
in-app (Qwen3TtsEngine.setVoice reads them from there).
"""
import argparse
import os
import struct
import sys
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")
# NOTE: don't chdir() here — the WAV paths in argv are resolved against
# the user's cwd. Qwen3TTS creates /tmp scratch files internally already.
MODEL_PATH = (
"/home/alf/.cache/huggingface/hub/"
"models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/"
"5d83992436eae1d760afd27aff78a71d676296fc"
)
# Prefix + suffix sizes taken from the reference damien_voice_prefix.bin /
# damien_voice_suffix.bin shipped on the tablet. If Qwen3TTS ever changes
# its chat template these may need to be re-checked — run the script
# with `--validate-damien damien_voice_prefix.bin` to diff against a
# known-good capture.
N_PREFIX = 9
N_SUFFIX = 2
TALKER_DIM = 1024
def load_model():
import torch
from qwen_tts import Qwen3TTSModel
print(f"Loading Qwen3-TTS model from {MODEL_PATH}...", flush=True)
tts = Qwen3TTSModel.from_pretrained(
MODEL_PATH, local_files_only=True, device_map="cpu"
)
return tts
class _PrefillCapturedSentinel(Exception):
"""Raised after the first prefill so we can abort generate_voice_clone
without waiting for the (very slow on CPU) full TTS decode."""
def capture_prefill(tts, wav_path: str, text: str):
"""Run generate_voice_clone just far enough to capture the first
(prefill) call's talker input embeddings, then abort. Doing the full
non-streaming decode would take several minutes per voice on CPU and
we don't need any of the audio — only the prefill vectors."""
import numpy as np
captured = []
talker = tts.model.talker
original_forward = talker.model.forward
def patched_forward(input_ids=None, inputs_embeds=None, **kwargs):
if inputs_embeds is not None and inputs_embeds.dim() == 3:
t = inputs_embeds.shape[1]
for i in range(t):
captured.append(
inputs_embeds[0, i, :].detach().cpu().numpy().astype(np.float32)
)
raise _PrefillCapturedSentinel()
return original_forward(
input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs
)
talker.model.forward = patched_forward
try:
try:
tts.generate_voice_clone(
text=text,
ref_audio=wav_path,
language="french",
x_vector_only_mode=True,
non_streaming_mode=True,
)
except _PrefillCapturedSentinel:
pass # expected — we abort after the first prefill
finally:
talker.model.forward = original_forward
if not captured:
raise RuntimeError("No prefill captured — hook wasn't triggered.")
return captured
def write_bin(path: Path, vectors):
n = len(vectors)
dim = len(vectors[0]) if n else TALKER_DIM
if dim != TALKER_DIM:
raise RuntimeError(f"Expected dim {TALKER_DIM}, got {dim}")
with open(path, "wb") as f:
f.write(struct.pack("<ii", n, dim))
for v in vectors:
f.write(struct.pack(f"<{dim}f", *v))
def process_voice(tts, wav_path: Path, out_dir: Path, text: str):
name = wav_path.stem.lower().split("_")[0] # "damien_15s_24k" → "damien"
prefix_path = out_dir / f"{name}_voice_prefix.bin"
suffix_path = out_dir / f"{name}_voice_suffix.bin"
if prefix_path.exists() and suffix_path.exists():
print(f" [skip] {name}: prefix/suffix already exist")
return
print(f" Capturing prefill for {name} ({wav_path.name})...", flush=True)
prefill = capture_prefill(tts, str(wav_path), text)
if len(prefill) < N_PREFIX + N_SUFFIX + 1:
raise RuntimeError(
f"Prefill too short for {name}: {len(prefill)} < {N_PREFIX + N_SUFFIX + 1}"
)
prefix_vecs = prefill[:N_PREFIX]
suffix_vecs = prefill[-N_SUFFIX:]
write_bin(prefix_path, prefix_vecs)
write_bin(suffix_path, suffix_vecs)
print(
f" Wrote {prefix_path.name} ({N_PREFIX}×{TALKER_DIM}) "
f"and {suffix_path.name} ({N_SUFFIX}×{TALKER_DIM})",
flush=True,
)
def validate_against_damien(tts, wav_path: Path, reference_prefix: Path, text: str):
"""Regenerate Damien's prefix/suffix from damien.wav and diff against
the reference files shipped on the tablet. Confirms this script's
slicing reproduces the original format."""
import numpy as np
prefill = capture_prefill(tts, str(wav_path), text)
candidate = np.array(prefill[:N_PREFIX], dtype=np.float32)
with open(reference_prefix, "rb") as f:
n, d = struct.unpack("<ii", f.read(8))
ref = np.frombuffer(f.read(n * d * 4), dtype=np.float32).reshape(n, d)
diff = np.abs(candidate - ref)
print(
f"Damien prefix validation: max|diff|={diff.max():.3e} "
f"mean|diff|={diff.mean():.3e} (expect ~0 if script is correct)"
)
def main():
p = argparse.ArgumentParser()
p.add_argument("wavs", nargs="+", help="Voice WAV files")
p.add_argument(
"--out-dir", default="/tmp/voice_prefixes", help="Output directory"
)
p.add_argument(
"--text", default="Bonjour.", help="Reference utterance for prefill"
)
p.add_argument(
"--validate-damien",
default=None,
help="Path to a reference damien_voice_prefix.bin for sanity-check",
)
args = p.parse_args()
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
tts = load_model()
if args.validate_damien:
damien_wav = next(
(Path(w) for w in args.wavs if "damien" in Path(w).stem.lower()), None
)
if damien_wav is None:
print("--validate-damien specified but no damien wav in input list")
sys.exit(1)
validate_against_damien(tts, damien_wav, Path(args.validate_damien), args.text)
for wav in args.wavs:
wp = Path(wav)
if not wp.exists():
print(f" [miss] {wp}")
continue
try:
process_voice(tts, wp, out_dir, args.text)
except Exception as e:
print(f" [fail] {wp.name}: {e}")
print(f"\nDone. Files written under {out_dir}")
print(
"Push to the tablet with, e.g.:\n"
f" adb push {out_dir}/*_voice_prefix.bin "
"/data/local/tmp/kazeia/models/qwen3-tts-npu/\n"
f" adb push {out_dir}/*_voice_suffix.bin "
"/data/local/tmp/kazeia/models/qwen3-tts-npu/"
)
if __name__ == "__main__":
main()