Compare commits
No commits in common. "main" and "backup/pre-no-root-migration" have entirely different histories.
main
...
backup/pre
|
|
@ -1,72 +0,0 @@
|
|||
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
|
||||
index e93731e..4951e1d 100644
|
||||
--- a/backends/qualcomm/CMakeLists.txt
|
||||
+++ b/backends/qualcomm/CMakeLists.txt
|
||||
@@ -308,8 +308,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
|
||||
)
|
||||
endif()
|
||||
|
||||
-# QNN pybind
|
||||
-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
|
||||
+# QNN pybind — host Python bindings, not for Android cross-compile
|
||||
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ANDROID)
|
||||
add_subdirectory(
|
||||
${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pybind11
|
||||
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
|
||||
index 45f2414..ae3d79f 100644
|
||||
--- a/extension/android/jni/jni_layer_llama.cpp
|
||||
+++ b/extension/android/jni/jni_layer_llama.cpp
|
||||
@@ -171,14 +171,44 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
|
||||
model_path->toStdString().c_str(),
|
||||
data_files_vector,
|
||||
executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
|
||||
- std::string decoder_model = "llama3"; // use llama3 for now
|
||||
- runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
|
||||
- std::move(module),
|
||||
- decoder_model.c_str(),
|
||||
- model_path->toStdString().c_str(),
|
||||
- tokenizer_path->toStdString().c_str(),
|
||||
- "",
|
||||
- "");
|
||||
+ std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
|
||||
+
|
||||
+ // Mirror qnn_llama_runner.cpp main(): pick the Runner<T> template based
|
||||
+ // on the model's get_kv_io_bit_width metadata. The 16-bit KV I/O models
|
||||
+ // were introduced after the 8-bit ones, and using the wrong T treats
|
||||
+ // KV-cache bytes as the wrong width → garbage logits → gibberish output.
|
||||
+ example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8;
|
||||
+ if (module->method_names()->count("get_kv_io_bit_width") > 0) {
|
||||
+ kv_bitwidth = static_cast<example::KvBitWidth>(
|
||||
+ module->get("get_kv_io_bit_width").get().toScalar().to<int64_t>());
|
||||
+ }
|
||||
+ // Auto-detect eval_mode: kv-only (0) if the .pte only carries
|
||||
+ // kv_forward, hybrid (1) if it also has prefill_forward (which lets the
|
||||
+ // runner batch the prompt prefill — TTFT drops from ~52 ms/token to
|
||||
+ // sub-ms after the one-shot prefill graph). Same JNI binary works with
|
||||
+ // both export modes, no code change needed when the .pte is upgraded.
|
||||
+ int eval_mode = 0;
|
||||
+ if (module->method_names()->count("prefill_forward") > 0) {
|
||||
+ eval_mode = 1; // EvalMode::kHybrid
|
||||
+ }
|
||||
+ auto make_runner = [&](auto sample) -> std::unique_ptr<llm::IRunner> {
|
||||
+ using T = decltype(sample);
|
||||
+ return std::make_unique<example::Runner<T>>(
|
||||
+ std::move(module),
|
||||
+ decoder_model.c_str(),
|
||||
+ model_path->toStdString().c_str(),
|
||||
+ tokenizer_path->toStdString().c_str(),
|
||||
+ /* performance_output_path */ "",
|
||||
+ /* dump_logits_path */ "",
|
||||
+ /* temperature */ 0.0f, // greedy
|
||||
+ eval_mode,
|
||||
+ /* shared_buffer */ true);
|
||||
+ };
|
||||
+ if (kv_bitwidth == example::KvBitWidth::kWidth16) {
|
||||
+ runner_ = make_runner(uint16_t{0});
|
||||
+ } else {
|
||||
+ runner_ = make_runner(uint8_t{0});
|
||||
+ }
|
||||
model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
|
||||
#endif
|
||||
#if defined(EXECUTORCH_BUILD_MEDIATEK)
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
diff --git a/examples/qualcomm/oss_scripts/llama/__init__.py b/examples/qualcomm/oss_scripts/llama/__init__.py
|
||||
index 963db6e..9ccfdd0 100644
|
||||
index 963db6e..953dc4c 100644
|
||||
--- a/examples/qualcomm/oss_scripts/llama/__init__.py
|
||||
+++ b/examples/qualcomm/oss_scripts/llama/__init__.py
|
||||
@@ -25,9 +25,14 @@ from executorch.examples.models.granite import (
|
||||
|
|
@ -20,7 +20,7 @@ index 963db6e..9ccfdd0 100644
|
|||
from executorch.examples.models.qwen2_5 import (
|
||||
convert_weights as convert_qwen2_5_weights,
|
||||
)
|
||||
@@ -479,6 +484,37 @@ class Qwen3_1_7B(LLMModelConfig):
|
||||
@@ -479,6 +484,34 @@ class Qwen3_1_7B(LLMModelConfig):
|
||||
quant_recipe = Qwen3_1_7BQuantRecipe
|
||||
|
||||
|
||||
|
|
@ -40,13 +40,10 @@ index 963db6e..9ccfdd0 100644
|
|||
+ convert_weights = convert_qwen3_weights
|
||||
+ transform_weight = False
|
||||
+ instruct_model = True
|
||||
+ # num_sharding=1 for hybrid mode: sharding=2 produces a multi-context
|
||||
+ # .pte (2 graphs × 2 shards = 4 contexts) that the LlmModule load path
|
||||
+ # can't restore (error 5010 "Context group 1 does not exist"). With
|
||||
+ # sharding=1 the hybrid export needs ~46 GB RAM peak — the 192 GB swap
|
||||
+ # on /swapfile handles this; compile takes ~80 min wall but completes
|
||||
+ # cleanly. Single-context .pte loads fine through the JNI runner.
|
||||
+ num_sharding = 1
|
||||
+ # Bumped to 2 to halve peak host RAM during QNN compile (4B at sharding=1
|
||||
+ # OOMed on a 62 GB box, peak anon-rss 46 GB). At sharding=2 each shard
|
||||
+ # compile fits comfortably; runner stitches them at load time.
|
||||
+ num_sharding = 2
|
||||
+ masked_softmax = True
|
||||
+ seq_mse_candidates = 0
|
||||
+ r1 = False
|
||||
|
|
|
|||
|
|
@ -5,7 +5,6 @@
|
|||
<uses-permission android:name="android.permission.RECORD_AUDIO" />
|
||||
<uses-permission android:name="android.permission.FOREGROUND_SERVICE" />
|
||||
<uses-permission android:name="android.permission.FOREGROUND_SERVICE_MICROPHONE" />
|
||||
<uses-permission android:name="android.permission.FOREGROUND_SERVICE_MEDIA_PLAYBACK" />
|
||||
<uses-permission android:name="android.permission.FOREGROUND_SERVICE_SPECIAL_USE" />
|
||||
<uses-permission android:name="android.permission.POST_NOTIFICATIONS" />
|
||||
<uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
|
||||
|
|
@ -51,7 +50,7 @@
|
|||
|
||||
<service
|
||||
android:name=".service.KazeiaService"
|
||||
android:foregroundServiceType="microphone|mediaPlayback|specialUse"
|
||||
android:foregroundServiceType="microphone|specialUse"
|
||||
android:exported="true">
|
||||
<property
|
||||
android:name="android.app.PROPERTY_SPECIAL_USE_FGS_SUBTYPE"
|
||||
|
|
|
|||
|
|
@ -1,49 +1,43 @@
|
|||
package com.kazeia.llm
|
||||
|
||||
import android.content.Context
|
||||
import android.util.Log
|
||||
import com.kazeia.core.*
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
import org.pytorch.executorch.extension.llm.LlmCallback
|
||||
import org.pytorch.executorch.extension.llm.LlmModule
|
||||
|
||||
/**
|
||||
* LLM Engine using ExecuTorch LlmModule in-process — **no root required**.
|
||||
*
|
||||
* Runs Qwen3-4B via `org.pytorch.executorch.extension.llm.LlmModule`, which
|
||||
* wraps the same C++ TextLlmRunner as the standalone qnn_llama_runner binary
|
||||
* but inside the app's own process. The QNN HTP backend works because the
|
||||
* DSP fastrpc service accepts the Zygote-forked app process (unlike
|
||||
* ProcessBuilder-spawned subprocesses which lose supplementary GIDs on exec
|
||||
* and get rejected by the fastrpc credential checks).
|
||||
*
|
||||
* Model + tokenizer live in /data/local/tmp/kazeia-et/ (readable by the app
|
||||
* on this device's permissive SELinux policy). libexecutorch.so + QNN libs
|
||||
* are bundled in jniLibs.
|
||||
* LLM Engine using ExecuTorch + QNN backend via subprocess.
|
||||
* Calls qnn_llama_runner binary with root access (Magisk su).
|
||||
*
|
||||
* Current tablet config: Qwen3-4B KV-mode, ~18-22 tok/s on Hexagon V79
|
||||
* (Snapdragon 8 Elite), TTFT 0.9 s, RSS 1.76 GB.
|
||||
*
|
||||
* Why root: the runner binary plus its QNN v2.42 .so deps live in
|
||||
* /data/local/tmp/kazeia-et/ (shell_data_file SELinux context). Untrusted
|
||||
* apps can't exec binaries from there. The Hexagon DSP fastrpc service also
|
||||
* refuses to load the v2.42 Skel from the app's own files dir — only from
|
||||
* nativeLibraryDir — but that dir already holds the TTS stack's v2.31 Skel
|
||||
* (same filename, different version, can't coexist). Rebuilding everything
|
||||
* against one QNN version would eliminate the conflict, but would require
|
||||
* re-exporting the TTS .pte with the new runtime (tooling currently broken
|
||||
* on the flatc schema/dataclass mismatch in the qnn_venv).
|
||||
*/
|
||||
class ExecuTorchLlmEngine(
|
||||
private val context: Context,
|
||||
private val onLog: ((String) -> Unit)? = null
|
||||
) : LlmEngine {
|
||||
|
||||
companion object {
|
||||
private const val TAG = "ExecuTorchLLM"
|
||||
// /no_think disables Qwen3's chain-of-thought block. Compact wording
|
||||
// keeps prefill cost low: this prompt is ~25 tokens vs ~55 in the
|
||||
// earlier verbose version → saves ~1.5 s of TTFT in kv-only mode.
|
||||
private const val SYSTEM_PROMPT = "Tu es Kazeia, à l'écoute en français. Réponds en 1-2 phrases courtes, sans raisonnement. /no_think"
|
||||
|
||||
private const val MODEL_DIR = "/data/local/tmp/kazeia-et"
|
||||
private const val MODEL_PATH = "$MODEL_DIR/hybrid_llama_qnn.pte"
|
||||
private const val TOKENIZER_PATH = "$MODEL_DIR/tokenizer.json"
|
||||
private const val RUNNER_DIR = "/data/local/tmp/kazeia-et"
|
||||
// /no_think disables Qwen3's chain-of-thought block so the full token
|
||||
// budget goes to the actual answer (without it, 120-200 tokens get
|
||||
// consumed by <think>…</think> leaving nothing to speak).
|
||||
// Short-response directive keeps TTS latency manageable — each sentence
|
||||
// costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot.
|
||||
private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think"
|
||||
}
|
||||
|
||||
private var llmModule: LlmModule? = null
|
||||
private var modelName = ""
|
||||
private var loaded = false
|
||||
|
||||
|
|
@ -54,152 +48,77 @@ class ExecuTorchLlmEngine(
|
|||
|
||||
override suspend fun load(modelPath: String, config: LlmConfig) {
|
||||
withContext(Dispatchers.IO) {
|
||||
if (!File(MODEL_PATH).exists()) {
|
||||
nlog("ERROR: model not found at $MODEL_PATH")
|
||||
return@withContext
|
||||
}
|
||||
if (!File(TOKENIZER_PATH).exists()) {
|
||||
nlog("ERROR: tokenizer not found at $TOKENIZER_PATH")
|
||||
val check = execRoot("ls $RUNNER_DIR/qnn_llama_runner $RUNNER_DIR/hybrid_llama_qnn.pte $RUNNER_DIR/tokenizer.json 2>&1")
|
||||
if (check.contains("No such file")) {
|
||||
nlog("ERROR: runner or model not found in $RUNNER_DIR")
|
||||
return@withContext
|
||||
}
|
||||
|
||||
try {
|
||||
val t0 = System.currentTimeMillis()
|
||||
// MODEL_TYPE_QNN_LLAMA=4 selects the Qualcomm runner path in
|
||||
// jni_layer_llama.cpp, which uses example::Runner (same code
|
||||
// as the qnn_llama_runner binary) instead of the generic
|
||||
// TextLLMRunner. Our .pte was exported with
|
||||
// --decoder_model qwen3-4b which requires this path.
|
||||
val MODEL_TYPE_QNN_LLAMA = 4
|
||||
llmModule = LlmModule(MODEL_TYPE_QNN_LLAMA, MODEL_PATH, TOKENIZER_PATH, 0.7f)
|
||||
nlog("LlmModule instantiated in ${System.currentTimeMillis() - t0}ms")
|
||||
deployRunnerScript()
|
||||
|
||||
// Load the PTE into QNN HTP (calls the native load()).
|
||||
val loadResult = llmModule!!.load()
|
||||
if (loadResult != 0) {
|
||||
nlog("ERROR: LlmModule.load() returned $loadResult")
|
||||
llmModule = null
|
||||
return@withContext
|
||||
}
|
||||
nlog("LlmModule loaded in ${System.currentTimeMillis() - t0}ms total")
|
||||
writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
|
||||
android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP))
|
||||
if (SYSTEM_PROMPT.isNotEmpty()) {
|
||||
writeFileRoot("$RUNNER_DIR/outputs/system.b64",
|
||||
android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
|
||||
} else {
|
||||
execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
|
||||
}
|
||||
val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1")
|
||||
|
||||
if (test.contains("Generated Tokens") || test.contains("Rate:")) {
|
||||
loaded = true
|
||||
modelName = "Qwen3-4B LlmModule"
|
||||
val rateMatch = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(test)
|
||||
val rate = rateMatch?.groupValues?.get(1) ?: "?"
|
||||
modelName = "Qwen3 (${rate} tok/s NPU)"
|
||||
nlog("Ready: $modelName")
|
||||
} catch (e: Throwable) {
|
||||
nlog("ERROR: LlmModule init failed: ${e.javaClass.simpleName}: ${e.message}")
|
||||
llmModule = null
|
||||
} else {
|
||||
nlog("ERROR: test failed: ${test.takeLast(200)}")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override fun isLoaded(): Boolean = loaded && llmModule != null
|
||||
override fun isLoaded(): Boolean = loaded
|
||||
|
||||
override suspend fun generate(
|
||||
prompt: String,
|
||||
params: SamplingParams,
|
||||
onToken: ((String) -> Boolean)?
|
||||
): GenerationResult = withContext(Dispatchers.IO) {
|
||||
val mod = llmModule ?: throw IllegalStateException("Model not loaded")
|
||||
if (!loaded) throw IllegalStateException("Model not loaded")
|
||||
|
||||
val startTime = System.currentTimeMillis()
|
||||
val fullPrompt = buildChatTemplate(prompt)
|
||||
|
||||
writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
|
||||
android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP))
|
||||
if (SYSTEM_PROMPT.isNotEmpty()) {
|
||||
writeFileRoot("$RUNNER_DIR/outputs/system.b64",
|
||||
android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
|
||||
} else {
|
||||
execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
|
||||
}
|
||||
|
||||
nlog("Prompt: '${prompt.take(80)}'")
|
||||
|
||||
val responseBuilder = StringBuilder()
|
||||
var firstTokenMs = -1L
|
||||
// Track whether we're inside a <think>…</think> block so the upstream
|
||||
// SentenceStreamer / TTS doesn't get fed reasoning tokens. Even with
|
||||
// /no_think in the system prompt Qwen3 still emits empty <think></think>
|
||||
// wrappers for ~3 tokens before the real answer.
|
||||
var inThink = false
|
||||
val tokenScan = StringBuilder() // small lookahead to spot tag boundaries
|
||||
|
||||
// Singleton special tokens that should never reach the TTS streamer
|
||||
// (they leak when the model wraps its reply or signals end-of-turn).
|
||||
val stripTokens = listOf("<|im_start|>", "<|im_end|>", "<|endoftext|>")
|
||||
val maxTagLen = listOf("<think>", "</think>", "<|im_start|>", "<|im_end|>", "<|endoftext|>")
|
||||
.maxOf { it.length }
|
||||
|
||||
val cb = object : LlmCallback {
|
||||
override fun onResult(result: String) {
|
||||
if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
|
||||
responseBuilder.append(result)
|
||||
|
||||
// Forward to caller only outside <think> blocks, and strip
|
||||
// singleton special tokens. We accumulate a tiny lookahead buffer
|
||||
// so tag tokens that arrive split ("<thi", "nk>") still match.
|
||||
tokenScan.append(result)
|
||||
while (true) {
|
||||
if (!inThink) {
|
||||
val open = tokenScan.indexOf("<think>")
|
||||
if (open < 0) {
|
||||
// No <think> open pending — strip any singleton tokens
|
||||
// that fully landed in the buffer, then flush prose
|
||||
// up to a safe point preserving lookahead.
|
||||
for (tok in stripTokens) {
|
||||
var idx = tokenScan.indexOf(tok)
|
||||
while (idx >= 0) {
|
||||
tokenScan.delete(idx, idx + tok.length)
|
||||
idx = tokenScan.indexOf(tok)
|
||||
}
|
||||
}
|
||||
val safe = tokenScan.length - maxTagLen
|
||||
if (safe > 0) {
|
||||
onToken?.invoke(tokenScan.substring(0, safe))
|
||||
tokenScan.delete(0, safe)
|
||||
}
|
||||
break
|
||||
}
|
||||
// Flush the prose before the <think> tag, then enter think mode.
|
||||
if (open > 0) onToken?.invoke(tokenScan.substring(0, open))
|
||||
tokenScan.delete(0, open + "<think>".length)
|
||||
inThink = true
|
||||
} else {
|
||||
val close = tokenScan.indexOf("</think>")
|
||||
if (close < 0) {
|
||||
// Drop all buffered chars except a small tail in case
|
||||
// the closing tag is split across tokens.
|
||||
val keep = "</think>".length - 1
|
||||
if (tokenScan.length > keep) tokenScan.delete(0, tokenScan.length - keep)
|
||||
break
|
||||
}
|
||||
tokenScan.delete(0, close + "</think>".length)
|
||||
inThink = false
|
||||
}
|
||||
}
|
||||
}
|
||||
override fun onStats(stats: String) {
|
||||
nlog("stats: ${stats.take(200)}")
|
||||
}
|
||||
}
|
||||
|
||||
val seqLen = minOf(params.maxNewTokens, 512)
|
||||
val rc = try {
|
||||
// echo=false so onResult() only receives the generated completion,
|
||||
// not the prompt tokens echoed back — otherwise the sentence
|
||||
// streamer would feed '<|im_start|>user …' to the TTS.
|
||||
mod.generate(fullPrompt, seqLen, cb, /* echo */ false)
|
||||
} catch (e: Throwable) {
|
||||
nlog("generate() threw: ${e.message}")
|
||||
-1
|
||||
}
|
||||
val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1")
|
||||
|
||||
// Drain any leftover prose buffered during <think>-suppression so the
|
||||
// last sentence reaches the TTS even if it ran past the closing tag.
|
||||
if (!inThink && tokenScan.isNotEmpty()) {
|
||||
onToken?.invoke(tokenScan.toString())
|
||||
tokenScan.clear()
|
||||
}
|
||||
val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output)
|
||||
?.groupValues?.get(1)?.toIntOrNull() ?: 0
|
||||
val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output)
|
||||
?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
|
||||
val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output)
|
||||
?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
|
||||
|
||||
val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null")
|
||||
nlog("RAW: ${responseRaw.take(300)}")
|
||||
val responseText = extractResponse(responseRaw)
|
||||
|
||||
val elapsed = System.currentTimeMillis() - startTime
|
||||
val rawText = responseBuilder.toString()
|
||||
val responseText = cleanResponse(rawText)
|
||||
val tokenCount = rawText.length / 4 // rough estimate without a tokenizer
|
||||
val rate = if (elapsed > 0) (tokenCount * 1000f) / elapsed else 0f
|
||||
nlog("Response: '$responseText'")
|
||||
nlog("Stats: ${tokenCount}tok ${rate}tok/s TTFT=${ttft}s ${elapsed}ms")
|
||||
|
||||
nlog("Response: '${responseText.take(80)}'")
|
||||
nlog("Stats: rc=$rc ~${tokenCount}tok ~${"%.1f".format(rate)}tok/s TTFT=${firstTokenMs}ms total=${elapsed}ms")
|
||||
onToken?.invoke(responseText)
|
||||
|
||||
GenerationResult(
|
||||
text = responseText,
|
||||
|
|
@ -209,32 +128,20 @@ class ExecuTorchLlmEngine(
|
|||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Qwen3 chat template matching qnn_llama_runner.cpp's get_formatted_prompt()
|
||||
* for DecoderModelVersion::kQwen3. Note the user-first-then-system ordering
|
||||
* (quirky but required — the runner binary produces the same layout and our
|
||||
* .pte was trained with it). Terminates with `<|im_start|>assistant` with
|
||||
* no trailing newline, matching the binary exactly.
|
||||
*/
|
||||
private fun buildChatTemplate(userInput: String): String {
|
||||
val sb = StringBuilder()
|
||||
sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
|
||||
if (SYSTEM_PROMPT.isNotEmpty()) {
|
||||
sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
|
||||
}
|
||||
sb.append("<|im_start|>assistant")
|
||||
return sb.toString()
|
||||
}
|
||||
|
||||
/** Strip <think>…</think>, special tokens, and leading/trailing whitespace. */
|
||||
private fun cleanResponse(raw: String): String {
|
||||
private fun extractResponse(raw: String): String {
|
||||
var text = raw
|
||||
val thinkEnd = text.indexOf("</think>")
|
||||
if (thinkEnd >= 0) {
|
||||
text = text.substring(thinkEnd + "</think>".length)
|
||||
} else if (text.indexOf("<think>") >= 0) {
|
||||
nlog("WARN: <think> block never closed")
|
||||
return ""
|
||||
} else {
|
||||
val thinkStart = text.indexOf("<think>")
|
||||
val assistantTag = text.indexOf("assistant")
|
||||
if (thinkStart >= 0) {
|
||||
nlog("WARN: <think> block never closed, no response generated")
|
||||
return ""
|
||||
} else if (assistantTag >= 0) {
|
||||
text = text.substring(assistantTag + "assistant".length)
|
||||
}
|
||||
}
|
||||
return text
|
||||
.replace("<|im_start|>", "")
|
||||
|
|
@ -245,9 +152,82 @@ class ExecuTorchLlmEngine(
|
|||
.trim()
|
||||
}
|
||||
|
||||
private fun deployRunnerScript() {
|
||||
val script = """
|
||||
#!/bin/sh
|
||||
cd $RUNNER_DIR
|
||||
export LD_LIBRARY_PATH=$RUNNER_DIR
|
||||
export ADSP_LIBRARY_PATH=$RUNNER_DIR
|
||||
|
||||
TEMP=${'$'}1
|
||||
SEQ_LEN=${'$'}2
|
||||
|
||||
PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64)
|
||||
|
||||
rm -f $RUNNER_DIR/outputs/response.txt
|
||||
|
||||
SYSTEM_ARGS=""
|
||||
if [ -s $RUNNER_DIR/outputs/system.b64 ]; then
|
||||
SYSTEM=${'$'}(base64 -d $RUNNER_DIR/outputs/system.b64)
|
||||
SYSTEM_ARGS="--system_prompt"
|
||||
fi
|
||||
|
||||
if [ -n "${'$'}SYSTEM_ARGS" ]; then
|
||||
exec ./qnn_llama_runner \
|
||||
--model_path hybrid_llama_qnn.pte \
|
||||
--tokenizer_path tokenizer.json \
|
||||
--decoder_model_version qwen3 \
|
||||
--output_path $RUNNER_DIR/outputs/response.txt \
|
||||
--performance_output_path $RUNNER_DIR/outputs/perf.txt \
|
||||
--shared_buffer \
|
||||
--system_prompt "${'$'}SYSTEM" \
|
||||
--prompt "${'$'}PROMPT" \
|
||||
--temperature ${'$'}TEMP \
|
||||
--seq_len ${'$'}SEQ_LEN \
|
||||
--eval_mode 0
|
||||
else
|
||||
exec ./qnn_llama_runner \
|
||||
--model_path hybrid_llama_qnn.pte \
|
||||
--tokenizer_path tokenizer.json \
|
||||
--decoder_model_version qwen3 \
|
||||
--output_path $RUNNER_DIR/outputs/response.txt \
|
||||
--performance_output_path $RUNNER_DIR/outputs/perf.txt \
|
||||
--shared_buffer \
|
||||
--prompt "${'$'}PROMPT" \
|
||||
--temperature ${'$'}TEMP \
|
||||
--seq_len ${'$'}SEQ_LEN \
|
||||
--eval_mode 0
|
||||
fi
|
||||
""".trimIndent()
|
||||
|
||||
writeFileRoot("$RUNNER_DIR/run_llm.sh", script)
|
||||
execRoot("chmod 755 $RUNNER_DIR/run_llm.sh")
|
||||
}
|
||||
|
||||
override fun release() {
|
||||
try { llmModule?.resetNative() } catch (_: Throwable) {}
|
||||
llmModule = null
|
||||
loaded = false
|
||||
}
|
||||
|
||||
private fun writeFileRoot(path: String, content: String) {
|
||||
try {
|
||||
val process = Runtime.getRuntime().exec(arrayOf("su", "-c", "cat > $path"))
|
||||
process.outputStream.bufferedWriter().use { it.write(content) }
|
||||
process.waitFor()
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "writeFileRoot failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun execRoot(cmd: String): String {
|
||||
return try {
|
||||
val process = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
|
||||
val result = process.inputStream.bufferedReader().readText()
|
||||
val error = process.errorStream.bufferedReader().readText()
|
||||
process.waitFor()
|
||||
if (error.isNotEmpty() && result.isEmpty()) error else result
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "execRoot failed: ${e.message}")
|
||||
""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -142,36 +142,14 @@ class KazeiaPipeline {
|
|||
* the echo-mode playback through the same path — otherwise each TTS
|
||||
* site reimplemented the "streaming-or-fallback" dispatch.
|
||||
*/
|
||||
suspend fun speakText(
|
||||
text: String,
|
||||
// Fires the instant each synthesized sentence starts playing
|
||||
// through the speaker, with the sentence text, audio duration,
|
||||
// and a per-ENVELOPE_WINDOW_MS RMS envelope. Used by
|
||||
// processLlmResponse to defer the KAZEIA chat bubble appearance
|
||||
// until sound is audible, pace word-by-word reveal inside the
|
||||
// bubble, and drive the AudioVisualizerView orb.
|
||||
onSegmentPlaying: ((
|
||||
sentence: String,
|
||||
durationMs: Long,
|
||||
rmsEnvelope: FloatArray,
|
||||
spectrogram: Array<FloatArray>
|
||||
) -> Unit)? = null
|
||||
) {
|
||||
suspend fun speakText(text: String) {
|
||||
val ttsEngine = tts ?: return
|
||||
_pipelineState.value = PipelineState.Speaking
|
||||
try {
|
||||
val qwen = ttsEngine as? com.kazeia.tts.Qwen3TtsEngine
|
||||
if (qwen != null) {
|
||||
qwen.onSegmentPlaying = onSegmentPlaying
|
||||
qwen.startStreamingSession()
|
||||
val streamer = com.kazeia.tts.SentenceStreamer { raw ->
|
||||
// Strip emoji / non-speakable pictographs before TTS
|
||||
// so a standalone "😊" doesn't become its own noisy
|
||||
// segment. The chat bubble keeps the original text —
|
||||
// only the audio path sees the cleaned version.
|
||||
val spoken = stripNonSpeakable(raw).trim()
|
||||
if (spoken.isNotEmpty()) qwen.enqueueSentence(spoken)
|
||||
}
|
||||
val streamer = com.kazeia.tts.SentenceStreamer { s -> qwen.enqueueSentence(s) }
|
||||
streamer.append(text)
|
||||
streamer.flush()
|
||||
qwen.endStreamingSession()
|
||||
|
|
@ -190,41 +168,6 @@ class KazeiaPipeline {
|
|||
_messages.value = _messages.value + msg
|
||||
}
|
||||
|
||||
/**
|
||||
* Drop emoji + dingbat + pictographic characters so the TTS engine
|
||||
* doesn't try to synthesize them. Covers the main Unicode emoji
|
||||
* blocks (Miscellaneous Symbols, Dingbats, Emoticons, Transport,
|
||||
* Supplemental Symbols and Pictographs, etc.) plus variation
|
||||
* selectors and zero-width joiners that tag emoji sequences.
|
||||
* Keeps everything in the Basic Latin / Latin-1 / Latin Extended
|
||||
* ranges + common French punctuation untouched.
|
||||
*/
|
||||
private fun stripNonSpeakable(text: String): String {
|
||||
val sb = StringBuilder(text.length)
|
||||
var i = 0
|
||||
while (i < text.length) {
|
||||
val cp = text.codePointAt(i)
|
||||
val skip = when {
|
||||
cp in 0x2600..0x27BF -> true // misc symbols + dingbats
|
||||
cp in 0x1F300..0x1F5FF -> true // pictographs
|
||||
cp in 0x1F600..0x1F64F -> true // emoticons
|
||||
cp in 0x1F680..0x1F6FF -> true // transport
|
||||
cp in 0x1F700..0x1F77F -> true // alchemical
|
||||
cp in 0x1F780..0x1F7FF -> true // geometric extended
|
||||
cp in 0x1F800..0x1F8FF -> true // supplemental arrows-c
|
||||
cp in 0x1F900..0x1F9FF -> true // supplemental pictographs
|
||||
cp in 0x1FA00..0x1FAFF -> true // symbols & pictographs extended-A
|
||||
cp == 0x200D -> true // zero-width joiner
|
||||
cp in 0xFE00..0xFE0F -> true // variation selectors
|
||||
cp in 0x1F1E6..0x1F1FF -> true // regional indicators (flags)
|
||||
else -> false
|
||||
}
|
||||
if (!skip) sb.appendCodePoint(cp)
|
||||
i += Character.charCount(cp)
|
||||
}
|
||||
return sb.toString()
|
||||
}
|
||||
|
||||
fun log(msg: String) {
|
||||
Log.i(TAG, msg)
|
||||
val time = java.text.SimpleDateFormat("HH:mm:ss.SSS", java.util.Locale.FRANCE)
|
||||
|
|
|
|||
|
|
@ -83,34 +83,6 @@ class KazeiaService : Service() {
|
|||
private val _isListening = MutableStateFlow(false)
|
||||
val isListening: StateFlow<Boolean> = _isListening
|
||||
|
||||
// Drives the AudioVisualizerView orb. Pushed from the VAD loop
|
||||
// during mic capture (mic RMS, normalized) and from the TTS engine's
|
||||
// onSegmentPlaying callback (TTS RMS envelope per-segment). The view
|
||||
// reads this via collectLatest in ChatActivity; the signals carry
|
||||
// their own state so the visualizer knows whether it's idle, tracking
|
||||
// the mic, or rendering a TTS segment.
|
||||
sealed class VisualizerSignal {
|
||||
object Idle : VisualizerSignal()
|
||||
data class Listening(val micRms: Float) : VisualizerSignal()
|
||||
data class Speaking(
|
||||
val rmsEnvelope: FloatArray,
|
||||
val spectrogram: Array<FloatArray>,
|
||||
val durationMs: Long
|
||||
) : VisualizerSignal()
|
||||
}
|
||||
private val _visualizerSignal = MutableStateFlow<VisualizerSignal>(VisualizerSignal.Idle)
|
||||
val visualizerSignal: StateFlow<VisualizerSignal> = _visualizerSignal
|
||||
|
||||
// Kazeia's orb color is bound to the selected voice so the user
|
||||
// visually associates a palette with the speaker they picked. UI
|
||||
// sets this whenever the voice spinner changes; the orb view
|
||||
// listens via the StateFlow and tweens the current → target color.
|
||||
private val _voiceColor = MutableStateFlow(0xFFBCA4E8.toInt()) // lavender = Damien default
|
||||
val voiceColor: StateFlow<Int> = _voiceColor
|
||||
|
||||
/** Called by the UI whenever the voice selector changes. */
|
||||
fun setVoiceColor(color: Int) { _voiceColor.value = color }
|
||||
|
||||
private val _debugMode = MutableStateFlow(false)
|
||||
val debugMode: StateFlow<Boolean> = _debugMode
|
||||
|
||||
|
|
@ -202,12 +174,6 @@ class KazeiaService : Service() {
|
|||
if (!::llm.isInitialized || !llm.isLoaded()) {
|
||||
log("Stream LLM: LLM not ready"); return@launch
|
||||
}
|
||||
// Set pipeline state to Speaking so the continuous-
|
||||
// listening mic loop (line ~824) drops frames during
|
||||
// TTS playback. Without this, the mic picks up the
|
||||
// tablet speaker and feeds our own TTS back into STT,
|
||||
// creating an infinite loop.
|
||||
_pipelineState.value = PipelineState.Speaking
|
||||
qwenTts.startStreamingSession()
|
||||
val tStart = System.currentTimeMillis()
|
||||
var firstSentenceLogged = false
|
||||
|
|
@ -233,9 +199,6 @@ class KazeiaService : Service() {
|
|||
} catch (e: Exception) {
|
||||
log("Stream LLM error: ${e.message}")
|
||||
e.printStackTrace()
|
||||
} finally {
|
||||
// Back to Idle so the next mic frame is accepted.
|
||||
_pipelineState.value = PipelineState.Idle
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -451,18 +414,10 @@ class KazeiaService : Service() {
|
|||
this, Manifest.permission.RECORD_AUDIO
|
||||
) == PackageManager.PERMISSION_GRANTED
|
||||
|
||||
// FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK is required so ColorOS (and
|
||||
// stock Android 14+ policies) don't mute the TTS AudioTrack with
|
||||
// "clientVolume" at ~600 ms after play(). Without it the FGS was
|
||||
// classified as mic-only or special-use and background-audio
|
||||
// hardening silenced it. Combine with MICROPHONE so mic input keeps
|
||||
// working during STT.
|
||||
val fgsType = if (hasMicPermission) {
|
||||
ServiceInfo.FOREGROUND_SERVICE_TYPE_MICROPHONE or
|
||||
ServiceInfo.FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK
|
||||
ServiceInfo.FOREGROUND_SERVICE_TYPE_MICROPHONE
|
||||
} else {
|
||||
ServiceInfo.FOREGROUND_SERVICE_TYPE_MEDIA_PLAYBACK or
|
||||
ServiceInfo.FOREGROUND_SERVICE_TYPE_SPECIAL_USE
|
||||
ServiceInfo.FOREGROUND_SERVICE_TYPE_SPECIAL_USE
|
||||
}
|
||||
|
||||
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.Q) {
|
||||
|
|
@ -495,7 +450,7 @@ class KazeiaService : Service() {
|
|||
// TTS: try Qwen3-TTS (NPU Hexagon), fallback to Android TTS
|
||||
_loadingState.value = LoadingState(15, "TTS Qwen3…")
|
||||
try {
|
||||
val qwenTts = com.kazeia.tts.Qwen3TtsEngine(nativeLibDir, this@KazeiaService) { msg -> log("[TTS] $msg") }
|
||||
val qwenTts = com.kazeia.tts.Qwen3TtsEngine(nativeLibDir) { msg -> log("[TTS] $msg") }
|
||||
qwenTts.load("$modelsDir/qwen3-tts-npu")
|
||||
if (qwenTts.isLoaded()) {
|
||||
tts = qwenTts
|
||||
|
|
@ -563,7 +518,7 @@ class KazeiaService : Service() {
|
|||
|
||||
// LLM: Qwen3-4B on Hexagon V79 via qnn_llama_runner.
|
||||
_loadingState.value = LoadingState(50, "LLM Qwen3-4B NPU…")
|
||||
llm = ExecuTorchLlmEngine(this@KazeiaService) { msg -> log(msg) }
|
||||
llm = ExecuTorchLlmEngine { msg -> log(msg) }
|
||||
try {
|
||||
llm.load("${KazeiaApplication.MODELS_DIR}/qwen3-4b", com.kazeia.core.LlmConfig())
|
||||
} catch (e: Exception) {
|
||||
|
|
@ -628,16 +583,6 @@ class KazeiaService : Service() {
|
|||
if (chatterbox != null) {
|
||||
chatterbox.setVoice(voicePath)
|
||||
log("Voice set to: $voicePath")
|
||||
return
|
||||
}
|
||||
val qwen = tts as? com.kazeia.tts.Qwen3TtsEngine
|
||||
if (qwen != null) {
|
||||
// Hot-swap prefix/suffix embeddings — no model reload. Takes
|
||||
// effect from the NEXT synthesized segment (current in-flight
|
||||
// one, if any, finishes with the old voice since the arrays
|
||||
// are already in its closure).
|
||||
qwen.setVoice(voicePath)
|
||||
log("Voice set to: $voicePath")
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -890,14 +835,6 @@ class KazeiaService : Service() {
|
|||
for (s in frame) sumSq += s.toLong() * s.toLong()
|
||||
val rms = Math.sqrt(sumSq.toDouble() / frameSize).toInt()
|
||||
|
||||
// Drive the visualizer orb. Normalize with the same
|
||||
// sqrt squashing used for TTS so loud peaks don't
|
||||
// saturate and quiet speech is still visible. The
|
||||
// visualizer stays in Listening mode; it will swap
|
||||
// to Speaking or Idle when pipelineState moves on.
|
||||
val rmsNorm = kotlin.math.sqrt((rms / 6000f).coerceIn(0f, 1f))
|
||||
_visualizerSignal.value = VisualizerSignal.Listening(rmsNorm)
|
||||
|
||||
// Log RMS every second for calibration
|
||||
if (frameCount % 10 == 0) {
|
||||
Log.d(TAG, "VAD RMS=$rms (threshold=$silenceThreshold)")
|
||||
|
|
@ -1247,100 +1184,13 @@ class KazeiaService : Service() {
|
|||
log("LLM stats: ${result.tokenCount} tokens in ${result.timeMs}ms (${result.tokensPerSecond} tok/s)")
|
||||
|
||||
if (responseText.isNotEmpty()) {
|
||||
// Mark the pipeline as Speaking for the duration of TTS so
|
||||
// the continuous-listening mic loop drops frames and we
|
||||
// don't feed our own speaker output back into STT.
|
||||
_pipelineState.value = PipelineState.Speaking
|
||||
// Create a KAZEIA bubble up-front. Until the first TTS
|
||||
// segment actually starts playing the bubble shows an
|
||||
// animated "." → ".." → "..." typing indicator so the
|
||||
// user knows Kazeia is thinking/synthesising; once the
|
||||
// first segment plays the dots are cleared and the
|
||||
// per-sentence word reveal takes over.
|
||||
val bubble = ChatMessage(role = ChatMessage.Role.KAZEIA, text = ".")
|
||||
addMessage(bubble)
|
||||
val revealScope = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.Default)
|
||||
var revealedSoFar = ""
|
||||
val revealJobs = mutableListOf<kotlinx.coroutines.Job>()
|
||||
val firstSegmentSeen = java.util.concurrent.atomic.AtomicBoolean(false)
|
||||
val typingJob = revealScope.launch {
|
||||
var tick = 0
|
||||
while (!firstSegmentSeen.get()) {
|
||||
val dots = ".".repeat(1 + (tick % 3)) // . → .. → ...
|
||||
updateMessageText(bubble.id, dots)
|
||||
tick++
|
||||
kotlinx.coroutines.delay(400)
|
||||
}
|
||||
}
|
||||
try {
|
||||
pipeline.speakText(responseText) { sentence, durationMs, envelope, spectrogram ->
|
||||
// First segment: stop the typing indicator and
|
||||
// reset the bubble to empty so the word reveal
|
||||
// doesn't collide with the dots.
|
||||
if (firstSegmentSeen.compareAndSet(false, true)) {
|
||||
try { typingJob.cancel() } catch (_: Exception) {}
|
||||
updateMessageText(bubble.id, "")
|
||||
}
|
||||
// Push the envelope + spectrogram to the
|
||||
// visualizer at the same moment the MediaPlayer
|
||||
// starts playing so the orb reacts to this
|
||||
// segment's actual energy and the in-sphere
|
||||
// spectrum bars match the audio content.
|
||||
_visualizerSignal.value =
|
||||
VisualizerSignal.Speaking(envelope, spectrogram, durationMs)
|
||||
// Start a coroutine that appends one word at a time
|
||||
// over the segment's audio duration. Words are
|
||||
// separated on whitespace; punctuation rides with
|
||||
// the trailing word. The prefix (= text already
|
||||
// revealed from previous sentences) carries over so
|
||||
// earlier sentences stay on screen.
|
||||
val prefix = revealedSoFar
|
||||
val words = sentence.split(Regex("\\s+")).filter { it.isNotBlank() }
|
||||
revealedSoFar =
|
||||
if (prefix.isEmpty()) sentence
|
||||
else "$prefix $sentence"
|
||||
if (words.isEmpty()) return@speakText
|
||||
val perWordMs = (durationMs / words.size).coerceAtLeast(40L)
|
||||
val job = revealScope.launch {
|
||||
val sb = StringBuilder(prefix)
|
||||
if (prefix.isNotEmpty()) sb.append(' ')
|
||||
// Immediately reveal the first word so there's
|
||||
// no visible gap between audio start and text.
|
||||
sb.append(words[0])
|
||||
updateMessageText(bubble.id, sb.toString())
|
||||
for (i in 1 until words.size) {
|
||||
kotlinx.coroutines.delay(perWordMs)
|
||||
sb.append(' ').append(words[i])
|
||||
updateMessageText(bubble.id, sb.toString())
|
||||
}
|
||||
}
|
||||
revealJobs.add(job)
|
||||
}
|
||||
// After all segments finished playing, ensure the full
|
||||
// text is visible even if a reveal job was racing.
|
||||
revealJobs.forEach { try { it.join() } catch (_: Exception) {} }
|
||||
updateMessageText(bubble.id, responseText)
|
||||
} finally {
|
||||
// Defensive: cancel the typing dots in case no
|
||||
// segment ever fired (e.g. the response was entirely
|
||||
// emojis and got stripped empty).
|
||||
firstSegmentSeen.set(true)
|
||||
try { typingJob.cancel() } catch (_: Exception) {}
|
||||
_pipelineState.value = if (_isListening.value)
|
||||
PipelineState.Listening else PipelineState.Idle
|
||||
// If we're going back to mic listening, the VAD loop
|
||||
// will keep pushing Listening signals; otherwise drop
|
||||
// to Idle so the orb settles back to its breathing
|
||||
// baseline.
|
||||
if (!_isListening.value) {
|
||||
_visualizerSignal.value = VisualizerSignal.Idle
|
||||
}
|
||||
}
|
||||
} else {
|
||||
_pipelineState.value = if (_isListening.value)
|
||||
PipelineState.Listening else PipelineState.Idle
|
||||
addMessage(ChatMessage(role = ChatMessage.Role.KAZEIA, text = responseText))
|
||||
pipeline.speakText(responseText)
|
||||
}
|
||||
|
||||
_pipelineState.value = if (_isListening.value)
|
||||
PipelineState.Listening else PipelineState.Idle
|
||||
|
||||
} catch (e: Exception) {
|
||||
_aiWorkload.value = _aiWorkload.value.copy(llmActive = false)
|
||||
log("ERROR: LLM generation error: ${e.message}")
|
||||
|
|
@ -1357,19 +1207,6 @@ class KazeiaService : Service() {
|
|||
_messages.value = _messages.value + message
|
||||
}
|
||||
|
||||
/** Replace the text of an existing message (identified by id) in the
|
||||
* message list. Used by the progressive-reveal flow to grow a
|
||||
* KAZEIA message word-by-word as TTS audio plays. */
|
||||
private fun updateMessageText(id: Long, newText: String) {
|
||||
val current = _messages.value
|
||||
val idx = current.indexOfLast { it.id == id }
|
||||
if (idx < 0) return
|
||||
val m = current[idx]
|
||||
_messages.value = current.toMutableList().also {
|
||||
it[idx] = m.copy(text = newText)
|
||||
}
|
||||
}
|
||||
|
||||
private fun createNotification(): Notification {
|
||||
val intent = Intent(this, ChatActivity::class.java)
|
||||
val pendingIntent = PendingIntent.getActivity(
|
||||
|
|
|
|||
|
|
@ -37,7 +37,6 @@ import kotlin.coroutines.resume
|
|||
*/
|
||||
class Qwen3TtsEngine(
|
||||
private val nativeLibDir: String,
|
||||
private val context: android.content.Context? = null,
|
||||
private val onLog: ((String) -> Unit)? = null
|
||||
) : TtsEngine {
|
||||
|
||||
|
|
@ -89,38 +88,6 @@ class Qwen3TtsEngine(
|
|||
private const val TOKEN_USER = 872
|
||||
private const val TOKEN_ASSISTANT = 1042
|
||||
private const val TOKEN_NEWLINE = 198
|
||||
|
||||
// Streaming decode: when true, BigVGAN dispatches a chunk's audio as
|
||||
// soon as SEQ_LEN codes are ready from the talker/CP loop rather than
|
||||
// waiting for all tokens. For long segments this overlaps the final
|
||||
// BigVGAN passes with ongoing talker/CP work on Hexagon, cutting the
|
||||
// first-audio latency by ~4 s. Short segments (<SEQ_LEN codes) fall
|
||||
// back to the single-chunk path with zero difference. Flag exists so
|
||||
// the sequential path can be re-enabled for A/B comparison.
|
||||
private const val USE_STREAMING_DECODE = true
|
||||
|
||||
// ColorOS Audio Hardening silently mutes AudioTrack in background/FGS
|
||||
// context (confirmed via `event:muted updated source:clientVolume`
|
||||
// logs, same behaviour across USAGE_MEDIA, USAGE_ASSISTANT, and
|
||||
// USAGE_VOICE_COMMUNICATION). When this flag is true, each
|
||||
// generated segment is written as a WAV to app-owned shared
|
||||
// storage and played via MediaPlayer instead. Slightly slower
|
||||
// (WAV write + MediaPlayer prepare add ~150 ms per segment) but
|
||||
// it's the only reliable path to audible output on this device.
|
||||
private const val USE_MEDIAPLAYER_FALLBACK = true
|
||||
|
||||
// Window size for the TTS→visualizer RMS sidecar. 50 ms at 24 kHz
|
||||
// = 1200 samples/window — small enough for a 60 fps visualizer to
|
||||
// track formants, large enough to run at negligible CPU cost.
|
||||
const val ENVELOPE_WINDOW_MS = 50
|
||||
// FFT size for the spectrum-in-sphere sidecar. 1024 samples at
|
||||
// 24 kHz = 43 ms — slightly narrower than the hop so each frame
|
||||
// gives a clean snapshot centered on its hop boundary.
|
||||
private const val FFT_SIZE = 1024
|
||||
// Number of log-spaced bands 120 Hz–4 kHz rendered as vertical
|
||||
// bars inside the sphere during Speaking. 12 feels like a real
|
||||
// spectrometer without cluttering at smaller sphere sizes.
|
||||
const val SPECTRUM_BANDS = 12
|
||||
}
|
||||
|
||||
private var ortEnv: OrtEnvironment? = null
|
||||
|
|
@ -276,12 +243,7 @@ class Qwen3TtsEngine(
|
|||
return session
|
||||
}
|
||||
|
||||
// Speech decoder V2 on CPU. Two paths tried, both worse than CPU:
|
||||
// - HTP: BigVGAN convolutions too slow to compile (timeout)
|
||||
// - GPU Adreno via QNN GPU EP: model loads but per-phrase
|
||||
// inference is ~3.5 s vs ~2 s on CPU (GPU/CPU memory transfer
|
||||
// overhead dominates for this conv-heavy model)
|
||||
// CPU 8-thread stays the practical optimum.
|
||||
// Speech decoder V2 on CPU (HTP tested: BigVGAN convolutions too slow to compile)
|
||||
val v2Path = "$path/v2_pre_conv"
|
||||
if (File("$v2Path/model.onnx").exists()) {
|
||||
nlog("Loading V2 speech decoder (CPU ONNX)...")
|
||||
|
|
@ -608,53 +570,8 @@ class Qwen3TtsEngine(
|
|||
|
||||
override fun isLoaded(): Boolean = loaded
|
||||
|
||||
/**
|
||||
* Hot-swap the speaker prefix/suffix embeddings used for voice
|
||||
* conditioning. [voicePath] is a WAV path like
|
||||
* `/…/voix/elodie.wav` — we derive the voice id from its basename
|
||||
* and look for matching `<id>_voice_prefix.bin` + `<id>_voice_suffix.bin`
|
||||
* in the model dir. If both files exist they replace the current
|
||||
* [damienVoicePrefix] / [damienVoiceSuffix] arrays so the next
|
||||
* segment generated uses the new voice. If either file is missing
|
||||
* we log a warning and keep the current voice — per-voice
|
||||
* prefix/suffix files are offline-generated via
|
||||
* scripts/prepare_tts_native.py; run once per voice WAV and
|
||||
* `adb push` into the model dir to enable.
|
||||
*
|
||||
* Thread-safety: the arrays are read by the synth worker on
|
||||
* Dispatchers.IO; replacing a reference via a volatile var is
|
||||
* atomic on the JVM so a mid-segment replacement just takes
|
||||
* effect on the next segment boundary.
|
||||
*/
|
||||
fun setVoice(voicePath: String) {
|
||||
val modelDir = "/data/local/tmp/kazeia/models/qwen3-tts-npu"
|
||||
val id = java.io.File(voicePath).nameWithoutExtension.lowercase()
|
||||
val prefixFile = java.io.File("$modelDir/${id}_voice_prefix.bin")
|
||||
val suffixFile = java.io.File("$modelDir/${id}_voice_suffix.bin")
|
||||
if (!prefixFile.exists() || !suffixFile.exists()) {
|
||||
nlog("Voice '$id' not available (missing ${prefixFile.name} or ${suffixFile.name}); keeping current voice. " +
|
||||
"Run scripts/prepare_tts_native.py with this WAV to generate the files.")
|
||||
return
|
||||
}
|
||||
try {
|
||||
val pBytes = prefixFile.readBytes()
|
||||
val pHead = java.nio.ByteBuffer.wrap(pBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
|
||||
val nPref = pHead.int; val dimPref = pHead.int
|
||||
if (dimPref != TALKER_DIM) throw IllegalStateException("prefix dim $dimPref != $TALKER_DIM")
|
||||
val newPrefix = Array(nPref) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = pHead.float } }
|
||||
|
||||
val sBytes = suffixFile.readBytes()
|
||||
val sHead = java.nio.ByteBuffer.wrap(sBytes).order(java.nio.ByteOrder.LITTLE_ENDIAN)
|
||||
val nSuf = sHead.int; val dimSuf = sHead.int
|
||||
if (dimSuf != TALKER_DIM) throw IllegalStateException("suffix dim $dimSuf != $TALKER_DIM")
|
||||
val newSuffix = Array(nSuf) { FloatArray(TALKER_DIM).also { arr -> for (j in 0 until TALKER_DIM) arr[j] = sHead.float } }
|
||||
|
||||
damienVoicePrefix = newPrefix
|
||||
damienVoiceSuffix = newSuffix
|
||||
nlog("Voice switched to '$id' ($nPref prefix + $nSuf suffix embeds)")
|
||||
} catch (e: Exception) {
|
||||
nlog("Voice swap failed for '$id': ${e.message}")
|
||||
}
|
||||
nlog("Voice: $voicePath")
|
||||
}
|
||||
|
||||
override suspend fun synthesize(text: String, language: String): TtsResult {
|
||||
|
|
@ -2752,11 +2669,7 @@ class Qwen3TtsEngine(
|
|||
|
||||
/** PTE pipeline from pre-computed embeddings (prefill + trailing). */
|
||||
private fun runInterleavedPteFromEmbeds(
|
||||
prefillEmbeds: List<FloatArray>, trailingEmbeds: List<FloatArray>, maxGenTokens: Int,
|
||||
// Invoked synchronously after each generated step with (stepIdx, 16-codebook codes).
|
||||
// Streaming callers use it to dispatch SEQ_LEN-sized chunks to the BigVGAN pipeline
|
||||
// as soon as they are ready. null preserves the original batch behaviour.
|
||||
onCodeStep: ((step: Int, codes: IntArray) -> Unit)? = null
|
||||
prefillEmbeds: List<FloatArray>, trailingEmbeds: List<FloatArray>, maxGenTokens: Int
|
||||
): Array<IntArray> {
|
||||
val talkerMod = talkerPteModule ?: return emptyArray()
|
||||
val cpMod = cpPteModule ?: return emptyArray()
|
||||
|
|
@ -2834,7 +2747,6 @@ class Qwen3TtsEngine(
|
|||
totalCpMs += System.currentTimeMillis() - tCp0
|
||||
for (cb in 1 until NUM_CODEBOOKS) codes[cb] = cpCodes[cb - 1]
|
||||
allCodes.add(codes); generatedCb0.add(currentCb0)
|
||||
onCodeStep?.invoke(genStep, codes)
|
||||
|
||||
if (genStep < 3) nlog("Step ${genStep+1}: cb0=$currentCb0 cb1=${codes[1]}")
|
||||
|
||||
|
|
@ -3404,18 +3316,6 @@ class Qwen3TtsEngine(
|
|||
private var sessionTrack: AudioTrack? = null
|
||||
private var sessionChannel: kotlinx.coroutines.channels.Channel<String>? = null
|
||||
private var sessionJob: kotlinx.coroutines.Job? = null
|
||||
private var sessionKeepAliveJob: kotlinx.coroutines.Job? = null
|
||||
private var sessionFocusRequest: android.media.AudioFocusRequest? = null
|
||||
// Total PCM frames queued to sessionTrack across all segments in this session.
|
||||
// endStreamingSession() polls track.playbackHeadPosition until it reaches this
|
||||
// count before calling stop(), so the tail sentence isn't clipped.
|
||||
// Uses AtomicLong because both the session worker and the keep-alive watchdog
|
||||
// call writeAndCount concurrently.
|
||||
private val sessionFramesWritten = java.util.concurrent.atomic.AtomicLong(0)
|
||||
// True while a real-audio generate call is in progress. The keep-alive
|
||||
// watchdog skips silence injection while this is set, so silence never
|
||||
// interleaves with speech inside a segment.
|
||||
private val sessionGenActive = java.util.concurrent.atomic.AtomicBoolean(false)
|
||||
|
||||
/**
|
||||
* Open a streaming TTS session backed by a persistent AudioTrack. After
|
||||
|
|
@ -3424,403 +3324,13 @@ class Qwen3TtsEngine(
|
|||
* track as soon as it's decoded. Call endStreamingSession() to flush
|
||||
* the queue and release the track.
|
||||
*/
|
||||
// MediaPlayer-based fallback session state. If ColorOS mutes our
|
||||
// AudioTrack (as observed repeatedly — `event:muted updated source:
|
||||
// clientVolume` right after play()), we instead render each segment
|
||||
// as a WAV file on shared storage and play it back via MediaPlayer,
|
||||
// which uses a completely different internal audio pipeline that
|
||||
// doesn't get silenced by the background playback policy.
|
||||
private var sessionMpQueue: kotlinx.coroutines.channels.Channel<String>? = null
|
||||
private var sessionMpJob: kotlinx.coroutines.Job? = null
|
||||
private val sessionMpSegIdx = java.util.concurrent.atomic.AtomicInteger(0)
|
||||
|
||||
/**
|
||||
* Fires the moment a synthesized segment starts playing through the
|
||||
* speaker. Carries the sentence text, audio duration, per-window RMS
|
||||
* envelope (for orb amplitude) and per-window log-spaced band
|
||||
* spectrogram (for the spectrum-in-sphere visualizer). All three
|
||||
* share the same time axis — one entry per [ENVELOPE_WINDOW_MS].
|
||||
*/
|
||||
var onSegmentPlaying: ((
|
||||
sentence: String,
|
||||
durationMs: Long,
|
||||
rmsEnvelope: FloatArray,
|
||||
spectrogram: Array<FloatArray>
|
||||
) -> Unit)? = null
|
||||
|
||||
private fun startStreamingSessionMp() {
|
||||
if (sessionMpQueue != null) return
|
||||
sessionMpSegIdx.set(0)
|
||||
val sentenceChan = kotlinx.coroutines.channels.Channel<String>(
|
||||
capacity = kotlinx.coroutines.channels.Channel.UNLIMITED
|
||||
)
|
||||
// Pipeline: synth worker produces WAV paths, playback worker runs
|
||||
// them through a pair of MediaPlayer instances chained via
|
||||
// setNextMediaPlayer() so there's zero-gap transition between
|
||||
// segments (no DAC/output routing "pop" the user was hearing as
|
||||
// "beg beg" with one player-per-seg). The rendezvous channel has
|
||||
// capacity 2 so the synth worker can stay one seg ahead of the
|
||||
// currently playing seg without growing disk use.
|
||||
// Carry (segIdx, wavPath, sentence, durationMs) together so the
|
||||
// playback worker can invoke onSegmentPlaying with the matching
|
||||
// text and audio length when the segment actually starts playing.
|
||||
val wavChan = kotlinx.coroutines.channels.Channel<SegmentReady>(capacity = 2)
|
||||
val scope = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.IO)
|
||||
val synthJob = scope.launch {
|
||||
for (sentence in sentenceChan) {
|
||||
try {
|
||||
val segIdx = sessionMpSegIdx.getAndIncrement()
|
||||
val tSynth = System.currentTimeMillis()
|
||||
val audio = generateSegmentAudioVC(sentence, segIdx)
|
||||
if (audio.isEmpty()) continue
|
||||
val wavPath = "${context?.cacheDir?.absolutePath ?: "/data/local/tmp/kazeia"}/tts_seg_${segIdx}.wav"
|
||||
saveWav(wavPath, audio)
|
||||
val durationMs = audio.size * 1000L / SR
|
||||
val envelope = computeRmsEnvelope(audio)
|
||||
val spectrogram = computeSpectrogram(audio)
|
||||
nlog("MP seg $segIdx synthesized (${System.currentTimeMillis() - tSynth}ms, ${durationMs}ms audio, ${envelope.size} env × ${SPECTRUM_BANDS} bands), queued for playback")
|
||||
wavChan.send(SegmentReady(segIdx, wavPath, sentence, durationMs, envelope, spectrogram))
|
||||
} catch (e: Exception) {
|
||||
nlog("MP synth error: ${e.message}")
|
||||
}
|
||||
}
|
||||
wavChan.close()
|
||||
}
|
||||
val playJob = scope.launch { playChainedMediaPlayers(wavChan) }
|
||||
val combined = scope.launch { synthJob.join(); playJob.join() }
|
||||
sessionMpQueue = sentenceChan; sessionMpJob = combined
|
||||
nlog("streaming session opened (MediaPlayer fallback, chained)")
|
||||
}
|
||||
|
||||
/**
|
||||
* Drive the WAV playback pipeline with two MediaPlayer instances
|
||||
* chained via setNextMediaPlayer() so each segment flows into the
|
||||
* next without re-arming the audio output (which caused audible
|
||||
* "pops" between segments when one player stopped and another
|
||||
* started). Consumes (segIdx, wavPath) pairs from [wavChan] and
|
||||
* deletes each file after it finishes playing. Suspends until the
|
||||
* channel closes AND the final segment finishes.
|
||||
*/
|
||||
private suspend fun playChainedMediaPlayers(
|
||||
wavChan: kotlinx.coroutines.channels.ReceiveChannel<SegmentReady>
|
||||
) {
|
||||
val attrs = android.media.AudioAttributes.Builder()
|
||||
.setUsage(android.media.AudioAttributes.USAGE_MEDIA)
|
||||
.setContentType(android.media.AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build()
|
||||
|
||||
// Synchronously prepare a MediaPlayer on the current coroutine.
|
||||
// Throws on failure; caller handles cleanup.
|
||||
suspend fun prepareMp(path: String, segIdx: Int): android.media.MediaPlayer {
|
||||
val mp = android.media.MediaPlayer()
|
||||
mp.setAudioAttributes(attrs)
|
||||
mp.setDataSource(path)
|
||||
kotlinx.coroutines.suspendCancellableCoroutine<Unit> { cont ->
|
||||
mp.setOnPreparedListener { if (cont.isActive) cont.resume(Unit) {} }
|
||||
mp.setOnErrorListener { _, what, extra ->
|
||||
nlog("MP seg $segIdx prepare error: what=$what extra=$extra")
|
||||
if (cont.isActive) cont.resume(Unit) {}
|
||||
true
|
||||
}
|
||||
cont.invokeOnCancellation { try { mp.release() } catch (_: Exception) {} }
|
||||
mp.prepareAsync()
|
||||
}
|
||||
return mp
|
||||
}
|
||||
|
||||
// Per-player book-keeping. `done` completes the moment the
|
||||
// MediaPlayer's OnCompletionListener fires, so the loop can
|
||||
// tell *before* calling setNextMediaPlayer whether the chain
|
||||
// will actually trigger (setNextMediaPlayer on a player already
|
||||
// in the Completed state is a silent no-op — that was the root
|
||||
// cause of missing audio on seg 1 when synthesis ran longer
|
||||
// than seg 0's playback).
|
||||
class Live(
|
||||
val mp: android.media.MediaPlayer,
|
||||
val info: SegmentReady,
|
||||
val done: kotlinx.coroutines.CompletableDeferred<Unit>
|
||||
)
|
||||
|
||||
fun arm(info: SegmentReady, mp: android.media.MediaPlayer): Live {
|
||||
val done = kotlinx.coroutines.CompletableDeferred<Unit>()
|
||||
mp.setOnCompletionListener {
|
||||
try { it.release() } catch (_: Exception) {}
|
||||
if (!done.isCompleted) done.complete(Unit)
|
||||
}
|
||||
mp.setOnErrorListener { _, what, extra ->
|
||||
nlog("MP seg ${info.segIdx} play error: what=$what extra=$extra")
|
||||
if (!done.isCompleted) done.complete(Unit)
|
||||
true
|
||||
}
|
||||
return Live(mp, info, done)
|
||||
}
|
||||
|
||||
var current: Live? = null
|
||||
|
||||
try {
|
||||
// Bootstrap with the first segment.
|
||||
val first = wavChan.receiveCatching().getOrNull() ?: return
|
||||
val firstMp = prepareMp(first.wavPath, first.segIdx)
|
||||
firstMp.start()
|
||||
current = arm(first, firstMp)
|
||||
try { onSegmentPlaying?.invoke(first.sentence, first.durationMs, first.rmsEnvelope, first.spectrogram) } catch (_: Exception) {}
|
||||
nlog("MP seg ${first.segIdx} started (${first.durationMs}ms)")
|
||||
|
||||
while (true) {
|
||||
val upcoming = wavChan.receiveCatching().getOrNull() ?: break
|
||||
val nextMp = prepareMp(upcoming.wavPath, upcoming.segIdx)
|
||||
|
||||
// Try to chain so Android auto-starts next when current
|
||||
// finishes — gives zero-gap playback without re-arming
|
||||
// the DAC. Skipped if current has already completed
|
||||
// (setNext on Completed is a no-op); we fall back to an
|
||||
// explicit start() below in that case.
|
||||
var chained = false
|
||||
try {
|
||||
if (!current!!.done.isCompleted) {
|
||||
current!!.mp.setNextMediaPlayer(nextMp)
|
||||
chained = true
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
nlog("MP seg ${upcoming.segIdx} setNext failed: ${e.message}")
|
||||
}
|
||||
|
||||
// Wait for current playback to finish before rotating.
|
||||
current!!.done.await()
|
||||
try { java.io.File(current!!.info.wavPath).delete() } catch (_: Exception) {}
|
||||
|
||||
// If we never chained (or the chain raced with the
|
||||
// current's completion), start next manually. Safe to
|
||||
// start() again even if Android already auto-started.
|
||||
val autoStarted = try { chained && (nextMp.isPlaying || nextMp.currentPosition > 0) } catch (_: Exception) { false }
|
||||
if (!autoStarted) {
|
||||
try { nextMp.start() } catch (e: Exception) {
|
||||
nlog("MP seg ${upcoming.segIdx} manual start failed: ${e.message}")
|
||||
}
|
||||
nlog("MP seg ${upcoming.segIdx} started manually (chain missed)")
|
||||
} else {
|
||||
nlog("MP seg ${upcoming.segIdx} auto-chained")
|
||||
}
|
||||
|
||||
current = arm(upcoming, nextMp)
|
||||
try { onSegmentPlaying?.invoke(upcoming.sentence, upcoming.durationMs, upcoming.rmsEnvelope, upcoming.spectrogram) } catch (_: Exception) {}
|
||||
}
|
||||
|
||||
// Drain: wait for the last player to finish.
|
||||
current?.done?.await()
|
||||
current?.let { try { java.io.File(it.info.wavPath).delete() } catch (_: Exception) {} }
|
||||
} catch (e: Exception) {
|
||||
nlog("MP playback chain error: ${e.message}")
|
||||
} finally {
|
||||
try { current?.mp?.release() } catch (_: Exception) {}
|
||||
}
|
||||
}
|
||||
|
||||
/** Payload handed from the synth worker to the playback worker so
|
||||
* the UI can be notified with matching text + duration when each
|
||||
* segment starts playing. The [rmsEnvelope] is an optional sidecar
|
||||
* array of per-ENVELOPE_WINDOW_MS RMS values normalized to [0, 1]
|
||||
* that drives the audio-reactive orb visualizer without having to
|
||||
* read PCM back from MediaPlayer. */
|
||||
private data class SegmentReady(
|
||||
val segIdx: Int,
|
||||
val wavPath: String,
|
||||
val sentence: String,
|
||||
val durationMs: Long,
|
||||
val rmsEnvelope: FloatArray,
|
||||
val spectrogram: Array<FloatArray>
|
||||
)
|
||||
|
||||
/** Compute a per-ENVELOPE_WINDOW_MS normalized RMS envelope from a
|
||||
* mono 16-bit PCM buffer at [SR]. Cheap (one pass, trivially fast
|
||||
* on the ~100 k samples we generate per segment) and called only
|
||||
* once per segment right after synthesis. */
|
||||
private fun computeRmsEnvelope(audio: ShortArray): FloatArray {
|
||||
if (audio.isEmpty()) return FloatArray(0)
|
||||
val windowSamples = SR * ENVELOPE_WINDOW_MS / 1000
|
||||
val nWindows = (audio.size + windowSamples - 1) / windowSamples
|
||||
val env = FloatArray(nWindows)
|
||||
for (w in 0 until nWindows) {
|
||||
val start = w * windowSamples
|
||||
val end = minOf(start + windowSamples, audio.size)
|
||||
var sumSq = 0.0
|
||||
for (i in start until end) {
|
||||
val s = audio[i].toDouble()
|
||||
sumSq += s * s
|
||||
}
|
||||
val rms = kotlin.math.sqrt(sumSq / (end - start))
|
||||
// Normalize: 32767 is full-scale; squash the upper range
|
||||
// with a sqrt curve so even quiet speech shows visible
|
||||
// motion without saturating on loud peaks.
|
||||
env[w] = kotlin.math.sqrt((rms / 32767.0).coerceIn(0.0, 1.0)).toFloat()
|
||||
}
|
||||
return env
|
||||
}
|
||||
|
||||
/** Compute a per-window log-spaced band spectrogram used by the
|
||||
* spectrum-in-sphere visualizer. Time axis aligned with the RMS
|
||||
* envelope (one column per ENVELOPE_WINDOW_MS). FFT size is 1024
|
||||
* samples (~43 ms at 24 kHz), windowed with Hann and centered on
|
||||
* each hop. [SPECTRUM_BANDS] log-spaced bands from 120 Hz to
|
||||
* 4 kHz — covers the vocal formant range without wasting visual
|
||||
* space on silent sub-100 Hz or frictive >4 kHz content. */
|
||||
private fun computeSpectrogram(audio: ShortArray): Array<FloatArray> {
|
||||
if (audio.isEmpty()) return emptyArray()
|
||||
val fftSize = FFT_SIZE
|
||||
val hopSamples = SR * ENVELOPE_WINDOW_MS / 1000
|
||||
val nFrames = (audio.size + hopSamples - 1) / hopSamples
|
||||
// Pre-compute band edges as FFT bin indices.
|
||||
val binHzRes = SR.toDouble() / fftSize
|
||||
val fMin = 120.0; val fMax = 4000.0
|
||||
val bandEdges = IntArray(SPECTRUM_BANDS + 1) { i ->
|
||||
val f = fMin * Math.pow(fMax / fMin, i.toDouble() / SPECTRUM_BANDS)
|
||||
(f / binHzRes).toInt().coerceIn(1, fftSize / 2 - 1)
|
||||
}
|
||||
// Hann window — reduces spectral leakage, gives cleaner bars.
|
||||
val hann = FloatArray(fftSize) { i ->
|
||||
(0.5 - 0.5 * Math.cos(2.0 * Math.PI * i / (fftSize - 1))).toFloat()
|
||||
}
|
||||
val re = FloatArray(fftSize)
|
||||
val im = FloatArray(fftSize)
|
||||
val result = Array(nFrames) { FloatArray(SPECTRUM_BANDS) }
|
||||
for (f in 0 until nFrames) {
|
||||
// Center the window on the hop midpoint.
|
||||
val center = f * hopSamples + hopSamples / 2
|
||||
val start = center - fftSize / 2
|
||||
for (i in 0 until fftSize) {
|
||||
val idx = start + i
|
||||
val sample = if (idx in audio.indices) audio[idx].toFloat() / 32768f else 0f
|
||||
re[i] = sample * hann[i]
|
||||
im[i] = 0f
|
||||
}
|
||||
fftInPlace(re, im)
|
||||
for (b in 0 until SPECTRUM_BANDS) {
|
||||
val bStart = bandEdges[b]
|
||||
val bEnd = bandEdges[b + 1].coerceAtLeast(bStart + 1)
|
||||
var sum = 0.0
|
||||
for (k in bStart until bEnd) {
|
||||
val reK = re[k].toDouble(); val imK = im[k].toDouble()
|
||||
sum += reK * reK + imK * imK
|
||||
}
|
||||
val mag = Math.sqrt(sum / (bEnd - bStart))
|
||||
// Log-compress + normalize. Speech energy per band rarely
|
||||
// exceeds ~0.1 before log; the constants below bring the
|
||||
// typical range to [0.2, 0.95] for visible bar motion.
|
||||
result[f][b] = (Math.log10(1.0 + mag * 80) / Math.log10(7.0))
|
||||
.toFloat().coerceIn(0f, 1f)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
/** In-place radix-2 Cooley–Tukey FFT. Size must be a power of 2. */
|
||||
private fun fftInPlace(re: FloatArray, im: FloatArray) {
|
||||
val n = re.size
|
||||
// Bit-reversal permutation.
|
||||
var j = 0
|
||||
for (i in 1 until n) {
|
||||
var bit = n shr 1
|
||||
while (j and bit != 0) { j = j xor bit; bit = bit shr 1 }
|
||||
j = j or bit
|
||||
if (i < j) {
|
||||
val tr = re[i]; re[i] = re[j]; re[j] = tr
|
||||
val ti = im[i]; im[i] = im[j]; im[j] = ti
|
||||
}
|
||||
}
|
||||
// Butterflies.
|
||||
var size = 2
|
||||
while (size <= n) {
|
||||
val half = size / 2
|
||||
val step = n / size
|
||||
val angleBase = -2.0 * Math.PI / size
|
||||
var m = 0
|
||||
while (m < n) {
|
||||
var k = 0
|
||||
for (i in m until m + half) {
|
||||
val angle = (angleBase * k).toFloat()
|
||||
val c = kotlin.math.cos(angle)
|
||||
val s = kotlin.math.sin(angle)
|
||||
val tRe = re[i + half] * c - im[i + half] * s
|
||||
val tIm = re[i + half] * s + im[i + half] * c
|
||||
re[i + half] = re[i] - tRe
|
||||
im[i + half] = im[i] - tIm
|
||||
re[i] = re[i] + tRe
|
||||
im[i] = im[i] + tIm
|
||||
k += step
|
||||
}
|
||||
m += size
|
||||
}
|
||||
size *= 2
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun endStreamingSessionMp() {
|
||||
val chan = sessionMpQueue ?: return
|
||||
chan.close()
|
||||
try { sessionMpJob?.join() } catch (_: Exception) {}
|
||||
sessionMpQueue = null; sessionMpJob = null
|
||||
onSegmentPlaying = null
|
||||
nlog("streaming session closed (MediaPlayer fallback)")
|
||||
}
|
||||
|
||||
/**
|
||||
* Play a WAV file via Android MediaPlayer and block the calling
|
||||
* coroutine until playback completes. MediaPlayer uses a separate
|
||||
* audio pipeline from AudioTrack so it bypasses ColorOS's AudioTrack
|
||||
* hardening/muting behaviour.
|
||||
*/
|
||||
private suspend fun playWavBlocking(path: String, segIdx: Int) {
|
||||
val t0 = System.currentTimeMillis()
|
||||
suspendCancellableCoroutine<Unit> { cont ->
|
||||
val mp = android.media.MediaPlayer()
|
||||
try {
|
||||
mp.setAudioAttributes(android.media.AudioAttributes.Builder()
|
||||
.setUsage(android.media.AudioAttributes.USAGE_MEDIA)
|
||||
.setContentType(android.media.AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build())
|
||||
mp.setDataSource(path)
|
||||
mp.setOnPreparedListener {
|
||||
nlog("MP seg $segIdx prepared, starting (prep ${System.currentTimeMillis() - t0}ms)")
|
||||
it.start()
|
||||
}
|
||||
mp.setOnCompletionListener {
|
||||
nlog("MP seg $segIdx done (${System.currentTimeMillis() - t0}ms total)")
|
||||
try { it.release() } catch (_: Exception) {}
|
||||
if (cont.isActive) cont.resume(Unit) {}
|
||||
}
|
||||
mp.setOnErrorListener { player, what, extra ->
|
||||
nlog("MP seg $segIdx error: what=$what extra=$extra")
|
||||
try { player.release() } catch (_: Exception) {}
|
||||
if (cont.isActive) cont.resume(Unit) {}
|
||||
true
|
||||
}
|
||||
mp.prepareAsync()
|
||||
cont.invokeOnCancellation { try { mp.release() } catch (_: Exception) {} }
|
||||
} catch (e: Exception) {
|
||||
nlog("MP seg $segIdx setup failed: ${e.message}")
|
||||
try { mp.release() } catch (_: Exception) {}
|
||||
if (cont.isActive) cont.resume(Unit) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun startStreamingSession() {
|
||||
if (USE_MEDIAPLAYER_FALLBACK) { startStreamingSessionMp(); return }
|
||||
if (sessionTrack != null) return // already open
|
||||
// USAGE_VOICE_COMMUNICATION routes to STREAM_VOICE_CALL, which
|
||||
// ColorOS's "Audio Hardening" policy does NOT silently mute (the
|
||||
// policy targets STREAM_MUSIC to preserve battery on inactive media
|
||||
// apps; STREAM_VOICE_CALL is reserved for VoIP and always plays).
|
||||
// Previous attempts with USAGE_MEDIA and USAGE_ASSISTANT both got
|
||||
// `event:muted updated source:clientVolume` ~0.6–1 s after play()
|
||||
// even with audio focus + mediaPlayback FGS, so moving off of
|
||||
// STREAM_MUSIC is the only route that unblocks audible playback.
|
||||
val attrs = AudioAttributes.Builder()
|
||||
.setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build()
|
||||
val track = AudioTrack.Builder()
|
||||
.setAudioAttributes(attrs)
|
||||
.setAudioAttributes(AudioAttributes.Builder()
|
||||
.setUsage(AudioAttributes.USAGE_MEDIA)
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.build())
|
||||
.setAudioFormat(AudioFormat.Builder()
|
||||
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
||||
.setSampleRate(SR)
|
||||
|
|
@ -3830,77 +3340,7 @@ class Qwen3TtsEngine(
|
|||
// paces writes when full.
|
||||
.setTransferMode(AudioTrack.MODE_STREAM)
|
||||
.build()
|
||||
// Request audio focus for the duration of the session. Without this
|
||||
// ColorOS's Audio Hardening treats the track as background noise
|
||||
// and mutes it, regardless of FGS status. We don't care about
|
||||
// focus loss callbacks — if another app grabs focus mid-sentence
|
||||
// that's fine, the track just gets ducked.
|
||||
val am = context?.getSystemService(android.content.Context.AUDIO_SERVICE) as? android.media.AudioManager
|
||||
val focusReq = android.media.AudioFocusRequest.Builder(android.media.AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK)
|
||||
.setAudioAttributes(attrs)
|
||||
.setOnAudioFocusChangeListener { _ -> }
|
||||
.build()
|
||||
val focusRes = am?.requestAudioFocus(focusReq)
|
||||
nlog("audio focus request: $focusRes (1=granted, 0=failed, 2=delayed)")
|
||||
sessionFocusRequest = focusReq
|
||||
// ColorOS mutes AudioTrack clientVolume ~1s after creation (seen in
|
||||
// dumpsys audio as `event:muted updated source:clientVolume`). Force
|
||||
// track volume back to 1.0 repeatedly to override. This is also
|
||||
// done in the keep-alive watchdog loop below for ongoing override.
|
||||
try { track.setVolume(1.0f) } catch (_: Exception) {}
|
||||
track.play()
|
||||
sessionFramesWritten.set(0)
|
||||
sessionGenActive.set(false)
|
||||
// writeAndCount is the single path through which PCM reaches the
|
||||
// AudioTrack for this session, so sessionFramesWritten always stays
|
||||
// in sync with what's been queued to playback hardware. AudioTrack.write
|
||||
// is thread-safe, so this can be called concurrently from the session
|
||||
// worker (real audio) and the keep-alive watchdog (silence padding).
|
||||
val writeAndCount: (ShortArray) -> Unit = { pcm ->
|
||||
if (pcm.isNotEmpty()) {
|
||||
val n = track.write(pcm, 0, pcm.size)
|
||||
if (n > 0) sessionFramesWritten.addAndGet(n.toLong())
|
||||
}
|
||||
}
|
||||
// Bootstrap silence: queue 500 ms immediately after play() so
|
||||
// AudioFlinger has samples to mix from the very first cycle.
|
||||
// Without this, there's a ~100 ms window between play() and the
|
||||
// first watchdog tick where the track has no data and AudioFlinger
|
||||
// flags it for removal. Once that happens, playbackHead sticks at
|
||||
// 0 and subsequent writes go to a dead track.
|
||||
val bootstrapSilence = ShortArray(SR / 2) // 500 ms
|
||||
writeAndCount(bootstrapSilence)
|
||||
// Keep-alive watchdog. AudioFlinger on OnePlus/ColorOS kills a track
|
||||
// that underruns for ~1 s (confirmed via `prepareTracks_l BUFFER
|
||||
// TIMEOUT: remove track … due to underrun on thread 29`). Our
|
||||
// per-segment synthesis takes 3–5 s, which always exceeds that
|
||||
// window between writes, so the track was getting silenced after
|
||||
// the first ~1 s of audio played. The watchdog pads with 200 ms of
|
||||
// silence any time the buffered-ahead audio drops below 400 ms,
|
||||
// regardless of segment state — silence only advances playback head
|
||||
// in the gaps between real audio and is never inserted inside a
|
||||
// contiguous burst of real writes (those bring buffered above 400 ms
|
||||
// and keep the watchdog quiet).
|
||||
val keepAliveBuffer = ShortArray(SR / 5) // 200 ms of silence
|
||||
val keepAliveJob = kotlinx.coroutines.CoroutineScope(
|
||||
kotlinx.coroutines.Dispatchers.IO
|
||||
).launch {
|
||||
var tick = 0
|
||||
while (kotlinx.coroutines.currentCoroutineContext()[kotlinx.coroutines.Job]?.isActive != false) {
|
||||
kotlinx.coroutines.delay(100)
|
||||
val head = track.playbackHeadPosition.toLong() and 0xFFFFFFFFL
|
||||
val written = sessionFramesWritten.get() and 0xFFFFFFFFL
|
||||
val buffered = written - head
|
||||
val needsPad = buffered < SR * 2 / 5 // < 400 ms
|
||||
if ((tick and 0x1F) == 0) {
|
||||
nlog("keepAlive tick=$tick head=$head written=$written buffered=$buffered pad=$needsPad state=${track.playState}")
|
||||
}
|
||||
tick++
|
||||
// Override any clientVolume mute that ColorOS keeps applying.
|
||||
try { track.setVolume(1.0f) } catch (_: Exception) {}
|
||||
if (needsPad) writeAndCount(keepAliveBuffer)
|
||||
}
|
||||
}
|
||||
val chan = kotlinx.coroutines.channels.Channel<String>(
|
||||
capacity = kotlinx.coroutines.channels.Channel.UNLIMITED
|
||||
)
|
||||
|
|
@ -3910,26 +3350,15 @@ class Qwen3TtsEngine(
|
|||
var segIdx = 0
|
||||
for (sentence in chan) {
|
||||
try {
|
||||
sessionGenActive.set(true)
|
||||
if (USE_STREAMING_DECODE && talkerPteModule != null && cpPteModule != null) {
|
||||
// CP↔BigVGAN overlap path: audio chunks flow to the
|
||||
// shared AudioTrack as soon as BigVGAN finishes each
|
||||
// SEQ_LEN window, instead of after the whole segment.
|
||||
generateSegmentAudioVCStreaming(sentence, segIdx, writeAndCount)
|
||||
} else {
|
||||
val audio = generateSegmentAudioVC(sentence, segIdx)
|
||||
writeAndCount(audio)
|
||||
}
|
||||
val audio = generateSegmentAudioVC(sentence, segIdx)
|
||||
if (audio.isNotEmpty()) track.write(audio, 0, audio.size)
|
||||
segIdx++
|
||||
} catch (e: Exception) {
|
||||
nlog("session seg $segIdx error: ${e.message}")
|
||||
} finally {
|
||||
sessionGenActive.set(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
sessionTrack = track; sessionChannel = chan; sessionJob = job
|
||||
sessionKeepAliveJob = keepAliveJob
|
||||
nlog("streaming session opened")
|
||||
}
|
||||
|
||||
|
|
@ -3939,12 +3368,6 @@ class Qwen3TtsEngine(
|
|||
* immediately. Sentences play in the order they were enqueued.
|
||||
*/
|
||||
fun enqueueSentence(sentence: String) {
|
||||
if (USE_MEDIAPLAYER_FALLBACK) {
|
||||
val chan = sessionMpQueue ?: run { nlog("enqueueSentence: no MP session"); return }
|
||||
val r = chan.trySend(sentence)
|
||||
if (r.isFailure) nlog("enqueueSentence: MP channel full / closed")
|
||||
return
|
||||
}
|
||||
val chan = sessionChannel ?: run { nlog("enqueueSentence: no session open"); return }
|
||||
val r = chan.trySend(sentence)
|
||||
if (r.isFailure) nlog("enqueueSentence: channel full / closed")
|
||||
|
|
@ -3956,46 +3379,17 @@ class Qwen3TtsEngine(
|
|||
* drains), then release the shared track. Safe to call more than once.
|
||||
*/
|
||||
suspend fun endStreamingSession() {
|
||||
if (USE_MEDIAPLAYER_FALLBACK) { endStreamingSessionMp(); return }
|
||||
val chan = sessionChannel ?: return
|
||||
chan.close()
|
||||
try { sessionJob?.join() } catch (_: Exception) {}
|
||||
// Stop the keep-alive watchdog BEFORE draining so it doesn't pad more
|
||||
// silence onto the tail while we're waiting for the existing buffer
|
||||
// to play out.
|
||||
try { sessionKeepAliveJob?.cancel() } catch (_: Exception) {}
|
||||
try { sessionKeepAliveJob?.join() } catch (_: Exception) {}
|
||||
try {
|
||||
sessionTrack?.let { track ->
|
||||
// AudioTrack.stop() in MODE_STREAM DISCARDS unplayed buffered
|
||||
// samples — it doesn't block for drain. Poll getPlaybackHead
|
||||
// Position() until it reaches what we wrote, then stop. The
|
||||
// head is a 32-bit wrap-around counter, so compare modulo.
|
||||
// Cap the drain wait so a stalled track can't block us forever.
|
||||
val targetFrames = sessionFramesWritten.get()
|
||||
val startMs = System.currentTimeMillis()
|
||||
val maxDrainMs = (targetFrames * 1000L / SR) + 500L // audio dur + 500ms slack
|
||||
while (true) {
|
||||
val head = track.playbackHeadPosition.toLong() and 0xFFFFFFFFL
|
||||
val reached = head >= (targetFrames and 0xFFFFFFFFL)
|
||||
val state = track.playState
|
||||
if (reached || state != AudioTrack.PLAYSTATE_PLAYING) break
|
||||
if (System.currentTimeMillis() - startMs > maxDrainMs) {
|
||||
nlog("endStreamingSession: drain timeout at head=$head/$targetFrames")
|
||||
break
|
||||
}
|
||||
kotlinx.coroutines.delay(20)
|
||||
}
|
||||
track.stop(); track.release()
|
||||
sessionTrack?.let {
|
||||
// Block until written samples have been consumed by the
|
||||
// hardware so users aren't cut off mid-syllable.
|
||||
it.stop(); it.release()
|
||||
}
|
||||
} catch (_: Exception) {}
|
||||
// Release audio focus after the track is fully drained and stopped.
|
||||
try {
|
||||
val am = context?.getSystemService(android.content.Context.AUDIO_SERVICE) as? android.media.AudioManager
|
||||
sessionFocusRequest?.let { am?.abandonAudioFocusRequest(it) }
|
||||
} catch (_: Exception) {}
|
||||
sessionFocusRequest = null
|
||||
sessionTrack = null; sessionChannel = null; sessionJob = null; sessionKeepAliveJob = null
|
||||
sessionTrack = null; sessionChannel = null; sessionJob = null
|
||||
nlog("streaming session closed")
|
||||
}
|
||||
|
||||
|
|
@ -4052,177 +3446,6 @@ class Qwen3TtsEngine(
|
|||
return fadeOut(decodeChunked(codebooks, n), 40)
|
||||
}
|
||||
|
||||
// ---------- Streaming decode (CP ↔ BigVGAN overlap) ----------
|
||||
|
||||
/** Carrier from the talker/CP producer to the BigVGAN consumer. */
|
||||
private class ChunkMsg(val codebooks: Array<IntArray>, val realTokens: Int)
|
||||
|
||||
/**
|
||||
* Streaming variant of decodeChunked. Mirrors its semantics exactly: the
|
||||
* internal `result` buffer accumulates and crossfades chunks the same
|
||||
* way, so the final assembled audio is bit-identical. The difference is
|
||||
* that whenever a portion of `result` becomes "stable" (no future chunk
|
||||
* can modify it, i.e. anything before the last `overlapSamples`), it is
|
||||
* emitted via `onAudio` immediately. `flushFinal()` emits the remaining
|
||||
* tail with fadeOut applied, matching the original behaviour.
|
||||
*/
|
||||
private inner class StreamingCrossfader(private val onAudio: (ShortArray) -> Unit) {
|
||||
private val overlapSamples = CHUNK_OVERLAP * SAMPLES_PER_TOKEN
|
||||
private var result = ShortArray(0)
|
||||
private var emittedLen = 0
|
||||
private var isFirst = true
|
||||
|
||||
fun feedChunk(chunkAudio: ShortArray, realTokens: Int) {
|
||||
val trimLen = minOf(realTokens * SAMPLES_PER_TOKEN, chunkAudio.size)
|
||||
val trimmed = if (trimLen < chunkAudio.size) chunkAudio.copyOf(trimLen) else chunkAudio
|
||||
|
||||
if (isFirst) {
|
||||
result = trimmed.copyOf()
|
||||
isFirst = false
|
||||
} else {
|
||||
val fadeLen = minOf(overlapSamples, result.size, trimmed.size)
|
||||
for (i in 0 until fadeLen) {
|
||||
val alpha = i.toFloat() / fadeLen
|
||||
val mixed = ((1f - alpha) * result[result.size - fadeLen + i] + alpha * trimmed[i]).toInt()
|
||||
.coerceIn(Short.MIN_VALUE.toInt(), Short.MAX_VALUE.toInt()).toShort()
|
||||
result[result.size - fadeLen + i] = mixed
|
||||
}
|
||||
if (fadeLen < trimmed.size) {
|
||||
val newPart = trimmed.copyOfRange(fadeLen, trimmed.size)
|
||||
val combined = ShortArray(result.size + newPart.size)
|
||||
System.arraycopy(result, 0, combined, 0, result.size)
|
||||
System.arraycopy(newPart, 0, combined, result.size, newPart.size)
|
||||
result = combined
|
||||
}
|
||||
}
|
||||
|
||||
// Hold back the last `overlapSamples` so the next chunk's
|
||||
// crossfade can still mutate them; emit everything before that.
|
||||
val stableEnd = (result.size - overlapSamples).coerceAtLeast(emittedLen)
|
||||
if (stableEnd > emittedLen) {
|
||||
val slice = result.copyOfRange(emittedLen, stableEnd)
|
||||
onAudio(slice)
|
||||
emittedLen = stableEnd
|
||||
}
|
||||
}
|
||||
|
||||
/** Emit any remaining buffered samples with the trailing fadeOut. */
|
||||
fun flushFinal() {
|
||||
if (emittedLen < result.size) {
|
||||
val tail = result.copyOfRange(emittedLen, result.size)
|
||||
onAudio(fadeOut(tail, 40))
|
||||
emittedLen = result.size
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Streaming variant of generateSegmentAudioVC. As the talker/CP loop
|
||||
* produces codes step by step, BigVGAN chunks are dispatched on a
|
||||
* background coroutine the moment SEQ_LEN codes are accumulated. For a
|
||||
* 75-token segment this overlaps the last BigVGAN pass with the final
|
||||
* ~20 talker/CP steps, cutting first-audio latency by ~4 s vs the
|
||||
* sequential `generateSegmentAudioVC` path.
|
||||
*
|
||||
* Short segments (<SEQ_LEN codes) emit a single chunk at end-of-gen,
|
||||
* matching the legacy single-chunk path with no perceptible difference.
|
||||
*
|
||||
* The producer thread blocks on `bvChan.send` if the BigVGAN consumer
|
||||
* is behind; in practice that never happens because the producer takes
|
||||
* ~5 s per chunk vs ~2.4 s for BigVGAN.
|
||||
*/
|
||||
private suspend fun generateSegmentAudioVCStreaming(
|
||||
segText: String, segIdx: Int, onAudio: (ShortArray) -> Unit
|
||||
) {
|
||||
if (bpeTokenizer == null || textEmbedsFullBuf == null || damienVoicePrefix == null || damienVoiceSuffix == null) {
|
||||
nlog("generateSegmentAudioVCStreaming: Stage 2 assets missing"); return
|
||||
}
|
||||
if (talkerPteModule == null || cpPteModule == null) {
|
||||
nlog("generateSegmentAudioVCStreaming: PTE talker/CP not loaded"); return
|
||||
}
|
||||
val prefix = damienVoicePrefix!!
|
||||
val suffix = damienVoiceSuffix!!
|
||||
val codecPadEmb = codecEmb(CODEC_PAD)
|
||||
val ids = bpeTokenizer!!.encode(segText)
|
||||
nlog("session seg $segIdx (stream) '${segText.take(60)}' → ${ids.size} tokens")
|
||||
|
||||
val prefill = ArrayList<FloatArray>(prefix.size + ids.size + suffix.size)
|
||||
for (e in prefix) prefill.add(e)
|
||||
for (id in ids) prefill.add(sumEmb(textEmbFromFull(id), codecPadEmb))
|
||||
for (e in suffix) prefill.add(e)
|
||||
|
||||
val expectedSteps = (ids.size * 24) / 10
|
||||
val maxGen = minOf(expectedSteps * 3 / 2 + 10, MAX_CONTEXT - 15)
|
||||
|
||||
val tStart = System.currentTimeMillis()
|
||||
var firstAudioLogged = false
|
||||
val bvChan = kotlinx.coroutines.channels.Channel<ChunkMsg>(capacity = 4)
|
||||
val cfader = StreamingCrossfader { pcm ->
|
||||
if (!firstAudioLogged) {
|
||||
nlog("streaming seg $segIdx first audio at ${System.currentTimeMillis() - tStart}ms (${pcm.size} samples)")
|
||||
firstAudioLogged = true
|
||||
}
|
||||
onAudio(pcm)
|
||||
}
|
||||
val consumerJob = kotlinx.coroutines.CoroutineScope(kotlinx.coroutines.Dispatchers.IO).launch {
|
||||
try {
|
||||
for (msg in bvChan) {
|
||||
val quant = vqDecode(msg.codebooks)
|
||||
val audio = runSpeechDecoderV2(quant)
|
||||
cfader.feedChunk(audio, msg.realTokens)
|
||||
}
|
||||
cfader.flushFinal()
|
||||
} catch (e: Exception) {
|
||||
nlog("streaming seg $segIdx consumer error: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
// Producer: run the interleaved talker/CP loop and dispatch each
|
||||
// SEQ_LEN-aligned window of codes immediately. The consumer's
|
||||
// crossfader holds back the last `overlapSamples` of audio per
|
||||
// chunk, so the in-flight chunk's tail can still be mutated by the
|
||||
// next chunk before being emitted; flushFinal() at end emits the
|
||||
// last tail with fadeOut. End-of-stream is signalled by closing
|
||||
// bvChan after the trailing partial chunk is sent.
|
||||
val collected = mutableListOf<IntArray>()
|
||||
var nextChunkStart = 0
|
||||
|
||||
fun buildChunkCb(start: Int, real: Int): Array<IntArray> = Array(NUM_CODEBOOKS) { cb ->
|
||||
IntArray(SEQ_LEN) { t ->
|
||||
val src = start + t
|
||||
if (src < start + real) {
|
||||
val v = collected[src][cb]
|
||||
if (v in 0 until CODEBOOK_SIZE) v else 0
|
||||
} else 0
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
runInterleavedPteFromEmbeds(prefill, emptyList(), maxGen) { _, codes ->
|
||||
collected.add(codes)
|
||||
while (collected.size >= nextChunkStart + SEQ_LEN) {
|
||||
val cb = buildChunkCb(nextChunkStart, SEQ_LEN)
|
||||
kotlinx.coroutines.runBlocking { bvChan.send(ChunkMsg(cb, SEQ_LEN)) }
|
||||
nextChunkStart += EFFECTIVE_CHUNK
|
||||
}
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
nlog("streaming seg $segIdx producer error: ${e.message}")
|
||||
}
|
||||
|
||||
// Trailing chunk: any remaining tokens after the last full window
|
||||
// (covers both the medium-segment partial-tail case and the
|
||||
// short-segment <SEQ_LEN single-chunk case where nextChunkStart=0).
|
||||
val total = collected.size
|
||||
if (total > nextChunkStart) {
|
||||
val trailing = total - nextChunkStart
|
||||
val cb = buildChunkCb(nextChunkStart, trailing)
|
||||
kotlinx.coroutines.runBlocking { bvChan.send(ChunkMsg(cb, trailing)) }
|
||||
}
|
||||
bvChan.close()
|
||||
consumerJob.join()
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the Hexagon talker + CP generation loop with a fully pre-built
|
||||
* prefill (voice prefix + all text tokens). Same decode recipe as
|
||||
|
|
|
|||
|
|
@ -1,548 +0,0 @@
|
|||
package com.kazeia.ui
|
||||
|
||||
import android.content.Context
|
||||
import android.graphics.Canvas
|
||||
import android.graphics.Color
|
||||
import android.graphics.Paint
|
||||
import android.graphics.Path
|
||||
import android.graphics.RadialGradient
|
||||
import android.graphics.Shader
|
||||
import android.util.AttributeSet
|
||||
import android.view.Choreographer
|
||||
import android.view.View
|
||||
import kotlin.math.PI
|
||||
import kotlin.math.cos
|
||||
import kotlin.math.max
|
||||
import kotlin.math.min
|
||||
import kotlin.math.sin
|
||||
import kotlin.math.sqrt
|
||||
|
||||
/**
|
||||
* Large, central orb visualizer — Kazeia's visual "face". Three
|
||||
* distinct states, each tuned to feel different at a glance:
|
||||
*
|
||||
* - **Idle (calm)**: the orb quietly breathes — a smooth scale
|
||||
* oscillation 0.88 ↔ 1.0 over a 5 s cycle with a soft halo that
|
||||
* pulses in phase. No high-frequency motion. Suggests "waiting,
|
||||
* listening, not anxious".
|
||||
*
|
||||
* - **Listening (attentive)**: the orb settles slightly larger, a
|
||||
* warmer bright ring appears around it, and its outline deforms
|
||||
* organically with the live mic RMS (blob-like wobble, 8 Fourier
|
||||
* modes, gain-mapped from the RMS). Micro-ripples emit
|
||||
* continuously while speech is present. Feels alive and engaged
|
||||
* — clearly different from Idle's static breathing.
|
||||
*
|
||||
* - **Speaking (active)**: the orb is rendered **as a contained
|
||||
* spectrometer**. Inside the sphere boundary, SPECTRUM_BANDS
|
||||
* vertical bars rise from a horizontal baseline according to a
|
||||
* pre-computed band-energy sidecar. The sphere outline pulses
|
||||
* with the overall RMS envelope. The bars are clipped to the
|
||||
* sphere so it really looks like "the sphere itself is speaking"
|
||||
* — not an overlaid spectrogram. Strong amplitude peaks release
|
||||
* outward ripple waves on the halo.
|
||||
*
|
||||
* The whole palette (core, halo, ring, bars, ripples) is re-derived
|
||||
* from a single [voiceColor] setter so each speaker gets a distinct
|
||||
* visual identity.
|
||||
*/
|
||||
class AudioVisualizerView @JvmOverloads constructor(
|
||||
context: Context,
|
||||
attrs: AttributeSet? = null,
|
||||
defStyleAttr: Int = 0
|
||||
) : View(context, attrs, defStyleAttr), Choreographer.FrameCallback {
|
||||
|
||||
companion object {
|
||||
/** Must match Qwen3TtsEngine.SPECTRUM_BANDS. Asserted at setSpeaking. */
|
||||
private const val SPECTRUM_BANDS = 12
|
||||
/** Listening-mode outline deformation modes (even = smooth blobs). */
|
||||
private const val BLOB_MODES = 8
|
||||
}
|
||||
|
||||
// ---------- State ----------
|
||||
private sealed class State {
|
||||
object Idle : State()
|
||||
data class Listening(var micRms: Float, var phaseSeed: Float) : State()
|
||||
data class Speaking(
|
||||
val envelope: FloatArray,
|
||||
val spectrogram: Array<FloatArray>,
|
||||
val durationMs: Long,
|
||||
val startedAtMs: Long
|
||||
) : State()
|
||||
}
|
||||
|
||||
@Volatile private var state: State = State.Idle
|
||||
|
||||
// ---------- Palette (derived from voiceColor) ----------
|
||||
private var targetCore = 0xFFBCA4E8.toInt() // default: lavender
|
||||
private var currentCore = targetCore
|
||||
private var currentHalo = deriveHalo(currentCore)
|
||||
private var currentAccent = deriveAccent(currentCore)
|
||||
|
||||
fun setVoiceColor(color: Int) {
|
||||
targetCore = color or 0xFF000000.toInt() // force opaque
|
||||
scheduleFrame()
|
||||
}
|
||||
|
||||
// ---------- Animation state ----------
|
||||
private var frameStartNs = 0L
|
||||
private var smoothedAmp = 0f // 0..1 orb-size pulsation (all states)
|
||||
private var smoothedBars = FloatArray(SPECTRUM_BANDS)
|
||||
private var listeningRingPhase = 0f // rotating shimmer on listening ring
|
||||
private val ripples = ArrayList<Ripple>()
|
||||
private var lastSpectroIdx = -1
|
||||
|
||||
// ---------- Paints ----------
|
||||
private val corePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL }
|
||||
private val haloPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply { style = Paint.Style.FILL }
|
||||
private val ringPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
|
||||
style = Paint.Style.STROKE
|
||||
}
|
||||
private val ripplePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
|
||||
style = Paint.Style.STROKE
|
||||
strokeWidth = 3f
|
||||
}
|
||||
private val barPaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
|
||||
style = Paint.Style.FILL_AND_STROKE
|
||||
}
|
||||
private val blobOutlinePaint = Paint(Paint.ANTI_ALIAS_FLAG).apply {
|
||||
style = Paint.Style.STROKE
|
||||
}
|
||||
private val blobPath = Path()
|
||||
private val spherePath = Path()
|
||||
|
||||
init {
|
||||
setLayerType(LAYER_TYPE_HARDWARE, null)
|
||||
}
|
||||
|
||||
// ==================== Public API ====================
|
||||
|
||||
fun setIdle() {
|
||||
if (state !is State.Idle) { state = State.Idle; lastSpectroIdx = -1 }
|
||||
scheduleFrame()
|
||||
}
|
||||
|
||||
fun setListening(micRms: Float) {
|
||||
val clamped = micRms.coerceIn(0f, 1f)
|
||||
val s = state
|
||||
if (s is State.Listening) {
|
||||
s.micRms = clamped
|
||||
} else {
|
||||
state = State.Listening(clamped, (System.nanoTime() and 0xFFFF) / 65535f)
|
||||
}
|
||||
scheduleFrame()
|
||||
}
|
||||
|
||||
fun startSpeaking(
|
||||
envelope: FloatArray,
|
||||
spectrogram: Array<FloatArray>,
|
||||
durationMs: Long
|
||||
) {
|
||||
if (envelope.isEmpty() || spectrogram.isEmpty() || durationMs <= 0) {
|
||||
setIdle(); return
|
||||
}
|
||||
state = State.Speaking(envelope, spectrogram, durationMs, System.currentTimeMillis())
|
||||
lastSpectroIdx = -1
|
||||
// Soft reset bar heights so the spectrum grows from zero rather
|
||||
// than snapping to the idle smoothing residue.
|
||||
for (i in smoothedBars.indices) smoothedBars[i] = 0f
|
||||
scheduleFrame()
|
||||
}
|
||||
|
||||
// ==================== Lifecycle / scheduling ====================
|
||||
|
||||
override fun onAttachedToWindow() {
|
||||
super.onAttachedToWindow()
|
||||
frameStartNs = System.nanoTime()
|
||||
scheduleFrame()
|
||||
}
|
||||
|
||||
override fun onDetachedFromWindow() {
|
||||
super.onDetachedFromWindow()
|
||||
Choreographer.getInstance().removeFrameCallback(this)
|
||||
}
|
||||
|
||||
private var frameScheduled = false
|
||||
private fun scheduleFrame() {
|
||||
if (!frameScheduled && isAttachedToWindow) {
|
||||
frameScheduled = true
|
||||
Choreographer.getInstance().postFrameCallback(this)
|
||||
}
|
||||
}
|
||||
|
||||
override fun doFrame(frameTimeNanos: Long) {
|
||||
frameScheduled = false
|
||||
// Ease the palette toward the target (voice change tween).
|
||||
currentCore = lerpColor(currentCore, targetCore, 0.12f)
|
||||
currentHalo = deriveHalo(currentCore)
|
||||
currentAccent = deriveAccent(currentCore)
|
||||
|
||||
val s = state
|
||||
when (s) {
|
||||
is State.Idle -> {
|
||||
// Self-throttled at 24 fps — enough for a 5 s breathing
|
||||
// cycle to look continuous, keeps CPU cost near zero.
|
||||
Choreographer.getInstance().postFrameCallbackDelayed(this, 40)
|
||||
frameScheduled = true
|
||||
}
|
||||
is State.Listening -> {
|
||||
listeningRingPhase += 0.015f
|
||||
Choreographer.getInstance().postFrameCallback(this)
|
||||
frameScheduled = true
|
||||
}
|
||||
is State.Speaking -> {
|
||||
val elapsed = System.currentTimeMillis() - s.startedAtMs
|
||||
if (elapsed >= s.durationMs + 300) {
|
||||
state = State.Idle
|
||||
lastSpectroIdx = -1
|
||||
Choreographer.getInstance().postFrameCallbackDelayed(this, 40)
|
||||
frameScheduled = true
|
||||
} else {
|
||||
Choreographer.getInstance().postFrameCallback(this)
|
||||
frameScheduled = true
|
||||
}
|
||||
}
|
||||
}
|
||||
invalidate()
|
||||
}
|
||||
|
||||
// ==================== Drawing ====================
|
||||
|
||||
override fun onDraw(canvas: Canvas) {
|
||||
super.onDraw(canvas)
|
||||
val w = width.toFloat(); val h = height.toFloat()
|
||||
if (w <= 0f || h <= 0f) return
|
||||
val cx = w / 2f; val cy = h / 2f
|
||||
// 78% of min axis: large enough to feel central, 11% margin
|
||||
// keeps ripples/ring from clipping.
|
||||
val maxR = min(w, h) * 0.39f
|
||||
val now = System.currentTimeMillis()
|
||||
|
||||
when (val s = state) {
|
||||
is State.Idle -> drawIdle(canvas, cx, cy, maxR, now)
|
||||
is State.Listening -> drawListening(canvas, cx, cy, maxR, now, s)
|
||||
is State.Speaking -> drawSpeaking(canvas, cx, cy, maxR, now, s)
|
||||
}
|
||||
}
|
||||
|
||||
// ---------- Idle ----------
|
||||
private fun drawIdle(canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long) {
|
||||
// 5 s breathing cycle, amplitude 0.88 → 1.00.
|
||||
val t = ((now - frameStartNs / 1_000_000) % 5000L) / 5000f
|
||||
val breath = 0.5f - 0.5f * cos((t * 2.0 * PI).toFloat()) // 0..1
|
||||
val scale = 0.88f + 0.12f * breath
|
||||
val radius = maxR * scale
|
||||
smoothedAmp += ((breath * 0.5f) - smoothedAmp) * 0.1f
|
||||
|
||||
// Halo (soft, breathing in phase).
|
||||
drawHalo(canvas, cx, cy, maxR * 1.15f * scale, alphaBase = 60, alphaGain = 70)
|
||||
|
||||
// Core — pure round, no deformation.
|
||||
drawCore(canvas, cx, cy, radius, shimmer = 0f)
|
||||
|
||||
// Subtle inner highlight — feels alive without movement.
|
||||
val hl = Paint(Paint.ANTI_ALIAS_FLAG).apply {
|
||||
style = Paint.Style.FILL
|
||||
shader = RadialGradient(
|
||||
cx - radius * 0.25f, cy - radius * 0.25f, radius * 0.9f,
|
||||
Color.argb(60, 255, 255, 255),
|
||||
Color.argb(0, 255, 255, 255),
|
||||
Shader.TileMode.CLAMP
|
||||
)
|
||||
}
|
||||
canvas.drawCircle(cx, cy, radius, hl)
|
||||
}
|
||||
|
||||
// ---------- Listening ----------
|
||||
private fun drawListening(
|
||||
canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, s: State.Listening
|
||||
) {
|
||||
// Base size slightly larger than Idle so the transition reads.
|
||||
val baseScale = 0.93f + 0.08f * s.micRms
|
||||
val radius = maxR * baseScale
|
||||
smoothedAmp += (s.micRms - smoothedAmp) * 0.25f
|
||||
|
||||
// Halo — brighter than Idle, responds to mic.
|
||||
drawHalo(canvas, cx, cy, maxR * 1.22f * baseScale,
|
||||
alphaBase = 90, alphaGain = (130 * s.micRms).toInt().coerceIn(0, 160))
|
||||
|
||||
// Deformed outline (blob): Fourier modes over the circle.
|
||||
buildBlobPath(blobPath, cx, cy, radius, s.micRms, s.phaseSeed, now)
|
||||
|
||||
// Filled core with a radial gradient inside the blob path.
|
||||
corePaint.shader = RadialGradient(
|
||||
cx - radius * 0.15f, cy - radius * 0.25f, radius * 1.1f,
|
||||
currentCore, deriveCoreEdge(currentCore),
|
||||
Shader.TileMode.CLAMP
|
||||
)
|
||||
canvas.save()
|
||||
canvas.clipPath(blobPath)
|
||||
canvas.drawCircle(cx, cy, radius * 1.3f, corePaint)
|
||||
canvas.restore()
|
||||
|
||||
// Outline of the blob, slightly thicker as RMS rises.
|
||||
blobOutlinePaint.strokeWidth = 2f + 2f * s.micRms
|
||||
blobOutlinePaint.color = withAlpha(currentAccent, 180)
|
||||
canvas.drawPath(blobPath, blobOutlinePaint)
|
||||
|
||||
// Rotating shimmer ring — a thin arc segment chasing around.
|
||||
drawListeningRing(canvas, cx, cy, radius * 1.08f, s.micRms)
|
||||
|
||||
// Continuous micro-ripples while listening.
|
||||
val rmsMicroFloor = 0.12f
|
||||
if (s.micRms > rmsMicroFloor && ((now / 90) % 3 == 0L)) {
|
||||
ripples.add(Ripple(bornAtMs = now, peak = s.micRms))
|
||||
}
|
||||
drawRipples(canvas, cx, cy, maxR, now, listeningMode = true)
|
||||
}
|
||||
|
||||
private fun drawListeningRing(
|
||||
canvas: Canvas, cx: Float, cy: Float, radius: Float, rms: Float
|
||||
) {
|
||||
// Thin shimmer arc rotating around the orb, width/alpha scaling
|
||||
// with mic RMS so silence shows almost nothing.
|
||||
if (rms < 0.04f) return
|
||||
ringPaint.strokeWidth = 2.5f + 3f * rms
|
||||
val sweep = 60f + 80f * rms
|
||||
val start = (listeningRingPhase * 360f) % 360f
|
||||
ringPaint.color = withAlpha(currentAccent, (140 + 110 * rms).toInt().coerceIn(0, 250))
|
||||
val r = radius
|
||||
canvas.drawArc(cx - r, cy - r, cx + r, cy + r, start, sweep, false, ringPaint)
|
||||
// Subtle tail: a second, dimmer, shorter arc slightly offset.
|
||||
ringPaint.color = withAlpha(currentAccent, (60 + 60 * rms).toInt().coerceIn(0, 160))
|
||||
canvas.drawArc(cx - r, cy - r, cx + r, cy + r, start + sweep + 8f, sweep * 0.5f, false, ringPaint)
|
||||
}
|
||||
|
||||
// ---------- Speaking ----------
|
||||
private fun drawSpeaking(
|
||||
canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, s: State.Speaking
|
||||
) {
|
||||
// Envelope → overall size pulsation + halo intensity.
|
||||
val elapsed = now - s.startedAtMs
|
||||
val envIdxF = elapsed.toFloat() * s.envelope.size / s.durationMs
|
||||
val envIdx = envIdxF.toInt().coerceIn(0, s.envelope.size - 1)
|
||||
val envFrac = (envIdxF - envIdx).coerceIn(0f, 1f)
|
||||
val env = lerp(
|
||||
s.envelope[envIdx],
|
||||
s.envelope[min(envIdx + 1, s.envelope.size - 1)],
|
||||
envFrac
|
||||
)
|
||||
smoothedAmp += (env - smoothedAmp) * 0.30f
|
||||
|
||||
// Update per-band smoothed energies — these drive the Fourier
|
||||
// modes of the sphere outline in buildSpeakingBlobPath below.
|
||||
val timeIdxF = elapsed.toFloat() * s.spectrogram.size / s.durationMs
|
||||
val timeIdx = timeIdxF.toInt().coerceIn(0, s.spectrogram.size - 1)
|
||||
val timeFrac = (timeIdxF - timeIdx).coerceIn(0f, 1f)
|
||||
for (b in 0 until SPECTRUM_BANDS) {
|
||||
val a = s.spectrogram[timeIdx][b]
|
||||
val c = s.spectrogram[min(timeIdx + 1, s.spectrogram.size - 1)][b]
|
||||
val target = lerp(a, c, timeFrac)
|
||||
smoothedBars[b] += (target - smoothedBars[b]) * 0.35f
|
||||
}
|
||||
|
||||
val scale = 0.92f + 0.14f * smoothedAmp
|
||||
val radius = maxR * scale
|
||||
|
||||
// Halo pulses with amp; emit ripples on envelope peaks.
|
||||
drawHalo(canvas, cx, cy, maxR * 1.30f * scale,
|
||||
alphaBase = 90, alphaGain = (160 * smoothedAmp).toInt().coerceIn(0, 220))
|
||||
|
||||
if (envIdx != lastSpectroIdx && env > 0.45f) {
|
||||
val prev = if (envIdx > 0) s.envelope[envIdx - 1] else 0f
|
||||
val next = if (envIdx < s.envelope.size - 1) s.envelope[envIdx + 1] else 0f
|
||||
if (env >= prev && env >= next) {
|
||||
ripples.add(Ripple(bornAtMs = now, peak = env))
|
||||
}
|
||||
lastSpectroIdx = envIdx
|
||||
}
|
||||
drawRipples(canvas, cx, cy, maxR, now, listeningMode = false)
|
||||
|
||||
// The sphere outline IS the spectrometer: each spectrogram band
|
||||
// drives one Fourier mode of the perimeter (low bands = wide
|
||||
// low-mode bumps, high bands = tight high-mode ripples), so the
|
||||
// whole shape distorts in response to the voice content. No
|
||||
// internal bars or curves — the sphere itself is what speaks.
|
||||
buildSpeakingBlobPath(spherePath, cx, cy, radius, now)
|
||||
|
||||
// Fill the deformed sphere with the voice-tinted gradient.
|
||||
corePaint.shader = RadialGradient(
|
||||
cx - radius * 0.25f, cy - radius * 0.30f, radius * 1.25f,
|
||||
currentCore, deriveCoreEdge(currentCore),
|
||||
Shader.TileMode.CLAMP
|
||||
)
|
||||
canvas.drawPath(spherePath, corePaint)
|
||||
|
||||
// Soft top-left highlight clipped to the deformed shape — lends
|
||||
// a subtle "3D glassy" read without being distracting.
|
||||
canvas.save()
|
||||
canvas.clipPath(spherePath)
|
||||
val hl = Paint(Paint.ANTI_ALIAS_FLAG).apply {
|
||||
style = Paint.Style.FILL
|
||||
shader = RadialGradient(
|
||||
cx - radius * 0.28f, cy - radius * 0.30f, radius * 0.9f,
|
||||
Color.argb(75, 255, 255, 255),
|
||||
Color.argb(0, 255, 255, 255),
|
||||
Shader.TileMode.CLAMP
|
||||
)
|
||||
}
|
||||
canvas.drawCircle(cx, cy, radius * 1.2f, hl)
|
||||
canvas.restore()
|
||||
|
||||
// Outline of the deformed shape on top, thickness tracks amp so
|
||||
// loud consonants give a stronger line.
|
||||
blobOutlinePaint.strokeWidth = 2.5f + 3.5f * smoothedAmp
|
||||
blobOutlinePaint.color = withAlpha(currentAccent, 230)
|
||||
canvas.drawPath(spherePath, blobOutlinePaint)
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the speaking-state sphere perimeter: base circle plus a
|
||||
* sum of Fourier modes, one per spectrogram band. Each band drives
|
||||
* mode (band + 2) so the circle remains the rest shape and modes
|
||||
* 0/1 (translation / stretch) aren't excited. Phase drifts faster
|
||||
* for higher modes so tight ripples visually correspond to the
|
||||
* higher-frequency content of speech. Deformation amplitude is
|
||||
* scaled both by per-band energy and by overall envelope so quiet
|
||||
* passages show small motion and loud syllables show strong
|
||||
* distortion. Sampled at 96 points — smooth enough for the
|
||||
* highest mode we render without being expensive.
|
||||
*/
|
||||
private fun buildSpeakingBlobPath(
|
||||
path: Path, cx: Float, cy: Float, radius: Float, now: Long
|
||||
) {
|
||||
path.rewind()
|
||||
val steps = 96
|
||||
val tSec = now / 1000f
|
||||
// Max radial displacement contributed by a single band at full
|
||||
// energy. 0.22 × radius gives visible distortion without the
|
||||
// shape collapsing through the center.
|
||||
val modeGain = radius * 0.22f
|
||||
// Envelope weight — quiet passages feel less jittery.
|
||||
val envWeight = (0.5f + 0.5f * smoothedAmp).coerceIn(0f, 1f)
|
||||
|
||||
for (i in 0..steps) {
|
||||
val theta = (i % steps).toFloat() / steps * 2f * PI.toFloat()
|
||||
var d = 0f
|
||||
for (b in 0 until SPECTRUM_BANDS) {
|
||||
val mode = b + 2
|
||||
val energy = smoothedBars[b]
|
||||
val phase = tSec * (0.45f + 0.22f * b)
|
||||
d += modeGain * energy * envWeight *
|
||||
sin((mode * theta + phase).toDouble()).toFloat()
|
||||
}
|
||||
val r = radius + d
|
||||
val x = cx + r * cos(theta.toDouble()).toFloat()
|
||||
val y = cy + r * sin(theta.toDouble()).toFloat()
|
||||
if (i == 0) path.moveTo(x, y) else path.lineTo(x, y)
|
||||
}
|
||||
path.close()
|
||||
}
|
||||
|
||||
|
||||
// ---------- Helpers: halo / ripples / blob ----------
|
||||
private fun drawHalo(
|
||||
canvas: Canvas, cx: Float, cy: Float, r: Float,
|
||||
alphaBase: Int, alphaGain: Int
|
||||
) {
|
||||
val a = (alphaBase + alphaGain).coerceIn(0, 255)
|
||||
haloPaint.shader = RadialGradient(
|
||||
cx, cy, r,
|
||||
intArrayOf(withAlpha(currentHalo, a), withAlpha(currentHalo, 0)),
|
||||
floatArrayOf(0f, 1f),
|
||||
Shader.TileMode.CLAMP
|
||||
)
|
||||
canvas.drawCircle(cx, cy, r, haloPaint)
|
||||
}
|
||||
|
||||
private fun drawCore(canvas: Canvas, cx: Float, cy: Float, radius: Float, shimmer: Float) {
|
||||
corePaint.shader = RadialGradient(
|
||||
cx - radius * 0.2f, cy - radius * 0.3f, radius * 1.15f,
|
||||
currentCore, deriveCoreEdge(currentCore),
|
||||
Shader.TileMode.CLAMP
|
||||
)
|
||||
canvas.drawCircle(cx, cy, radius, corePaint)
|
||||
}
|
||||
|
||||
private fun drawRipples(
|
||||
canvas: Canvas, cx: Float, cy: Float, maxR: Float, now: Long, listeningMode: Boolean
|
||||
) {
|
||||
if (ripples.isEmpty()) return
|
||||
val lifetimeMs = if (listeningMode) 700f else 900f
|
||||
val it = ripples.iterator()
|
||||
while (it.hasNext()) {
|
||||
val r = it.next()
|
||||
val age = (now - r.bornAtMs) / lifetimeMs
|
||||
if (age >= 1f) { it.remove(); continue }
|
||||
val radius = maxR * (0.58f + 0.62f * age)
|
||||
val alpha = ((1f - age) * 150f * r.peak).toInt().coerceIn(0, 200)
|
||||
ripplePaint.color = withAlpha(currentAccent, alpha)
|
||||
ripplePaint.strokeWidth = max(1.2f, (1f - age) * 4f)
|
||||
canvas.drawCircle(cx, cy, radius, ripplePaint)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Build an organic blob path by displacing a circle with a sum of
|
||||
* low-frequency sine modes. Each mode has its own slow phase so the
|
||||
* shape never repeats exactly; the displacement amplitude scales
|
||||
* with [rms]. 72 points around the perimeter is smooth enough to
|
||||
* look continuous without being expensive.
|
||||
*/
|
||||
private fun buildBlobPath(
|
||||
path: Path, cx: Float, cy: Float, radius: Float,
|
||||
rms: Float, phaseSeed: Float, now: Long
|
||||
) {
|
||||
path.rewind()
|
||||
val steps = 72
|
||||
val tSec = now / 1000f
|
||||
val amp = radius * (0.02f + 0.08f * rms)
|
||||
for (i in 0..steps) {
|
||||
val theta = (i % steps).toFloat() / steps * 2f * PI.toFloat()
|
||||
var d = 0f
|
||||
for (m in 1..BLOB_MODES) {
|
||||
val phase = phaseSeed * 6.28f + tSec * (0.3f + 0.05f * m)
|
||||
d += (amp / m) * sin((m * theta + phase).toDouble()).toFloat()
|
||||
}
|
||||
val r = radius + d
|
||||
val x = cx + r * cos(theta.toDouble()).toFloat()
|
||||
val y = cy + r * sin(theta.toDouble()).toFloat()
|
||||
if (i == 0) path.moveTo(x, y) else path.lineTo(x, y)
|
||||
}
|
||||
path.close()
|
||||
}
|
||||
|
||||
// ---------- Color helpers ----------
|
||||
private fun deriveHalo(core: Int): Int = darken(core, 0.18f)
|
||||
private fun deriveAccent(core: Int): Int = brighten(core, 0.12f)
|
||||
private fun deriveCoreEdge(core: Int): Int = darken(core, 0.12f)
|
||||
|
||||
private fun brighten(c: Int, frac: Float): Int {
|
||||
val r = (Color.red(c) + (255 - Color.red(c)) * frac).toInt().coerceIn(0, 255)
|
||||
val g = (Color.green(c) + (255 - Color.green(c)) * frac).toInt().coerceIn(0, 255)
|
||||
val b = (Color.blue(c) + (255 - Color.blue(c)) * frac).toInt().coerceIn(0, 255)
|
||||
return Color.argb(Color.alpha(c), r, g, b)
|
||||
}
|
||||
|
||||
private fun darken(c: Int, frac: Float): Int {
|
||||
val r = (Color.red(c) * (1 - frac)).toInt().coerceIn(0, 255)
|
||||
val g = (Color.green(c) * (1 - frac)).toInt().coerceIn(0, 255)
|
||||
val b = (Color.blue(c) * (1 - frac)).toInt().coerceIn(0, 255)
|
||||
return Color.argb(Color.alpha(c), r, g, b)
|
||||
}
|
||||
|
||||
private fun withAlpha(c: Int, alpha: Int): Int {
|
||||
return Color.argb(alpha.coerceIn(0, 255), Color.red(c), Color.green(c), Color.blue(c))
|
||||
}
|
||||
|
||||
private fun lerp(a: Float, b: Float, t: Float): Float = a + (b - a) * t
|
||||
|
||||
private fun lerpColor(from: Int, to: Int, t: Float): Int {
|
||||
val r = lerp(Color.red(from).toFloat(), Color.red(to).toFloat(), t).toInt().coerceIn(0, 255)
|
||||
val g = lerp(Color.green(from).toFloat(), Color.green(to).toFloat(), t).toInt().coerceIn(0, 255)
|
||||
val b = lerp(Color.blue(from).toFloat(), Color.blue(to).toFloat(), t).toInt().coerceIn(0, 255)
|
||||
return Color.argb(255, r, g, b)
|
||||
}
|
||||
|
||||
private class Ripple(val bornAtMs: Long, val peak: Float)
|
||||
}
|
||||
|
|
@ -187,21 +187,6 @@ class ChatActivity : AppCompatActivity() {
|
|||
"Amir", "Didier", "Sid", "Zelda"
|
||||
)
|
||||
|
||||
/** One color per speaker — derived palette (core + halo + bars) is
|
||||
* generated inside AudioVisualizerView. Chosen to be calm,
|
||||
* perceptually distinct, and consistent in saturation so switching
|
||||
* voices changes *hue* rather than *mood*. */
|
||||
private val voiceColors = listOf(
|
||||
0xFFBCA4E8.toInt(), // Damien — lavender
|
||||
0xFFE8A4CC.toInt(), // Elodie — rose
|
||||
0xFF82D5D0.toInt(), // Jerome — aqua
|
||||
0xFFE8BFA4.toInt(), // Richard — amber sand
|
||||
0xFF95D5A6.toInt(), // Amir — emerald
|
||||
0xFF8FA2D4.toInt(), // Didier — indigo
|
||||
0xFFE8B89A.toInt(), // Sid — peach
|
||||
0xFFA4BEE8.toInt() // Zelda — periwinkle
|
||||
)
|
||||
|
||||
private fun setupResourceMonitoring() {
|
||||
val graphCpu = findViewById<MiniGraphView>(R.id.graphCpu)
|
||||
val graphGpu = findViewById<MiniGraphView>(R.id.graphGpu)
|
||||
|
|
@ -269,12 +254,6 @@ class ChatActivity : AppCompatActivity() {
|
|||
override fun onItemSelected(parent: AdapterView<*>?, view: android.view.View?, pos: Int, id: Long) {
|
||||
val voicePath = "${com.kazeia.KazeiaApplication.MODELS_DIR}/../voix/${voiceFiles[pos]}"
|
||||
kazeiaService?.setVoice(voicePath)
|
||||
// Push the matching color to the service so the orb
|
||||
// view picks it up; the view tweens from the previous
|
||||
// color so voice changes don't snap visually.
|
||||
val color = voiceColors[pos.coerceIn(voiceColors.indices)]
|
||||
kazeiaService?.setVoiceColor(color)
|
||||
binding.audioViz.setVoiceColor(color)
|
||||
appendLog("Voix: ${voiceNames[pos]}")
|
||||
}
|
||||
override fun onNothingSelected(parent: AdapterView<*>?) {}
|
||||
|
|
@ -347,43 +326,6 @@ class ChatActivity : AppCompatActivity() {
|
|||
setDebugPanelVisible(debug)
|
||||
}
|
||||
}
|
||||
launch {
|
||||
// Drive the orb visualizer from the service-side signal.
|
||||
// Service decides whether the app is idle, tracking the
|
||||
// mic, or rendering a TTS segment; the view just renders
|
||||
// it. StartSpeaking is edge-triggered on the envelope
|
||||
// identity so re-emitting the same signal won't restart
|
||||
// the animation timer.
|
||||
var lastSpeakingEnv: FloatArray? = null
|
||||
service.visualizerSignal.collect { sig ->
|
||||
when (sig) {
|
||||
is com.kazeia.service.KazeiaService.VisualizerSignal.Idle -> {
|
||||
binding.audioViz.setIdle()
|
||||
lastSpeakingEnv = null
|
||||
}
|
||||
is com.kazeia.service.KazeiaService.VisualizerSignal.Listening -> {
|
||||
binding.audioViz.setListening(sig.micRms)
|
||||
lastSpeakingEnv = null
|
||||
}
|
||||
is com.kazeia.service.KazeiaService.VisualizerSignal.Speaking -> {
|
||||
if (sig.rmsEnvelope !== lastSpeakingEnv) {
|
||||
binding.audioViz.startSpeaking(
|
||||
sig.rmsEnvelope, sig.spectrogram, sig.durationMs
|
||||
)
|
||||
lastSpeakingEnv = sig.rmsEnvelope
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
launch {
|
||||
// Keep the view's voice color synchronised with the
|
||||
// service — covers the initial state when the view
|
||||
// attaches before the spinner's first callback fires.
|
||||
service.voiceColor.collect { color ->
|
||||
binding.audioViz.setVoiceColor(color)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,12 +18,17 @@ class ResourceMonitor(private val context: Context) {
|
|||
private var prevIdle = 0L
|
||||
private var prevGpuBusy = 0L
|
||||
private var prevGpuTotal = 0L
|
||||
private var hasRoot = false
|
||||
|
||||
// No-root deployment (2026-04-14): the previous `su -c id` probe used to
|
||||
// enable GPU/NPU sysfs reads via root, but it also triggered a Magisk
|
||||
// prompt on every ChatActivity launch. The whole pipeline now runs in
|
||||
// the app process so root is never needed — GPU/NPU usage is reported
|
||||
// as -1 (UI shows "—") and the dashboard shows CPU + RAM only.
|
||||
init {
|
||||
// Test root access once
|
||||
hasRoot = try {
|
||||
val p = Runtime.getRuntime().exec(arrayOf("su", "-c", "id"))
|
||||
val result = p.inputStream.bufferedReader().readText()
|
||||
p.waitFor()
|
||||
result.contains("uid=0")
|
||||
} catch (_: Exception) { false }
|
||||
}
|
||||
|
||||
fun snapshot(): ResourceSnapshot {
|
||||
return ResourceSnapshot(
|
||||
|
|
@ -62,9 +67,7 @@ class ResourceMonitor(private val context: Context) {
|
|||
}
|
||||
|
||||
private fun readGpu(): Float {
|
||||
// Non-root path: some devices expose /sys/class/kgsl/kgsl-3d0/gpubusy
|
||||
// as world-readable. If it's locked down (most SELinux configs do),
|
||||
// just return -1 — no root fallback, no Magisk prompt.
|
||||
// Try direct read first (works on some devices)
|
||||
try {
|
||||
val content = File("/sys/class/kgsl/kgsl-3d0/gpubusy").readText().trim()
|
||||
val parts = content.split("\\s+".toRegex())
|
||||
|
|
@ -78,14 +81,38 @@ class ResourceMonitor(private val context: Context) {
|
|||
if (dt > 0) return (db * 100f / dt).coerceIn(0f, 100f)
|
||||
}
|
||||
} catch (_: Exception) {}
|
||||
|
||||
// Try with root
|
||||
if (hasRoot) {
|
||||
try {
|
||||
val content = execRoot("cat /sys/class/kgsl/kgsl-3d0/gpu_busy_percentage").trim()
|
||||
val pct = content.replace("%", "").trim().toFloatOrNull()
|
||||
if (pct != null) return pct.coerceIn(0f, 100f)
|
||||
} catch (_: Exception) {}
|
||||
}
|
||||
|
||||
return -1f
|
||||
}
|
||||
|
||||
private fun readNpu(): Float {
|
||||
// NPU usage reporting required root sysfs reads (cdsp_rm/cpu_vote,
|
||||
// /proc/fastrpc) that always triggered a Magisk prompt. Removed with
|
||||
// the no-root migration — no equivalent public API exists, so the
|
||||
// UI just shows "—" for NPU load.
|
||||
// NPU doesn't have a standard busy metric
|
||||
// Use CDSP (compute DSP) load as proxy if available
|
||||
if (hasRoot) {
|
||||
try {
|
||||
// Check if CDSP is active by reading vote count
|
||||
val vote = execRoot("cat /sys/bus/platform/devices/soc:qcom,msm-cdsp-rm/cdsp_rm/cpu_vote 2>/dev/null").trim()
|
||||
if (vote.isNotEmpty()) {
|
||||
val v = vote.toIntOrNull() ?: 0
|
||||
return if (v > 0) 100f else 0f
|
||||
}
|
||||
} catch (_: Exception) {}
|
||||
|
||||
try {
|
||||
// Alternative: check fastrpc activity
|
||||
val stat = execRoot("cat /proc/fastrpc 2>/dev/null || echo none").trim()
|
||||
if (stat != "none" && stat.isNotEmpty()) return 50f
|
||||
} catch (_: Exception) {}
|
||||
}
|
||||
return -1f
|
||||
}
|
||||
|
||||
|
|
@ -107,4 +134,12 @@ class ResourceMonitor(private val context: Context) {
|
|||
} catch (_: Exception) { return 0 }
|
||||
}
|
||||
|
||||
private fun execRoot(cmd: String): String {
|
||||
return try {
|
||||
val p = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
|
||||
val result = p.inputStream.bufferedReader().readText()
|
||||
p.waitFor()
|
||||
result
|
||||
} catch (_: Exception) { "" }
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -100,23 +100,6 @@
|
|||
|
||||
</LinearLayout>
|
||||
|
||||
<!-- Central orb visualizer: Kazeia's visual "face". Takes the
|
||||
top half of the chat area so it reads as the primary UI
|
||||
element; the message list sits below it and shows the
|
||||
word-by-word reveal of the current reply. Color is driven
|
||||
by the selected voice (Damien=lavender, Elodie=rose, …). -->
|
||||
<com.kazeia.ui.AudioVisualizerView
|
||||
android:id="@+id/audioViz"
|
||||
android:layout_width="0dp"
|
||||
android:layout_height="0dp"
|
||||
android:background="@color/kazeia_background"
|
||||
app:layout_constraintTop_toBottomOf="@id/voiceBar"
|
||||
app:layout_constraintBottom_toTopOf="@id/rvMessages"
|
||||
app:layout_constraintStart_toStartOf="parent"
|
||||
app:layout_constraintEnd_toEndOf="parent"
|
||||
app:layout_constraintVertical_chainStyle="spread"
|
||||
app:layout_constraintVertical_weight="3" />
|
||||
|
||||
<!-- Chat messages -->
|
||||
<androidx.recyclerview.widget.RecyclerView
|
||||
android:id="@+id/rvMessages"
|
||||
|
|
@ -124,11 +107,10 @@
|
|||
android:layout_height="0dp"
|
||||
android:clipToPadding="false"
|
||||
android:padding="8dp"
|
||||
app:layout_constraintTop_toBottomOf="@id/audioViz"
|
||||
app:layout_constraintTop_toBottomOf="@id/voiceBar"
|
||||
app:layout_constraintBottom_toTopOf="@id/inputBar"
|
||||
app:layout_constraintStart_toStartOf="parent"
|
||||
app:layout_constraintEnd_toEndOf="parent"
|
||||
app:layout_constraintVertical_weight="2" />
|
||||
app:layout_constraintEnd_toEndOf="parent" />
|
||||
|
||||
<!-- Input bar -->
|
||||
<LinearLayout
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Kazeia Android — Élimination du root pour le LLM (résolu)
|
||||
# Kazeia Android — Problème d'élimination de root pour le LLM
|
||||
|
||||
**Date :** 2026-04-14
|
||||
**Device :** OnePlus Pad 3 (OPD2415, Snapdragon 8 Elite, SoC `sun`), Android 16 (OxygenOS), Magisk root
|
||||
|
|
@ -6,13 +6,6 @@
|
|||
|
||||
---
|
||||
|
||||
> **🟢 Statut : RÉSOLU.** Pipeline complet STT + LLM + TTS tourne in-process sans
|
||||
> aucun appel à `su`. Voir la section **Résolution** en bas du document pour le
|
||||
> détail du fix. Le reste du document décrit l'investigation initiale et garde
|
||||
> sa valeur historique.
|
||||
|
||||
---
|
||||
|
||||
## 1. Contexte général
|
||||
|
||||
L'app Kazeia (Android / Kotlin + Jetpack Compose) orchestre un pipeline **STT → LLM → TTS** entièrement on-device sur le Hexagon HTP (V79) du Snapdragon 8 Elite.
|
||||
|
|
@ -231,132 +224,3 @@ Je cherche soit :
|
|||
- Soit **la confirmation** que l'approche actuelle (root + Magisk remember) est le meilleur compromis accessible, avec éventuellement des suggestions pour minimiser les prompts
|
||||
|
||||
Merci.
|
||||
|
||||
---
|
||||
|
||||
## 10. Résolution (post-mortem)
|
||||
|
||||
Une seconde opinion technique a identifié la **vraie cause racine** que
|
||||
l'investigation locale avait mal diagnostiquée.
|
||||
|
||||
### 10.1 Vraie cause
|
||||
|
||||
Les processus Android forkés par Zygote (l'app elle-même, ses Services
|
||||
`android:process=":xxx"`, etc.) héritent des **GIDs supplémentaires**
|
||||
configurés à l'init pour `untrusted_app`. Ces GIDs incluent l'autorisation
|
||||
`/dev/cdsprpc-smd` et d'autres canaux fastrpc.
|
||||
|
||||
Quand `Runtime.exec("su"…)` ou `ProcessBuilder` font un `fork()` + `exec()`
|
||||
classique, le `exec()` ne préserve pas tous les credentials utilisés par le
|
||||
driver fastrpc Qualcomm pour authentifier le client. Le driver retourne
|
||||
**error 4000 "Failed to load skel"** car il refuse de créer une session DSP
|
||||
pour ce process.
|
||||
|
||||
C'est pour ça que :
|
||||
- ORT-QNN (Whisper) marchait in-process : chargé via `System.loadLibrary` dans
|
||||
l'app, qui est Zygote-forked → credentials valides.
|
||||
- `su -c qnn_llama_runner` marchait : root bypasse les checks fastrpc.
|
||||
- `ProcessBuilder` du même runner échouait : ni Zygote-forked, ni root.
|
||||
|
||||
Le "conflit de version QNN v2.31 vs v2.37" que j'avais soupçonné n'était
|
||||
**pas le vrai problème**. Les libs étaient déjà unifiées en v2.42 dans jniLibs.
|
||||
|
||||
### 10.2 La solution : `LlmModule` JNI in-process
|
||||
|
||||
ExecuTorch fournit `org.pytorch.executorch.extension.llm.LlmModule`, un
|
||||
wrapper JNI autour du même C++ `example::Runner` que le binaire
|
||||
`qnn_llama_runner`. En l'invoquant depuis l'app (process Zygote-forked), le
|
||||
DSP fastrpc accepte la session — pas de root nécessaire.
|
||||
|
||||
### 10.3 Étapes réelles du fix
|
||||
|
||||
1. **Build ExecuTorch Android** avec `EXECUTORCH_BUILD_LLAMA_JNI=ON`,
|
||||
`EXECUTORCH_BUILD_QNN=ON`, `QNN_SDK_ROOT=/opt/Kazeia/qnn_sdk_242/qairt/2.42.0.251225` →
|
||||
produit `libexecutorch_jni.so` 192 MB qui inclut le runner LLM + le backend QNN.
|
||||
2. **Patches sources** dans `/opt/Kazeia/executorch-patches/llm_in_process_jni.patch` :
|
||||
- `backends/qualcomm/CMakeLists.txt` : gate `PyQnnManagerAdaptor` sur `NOT ANDROID`
|
||||
(le guard original sur `CMAKE_SYSTEM_PROCESSOR MATCHES x86_64` se déclenche
|
||||
dans des sous-scopes du cross-compile Android).
|
||||
- `extension/android/jni/jni_layer_llama.cpp`, branche `MODEL_TYPE_QNN_LLAMA` :
|
||||
- `decoder_model = "qwen3"` (au lieu de `"llama3"` hardcodé)
|
||||
- `temperature = 0.0f`, `eval_mode = 0` (kKVCached), `shared_buffer = true`
|
||||
- **Crucial** : choisir `Runner<uint8_t>` ou `Runner<uint16_t>` selon
|
||||
`module->get("get_kv_io_bit_width")` (mirror du `qnn_llama_runner.cpp main()`).
|
||||
Hardcoder la mauvaise largeur produit du gibberish déterministe
|
||||
comme `blocked罩ug darkestSOLEQuotes作者本人 humanity` — la KV cache
|
||||
est lue/écrite à la mauvaise largeur de byte.
|
||||
3. **Bundling jniLibs** :
|
||||
- `libexecutorch.so` / `libexecutorch_jni.so` (build du 13-april avec LlmModule)
|
||||
- `libqnn_executorch_backend.so` (assorti)
|
||||
- `libQnnHtp.so`, `libQnnHtpPrepare.so`, `libQnnHtpV79Stub.so`, `libQnnSystem.so`,
|
||||
`libQnnHtpV79Skel.so` (tous v2.42 depuis `/opt/Kazeia/qnn_sdk_242/`)
|
||||
4. **JAR avec `LlmModule.class`** : compilation manuelle via `javac` (le build
|
||||
gradle de l'AAR demandait android-34 platform non installée).
|
||||
5. **Réécriture `ExecuTorchLlmEngine.kt`** :
|
||||
- Constructeur : `LlmModule(MODEL_TYPE_QNN_LLAMA=4, ptePath, tokPath, 0.7f)` puis `.load()`
|
||||
- `generate(prompt, seqLen, callback, echo=false)` — sinon le callback échoue à
|
||||
stripper les tokens du prompt
|
||||
- Template ChatML Qwen3 buildé en Kotlin, mirror exact de
|
||||
`qnn_llama_runner.cpp::get_formatted_prompt()` pour `kQwen3` (user-first puis
|
||||
system optionnel puis `<|im_start|>assistant`)
|
||||
- Filtre inline `<think>…</think>` dans le callback avec lookahead pour les tags
|
||||
fragmentés sur plusieurs pieces
|
||||
|
||||
### 10.4 Métriques validées
|
||||
|
||||
| Métrique | Valeur |
|
||||
|---|---|
|
||||
| LlmModule.load() | 4.2 s (one-time à l'init de l'app) |
|
||||
| LLM gen | ~17 tok/s (kv-only) |
|
||||
| LLM TTFT | ~4 s pour 77 tokens prompt (prefill séquentiel kKVCached) |
|
||||
| TTS Talker(PTE) | 37 ms/step (vs 45-65 avant) |
|
||||
| TTS CP(PTE) | 73 ms/step |
|
||||
| Pipeline e2e | "Bonjour, comment vas-tu ?" → audio en ~7 s |
|
||||
| Magisk prompts | **0** |
|
||||
|
||||
### 10.5 Optimisations restantes (non bloquantes)
|
||||
|
||||
- **TTFT** : ré-exporter le `.pte` en `--model_mode hybrid` pour avoir un
|
||||
`prefill_forward` parallèle → TTFT passerait de ~4 s à <1 s. Pas nécessaire
|
||||
pour le use case conversationnel actuel.
|
||||
- **Cosmétique** : le statusbar de l'app affiche encore "Hexagon NPU" pour le
|
||||
TTS alors que c'est désormais le chemin .pte (label hérité du temps où c'était
|
||||
ggml-hexagon).
|
||||
|
||||
### 10.6 Mémoire projet
|
||||
|
||||
État complet documenté dans
|
||||
`/home/alf/.claude/projects/-opt-Kazeia/memory/project_llm_npu_plan.md`.
|
||||
Backup git : branche `backup/pre-no-root-migration` + commit `6e6a2d9`.
|
||||
Backup disk : `/home/alf/kazeia_backup_20260414/`.
|
||||
|
||||
### 10.7 Commits clés
|
||||
|
||||
- `f32b5dd` (LLM no-root: validate end-to-end pipeline, fix kv_io_bit_width detection)
|
||||
- `b57719f` (LLM: filter <think> tokens out of the streaming TTS path)
|
||||
|
||||
### 10.8 Comparaison de performances avant/après
|
||||
|
||||
Mesurée le 2026-04-14 sur le même `.pte` Qwen3-4B avec le même runner C++ —
|
||||
seule la voie d'invocation change (subprocess `su -c` vs `LlmModule` JNI
|
||||
in-process).
|
||||
|
||||
| Métrique | Avant (su-c subprocess) | Après (in-process LlmModule) | Delta |
|
||||
|---|---|---|---|
|
||||
| LLM gen rate | 18.3 tok/s | 17.2 tok/s | -6 % (bruit) |
|
||||
| LLM prefill speed | 52 ms / prompt-token | 52 ms / prompt-token | identique |
|
||||
| LLM TTFT (prompt 35 tok) | 1.8 s | 1.8 s | identique |
|
||||
| LLM TTFT (prompt 80 tok, system+ChatML) | ~4.1 s | 4.2 s | identique |
|
||||
| TTS Talker(.pte) | 45-65 ms / step | 37 ms / step | +25-40 % (contexte QNN partagé) |
|
||||
| TTS CP(.pte) | 65-157 ms / step | 73 ms / step | +10-50 % |
|
||||
| TTS load au boot | 26.7 s | 4.3 s | **6× plus rapide** (plus de subprocess Hexagon 12 s) |
|
||||
| `LlmModule.load()` au boot | n/a (subprocess à la demande) | 3.1 s (one-time) | overhead init |
|
||||
| App RSS | ~2 GB app + 1.76 GB subprocess séparé | ~3.7 GB process unique | mêmes ressources globales |
|
||||
| Erreurs DSP 6031/6033 en concurrence | régulières | disparues | architectural |
|
||||
| Prompts Magisk | 5 / tour | **0** | UX net |
|
||||
| Taille APK | ~100 MB | ~100 MB (libexecutorch_jni.so 192 MB → 8.5 MB après strip à l'install) | négligeable |
|
||||
|
||||
**Conclusion** : pas de régression LLM (perf identique, le runner C++ est le même).
|
||||
Gain net sur la TTS (Talker 25-40 % plus rapide grâce au contexte QNN partagé,
|
||||
load 6× plus rapide). Architecture plus propre : un seul process, un seul runtime
|
||||
QNN, plus de contention DSP, plus de prompts root.
|
||||
|
|
|
|||
|
|
@ -1,233 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate per-voice <name>_voice_prefix.bin (9 × 1024 fp32) and
|
||||
<name>_voice_suffix.bin (2 × 1024 fp32) for Kazeia's on-device TTS
|
||||
engine (Qwen3-TTS 0.6B-Base voice-clone mode).
|
||||
|
||||
The on-device pipeline concatenates prefix + text-embeds + suffix as
|
||||
the talker's prefill. The prefix is the voice-conditioning preamble
|
||||
produced by the Qwen3TTS model when run with `x_vector_only_mode=True`
|
||||
on a short reference phrase — it carries the speaker x-vector and the
|
||||
leading ChatML / transcript tokens that precede user text. The suffix
|
||||
is the closing tokens that sit right after user text (end-of-turn,
|
||||
assistant-ready marker).
|
||||
|
||||
Approach: run the model once per voice on a fixed short utterance,
|
||||
capture every talker input embedding of the first (multi-token)
|
||||
prefill call via a forward hook — that's the full prefill sequence.
|
||||
The reference Damien files contain exactly 9 pre-text embeds + 2
|
||||
post-text embeds, which corresponds to:
|
||||
|
||||
[prefix: 9 vectors] [text embeds: N vectors] [suffix: 2 vectors]
|
||||
|
||||
We BPE-tokenize the same utterance with Qwen3TTS's own tokenizer to
|
||||
find where the text tokens start and end inside the prefill, then
|
||||
slice out the preceding 9 and trailing 2 vectors. This makes the
|
||||
split robust to tokenizer changes and matches the Damien files
|
||||
bit-identically (verified during the first run: /tmp/check_damien_*).
|
||||
|
||||
Usage:
|
||||
export_voice_prefix_suffix.py VOICE.wav [VOICE.wav ...]
|
||||
--out-dir /path/to/output (default /tmp/voice_prefixes)
|
||||
--text "Bonjour." (reference utterance; short is ok)
|
||||
|
||||
The output file names are `<basename_without_ext>_voice_prefix.bin`
|
||||
and `<basename_without_ext>_voice_suffix.bin`. Push them to
|
||||
/data/local/tmp/kazeia/models/qwen3-tts-npu/ to activate the voice
|
||||
in-app (Qwen3TtsEngine.setVoice reads them from there).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
# NOTE: don't chdir() here — the WAV paths in argv are resolved against
|
||||
# the user's cwd. Qwen3TTS creates /tmp scratch files internally already.
|
||||
|
||||
MODEL_PATH = (
|
||||
"/home/alf/.cache/huggingface/hub/"
|
||||
"models--Qwen--Qwen3-TTS-12Hz-0.6B-Base/snapshots/"
|
||||
"5d83992436eae1d760afd27aff78a71d676296fc"
|
||||
)
|
||||
|
||||
# Prefix + suffix sizes taken from the reference damien_voice_prefix.bin /
|
||||
# damien_voice_suffix.bin shipped on the tablet. If Qwen3TTS ever changes
|
||||
# its chat template these may need to be re-checked — run the script
|
||||
# with `--validate-damien damien_voice_prefix.bin` to diff against a
|
||||
# known-good capture.
|
||||
N_PREFIX = 9
|
||||
N_SUFFIX = 2
|
||||
TALKER_DIM = 1024
|
||||
|
||||
|
||||
def load_model():
|
||||
import torch
|
||||
from qwen_tts import Qwen3TTSModel
|
||||
|
||||
print(f"Loading Qwen3-TTS model from {MODEL_PATH}...", flush=True)
|
||||
tts = Qwen3TTSModel.from_pretrained(
|
||||
MODEL_PATH, local_files_only=True, device_map="cpu"
|
||||
)
|
||||
return tts
|
||||
|
||||
|
||||
class _PrefillCapturedSentinel(Exception):
|
||||
"""Raised after the first prefill so we can abort generate_voice_clone
|
||||
without waiting for the (very slow on CPU) full TTS decode."""
|
||||
|
||||
|
||||
def capture_prefill(tts, wav_path: str, text: str):
|
||||
"""Run generate_voice_clone just far enough to capture the first
|
||||
(prefill) call's talker input embeddings, then abort. Doing the full
|
||||
non-streaming decode would take several minutes per voice on CPU and
|
||||
we don't need any of the audio — only the prefill vectors."""
|
||||
import numpy as np
|
||||
|
||||
captured = []
|
||||
talker = tts.model.talker
|
||||
original_forward = talker.model.forward
|
||||
|
||||
def patched_forward(input_ids=None, inputs_embeds=None, **kwargs):
|
||||
if inputs_embeds is not None and inputs_embeds.dim() == 3:
|
||||
t = inputs_embeds.shape[1]
|
||||
for i in range(t):
|
||||
captured.append(
|
||||
inputs_embeds[0, i, :].detach().cpu().numpy().astype(np.float32)
|
||||
)
|
||||
raise _PrefillCapturedSentinel()
|
||||
return original_forward(
|
||||
input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs
|
||||
)
|
||||
|
||||
talker.model.forward = patched_forward
|
||||
try:
|
||||
try:
|
||||
tts.generate_voice_clone(
|
||||
text=text,
|
||||
ref_audio=wav_path,
|
||||
language="french",
|
||||
x_vector_only_mode=True,
|
||||
non_streaming_mode=True,
|
||||
)
|
||||
except _PrefillCapturedSentinel:
|
||||
pass # expected — we abort after the first prefill
|
||||
finally:
|
||||
talker.model.forward = original_forward
|
||||
|
||||
if not captured:
|
||||
raise RuntimeError("No prefill captured — hook wasn't triggered.")
|
||||
return captured
|
||||
|
||||
|
||||
def write_bin(path: Path, vectors):
|
||||
n = len(vectors)
|
||||
dim = len(vectors[0]) if n else TALKER_DIM
|
||||
if dim != TALKER_DIM:
|
||||
raise RuntimeError(f"Expected dim {TALKER_DIM}, got {dim}")
|
||||
with open(path, "wb") as f:
|
||||
f.write(struct.pack("<ii", n, dim))
|
||||
for v in vectors:
|
||||
f.write(struct.pack(f"<{dim}f", *v))
|
||||
|
||||
|
||||
def process_voice(tts, wav_path: Path, out_dir: Path, text: str):
|
||||
name = wav_path.stem.lower().split("_")[0] # "damien_15s_24k" → "damien"
|
||||
prefix_path = out_dir / f"{name}_voice_prefix.bin"
|
||||
suffix_path = out_dir / f"{name}_voice_suffix.bin"
|
||||
if prefix_path.exists() and suffix_path.exists():
|
||||
print(f" [skip] {name}: prefix/suffix already exist")
|
||||
return
|
||||
|
||||
print(f" Capturing prefill for {name} ({wav_path.name})...", flush=True)
|
||||
prefill = capture_prefill(tts, str(wav_path), text)
|
||||
if len(prefill) < N_PREFIX + N_SUFFIX + 1:
|
||||
raise RuntimeError(
|
||||
f"Prefill too short for {name}: {len(prefill)} < {N_PREFIX + N_SUFFIX + 1}"
|
||||
)
|
||||
prefix_vecs = prefill[:N_PREFIX]
|
||||
suffix_vecs = prefill[-N_SUFFIX:]
|
||||
write_bin(prefix_path, prefix_vecs)
|
||||
write_bin(suffix_path, suffix_vecs)
|
||||
print(
|
||||
f" Wrote {prefix_path.name} ({N_PREFIX}×{TALKER_DIM}) "
|
||||
f"and {suffix_path.name} ({N_SUFFIX}×{TALKER_DIM})",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
|
||||
def validate_against_damien(tts, wav_path: Path, reference_prefix: Path, text: str):
|
||||
"""Regenerate Damien's prefix/suffix from damien.wav and diff against
|
||||
the reference files shipped on the tablet. Confirms this script's
|
||||
slicing reproduces the original format."""
|
||||
import numpy as np
|
||||
|
||||
prefill = capture_prefill(tts, str(wav_path), text)
|
||||
candidate = np.array(prefill[:N_PREFIX], dtype=np.float32)
|
||||
|
||||
with open(reference_prefix, "rb") as f:
|
||||
n, d = struct.unpack("<ii", f.read(8))
|
||||
ref = np.frombuffer(f.read(n * d * 4), dtype=np.float32).reshape(n, d)
|
||||
|
||||
diff = np.abs(candidate - ref)
|
||||
print(
|
||||
f"Damien prefix validation: max|diff|={diff.max():.3e} "
|
||||
f"mean|diff|={diff.mean():.3e} (expect ~0 if script is correct)"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
p = argparse.ArgumentParser()
|
||||
p.add_argument("wavs", nargs="+", help="Voice WAV files")
|
||||
p.add_argument(
|
||||
"--out-dir", default="/tmp/voice_prefixes", help="Output directory"
|
||||
)
|
||||
p.add_argument(
|
||||
"--text", default="Bonjour.", help="Reference utterance for prefill"
|
||||
)
|
||||
p.add_argument(
|
||||
"--validate-damien",
|
||||
default=None,
|
||||
help="Path to a reference damien_voice_prefix.bin for sanity-check",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
out_dir = Path(args.out_dir)
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
tts = load_model()
|
||||
|
||||
if args.validate_damien:
|
||||
damien_wav = next(
|
||||
(Path(w) for w in args.wavs if "damien" in Path(w).stem.lower()), None
|
||||
)
|
||||
if damien_wav is None:
|
||||
print("--validate-damien specified but no damien wav in input list")
|
||||
sys.exit(1)
|
||||
validate_against_damien(tts, damien_wav, Path(args.validate_damien), args.text)
|
||||
|
||||
for wav in args.wavs:
|
||||
wp = Path(wav)
|
||||
if not wp.exists():
|
||||
print(f" [miss] {wp}")
|
||||
continue
|
||||
try:
|
||||
process_voice(tts, wp, out_dir, args.text)
|
||||
except Exception as e:
|
||||
print(f" [fail] {wp.name}: {e}")
|
||||
|
||||
print(f"\nDone. Files written under {out_dir}")
|
||||
print(
|
||||
"Push to the tablet with, e.g.:\n"
|
||||
f" adb push {out_dir}/*_voice_prefix.bin "
|
||||
"/data/local/tmp/kazeia/models/qwen3-tts-npu/\n"
|
||||
f" adb push {out_dir}/*_voice_suffix.bin "
|
||||
"/data/local/tmp/kazeia/models/qwen3-tts-npu/"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in New Issue