LLM no-root: migrate to in-process LlmModule (JNI) — zero su calls
The root cause of the previous su-c requirement was that Qualcomm's FastRPC kernel driver rejects processes spawned via ProcessBuilder fork+exec because they lose supplementary GIDs on exec. Zygote-forked app processes retain the proper init-configured credentials and are accepted by the adsprpcd service, which is why ORT-QNN (Whisper, in-process) worked while the subprocess qnn_llama_runner did not. Running the LLM in-process via ExecuTorch's LlmModule bypasses the fork+exec path entirely. What this commit does: - ExecuTorchLlmEngine now uses org.pytorch.executorch.extension.llm.LlmModule with MODEL_TYPE_QNN_LLAMA=4 (routes to example::Runner in jni_layer_llama.cpp, the same C++ runner that qnn_llama_runner embeds). - All su, ProcessBuilder, file-based prompt/response plumbing, and run_llm.sh gone. ChatML template is built in Kotlin; tokens stream in via LlmCallback. Supporting changes under executorch-patches/llm_in_process_jni.patch: 1. backends/qualcomm/CMakeLists.txt — gate PyQnnManagerAdaptor on NOT ANDROID. The original guard (CMAKE_SYSTEM_PROCESSOR MATCHES x86_64) misfires in a nested scope during Android cross-compile and tried to build the host Python bindings. 2. extension/android/jni/jni_layer_llama.cpp — hardcode decoder_model="qwen3" (was "llama3") and pass eval_mode=0 (EvalMode::kKVCached) + shared_buffer=true to match our hybrid_llama_qnn.pte which only contains kv_forward, not prefill_forward. Build: scripts/build_android_library.sh arm64-v8a with QNN_SDK_ROOT pointing to /opt/Kazeia/qnn_sdk_242/qairt/2.42.0.251225 and EXECUTORCH_BUILD_QNN=ON. Produces libexecutorch_jni.so (192 MB) with QNN v2.42 backend + the llama runner code, plus libqnn_executorch_backend.so. Both staged in jniLibs. Validated on OnePlus Pad 3: LlmModule.load() completes in 4.2 s, no su prompts, Pipeline ready with STT(WhisperHybridEngine) → [VoiceCommands → LLM] → TTS(Qwen3TtsEngine). TTS .pte still loads with the upgraded v2.42 runtime — no regression. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
6e6a2d9f82
commit
809a6d4fed
|
|
@ -0,0 +1,40 @@
|
|||
diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
|
||||
index e93731e..4951e1d 100644
|
||||
--- a/backends/qualcomm/CMakeLists.txt
|
||||
+++ b/backends/qualcomm/CMakeLists.txt
|
||||
@@ -308,8 +308,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
|
||||
)
|
||||
endif()
|
||||
|
||||
-# QNN pybind
|
||||
-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
|
||||
+# QNN pybind — host Python bindings, not for Android cross-compile
|
||||
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ANDROID)
|
||||
add_subdirectory(
|
||||
${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
|
||||
${CMAKE_CURRENT_BINARY_DIR}/pybind11
|
||||
diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
|
||||
index 45f2414..e1c2a8f 100644
|
||||
--- a/extension/android/jni/jni_layer_llama.cpp
|
||||
+++ b/extension/android/jni/jni_layer_llama.cpp
|
||||
@@ -171,14 +171,17 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
|
||||
model_path->toStdString().c_str(),
|
||||
data_files_vector,
|
||||
executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
|
||||
- std::string decoder_model = "llama3"; // use llama3 for now
|
||||
+ std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
|
||||
runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
|
||||
std::move(module),
|
||||
decoder_model.c_str(),
|
||||
model_path->toStdString().c_str(),
|
||||
tokenizer_path->toStdString().c_str(),
|
||||
- "",
|
||||
- "");
|
||||
+ /* performance_output_path */ "",
|
||||
+ /* dump_logits_path */ "",
|
||||
+ /* temperature */ 0.7f,
|
||||
+ /* eval_mode */ 0, // EvalMode::kKVCached (our .pte has only kv_forward, no prefill_forward)
|
||||
+ /* shared_buffer */ true);
|
||||
model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
|
||||
#endif
|
||||
#if defined(EXECUTORCH_BUILD_MEDIATEK)
|
||||
|
|
@ -1,43 +1,49 @@
|
|||
package com.kazeia.llm
|
||||
|
||||
import android.content.Context
|
||||
import android.util.Log
|
||||
import com.kazeia.core.*
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
import org.pytorch.executorch.extension.llm.LlmCallback
|
||||
import org.pytorch.executorch.extension.llm.LlmModule
|
||||
|
||||
/**
|
||||
* LLM Engine using ExecuTorch + QNN backend via subprocess.
|
||||
* Calls qnn_llama_runner binary with root access (Magisk su).
|
||||
* LLM Engine using ExecuTorch LlmModule in-process — **no root required**.
|
||||
*
|
||||
* Runs Qwen3-4B via `org.pytorch.executorch.extension.llm.LlmModule`, which
|
||||
* wraps the same C++ TextLlmRunner as the standalone qnn_llama_runner binary
|
||||
* but inside the app's own process. The QNN HTP backend works because the
|
||||
* DSP fastrpc service accepts the Zygote-forked app process (unlike
|
||||
* ProcessBuilder-spawned subprocesses which lose supplementary GIDs on exec
|
||||
* and get rejected by the fastrpc credential checks).
|
||||
*
|
||||
* Model + tokenizer live in /data/local/tmp/kazeia-et/ (readable by the app
|
||||
* on this device's permissive SELinux policy). libexecutorch.so + QNN libs
|
||||
* are bundled in jniLibs.
|
||||
*
|
||||
* Current tablet config: Qwen3-4B KV-mode, ~18-22 tok/s on Hexagon V79
|
||||
* (Snapdragon 8 Elite), TTFT 0.9 s, RSS 1.76 GB.
|
||||
*
|
||||
* Why root: the runner binary plus its QNN v2.42 .so deps live in
|
||||
* /data/local/tmp/kazeia-et/ (shell_data_file SELinux context). Untrusted
|
||||
* apps can't exec binaries from there. The Hexagon DSP fastrpc service also
|
||||
* refuses to load the v2.42 Skel from the app's own files dir — only from
|
||||
* nativeLibraryDir — but that dir already holds the TTS stack's v2.31 Skel
|
||||
* (same filename, different version, can't coexist). Rebuilding everything
|
||||
* against one QNN version would eliminate the conflict, but would require
|
||||
* re-exporting the TTS .pte with the new runtime (tooling currently broken
|
||||
* on the flatc schema/dataclass mismatch in the qnn_venv).
|
||||
*/
|
||||
class ExecuTorchLlmEngine(
|
||||
private val context: Context,
|
||||
private val onLog: ((String) -> Unit)? = null
|
||||
) : LlmEngine {
|
||||
|
||||
companion object {
|
||||
private const val TAG = "ExecuTorchLLM"
|
||||
private const val RUNNER_DIR = "/data/local/tmp/kazeia-et"
|
||||
// /no_think disables Qwen3's chain-of-thought block so the full token
|
||||
// budget goes to the actual answer (without it, 120-200 tokens get
|
||||
// consumed by <think>…</think> leaving nothing to speak).
|
||||
// Short-response directive keeps TTS latency manageable — each sentence
|
||||
// costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot.
|
||||
// budget goes to the actual answer. Short-response directive keeps
|
||||
// TTS latency manageable.
|
||||
private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think"
|
||||
|
||||
private const val MODEL_DIR = "/data/local/tmp/kazeia-et"
|
||||
private const val MODEL_PATH = "$MODEL_DIR/hybrid_llama_qnn.pte"
|
||||
private const val TOKENIZER_PATH = "$MODEL_DIR/tokenizer.json"
|
||||
}
|
||||
|
||||
private var llmModule: LlmModule? = null
|
||||
private var modelName = ""
|
||||
private var loaded = false
|
||||
|
||||
|
|
@ -48,77 +54,88 @@ class ExecuTorchLlmEngine(
|
|||
|
||||
override suspend fun load(modelPath: String, config: LlmConfig) {
|
||||
withContext(Dispatchers.IO) {
|
||||
val check = execRoot("ls $RUNNER_DIR/qnn_llama_runner $RUNNER_DIR/hybrid_llama_qnn.pte $RUNNER_DIR/tokenizer.json 2>&1")
|
||||
if (check.contains("No such file")) {
|
||||
nlog("ERROR: runner or model not found in $RUNNER_DIR")
|
||||
if (!File(MODEL_PATH).exists()) {
|
||||
nlog("ERROR: model not found at $MODEL_PATH")
|
||||
return@withContext
|
||||
}
|
||||
if (!File(TOKENIZER_PATH).exists()) {
|
||||
nlog("ERROR: tokenizer not found at $TOKENIZER_PATH")
|
||||
return@withContext
|
||||
}
|
||||
|
||||
deployRunnerScript()
|
||||
try {
|
||||
val t0 = System.currentTimeMillis()
|
||||
// MODEL_TYPE_QNN_LLAMA=4 selects the Qualcomm runner path in
|
||||
// jni_layer_llama.cpp, which uses example::Runner (same code
|
||||
// as the qnn_llama_runner binary) instead of the generic
|
||||
// TextLLMRunner. Our .pte was exported with
|
||||
// --decoder_model qwen3-4b which requires this path.
|
||||
val MODEL_TYPE_QNN_LLAMA = 4
|
||||
llmModule = LlmModule(MODEL_TYPE_QNN_LLAMA, MODEL_PATH, TOKENIZER_PATH, 0.7f)
|
||||
nlog("LlmModule instantiated in ${System.currentTimeMillis() - t0}ms")
|
||||
|
||||
writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
|
||||
android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP))
|
||||
if (SYSTEM_PROMPT.isNotEmpty()) {
|
||||
writeFileRoot("$RUNNER_DIR/outputs/system.b64",
|
||||
android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
|
||||
} else {
|
||||
execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
|
||||
}
|
||||
val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1")
|
||||
// Load the PTE into QNN HTP (calls the native load()).
|
||||
val loadResult = llmModule!!.load()
|
||||
if (loadResult != 0) {
|
||||
nlog("ERROR: LlmModule.load() returned $loadResult")
|
||||
llmModule = null
|
||||
return@withContext
|
||||
}
|
||||
nlog("LlmModule loaded in ${System.currentTimeMillis() - t0}ms total")
|
||||
|
||||
if (test.contains("Generated Tokens") || test.contains("Rate:")) {
|
||||
loaded = true
|
||||
val rateMatch = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(test)
|
||||
val rate = rateMatch?.groupValues?.get(1) ?: "?"
|
||||
modelName = "Qwen3 (${rate} tok/s NPU)"
|
||||
modelName = "Qwen3-4B LlmModule"
|
||||
nlog("Ready: $modelName")
|
||||
} else {
|
||||
nlog("ERROR: test failed: ${test.takeLast(200)}")
|
||||
} catch (e: Throwable) {
|
||||
nlog("ERROR: LlmModule init failed: ${e.javaClass.simpleName}: ${e.message}")
|
||||
llmModule = null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override fun isLoaded(): Boolean = loaded
|
||||
override fun isLoaded(): Boolean = loaded && llmModule != null
|
||||
|
||||
override suspend fun generate(
|
||||
prompt: String,
|
||||
params: SamplingParams,
|
||||
onToken: ((String) -> Boolean)?
|
||||
): GenerationResult = withContext(Dispatchers.IO) {
|
||||
if (!loaded) throw IllegalStateException("Model not loaded")
|
||||
val mod = llmModule ?: throw IllegalStateException("Model not loaded")
|
||||
|
||||
val startTime = System.currentTimeMillis()
|
||||
|
||||
writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
|
||||
android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP))
|
||||
if (SYSTEM_PROMPT.isNotEmpty()) {
|
||||
writeFileRoot("$RUNNER_DIR/outputs/system.b64",
|
||||
android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
|
||||
} else {
|
||||
execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
|
||||
}
|
||||
|
||||
val fullPrompt = buildChatTemplate(prompt)
|
||||
nlog("Prompt: '${prompt.take(80)}'")
|
||||
|
||||
val responseBuilder = StringBuilder()
|
||||
var firstTokenMs = -1L
|
||||
|
||||
val cb = object : LlmCallback {
|
||||
override fun onResult(result: String) {
|
||||
if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
|
||||
responseBuilder.append(result)
|
||||
onToken?.invoke(result)
|
||||
}
|
||||
override fun onStats(stats: String) {
|
||||
nlog("stats: ${stats.take(200)}")
|
||||
}
|
||||
}
|
||||
|
||||
val seqLen = minOf(params.maxNewTokens, 512)
|
||||
val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1")
|
||||
|
||||
val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output)
|
||||
?.groupValues?.get(1)?.toIntOrNull() ?: 0
|
||||
val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output)
|
||||
?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
|
||||
val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output)
|
||||
?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
|
||||
|
||||
val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null")
|
||||
nlog("RAW: ${responseRaw.take(300)}")
|
||||
val responseText = extractResponse(responseRaw)
|
||||
val rc = try {
|
||||
mod.generate(fullPrompt, seqLen, cb)
|
||||
} catch (e: Throwable) {
|
||||
nlog("generate() threw: ${e.message}")
|
||||
-1
|
||||
}
|
||||
|
||||
val elapsed = System.currentTimeMillis() - startTime
|
||||
nlog("Response: '$responseText'")
|
||||
nlog("Stats: ${tokenCount}tok ${rate}tok/s TTFT=${ttft}s ${elapsed}ms")
|
||||
val rawText = responseBuilder.toString()
|
||||
val responseText = cleanResponse(rawText)
|
||||
val tokenCount = rawText.length / 4 // rough estimate without a tokenizer
|
||||
val rate = if (elapsed > 0) (tokenCount * 1000f) / elapsed else 0f
|
||||
|
||||
onToken?.invoke(responseText)
|
||||
nlog("Response: '${responseText.take(80)}'")
|
||||
nlog("Stats: rc=$rc ~${tokenCount}tok ~${"%.1f".format(rate)}tok/s TTFT=${firstTokenMs}ms total=${elapsed}ms")
|
||||
|
||||
GenerationResult(
|
||||
text = responseText,
|
||||
|
|
@ -128,20 +145,31 @@ class ExecuTorchLlmEngine(
|
|||
)
|
||||
}
|
||||
|
||||
private fun extractResponse(raw: String): String {
|
||||
/**
|
||||
* Wrap user input in Qwen3's ChatML template so the instruct model
|
||||
* actually follows the system directive instead of echoing the prompt.
|
||||
* Terminating with `<|im_start|>assistant\n` signals the model to begin
|
||||
* its reply; no trailing tokens.
|
||||
*/
|
||||
private fun buildChatTemplate(userInput: String): String {
|
||||
val sb = StringBuilder()
|
||||
if (SYSTEM_PROMPT.isNotEmpty()) {
|
||||
sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
|
||||
}
|
||||
sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
|
||||
sb.append("<|im_start|>assistant\n")
|
||||
return sb.toString()
|
||||
}
|
||||
|
||||
/** Strip <think>…</think>, special tokens, and leading/trailing whitespace. */
|
||||
private fun cleanResponse(raw: String): String {
|
||||
var text = raw
|
||||
val thinkEnd = text.indexOf("</think>")
|
||||
if (thinkEnd >= 0) {
|
||||
text = text.substring(thinkEnd + "</think>".length)
|
||||
} else {
|
||||
val thinkStart = text.indexOf("<think>")
|
||||
val assistantTag = text.indexOf("assistant")
|
||||
if (thinkStart >= 0) {
|
||||
nlog("WARN: <think> block never closed, no response generated")
|
||||
return ""
|
||||
} else if (assistantTag >= 0) {
|
||||
text = text.substring(assistantTag + "assistant".length)
|
||||
}
|
||||
} else if (text.indexOf("<think>") >= 0) {
|
||||
nlog("WARN: <think> block never closed")
|
||||
return ""
|
||||
}
|
||||
return text
|
||||
.replace("<|im_start|>", "")
|
||||
|
|
@ -152,82 +180,9 @@ class ExecuTorchLlmEngine(
|
|||
.trim()
|
||||
}
|
||||
|
||||
private fun deployRunnerScript() {
|
||||
val script = """
|
||||
#!/bin/sh
|
||||
cd $RUNNER_DIR
|
||||
export LD_LIBRARY_PATH=$RUNNER_DIR
|
||||
export ADSP_LIBRARY_PATH=$RUNNER_DIR
|
||||
|
||||
TEMP=${'$'}1
|
||||
SEQ_LEN=${'$'}2
|
||||
|
||||
PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64)
|
||||
|
||||
rm -f $RUNNER_DIR/outputs/response.txt
|
||||
|
||||
SYSTEM_ARGS=""
|
||||
if [ -s $RUNNER_DIR/outputs/system.b64 ]; then
|
||||
SYSTEM=${'$'}(base64 -d $RUNNER_DIR/outputs/system.b64)
|
||||
SYSTEM_ARGS="--system_prompt"
|
||||
fi
|
||||
|
||||
if [ -n "${'$'}SYSTEM_ARGS" ]; then
|
||||
exec ./qnn_llama_runner \
|
||||
--model_path hybrid_llama_qnn.pte \
|
||||
--tokenizer_path tokenizer.json \
|
||||
--decoder_model_version qwen3 \
|
||||
--output_path $RUNNER_DIR/outputs/response.txt \
|
||||
--performance_output_path $RUNNER_DIR/outputs/perf.txt \
|
||||
--shared_buffer \
|
||||
--system_prompt "${'$'}SYSTEM" \
|
||||
--prompt "${'$'}PROMPT" \
|
||||
--temperature ${'$'}TEMP \
|
||||
--seq_len ${'$'}SEQ_LEN \
|
||||
--eval_mode 0
|
||||
else
|
||||
exec ./qnn_llama_runner \
|
||||
--model_path hybrid_llama_qnn.pte \
|
||||
--tokenizer_path tokenizer.json \
|
||||
--decoder_model_version qwen3 \
|
||||
--output_path $RUNNER_DIR/outputs/response.txt \
|
||||
--performance_output_path $RUNNER_DIR/outputs/perf.txt \
|
||||
--shared_buffer \
|
||||
--prompt "${'$'}PROMPT" \
|
||||
--temperature ${'$'}TEMP \
|
||||
--seq_len ${'$'}SEQ_LEN \
|
||||
--eval_mode 0
|
||||
fi
|
||||
""".trimIndent()
|
||||
|
||||
writeFileRoot("$RUNNER_DIR/run_llm.sh", script)
|
||||
execRoot("chmod 755 $RUNNER_DIR/run_llm.sh")
|
||||
}
|
||||
|
||||
override fun release() {
|
||||
try { llmModule?.resetNative() } catch (_: Throwable) {}
|
||||
llmModule = null
|
||||
loaded = false
|
||||
}
|
||||
|
||||
private fun writeFileRoot(path: String, content: String) {
|
||||
try {
|
||||
val process = Runtime.getRuntime().exec(arrayOf("su", "-c", "cat > $path"))
|
||||
process.outputStream.bufferedWriter().use { it.write(content) }
|
||||
process.waitFor()
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "writeFileRoot failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun execRoot(cmd: String): String {
|
||||
return try {
|
||||
val process = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
|
||||
val result = process.inputStream.bufferedReader().readText()
|
||||
val error = process.errorStream.bufferedReader().readText()
|
||||
process.waitFor()
|
||||
if (error.isNotEmpty() && result.isEmpty()) error else result
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "execRoot failed: ${e.message}")
|
||||
""
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -518,7 +518,7 @@ class KazeiaService : Service() {
|
|||
|
||||
// LLM: Qwen3-4B on Hexagon V79 via qnn_llama_runner.
|
||||
_loadingState.value = LoadingState(50, "LLM Qwen3-4B NPU…")
|
||||
llm = ExecuTorchLlmEngine { msg -> log(msg) }
|
||||
llm = ExecuTorchLlmEngine(this@KazeiaService) { msg -> log(msg) }
|
||||
try {
|
||||
llm.load("${KazeiaApplication.MODELS_DIR}/qwen3-4b", com.kazeia.core.LlmConfig())
|
||||
} catch (e: Exception) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue