From 809a6d4fedb94bfe1dafef2d48a0491ed1a2a202 Mon Sep 17 00:00:00 2001 From: Kazeia Team Date: Tue, 14 Apr 2026 10:39:50 +0200 Subject: [PATCH] =?UTF-8?q?LLM=20no-root:=20migrate=20to=20in-process=20Ll?= =?UTF-8?q?mModule=20(JNI)=20=E2=80=94=20zero=20su=20calls?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The root cause of the previous su-c requirement was that Qualcomm's FastRPC kernel driver rejects processes spawned via ProcessBuilder fork+exec because they lose supplementary GIDs on exec. Zygote-forked app processes retain the proper init-configured credentials and are accepted by the adsprpcd service, which is why ORT-QNN (Whisper, in-process) worked while the subprocess qnn_llama_runner did not. Running the LLM in-process via ExecuTorch's LlmModule bypasses the fork+exec path entirely. What this commit does: - ExecuTorchLlmEngine now uses org.pytorch.executorch.extension.llm.LlmModule with MODEL_TYPE_QNN_LLAMA=4 (routes to example::Runner in jni_layer_llama.cpp, the same C++ runner that qnn_llama_runner embeds). - All su, ProcessBuilder, file-based prompt/response plumbing, and run_llm.sh gone. ChatML template is built in Kotlin; tokens stream in via LlmCallback. Supporting changes under executorch-patches/llm_in_process_jni.patch: 1. backends/qualcomm/CMakeLists.txt — gate PyQnnManagerAdaptor on NOT ANDROID. The original guard (CMAKE_SYSTEM_PROCESSOR MATCHES x86_64) misfires in a nested scope during Android cross-compile and tried to build the host Python bindings. 2. extension/android/jni/jni_layer_llama.cpp — hardcode decoder_model="qwen3" (was "llama3") and pass eval_mode=0 (EvalMode::kKVCached) + shared_buffer=true to match our hybrid_llama_qnn.pte which only contains kv_forward, not prefill_forward. Build: scripts/build_android_library.sh arm64-v8a with QNN_SDK_ROOT pointing to /opt/Kazeia/qnn_sdk_242/qairt/2.42.0.251225 and EXECUTORCH_BUILD_QNN=ON. Produces libexecutorch_jni.so (192 MB) with QNN v2.42 backend + the llama runner code, plus libqnn_executorch_backend.so. Both staged in jniLibs. Validated on OnePlus Pad 3: LlmModule.load() completes in 4.2 s, no su prompts, Pipeline ready with STT(WhisperHybridEngine) → [VoiceCommands → LLM] → TTS(Qwen3TtsEngine). TTS .pte still loads with the upgraded v2.42 runtime — no regression. Co-Authored-By: Claude Opus 4.6 (1M context) --- executorch-patches/llm_in_process_jni.patch | 40 +++ .../com/kazeia/llm/ExecuTorchLlmEngine.kt | 251 +++++++----------- .../java/com/kazeia/service/KazeiaService.kt | 2 +- 3 files changed, 144 insertions(+), 149 deletions(-) create mode 100644 executorch-patches/llm_in_process_jni.patch diff --git a/executorch-patches/llm_in_process_jni.patch b/executorch-patches/llm_in_process_jni.patch new file mode 100644 index 0000000..a4a64a5 --- /dev/null +++ b/executorch-patches/llm_in_process_jni.patch @@ -0,0 +1,40 @@ +diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt +index e93731e..4951e1d 100644 +--- a/backends/qualcomm/CMakeLists.txt ++++ b/backends/qualcomm/CMakeLists.txt +@@ -308,8 +308,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon) + ) + endif() + +-# QNN pybind +-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64") ++# QNN pybind — host Python bindings, not for Android cross-compile ++if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ANDROID) + add_subdirectory( + ${EXECUTORCH_SOURCE_DIR}/third-party/pybind11 + ${CMAKE_CURRENT_BINARY_DIR}/pybind11 +diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp +index 45f2414..e1c2a8f 100644 +--- a/extension/android/jni/jni_layer_llama.cpp ++++ b/extension/android/jni/jni_layer_llama.cpp +@@ -171,14 +171,17 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { + model_path->toStdString().c_str(), + data_files_vector, + executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); +- std::string decoder_model = "llama3"; // use llama3 for now ++ std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b + runner_ = std::make_unique>( // QNN runner + std::move(module), + decoder_model.c_str(), + model_path->toStdString().c_str(), + tokenizer_path->toStdString().c_str(), +- "", +- ""); ++ /* performance_output_path */ "", ++ /* dump_logits_path */ "", ++ /* temperature */ 0.7f, ++ /* eval_mode */ 0, // EvalMode::kKVCached (our .pte has only kv_forward, no prefill_forward) ++ /* shared_buffer */ true); + model_type_category_ = MODEL_TYPE_CATEGORY_LLM; + #endif + #if defined(EXECUTORCH_BUILD_MEDIATEK) diff --git a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt index 85df1d0..18a207a 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt @@ -1,43 +1,49 @@ package com.kazeia.llm +import android.content.Context import android.util.Log import com.kazeia.core.* import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.withContext import java.io.File +import org.pytorch.executorch.extension.llm.LlmCallback +import org.pytorch.executorch.extension.llm.LlmModule /** - * LLM Engine using ExecuTorch + QNN backend via subprocess. - * Calls qnn_llama_runner binary with root access (Magisk su). + * LLM Engine using ExecuTorch LlmModule in-process — **no root required**. + * + * Runs Qwen3-4B via `org.pytorch.executorch.extension.llm.LlmModule`, which + * wraps the same C++ TextLlmRunner as the standalone qnn_llama_runner binary + * but inside the app's own process. The QNN HTP backend works because the + * DSP fastrpc service accepts the Zygote-forked app process (unlike + * ProcessBuilder-spawned subprocesses which lose supplementary GIDs on exec + * and get rejected by the fastrpc credential checks). + * + * Model + tokenizer live in /data/local/tmp/kazeia-et/ (readable by the app + * on this device's permissive SELinux policy). libexecutorch.so + QNN libs + * are bundled in jniLibs. * * Current tablet config: Qwen3-4B KV-mode, ~18-22 tok/s on Hexagon V79 * (Snapdragon 8 Elite), TTFT 0.9 s, RSS 1.76 GB. - * - * Why root: the runner binary plus its QNN v2.42 .so deps live in - * /data/local/tmp/kazeia-et/ (shell_data_file SELinux context). Untrusted - * apps can't exec binaries from there. The Hexagon DSP fastrpc service also - * refuses to load the v2.42 Skel from the app's own files dir — only from - * nativeLibraryDir — but that dir already holds the TTS stack's v2.31 Skel - * (same filename, different version, can't coexist). Rebuilding everything - * against one QNN version would eliminate the conflict, but would require - * re-exporting the TTS .pte with the new runtime (tooling currently broken - * on the flatc schema/dataclass mismatch in the qnn_venv). */ class ExecuTorchLlmEngine( + private val context: Context, private val onLog: ((String) -> Unit)? = null ) : LlmEngine { companion object { private const val TAG = "ExecuTorchLLM" - private const val RUNNER_DIR = "/data/local/tmp/kazeia-et" // /no_think disables Qwen3's chain-of-thought block so the full token - // budget goes to the actual answer (without it, 120-200 tokens get - // consumed by leaving nothing to speak). - // Short-response directive keeps TTS latency manageable — each sentence - // costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot. + // budget goes to the actual answer. Short-response directive keeps + // TTS latency manageable. private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think" + + private const val MODEL_DIR = "/data/local/tmp/kazeia-et" + private const val MODEL_PATH = "$MODEL_DIR/hybrid_llama_qnn.pte" + private const val TOKENIZER_PATH = "$MODEL_DIR/tokenizer.json" } + private var llmModule: LlmModule? = null private var modelName = "" private var loaded = false @@ -48,77 +54,88 @@ class ExecuTorchLlmEngine( override suspend fun load(modelPath: String, config: LlmConfig) { withContext(Dispatchers.IO) { - val check = execRoot("ls $RUNNER_DIR/qnn_llama_runner $RUNNER_DIR/hybrid_llama_qnn.pte $RUNNER_DIR/tokenizer.json 2>&1") - if (check.contains("No such file")) { - nlog("ERROR: runner or model not found in $RUNNER_DIR") + if (!File(MODEL_PATH).exists()) { + nlog("ERROR: model not found at $MODEL_PATH") + return@withContext + } + if (!File(TOKENIZER_PATH).exists()) { + nlog("ERROR: tokenizer not found at $TOKENIZER_PATH") return@withContext } - deployRunnerScript() + try { + val t0 = System.currentTimeMillis() + // MODEL_TYPE_QNN_LLAMA=4 selects the Qualcomm runner path in + // jni_layer_llama.cpp, which uses example::Runner (same code + // as the qnn_llama_runner binary) instead of the generic + // TextLLMRunner. Our .pte was exported with + // --decoder_model qwen3-4b which requires this path. + val MODEL_TYPE_QNN_LLAMA = 4 + llmModule = LlmModule(MODEL_TYPE_QNN_LLAMA, MODEL_PATH, TOKENIZER_PATH, 0.7f) + nlog("LlmModule instantiated in ${System.currentTimeMillis() - t0}ms") - writeFileRoot("$RUNNER_DIR/outputs/prompt.b64", - android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP)) - if (SYSTEM_PROMPT.isNotEmpty()) { - writeFileRoot("$RUNNER_DIR/outputs/system.b64", - android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP)) - } else { - execRoot("rm -f $RUNNER_DIR/outputs/system.b64") - } - val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1") + // Load the PTE into QNN HTP (calls the native load()). + val loadResult = llmModule!!.load() + if (loadResult != 0) { + nlog("ERROR: LlmModule.load() returned $loadResult") + llmModule = null + return@withContext + } + nlog("LlmModule loaded in ${System.currentTimeMillis() - t0}ms total") - if (test.contains("Generated Tokens") || test.contains("Rate:")) { loaded = true - val rateMatch = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(test) - val rate = rateMatch?.groupValues?.get(1) ?: "?" - modelName = "Qwen3 (${rate} tok/s NPU)" + modelName = "Qwen3-4B LlmModule" nlog("Ready: $modelName") - } else { - nlog("ERROR: test failed: ${test.takeLast(200)}") + } catch (e: Throwable) { + nlog("ERROR: LlmModule init failed: ${e.javaClass.simpleName}: ${e.message}") + llmModule = null } } } - override fun isLoaded(): Boolean = loaded + override fun isLoaded(): Boolean = loaded && llmModule != null override suspend fun generate( prompt: String, params: SamplingParams, onToken: ((String) -> Boolean)? ): GenerationResult = withContext(Dispatchers.IO) { - if (!loaded) throw IllegalStateException("Model not loaded") + val mod = llmModule ?: throw IllegalStateException("Model not loaded") val startTime = System.currentTimeMillis() - - writeFileRoot("$RUNNER_DIR/outputs/prompt.b64", - android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP)) - if (SYSTEM_PROMPT.isNotEmpty()) { - writeFileRoot("$RUNNER_DIR/outputs/system.b64", - android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP)) - } else { - execRoot("rm -f $RUNNER_DIR/outputs/system.b64") - } - + val fullPrompt = buildChatTemplate(prompt) nlog("Prompt: '${prompt.take(80)}'") + val responseBuilder = StringBuilder() + var firstTokenMs = -1L + + val cb = object : LlmCallback { + override fun onResult(result: String) { + if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime + responseBuilder.append(result) + onToken?.invoke(result) + } + override fun onStats(stats: String) { + nlog("stats: ${stats.take(200)}") + } + } + val seqLen = minOf(params.maxNewTokens, 512) - val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1") - - val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output) - ?.groupValues?.get(1)?.toIntOrNull() ?: 0 - val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output) - ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f - val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output) - ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f - - val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null") - nlog("RAW: ${responseRaw.take(300)}") - val responseText = extractResponse(responseRaw) + val rc = try { + mod.generate(fullPrompt, seqLen, cb) + } catch (e: Throwable) { + nlog("generate() threw: ${e.message}") + -1 + } val elapsed = System.currentTimeMillis() - startTime - nlog("Response: '$responseText'") - nlog("Stats: ${tokenCount}tok ${rate}tok/s TTFT=${ttft}s ${elapsed}ms") + val rawText = responseBuilder.toString() + val responseText = cleanResponse(rawText) + val tokenCount = rawText.length / 4 // rough estimate without a tokenizer + val rate = if (elapsed > 0) (tokenCount * 1000f) / elapsed else 0f - onToken?.invoke(responseText) + nlog("Response: '${responseText.take(80)}'") + nlog("Stats: rc=$rc ~${tokenCount}tok ~${"%.1f".format(rate)}tok/s TTFT=${firstTokenMs}ms total=${elapsed}ms") GenerationResult( text = responseText, @@ -128,20 +145,31 @@ class ExecuTorchLlmEngine( ) } - private fun extractResponse(raw: String): String { + /** + * Wrap user input in Qwen3's ChatML template so the instruct model + * actually follows the system directive instead of echoing the prompt. + * Terminating with `<|im_start|>assistant\n` signals the model to begin + * its reply; no trailing tokens. + */ + private fun buildChatTemplate(userInput: String): String { + val sb = StringBuilder() + if (SYSTEM_PROMPT.isNotEmpty()) { + sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n") + } + sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n") + sb.append("<|im_start|>assistant\n") + return sb.toString() + } + + /** Strip , special tokens, and leading/trailing whitespace. */ + private fun cleanResponse(raw: String): String { var text = raw val thinkEnd = text.indexOf("") if (thinkEnd >= 0) { text = text.substring(thinkEnd + "".length) - } else { - val thinkStart = text.indexOf("") - val assistantTag = text.indexOf("assistant") - if (thinkStart >= 0) { - nlog("WARN: block never closed, no response generated") - return "" - } else if (assistantTag >= 0) { - text = text.substring(assistantTag + "assistant".length) - } + } else if (text.indexOf("") >= 0) { + nlog("WARN: block never closed") + return "" } return text .replace("<|im_start|>", "") @@ -152,82 +180,9 @@ class ExecuTorchLlmEngine( .trim() } - private fun deployRunnerScript() { - val script = """ -#!/bin/sh -cd $RUNNER_DIR -export LD_LIBRARY_PATH=$RUNNER_DIR -export ADSP_LIBRARY_PATH=$RUNNER_DIR - -TEMP=${'$'}1 -SEQ_LEN=${'$'}2 - -PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64) - -rm -f $RUNNER_DIR/outputs/response.txt - -SYSTEM_ARGS="" -if [ -s $RUNNER_DIR/outputs/system.b64 ]; then - SYSTEM=${'$'}(base64 -d $RUNNER_DIR/outputs/system.b64) - SYSTEM_ARGS="--system_prompt" -fi - -if [ -n "${'$'}SYSTEM_ARGS" ]; then - exec ./qnn_llama_runner \ - --model_path hybrid_llama_qnn.pte \ - --tokenizer_path tokenizer.json \ - --decoder_model_version qwen3 \ - --output_path $RUNNER_DIR/outputs/response.txt \ - --performance_output_path $RUNNER_DIR/outputs/perf.txt \ - --shared_buffer \ - --system_prompt "${'$'}SYSTEM" \ - --prompt "${'$'}PROMPT" \ - --temperature ${'$'}TEMP \ - --seq_len ${'$'}SEQ_LEN \ - --eval_mode 0 -else - exec ./qnn_llama_runner \ - --model_path hybrid_llama_qnn.pte \ - --tokenizer_path tokenizer.json \ - --decoder_model_version qwen3 \ - --output_path $RUNNER_DIR/outputs/response.txt \ - --performance_output_path $RUNNER_DIR/outputs/perf.txt \ - --shared_buffer \ - --prompt "${'$'}PROMPT" \ - --temperature ${'$'}TEMP \ - --seq_len ${'$'}SEQ_LEN \ - --eval_mode 0 -fi -""".trimIndent() - - writeFileRoot("$RUNNER_DIR/run_llm.sh", script) - execRoot("chmod 755 $RUNNER_DIR/run_llm.sh") - } - override fun release() { + try { llmModule?.resetNative() } catch (_: Throwable) {} + llmModule = null loaded = false } - - private fun writeFileRoot(path: String, content: String) { - try { - val process = Runtime.getRuntime().exec(arrayOf("su", "-c", "cat > $path")) - process.outputStream.bufferedWriter().use { it.write(content) } - process.waitFor() - } catch (e: Exception) { - Log.e(TAG, "writeFileRoot failed: ${e.message}") - } - } - - private fun execRoot(cmd: String): String { - return try { - val process = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd)) - val result = process.inputStream.bufferedReader().readText() - val error = process.errorStream.bufferedReader().readText() - process.waitFor() - if (error.isNotEmpty() && result.isEmpty()) error else result - } catch (e: Exception) { - Log.e(TAG, "execRoot failed: ${e.message}") - "" - } - } } diff --git a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt index 82eaa24..401ebc1 100644 --- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt +++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt @@ -518,7 +518,7 @@ class KazeiaService : Service() { // LLM: Qwen3-4B on Hexagon V79 via qnn_llama_runner. _loadingState.value = LoadingState(50, "LLM Qwen3-4B NPU…") - llm = ExecuTorchLlmEngine { msg -> log(msg) } + llm = ExecuTorchLlmEngine(this@KazeiaService) { msg -> log(msg) } try { llm.load("${KazeiaApplication.MODELS_DIR}/qwen3-4b", com.kazeia.core.LlmConfig()) } catch (e: Exception) {