LLM no-root: migrate to in-process LlmModule (JNI) — zero su calls

The root cause of the previous su-c requirement was that Qualcomm's FastRPC kernel driver rejects processes spawned via ProcessBuilder fork+exec because they lose supplementary GIDs on exec. Zygote-forked app processes retain the proper init-configured credentials and are accepted by the adsprpcd service, which is why ORT-QNN (Whisper, in-process) worked while the subprocess qnn_llama_runner did not. Running the LLM in-process via ExecuTorch's LlmModule bypasses the fork+exec path entirely. What this commit does: - ExecuTorchLlmEngine now uses org.pytorch.executorch.extension.llm.LlmModule with MODEL_TYPE_QNN_LLAMA=4 (routes to example::Runner in jni_layer_llama.cpp, the same C++ runner that qnn_llama_runner embeds). - All su, ProcessBuilder, file-based prompt/response plumbing, and run_llm.sh gone. ChatML template is built in Kotlin; tokens stream in via LlmCallback. Supporting changes under executorch-patches/llm_in_process_jni.patch: 1. backends/qualcomm/CMakeLists.txt — gate PyQnnManagerAdaptor on NOT ANDROID. The original guard (CMAKE_SYSTEM_PROCESSOR MATCHES x86_64) misfires in a nested scope during Android cross-compile and tried to build the host Python bindings. 2. extension/android/jni/jni_layer_llama.cpp — hardcode decoder_model="qwen3" (was "llama3") and pass eval_mode=0 (EvalMode::kKVCached) + shared_buffer=true to match our hybrid_llama_qnn.pte which only contains kv_forward, not prefill_forward. Build: scripts/build_android_library.sh arm64-v8a with QNN_SDK_ROOT pointing to /opt/Kazeia/qnn_sdk_242/qairt/2.42.0.251225 and EXECUTORCH_BUILD_QNN=ON. Produces libexecutorch_jni.so (192 MB) with QNN v2.42 backend + the llama runner code, plus libqnn_executorch_backend.so. Both staged in jniLibs. Validated on OnePlus Pad 3: LlmModule.load() completes in 4.2 s, no su prompts, Pipeline ready with STT(WhisperHybridEngine) → [VoiceCommands → LLM] → TTS(Qwen3TtsEngine). TTS .pte still loads with the upgraded v2.42 runtime — no regression. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-14 10:39:50 +02:00 · 2026-04-14 10:39:50 +02:00 · 809a6d4fed
parent 6e6a2d9f82
commit 809a6d4fed
3 changed files with 144 additions and 149 deletions
--- a/executorch-patches/llm_in_process_jni.patch
+++ b/executorch-patches/llm_in_process_jni.patch
@ -0,0 +1,40 @@
+diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt
+index e93731e..4951e1d 100644
+--- a/backends/qualcomm/CMakeLists.txt
+++ b/backends/qualcomm/CMakeLists.txt
+@@ -308,8 +308,8 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
+   )
+ endif()
+ 
+-# QNN pybind
+-if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64")
+# QNN pybind — host Python bindings, not for Android cross-compile
+if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "x86_64" AND NOT ANDROID)
+   add_subdirectory(
+     ${EXECUTORCH_SOURCE_DIR}/third-party/pybind11
+     ${CMAKE_CURRENT_BINARY_DIR}/pybind11
+diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp
+index 45f2414..e1c2a8f 100644
+--- a/extension/android/jni/jni_layer_llama.cpp
+++ b/extension/android/jni/jni_layer_llama.cpp
+@@ -171,14 +171,17 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass<ExecuTorchLlmJni> {
+           model_path->toStdString().c_str(),
+           data_files_vector,
+           executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors);
+-      std::string decoder_model = "llama3"; // use llama3 for now
+      std::string decoder_model = "qwen3"; // Kazeia: our .pte was exported with --decoder_model qwen3-4b
+       runner_ = std::make_unique<example::Runner<uint16_t>>( // QNN runner
+           std::move(module),
+           decoder_model.c_str(),
+           model_path->toStdString().c_str(),
+           tokenizer_path->toStdString().c_str(),
+-          "",
+-          "");
+          /* performance_output_path */ "",
+          /* dump_logits_path */ "",
+          /* temperature */ 0.7f,
+          /* eval_mode */ 0, // EvalMode::kKVCached (our .pte has only kv_forward, no prefill_forward)
+          /* shared_buffer */ true);
+       model_type_category_ = MODEL_TYPE_CATEGORY_LLM;
+ #endif
+ #if defined(EXECUTORCH_BUILD_MEDIATEK)
--- a/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/llm/ExecuTorchLlmEngine.kt
@ -1,43 +1,49 @@
 package com.kazeia.llm

+import android.content.Context
 import android.util.Log
 import com.kazeia.core.*
 import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.withContext
 import java.io.File
+import org.pytorch.executorch.extension.llm.LlmCallback
+import org.pytorch.executorch.extension.llm.LlmModule

 /**
- * LLM Engine using ExecuTorch + QNN backend via subprocess.
- * Calls qnn_llama_runner binary with root access (Magisk su).
+ * LLM Engine using ExecuTorch LlmModule in-process — **no root required**.
+ *
+ * Runs Qwen3-4B via `org.pytorch.executorch.extension.llm.LlmModule`, which
+ * wraps the same C++ TextLlmRunner as the standalone qnn_llama_runner binary
+ * but inside the app's own process. The QNN HTP backend works because the
+ * DSP fastrpc service accepts the Zygote-forked app process (unlike
+ * ProcessBuilder-spawned subprocesses which lose supplementary GIDs on exec
+ * and get rejected by the fastrpc credential checks).
+ *
+ * Model + tokenizer live in /data/local/tmp/kazeia-et/ (readable by the app
+ * on this device's permissive SELinux policy). libexecutorch.so + QNN libs
+ * are bundled in jniLibs.
 *
 * Current tablet config: Qwen3-4B KV-mode, ~18-22 tok/s on Hexagon V79
 * (Snapdragon 8 Elite), TTFT 0.9 s, RSS 1.76 GB.
- *
- * Why root: the runner binary plus its QNN v2.42 .so deps live in
- * /data/local/tmp/kazeia-et/ (shell_data_file SELinux context). Untrusted
- * apps can't exec binaries from there. The Hexagon DSP fastrpc service also
- * refuses to load the v2.42 Skel from the app's own files dir — only from
- * nativeLibraryDir — but that dir already holds the TTS stack's v2.31 Skel
- * (same filename, different version, can't coexist). Rebuilding everything
- * against one QNN version would eliminate the conflict, but would require
- * re-exporting the TTS .pte with the new runtime (tooling currently broken
- * on the flatc schema/dataclass mismatch in the qnn_venv).
 */
 class ExecuTorchLlmEngine(
+    private val context: Context,
    private val onLog: ((String) -> Unit)? = null
 ) : LlmEngine {

    companion object {
        private const val TAG = "ExecuTorchLLM"
-        private const val RUNNER_DIR = "/data/local/tmp/kazeia-et"
        // /no_think disables Qwen3's chain-of-thought block so the full token
-        // budget goes to the actual answer (without it, 120-200 tokens get
-        // consumed by <think>…</think> leaving nothing to speak).
-        // Short-response directive keeps TTS latency manageable — each sentence
-        // costs ~3-5 s on the .pte path, so 1-2 sentences is the sweet spot.
+        // budget goes to the actual answer. Short-response directive keeps
+        // TTS latency manageable.
        private const val SYSTEM_PROMPT = "Tu es Kazeia, un compagnon bienveillant d'écoute émotionnelle. Réponds toujours en français, en 1 ou 2 phrases courtes (40 mots maximum). Pas de raisonnement, donne directement la réponse. /no_think"
+
+        private const val MODEL_DIR = "/data/local/tmp/kazeia-et"
+        private const val MODEL_PATH = "$MODEL_DIR/hybrid_llama_qnn.pte"
+        private const val TOKENIZER_PATH = "$MODEL_DIR/tokenizer.json"
    }

+    private var llmModule: LlmModule? = null
    private var modelName = ""
    private var loaded = false

@ -48,77 +54,88 @@ class ExecuTorchLlmEngine(

    override suspend fun load(modelPath: String, config: LlmConfig) {
        withContext(Dispatchers.IO) {
-            val check = execRoot("ls $RUNNER_DIR/qnn_llama_runner $RUNNER_DIR/hybrid_llama_qnn.pte $RUNNER_DIR/tokenizer.json 2>&1")
-            if (check.contains("No such file")) {
-                nlog("ERROR: runner or model not found in $RUNNER_DIR")
+            if (!File(MODEL_PATH).exists()) {
+                nlog("ERROR: model not found at $MODEL_PATH")
+                return@withContext
+            }
+            if (!File(TOKENIZER_PATH).exists()) {
+                nlog("ERROR: tokenizer not found at $TOKENIZER_PATH")
                return@withContext
            }

-            deployRunnerScript()
+            try {
+                val t0 = System.currentTimeMillis()
+                // MODEL_TYPE_QNN_LLAMA=4 selects the Qualcomm runner path in
+                // jni_layer_llama.cpp, which uses example::Runner (same code
+                // as the qnn_llama_runner binary) instead of the generic
+                // TextLLMRunner. Our .pte was exported with
+                // --decoder_model qwen3-4b which requires this path.
+                val MODEL_TYPE_QNN_LLAMA = 4
+                llmModule = LlmModule(MODEL_TYPE_QNN_LLAMA, MODEL_PATH, TOKENIZER_PATH, 0.7f)
+                nlog("LlmModule instantiated in ${System.currentTimeMillis() - t0}ms")

-            writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
-                android.util.Base64.encodeToString("Bonjour".toByteArray(), android.util.Base64.NO_WRAP))
-            if (SYSTEM_PROMPT.isNotEmpty()) {
-                writeFileRoot("$RUNNER_DIR/outputs/system.b64",
-                    android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
-            } else {
-                execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
+                // Load the PTE into QNN HTP (calls the native load()).
+                val loadResult = llmModule!!.load()
+                if (loadResult != 0) {
+                    nlog("ERROR: LlmModule.load() returned $loadResult")
+                    llmModule = null
+                    return@withContext
                }
-            val test = execRoot("sh $RUNNER_DIR/run_llm.sh 0.0 80 2>&1")
+                nlog("LlmModule loaded in ${System.currentTimeMillis() - t0}ms total")

-            if (test.contains("Generated Tokens") || test.contains("Rate:")) {
                loaded = true
-                val rateMatch = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(test)
-                val rate = rateMatch?.groupValues?.get(1) ?: "?"
-                modelName = "Qwen3 (${rate} tok/s NPU)"
+                modelName = "Qwen3-4B LlmModule"
                nlog("Ready: $modelName")
-            } else {
-                nlog("ERROR: test failed: ${test.takeLast(200)}")
+            } catch (e: Throwable) {
+                nlog("ERROR: LlmModule init failed: ${e.javaClass.simpleName}: ${e.message}")
+                llmModule = null
            }
        }
    }

-    override fun isLoaded(): Boolean = loaded
+    override fun isLoaded(): Boolean = loaded && llmModule != null

    override suspend fun generate(
        prompt: String,
        params: SamplingParams,
        onToken: ((String) -> Boolean)?
    ): GenerationResult = withContext(Dispatchers.IO) {
-        if (!loaded) throw IllegalStateException("Model not loaded")
+        val mod = llmModule ?: throw IllegalStateException("Model not loaded")

        val startTime = System.currentTimeMillis()
-
-        writeFileRoot("$RUNNER_DIR/outputs/prompt.b64",
-            android.util.Base64.encodeToString(prompt.toByteArray(), android.util.Base64.NO_WRAP))
-        if (SYSTEM_PROMPT.isNotEmpty()) {
-            writeFileRoot("$RUNNER_DIR/outputs/system.b64",
-                android.util.Base64.encodeToString(SYSTEM_PROMPT.toByteArray(), android.util.Base64.NO_WRAP))
-        } else {
-            execRoot("rm -f $RUNNER_DIR/outputs/system.b64")
-        }
-
+        val fullPrompt = buildChatTemplate(prompt)
        nlog("Prompt: '${prompt.take(80)}'")

+        val responseBuilder = StringBuilder()
+        var firstTokenMs = -1L
+
+        val cb = object : LlmCallback {
+            override fun onResult(result: String) {
+                if (firstTokenMs < 0) firstTokenMs = System.currentTimeMillis() - startTime
+                responseBuilder.append(result)
+                onToken?.invoke(result)
+            }
+            override fun onStats(stats: String) {
+                nlog("stats: ${stats.take(200)}")
+            }
+        }
+
        val seqLen = minOf(params.maxNewTokens, 512)
-        val output = execRoot("sh $RUNNER_DIR/run_llm.sh ${params.temperature} $seqLen 2>&1")
-
-        val tokenCount = Regex("Generated Tokens:\\s+(\\d+)").find(output)
-            ?.groupValues?.get(1)?.toIntOrNull() ?: 0
-        val rate = Regex("Generated \\d+ tokens:.*Rate:\\s+([\\d.]+)").find(output)
-            ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
-        val ttft = Regex("Time to first generated token:\\s+([\\d.]+)").find(output)
-            ?.groupValues?.get(1)?.toFloatOrNull() ?: 0f
-
-        val responseRaw = execRoot("cat $RUNNER_DIR/outputs/response.txt 2>/dev/null")
-        nlog("RAW: ${responseRaw.take(300)}")
-        val responseText = extractResponse(responseRaw)
+        val rc = try {
+            mod.generate(fullPrompt, seqLen, cb)
+        } catch (e: Throwable) {
+            nlog("generate() threw: ${e.message}")
+            -1
+        }

        val elapsed = System.currentTimeMillis() - startTime
-        nlog("Response: '$responseText'")
-        nlog("Stats: ${tokenCount}tok ${rate}tok/s TTFT=${ttft}s ${elapsed}ms")
+        val rawText = responseBuilder.toString()
+        val responseText = cleanResponse(rawText)
+        val tokenCount = rawText.length / 4  // rough estimate without a tokenizer
+        val rate = if (elapsed > 0) (tokenCount * 1000f) / elapsed else 0f

-        onToken?.invoke(responseText)
+        nlog("Response: '${responseText.take(80)}'")
+        nlog("Stats: rc=$rc ~${tokenCount}tok ~${"%.1f".format(rate)}tok/s TTFT=${firstTokenMs}ms total=${elapsed}ms")

        GenerationResult(
            text = responseText,
@ -128,20 +145,31 @@ class ExecuTorchLlmEngine(
        )
    }

-    private fun extractResponse(raw: String): String {
+    /**
+     * Wrap user input in Qwen3's ChatML template so the instruct model
+     * actually follows the system directive instead of echoing the prompt.
+     * Terminating with `<|im_start|>assistant\n` signals the model to begin
+     * its reply; no trailing tokens.
+     */
+    private fun buildChatTemplate(userInput: String): String {
+        val sb = StringBuilder()
+        if (SYSTEM_PROMPT.isNotEmpty()) {
+            sb.append("<|im_start|>system\n").append(SYSTEM_PROMPT).append("<|im_end|>\n")
+        }
+        sb.append("<|im_start|>user\n").append(userInput).append("<|im_end|>\n")
+        sb.append("<|im_start|>assistant\n")
+        return sb.toString()
+    }
+
+    /** Strip <think>…</think>, special tokens, and leading/trailing whitespace. */
+    private fun cleanResponse(raw: String): String {
        var text = raw
        val thinkEnd = text.indexOf("</think>")
        if (thinkEnd >= 0) {
            text = text.substring(thinkEnd + "</think>".length)
-        } else {
-            val thinkStart = text.indexOf("<think>")
-            val assistantTag = text.indexOf("assistant")
-            if (thinkStart >= 0) {
-                nlog("WARN: <think> block never closed, no response generated")
+        } else if (text.indexOf("<think>") >= 0) {
+            nlog("WARN: <think> block never closed")
            return ""
-            } else if (assistantTag >= 0) {
-                text = text.substring(assistantTag + "assistant".length)
-            }
        }
        return text
            .replace("<|im_start|>", "")
@ -152,82 +180,9 @@ class ExecuTorchLlmEngine(
            .trim()
    }

-    private fun deployRunnerScript() {
-        val script = """
-#!/bin/sh
-cd $RUNNER_DIR
-export LD_LIBRARY_PATH=$RUNNER_DIR
-export ADSP_LIBRARY_PATH=$RUNNER_DIR
-
-TEMP=${'$'}1
-SEQ_LEN=${'$'}2
-
-PROMPT=${'$'}(base64 -d $RUNNER_DIR/outputs/prompt.b64)
-
-rm -f $RUNNER_DIR/outputs/response.txt
-
-SYSTEM_ARGS=""
-if [ -s $RUNNER_DIR/outputs/system.b64 ]; then
-  SYSTEM=${'$'}(base64 -d $RUNNER_DIR/outputs/system.b64)
-  SYSTEM_ARGS="--system_prompt"
-fi
-
-if [ -n "${'$'}SYSTEM_ARGS" ]; then
-  exec ./qnn_llama_runner \
-    --model_path hybrid_llama_qnn.pte \
-    --tokenizer_path tokenizer.json \
-    --decoder_model_version qwen3 \
-    --output_path $RUNNER_DIR/outputs/response.txt \
-    --performance_output_path $RUNNER_DIR/outputs/perf.txt \
-    --shared_buffer \
-    --system_prompt "${'$'}SYSTEM" \
-    --prompt "${'$'}PROMPT" \
-    --temperature ${'$'}TEMP \
-    --seq_len ${'$'}SEQ_LEN \
-    --eval_mode 0
-else
-  exec ./qnn_llama_runner \
-    --model_path hybrid_llama_qnn.pte \
-    --tokenizer_path tokenizer.json \
-    --decoder_model_version qwen3 \
-    --output_path $RUNNER_DIR/outputs/response.txt \
-    --performance_output_path $RUNNER_DIR/outputs/perf.txt \
-    --shared_buffer \
-    --prompt "${'$'}PROMPT" \
-    --temperature ${'$'}TEMP \
-    --seq_len ${'$'}SEQ_LEN \
-    --eval_mode 0
-fi
-""".trimIndent()
-
-        writeFileRoot("$RUNNER_DIR/run_llm.sh", script)
-        execRoot("chmod 755 $RUNNER_DIR/run_llm.sh")
-    }
-
    override fun release() {
+        try { llmModule?.resetNative() } catch (_: Throwable) {}
+        llmModule = null
        loaded = false
    }
-
-    private fun writeFileRoot(path: String, content: String) {
-        try {
-            val process = Runtime.getRuntime().exec(arrayOf("su", "-c", "cat > $path"))
-            process.outputStream.bufferedWriter().use { it.write(content) }
-            process.waitFor()
-        } catch (e: Exception) {
-            Log.e(TAG, "writeFileRoot failed: ${e.message}")
-        }
-    }
-
-    private fun execRoot(cmd: String): String {
-        return try {
-            val process = Runtime.getRuntime().exec(arrayOf("su", "-c", cmd))
-            val result = process.inputStream.bufferedReader().readText()
-            val error = process.errorStream.bufferedReader().readText()
-            process.waitFor()
-            if (error.isNotEmpty() && result.isEmpty()) error else result
-        } catch (e: Exception) {
-            Log.e(TAG, "execRoot failed: ${e.message}")
-            ""
-        }
-    }
 }
--- a/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
+++ b/kazeia-android/app/src/main/java/com/kazeia/service/KazeiaService.kt
@ -518,7 +518,7 @@ class KazeiaService : Service() {

                // LLM: Qwen3-4B on Hexagon V79 via qnn_llama_runner.
                _loadingState.value = LoadingState(50, "LLM Qwen3-4B NPU…")
-                llm = ExecuTorchLlmEngine { msg -> log(msg) }
+                llm = ExecuTorchLlmEngine(this@KazeiaService) { msg -> log(msg) }
                try {
                    llm.load("${KazeiaApplication.MODELS_DIR}/qwen3-4b", com.kazeia.core.LlmConfig())
                } catch (e: Exception) {